kodexa 7.5.514404640805__py3-none-any.whl → 8.0.14958192442__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kodexa/dataclasses/__init__.py +1 -1
- kodexa/model/__init__.py +2 -2
- kodexa/model/objects.py +21 -1
- kodexa/model/utils.py +1 -1
- kodexa/pipeline/pipeline.py +1 -1
- kodexa/platform/client.py +1 -2
- kodexa/platform/kodexa.py +4 -1
- kodexa/platform/manifest.py +447 -0
- kodexa/selectors/__init__.py +1 -1
- kodexa/selectors/ast.py +371 -98
- kodexa/selectors/error.py +29 -0
- kodexa/selectors/kodexa-ast-visitor.py +268 -0
- kodexa/selectors/parser.py +91 -0
- kodexa/selectors/resources/KodexaSelector.interp +99 -0
- kodexa/selectors/resources/KodexaSelector.tokens +56 -0
- kodexa/selectors/resources/KodexaSelectorLexer.interp +119 -0
- kodexa/selectors/resources/KodexaSelectorLexer.py +204 -0
- kodexa/selectors/resources/KodexaSelectorLexer.tokens +56 -0
- kodexa/selectors/resources/KodexaSelectorListener.py +570 -0
- kodexa/selectors/resources/KodexaSelectorParser.py +3246 -0
- kodexa/selectors/resources/KodexaSelectorVisitor.py +323 -0
- kodexa/selectors/visitor.py +265 -0
- kodexa/steps/__init__.py +4 -2
- kodexa/steps/common.py +0 -68
- kodexa/testing/test_utils.py +1 -1
- {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/METADATA +7 -3
- kodexa-8.0.14958192442.dist-info/RECORD +53 -0
- {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/WHEEL +1 -1
- kodexa/model/model.py +0 -3259
- kodexa/model/persistence.py +0 -2017
- kodexa/selectors/core.py +0 -124
- kodexa/selectors/lexrules.py +0 -137
- kodexa/selectors/lextab.py +0 -83
- kodexa/selectors/lextab.pyi +0 -1
- kodexa/selectors/parserules.py +0 -414
- kodexa/selectors/parserules.pyi +0 -1
- kodexa/selectors/parsetab.py +0 -4149
- kodexa/selectors/parsetab.pyi +0 -1
- kodexa-7.5.514404640805.dist-info/RECORD +0 -50
- {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/LICENSE +0 -0
kodexa/model/persistence.py
DELETED
@@ -1,2017 +0,0 @@
|
|
1
|
-
import dataclasses
|
2
|
-
import logging
|
3
|
-
import pathlib
|
4
|
-
import sqlite3
|
5
|
-
import tempfile
|
6
|
-
import time
|
7
|
-
import uuid
|
8
|
-
from typing import List, Optional
|
9
|
-
|
10
|
-
import msgpack
|
11
|
-
|
12
|
-
from kodexa.model import Document, ContentNode, SourceMetadata
|
13
|
-
from kodexa.model.model import (
|
14
|
-
DocumentMetadata,
|
15
|
-
ContentFeature,
|
16
|
-
ContentException,
|
17
|
-
ModelInsight, ProcessingStep,
|
18
|
-
)
|
19
|
-
from kodexa.model.objects import DocumentTaxonValidation
|
20
|
-
|
21
|
-
logger = logging.getLogger()
|
22
|
-
|
23
|
-
# Heavily used SQL
|
24
|
-
EXCEPTION_INSERT = "INSERT INTO content_exceptions (tag, message, exception_details, group_uuid, tag_uuid, exception_type, severity, node_uuid, exception_type_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"
|
25
|
-
EXCEPTION_SELECT = "select tag, message, exception_details, group_uuid, tag_uuid, exception_type, severity, node_uuid, exception_type_id from content_exceptions"
|
26
|
-
|
27
|
-
MODEL_INSIGHT_INSERT = "INSERT INTO model_insights (model_insight) VALUES (?)"
|
28
|
-
MODEL_INSIGHT_SELECT = "select model_insight from model_insights"
|
29
|
-
|
30
|
-
FEATURE_INSERT = "INSERT INTO ft (id, cn_id, f_type, binary_value, single, tag_uuid) VALUES (?,?,?,?,?,?)"
|
31
|
-
FEATURE_DELETE = "DELETE FROM ft where cn_id=? and f_type=?"
|
32
|
-
|
33
|
-
CONTENT_NODE_INSERT = "INSERT INTO cn (pid, nt, idx) VALUES (?,?,?)"
|
34
|
-
CONTENT_NODE_UPDATE = "UPDATE cn set pid=?, nt=?, idx=? WHERE id=?"
|
35
|
-
|
36
|
-
CONTENT_NODE_PART_INSERT = (
|
37
|
-
"INSERT INTO cnp (cn_id, pos, content, content_idx) VALUES (?,?,?,?)"
|
38
|
-
)
|
39
|
-
NOTE_TYPE_INSERT = "insert into n_type(name) values (?)"
|
40
|
-
NODE_TYPE_LOOKUP = "select id from n_type where name = ?"
|
41
|
-
FEATURE_TYPE_INSERT = "insert into f_type(name) values (?)"
|
42
|
-
FEATURE_TYPE_LOOKUP = "select id from f_type where name = ?"
|
43
|
-
METADATA_INSERT = "insert into metadata(id,metadata) values (1,?)"
|
44
|
-
METADATA_DELETE = "delete from metadata where id=1"
|
45
|
-
|
46
|
-
# Configuration constants
|
47
|
-
CACHE_SIZE = 10000 # Number of nodes to cache
|
48
|
-
BATCH_SIZE = 1000 # Size of batches for bulk operations
|
49
|
-
SLOW_QUERY_THRESHOLD = 1.0 # Seconds
|
50
|
-
MAX_CONNECTIONS = 5 # Maximum number of database connections
|
51
|
-
|
52
|
-
def monitor_performance(func):
|
53
|
-
"""Performance monitoring decorator"""
|
54
|
-
def wrapper(*args, **kwargs):
|
55
|
-
start_time = time.time()
|
56
|
-
result = func(*args, **kwargs)
|
57
|
-
duration = time.time() - start_time
|
58
|
-
if duration > SLOW_QUERY_THRESHOLD:
|
59
|
-
logger.warning(f"Slow operation detected: {func.__name__}, duration: {duration}s")
|
60
|
-
return result
|
61
|
-
return wrapper
|
62
|
-
|
63
|
-
class SqliteDocumentPersistence(object):
|
64
|
-
"""
|
65
|
-
The Sqlite persistence engine to support large scale documents (part of the V4 Kodexa Document Architecture)
|
66
|
-
|
67
|
-
Attributes:
|
68
|
-
document (Document): The document to be persisted.
|
69
|
-
filename (str): The name of the file where the document is stored.
|
70
|
-
delete_on_close (bool): If True, the file will be deleted when the connection is closed.
|
71
|
-
"""
|
72
|
-
|
73
|
-
"""
|
74
|
-
The Sqlite persistence engine to support large scale documents (part of the V4 Kodexa Document Architecture)
|
75
|
-
"""
|
76
|
-
|
77
|
-
def __init__(self, document: Document, filename: str = None, delete_on_close=False, inmemory=False, persistence_manager=None):
|
78
|
-
self.document = document
|
79
|
-
|
80
|
-
self.node_types = {}
|
81
|
-
self.node_type_id_by_name = {}
|
82
|
-
self.feature_type_id_by_name = {}
|
83
|
-
self.feature_type_names = {}
|
84
|
-
self.delete_on_close = delete_on_close
|
85
|
-
|
86
|
-
import sqlite3
|
87
|
-
|
88
|
-
self.is_new = True
|
89
|
-
if filename is not None:
|
90
|
-
self.is_tmp = False
|
91
|
-
path = pathlib.Path(filename)
|
92
|
-
if path.exists():
|
93
|
-
# At this point we need to load the db
|
94
|
-
self.is_new = False
|
95
|
-
else:
|
96
|
-
from kodexa import KodexaPlatform
|
97
|
-
|
98
|
-
new_file, filename = tempfile.mkstemp(
|
99
|
-
suffix=".kddb", dir=KodexaPlatform.get_tempdir()
|
100
|
-
)
|
101
|
-
self.is_tmp = True
|
102
|
-
|
103
|
-
self.current_filename = filename
|
104
|
-
|
105
|
-
if inmemory:
|
106
|
-
self.inmemory=True
|
107
|
-
self.connection = self.create_in_memory_database(filename)
|
108
|
-
else:
|
109
|
-
self.inmemory=False
|
110
|
-
self.connection = sqlite3.connect(filename)
|
111
|
-
|
112
|
-
self.cursor = self.connection.cursor()
|
113
|
-
self.cursor.execute("PRAGMA journal_mode=OFF")
|
114
|
-
self.cursor.execute("PRAGMA temp_store=MEMORY")
|
115
|
-
self.cursor.execute("PRAGMA mmap_size=30000000000")
|
116
|
-
self.cursor.execute("PRAGMA cache_size=10000")
|
117
|
-
self.cursor.execute("PRAGMA page_size=4096")
|
118
|
-
|
119
|
-
try:
|
120
|
-
# We need to populate node_type_id_by_name
|
121
|
-
for n_type in self.cursor.execute("select id,name from n_type"):
|
122
|
-
self.node_types[n_type[0]] = n_type[1]
|
123
|
-
self.node_type_id_by_name[n_type[1]] = n_type[0]
|
124
|
-
except:
|
125
|
-
pass
|
126
|
-
|
127
|
-
def create_in_memory_database(self, disk_db_path: str):
|
128
|
-
# Connect to the in-memory database
|
129
|
-
mem_conn = sqlite3.connect(':memory:')
|
130
|
-
mem_cursor = mem_conn.cursor()
|
131
|
-
|
132
|
-
# Connect to the database on disk
|
133
|
-
disk_conn = sqlite3.connect(disk_db_path)
|
134
|
-
disk_cursor = disk_conn.cursor()
|
135
|
-
|
136
|
-
# Load the contents of the disk database into memory
|
137
|
-
disk_cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
|
138
|
-
tables = disk_cursor.fetchall()
|
139
|
-
for table_name, create_table_sql in tables:
|
140
|
-
if "sqlite" in table_name:
|
141
|
-
continue
|
142
|
-
|
143
|
-
# Create the table structure in the in-memory database
|
144
|
-
mem_cursor.execute(create_table_sql)
|
145
|
-
|
146
|
-
# Populate the table with data from the disk database
|
147
|
-
disk_cursor.execute(f"SELECT * FROM {table_name}")
|
148
|
-
rows = disk_cursor.fetchall()
|
149
|
-
for row in rows:
|
150
|
-
placeholders = ', '.join('?' * len(row))
|
151
|
-
mem_cursor.execute(f"INSERT INTO {table_name} VALUES ({placeholders})", row)
|
152
|
-
|
153
|
-
# Commit changes and close disk connection
|
154
|
-
mem_conn.commit()
|
155
|
-
disk_conn.close()
|
156
|
-
|
157
|
-
return mem_conn
|
158
|
-
|
159
|
-
@monitor_performance
|
160
|
-
def get_all_tags(self):
|
161
|
-
"""
|
162
|
-
Retrieves all tags from the document.
|
163
|
-
|
164
|
-
Returns:
|
165
|
-
list: A list of all tags in the document.
|
166
|
-
"""
|
167
|
-
features = []
|
168
|
-
for feature in self.cursor.execute(
|
169
|
-
"select name from f_type where name like 'tag:%'"
|
170
|
-
).fetchall():
|
171
|
-
features.append(feature[0].split(":")[1])
|
172
|
-
|
173
|
-
return features
|
174
|
-
|
175
|
-
@monitor_performance
|
176
|
-
def update_features(self, node):
|
177
|
-
"""
|
178
|
-
Updates the features of a given node in the document.
|
179
|
-
|
180
|
-
Args:
|
181
|
-
node (Node): The node whose features are to be updated.
|
182
|
-
"""
|
183
|
-
|
184
|
-
next_feature_id = self.get_max_feature_id()
|
185
|
-
all_features = []
|
186
|
-
for feature in node.get_features():
|
187
|
-
binary_value = sqlite3.Binary(
|
188
|
-
msgpack.packb(feature.value, use_bin_type=True)
|
189
|
-
)
|
190
|
-
|
191
|
-
tag_uuid = None
|
192
|
-
if feature.feature_type == "tag" and "uuid" in feature.value[0]:
|
193
|
-
tag_uuid = feature.value[0]["uuid"]
|
194
|
-
|
195
|
-
all_features.append(
|
196
|
-
[
|
197
|
-
next_feature_id,
|
198
|
-
node.uuid,
|
199
|
-
self.get_feature_type_id(feature),
|
200
|
-
binary_value,
|
201
|
-
feature.single,
|
202
|
-
tag_uuid,
|
203
|
-
]
|
204
|
-
)
|
205
|
-
|
206
|
-
next_feature_id = next_feature_id + 1
|
207
|
-
|
208
|
-
self.cursor.execute("DELETE FROM ft where cn_id=?", [node.uuid])
|
209
|
-
self.cursor.executemany(FEATURE_INSERT, all_features)
|
210
|
-
|
211
|
-
@monitor_performance
|
212
|
-
def update_node(self, node):
|
213
|
-
"""
|
214
|
-
Updates a given node in the document.
|
215
|
-
|
216
|
-
Args:
|
217
|
-
node (Node): The node to be updated.
|
218
|
-
"""
|
219
|
-
self.cursor.execute(
|
220
|
-
"update cn set idx=?, pid=? where id=?",
|
221
|
-
[node.index, node._parent_uuid, node.uuid],
|
222
|
-
)
|
223
|
-
|
224
|
-
@monitor_performance
|
225
|
-
def get_content_nodes(self, node_type, parent_node: ContentNode, include_children):
|
226
|
-
"""
|
227
|
-
Retrieves content nodes from the document based on the given parameters.
|
228
|
-
|
229
|
-
Args:
|
230
|
-
node_type (str): The type of the node to be retrieved.
|
231
|
-
parent_node (ContentNode): The parent node of the nodes to be retrieved.
|
232
|
-
include_children (bool): If True, child nodes will also be retrieved.
|
233
|
-
|
234
|
-
Returns:
|
235
|
-
list: A list of content nodes that match the given parameters.
|
236
|
-
"""
|
237
|
-
nodes = []
|
238
|
-
if not self.connection.in_transaction:
|
239
|
-
self.cursor.execute("BEGIN TRANSACTION")
|
240
|
-
if include_children:
|
241
|
-
if node_type == "*":
|
242
|
-
query = """
|
243
|
-
with recursive
|
244
|
-
parent_node(id, pid, nt, idx, path) AS (
|
245
|
-
VALUES (?,?,?,?,?)
|
246
|
-
UNION ALL
|
247
|
-
SELECT cns.id, cns.pid, cns.nt, cns.idx, parent_node.path || substr('0000000' || cns.idx, -6, 6)
|
248
|
-
FROM cn cns, parent_node
|
249
|
-
WHERE parent_node.id = cns.pid
|
250
|
-
)
|
251
|
-
SELECT id, pid, nt, idx, path from parent_node order by path
|
252
|
-
"""
|
253
|
-
|
254
|
-
try:
|
255
|
-
results = self.cursor.execute(
|
256
|
-
query,
|
257
|
-
[
|
258
|
-
parent_node.uuid,
|
259
|
-
parent_node.get_parent().uuid
|
260
|
-
if parent_node.get_parent()
|
261
|
-
else None,
|
262
|
-
next(
|
263
|
-
key
|
264
|
-
for key, value in self.node_types.items()
|
265
|
-
if value == parent_node.get_node_type()
|
266
|
-
),
|
267
|
-
parent_node.index,
|
268
|
-
f"{parent_node.index}".zfill(6),
|
269
|
-
],
|
270
|
-
).fetchall()
|
271
|
-
except StopIteration:
|
272
|
-
return []
|
273
|
-
else:
|
274
|
-
query = """
|
275
|
-
with recursive
|
276
|
-
parent_node(id, pid, nt, idx, path) AS (
|
277
|
-
VALUES (?,?,?,?,?)
|
278
|
-
UNION ALL
|
279
|
-
SELECT cns.id, cns.pid, cns.nt, cns.idx, parent_node.path || substr('000000' || cns.idx, -6, 6)
|
280
|
-
FROM cn cns, parent_node
|
281
|
-
WHERE parent_node.id = cns.pid
|
282
|
-
)
|
283
|
-
SELECT id, pid, nt, idx, path from parent_node where nt=? order by path
|
284
|
-
"""
|
285
|
-
|
286
|
-
try:
|
287
|
-
results = self.cursor.execute(
|
288
|
-
query,
|
289
|
-
[
|
290
|
-
parent_node.uuid,
|
291
|
-
parent_node.get_parent().uuid
|
292
|
-
if parent_node.get_parent()
|
293
|
-
else None,
|
294
|
-
next(
|
295
|
-
key
|
296
|
-
for key, value in self.node_types.items()
|
297
|
-
if value == parent_node.get_node_type()
|
298
|
-
),
|
299
|
-
parent_node.index,
|
300
|
-
f"{parent_node.index}".zfill(6),
|
301
|
-
next(
|
302
|
-
key
|
303
|
-
for key, value in self.node_types.items()
|
304
|
-
if value == node_type
|
305
|
-
),
|
306
|
-
],
|
307
|
-
).fetchall()
|
308
|
-
except StopIteration:
|
309
|
-
self.connection.commit()
|
310
|
-
return []
|
311
|
-
else:
|
312
|
-
query = "select id, pid, nt, idx from cn where pid=? and nt=? order by idx"
|
313
|
-
try:
|
314
|
-
results = self.cursor.execute(
|
315
|
-
query,
|
316
|
-
[
|
317
|
-
parent_node.uuid,
|
318
|
-
next(
|
319
|
-
key
|
320
|
-
for key, value in self.node_types.items()
|
321
|
-
if value == node_type
|
322
|
-
),
|
323
|
-
],
|
324
|
-
).fetchall()
|
325
|
-
except StopIteration:
|
326
|
-
self.connection.commit()
|
327
|
-
return []
|
328
|
-
|
329
|
-
for raw_node in list(results):
|
330
|
-
nodes.append(self.__build_node(raw_node))
|
331
|
-
|
332
|
-
self.connection.commit()
|
333
|
-
|
334
|
-
return nodes
|
335
|
-
|
336
|
-
def initialize(self):
|
337
|
-
"""
|
338
|
-
Initializes the SqliteDocumentPersistence object by either building a new database or loading an existing one.
|
339
|
-
"""
|
340
|
-
if self.is_new:
|
341
|
-
self.__build_db()
|
342
|
-
else:
|
343
|
-
self.__load_document()
|
344
|
-
|
345
|
-
def close(self):
|
346
|
-
"""
|
347
|
-
Closes the connection to the database. If delete_on_close is True, the file will also be deleted.
|
348
|
-
"""
|
349
|
-
if self.is_tmp or self.delete_on_close:
|
350
|
-
pathlib.Path(self.current_filename).unlink()
|
351
|
-
else:
|
352
|
-
self.cursor.close()
|
353
|
-
self.connection.close()
|
354
|
-
|
355
|
-
@monitor_performance
|
356
|
-
def get_max_feature_id(self):
|
357
|
-
"""
|
358
|
-
Retrieves the maximum feature id from the document.
|
359
|
-
|
360
|
-
Returns:
|
361
|
-
int: The maximum feature id.
|
362
|
-
"""
|
363
|
-
max_id = self.cursor.execute("select max(id) from ft").fetchone()
|
364
|
-
if max_id[0] is None:
|
365
|
-
return 1
|
366
|
-
|
367
|
-
return max_id[0] + 1
|
368
|
-
|
369
|
-
def __build_db(self):
|
370
|
-
"""
|
371
|
-
Builds a new database for the document.
|
372
|
-
"""
|
373
|
-
self.cursor.execute(
|
374
|
-
"CREATE TABLE metadata (id integer primary key, metadata text)"
|
375
|
-
)
|
376
|
-
self.cursor.execute(
|
377
|
-
"CREATE TABLE cn (id integer primary key, nt INTEGER, pid INTEGER, idx INTEGER)"
|
378
|
-
)
|
379
|
-
self.cursor.execute(
|
380
|
-
"CREATE TABLE cnp (id integer primary key, cn_id INTEGER, pos integer, content text, content_idx integer)"
|
381
|
-
)
|
382
|
-
|
383
|
-
self.cursor.execute("CREATE TABLE n_type (id integer primary key, name text)")
|
384
|
-
self.cursor.execute("CREATE TABLE f_type (id integer primary key, name text)")
|
385
|
-
self.cursor.execute(
|
386
|
-
"""CREATE TABLE ft
|
387
|
-
(
|
388
|
-
id integer primary key,
|
389
|
-
cn_id integer,
|
390
|
-
f_type INTEGER,
|
391
|
-
binary_value blob,
|
392
|
-
single integer,
|
393
|
-
tag_uuid text
|
394
|
-
)"""
|
395
|
-
)
|
396
|
-
|
397
|
-
self.cursor.execute("CREATE UNIQUE INDEX n_type_uk ON n_type(name);")
|
398
|
-
self.cursor.execute("CREATE UNIQUE INDEX f_type_uk ON f_type(name);")
|
399
|
-
self.cursor.execute("CREATE INDEX cn_perf ON cn(nt);")
|
400
|
-
self.cursor.execute("CREATE INDEX cn_perf2 ON cn(pid);")
|
401
|
-
self.cursor.execute("CREATE INDEX cnp_perf ON cnp(cn_id, pos);")
|
402
|
-
self.cursor.execute("CREATE INDEX f_perf ON ft(cn_id);")
|
403
|
-
self.cursor.execute("CREATE INDEX f_perf2 ON ft(tag_uuid);")
|
404
|
-
self.cursor.execute(
|
405
|
-
"""CREATE TABLE content_exceptions
|
406
|
-
(
|
407
|
-
id integer primary key,
|
408
|
-
tag text,
|
409
|
-
message text,
|
410
|
-
exception_details text,
|
411
|
-
group_uuid text,
|
412
|
-
tag_uuid text,
|
413
|
-
exception_type text,
|
414
|
-
exception_type_id text,
|
415
|
-
severity text,
|
416
|
-
node_uuid text
|
417
|
-
)"""
|
418
|
-
)
|
419
|
-
self.cursor.execute(
|
420
|
-
"CREATE TABLE model_insights (id integer primary key,model_insight text);"
|
421
|
-
)
|
422
|
-
self.document.version = "6.0.0"
|
423
|
-
|
424
|
-
self.__update_metadata()
|
425
|
-
|
426
|
-
@monitor_performance
|
427
|
-
def content_node_count(self):
|
428
|
-
"""
|
429
|
-
Counts the number of content nodes in the document.
|
430
|
-
|
431
|
-
Returns:
|
432
|
-
int: The number of content nodes in the document.
|
433
|
-
"""
|
434
|
-
self.cursor.execute("select * from cn").fetchall()
|
435
|
-
|
436
|
-
@monitor_performance
|
437
|
-
def get_feature_type_id(self, feature):
|
438
|
-
"""
|
439
|
-
Retrieves the id of a given feature.
|
440
|
-
|
441
|
-
Args:
|
442
|
-
feature (Feature): The feature whose id is to be retrieved.
|
443
|
-
|
444
|
-
Returns:
|
445
|
-
int: The id of the feature.
|
446
|
-
"""
|
447
|
-
return self.__resolve_f_type(feature)
|
448
|
-
|
449
|
-
def __resolve_f_type(self, feature):
|
450
|
-
"""
|
451
|
-
Resolves the feature type of a given feature.
|
452
|
-
|
453
|
-
Args:
|
454
|
-
feature (Feature): The feature whose feature type is to be resolved.
|
455
|
-
|
456
|
-
Returns:
|
457
|
-
int: The id of the feature type.
|
458
|
-
"""
|
459
|
-
feature_type_name = feature.feature_type + ":" + feature.name
|
460
|
-
|
461
|
-
if feature_type_name in self.feature_type_id_by_name:
|
462
|
-
return self.feature_type_id_by_name[feature_type_name]
|
463
|
-
|
464
|
-
result = self.cursor.execute(
|
465
|
-
FEATURE_TYPE_LOOKUP, [feature_type_name]
|
466
|
-
).fetchone()
|
467
|
-
if result is None:
|
468
|
-
new_feature_type_name_id = self.cursor.execute(
|
469
|
-
FEATURE_TYPE_INSERT, [feature_type_name]
|
470
|
-
).lastrowid
|
471
|
-
self.feature_type_names[new_feature_type_name_id] = feature_type_name
|
472
|
-
self.feature_type_id_by_name[feature_type_name] = new_feature_type_name_id
|
473
|
-
return new_feature_type_name_id
|
474
|
-
|
475
|
-
return result[0]
|
476
|
-
|
477
|
-
def __resolve_n_type(self, n_type):
|
478
|
-
"""
|
479
|
-
Resolves the node type of a given node.
|
480
|
-
|
481
|
-
Args:
|
482
|
-
n_type (str): The node type to be resolved.
|
483
|
-
|
484
|
-
Returns:
|
485
|
-
int: The id of the node type.
|
486
|
-
"""
|
487
|
-
if n_type in self.node_type_id_by_name:
|
488
|
-
return self.node_type_id_by_name[n_type]
|
489
|
-
result = self.cursor.execute(NODE_TYPE_LOOKUP, [n_type]).fetchone()
|
490
|
-
if result is None:
|
491
|
-
new_type_id = self.cursor.execute(NOTE_TYPE_INSERT, [n_type]).lastrowid
|
492
|
-
self.node_types[new_type_id] = n_type
|
493
|
-
self.node_type_id_by_name[n_type] = new_type_id
|
494
|
-
return new_type_id
|
495
|
-
|
496
|
-
return result[0]
|
497
|
-
|
498
|
-
@monitor_performance
|
499
|
-
def __insert_node(self, node: ContentNode, parent, execute=True):
|
500
|
-
"""
|
501
|
-
Inserts a node into the document.
|
502
|
-
|
503
|
-
Args:
|
504
|
-
node (ContentNode): The node to be inserted.
|
505
|
-
parent (Node): The parent node of the node to be inserted.
|
506
|
-
execute (bool, optional): If True, the node will be inserted immediately. Defaults to True.
|
507
|
-
|
508
|
-
Returns:
|
509
|
-
tuple: A tuple containing the values of the node and its parts.
|
510
|
-
"""
|
511
|
-
|
512
|
-
if node.index is None:
|
513
|
-
node.index = 0
|
514
|
-
|
515
|
-
if parent:
|
516
|
-
node._parent_uuid = parent.uuid
|
517
|
-
|
518
|
-
if node.uuid:
|
519
|
-
# Delete the existing node
|
520
|
-
cn_values = [
|
521
|
-
node._parent_uuid,
|
522
|
-
self.__resolve_n_type(node.node_type),
|
523
|
-
node.index,
|
524
|
-
node.uuid,
|
525
|
-
]
|
526
|
-
|
527
|
-
# Make sure we load the content parts if we haven't
|
528
|
-
node.get_content_parts()
|
529
|
-
|
530
|
-
if execute:
|
531
|
-
self.cursor.execute("DELETE FROM cn where id=?", [node.uuid])
|
532
|
-
self.cursor.execute(
|
533
|
-
"INSERT INTO cn (pid, nt, idx, id) VALUES (?,?,?,?)", cn_values
|
534
|
-
)
|
535
|
-
self.cursor.execute("DELETE FROM cnp where cn_id=?", [node.uuid])
|
536
|
-
|
537
|
-
cn_parts_values = []
|
538
|
-
for idx, part in enumerate(node.get_content_parts()):
|
539
|
-
cn_parts_values.append(
|
540
|
-
[
|
541
|
-
node.uuid,
|
542
|
-
idx,
|
543
|
-
part if isinstance(part, str) else None,
|
544
|
-
part if not isinstance(part, str) else None,
|
545
|
-
]
|
546
|
-
)
|
547
|
-
|
548
|
-
if execute:
|
549
|
-
self.cursor.executemany(CONTENT_NODE_PART_INSERT, cn_parts_values)
|
550
|
-
|
551
|
-
return ([cn_values], cn_parts_values)
|
552
|
-
|
553
|
-
raise Exception("Node must have a UUID?")
|
554
|
-
|
555
|
-
def __clean_none_values(self, d):
|
556
|
-
"""
|
557
|
-
Cleans a dictionary by removing keys with None values.
|
558
|
-
|
559
|
-
Args:
|
560
|
-
d (dict): The dictionary to be cleaned.
|
561
|
-
|
562
|
-
Returns:
|
563
|
-
dict: The cleaned dictionary.
|
564
|
-
"""
|
565
|
-
clean = {}
|
566
|
-
for k, v in d.items():
|
567
|
-
if isinstance(v, dict):
|
568
|
-
nested = self.__clean_none_values(v)
|
569
|
-
if len(nested.keys()) > 0:
|
570
|
-
clean[k] = nested
|
571
|
-
elif v is not None:
|
572
|
-
clean[k] = v
|
573
|
-
return clean
|
574
|
-
|
575
|
-
def __update_metadata(self):
|
576
|
-
"""
|
577
|
-
Updates the metadata of the document.
|
578
|
-
"""
|
579
|
-
document_metadata = {
|
580
|
-
"version": Document.CURRENT_VERSION,
|
581
|
-
"metadata": self.document.metadata,
|
582
|
-
"source": self.__clean_none_values(
|
583
|
-
dataclasses.asdict(self.document.source)
|
584
|
-
),
|
585
|
-
"mixins": self.document.get_mixins(),
|
586
|
-
"labels": self.document.labels,
|
587
|
-
"uuid": self.document.uuid,
|
588
|
-
}
|
589
|
-
self.cursor.execute(METADATA_DELETE)
|
590
|
-
self.cursor.execute(
|
591
|
-
METADATA_INSERT,
|
592
|
-
[sqlite3.Binary(msgpack.packb(document_metadata, use_bin_type=True))],
|
593
|
-
)
|
594
|
-
|
595
|
-
def __load_document(self):
|
596
|
-
"""
|
597
|
-
Loads an existing document from the database.
|
598
|
-
"""
|
599
|
-
for n_type in self.cursor.execute("select id,name from n_type"):
|
600
|
-
self.node_types[n_type[0]] = n_type[1]
|
601
|
-
for f_type in self.cursor.execute("select id,name from f_type"):
|
602
|
-
self.feature_type_names[f_type[0]] = f_type[1]
|
603
|
-
|
604
|
-
metadata = msgpack.unpackb(
|
605
|
-
self.cursor.execute("select * from metadata").fetchone()[1]
|
606
|
-
)
|
607
|
-
self.document.metadata = DocumentMetadata(metadata["metadata"])
|
608
|
-
self.document.version = (
|
609
|
-
metadata["version"]
|
610
|
-
if "version" in metadata and metadata["version"]
|
611
|
-
else Document.PREVIOUS_VERSION
|
612
|
-
)
|
613
|
-
# some older docs don't have a version or it's None
|
614
|
-
|
615
|
-
self.uuid = (
|
616
|
-
metadata["uuid"]
|
617
|
-
if "uuid" in metadata
|
618
|
-
else str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
|
619
|
-
)
|
620
|
-
if "source" in metadata and metadata["source"]:
|
621
|
-
self.document.source = SourceMetadata.from_dict(metadata["source"])
|
622
|
-
if "labels" in metadata and metadata["labels"]:
|
623
|
-
self.document.labels = metadata["labels"]
|
624
|
-
if "mixins" in metadata and metadata["mixins"]:
|
625
|
-
self.document._mixins = metadata["mixins"]
|
626
|
-
|
627
|
-
self.uuid = metadata.get("uuid")
|
628
|
-
|
629
|
-
import semver
|
630
|
-
|
631
|
-
root_node = self.cursor.execute(
|
632
|
-
"select id, pid, nt, idx from cn where pid is null"
|
633
|
-
).fetchone()
|
634
|
-
if root_node:
|
635
|
-
self.document.content_node = self.__build_node(root_node)
|
636
|
-
|
637
|
-
if semver.compare(self.document.version, "4.0.1") < 0:
|
638
|
-
# We need to migrate this to a 4.0.1 document
|
639
|
-
self.cursor.execute(
|
640
|
-
"""CREATE TABLE ft
|
641
|
-
(
|
642
|
-
id integer primary key,
|
643
|
-
cn_id integer,
|
644
|
-
f_type INTEGER,
|
645
|
-
binary_value blob,
|
646
|
-
single integer,
|
647
|
-
tag_uuid text
|
648
|
-
)"""
|
649
|
-
)
|
650
|
-
self.cursor.execute(
|
651
|
-
"insert into ft select f.id, f.cn_id, f.f_type, fv.binary_value, fv.single, null from f, f_value fv where fv.id = f.fvalue_id"
|
652
|
-
)
|
653
|
-
# we will create a new feature table
|
654
|
-
self.cursor.execute("drop table f")
|
655
|
-
self.cursor.execute("drop table f_value")
|
656
|
-
self.cursor.execute("CREATE INDEX f_perf ON ft(cn_id);")
|
657
|
-
self.cursor.execute("CREATE INDEX f_perf2 ON ft(tag_uuid);")
|
658
|
-
|
659
|
-
# We always run this
|
660
|
-
self.cursor.execute(
|
661
|
-
"""CREATE TABLE IF NOT EXISTS content_exceptions
|
662
|
-
(
|
663
|
-
id integer primary key,
|
664
|
-
tag text,
|
665
|
-
message text,
|
666
|
-
exception_details text,
|
667
|
-
group_uuid text,
|
668
|
-
tag_uuid text,
|
669
|
-
exception_type text,
|
670
|
-
severity text,
|
671
|
-
node_uuid text
|
672
|
-
)"""
|
673
|
-
)
|
674
|
-
self.cursor.execute(
|
675
|
-
"""CREATE TABLE IF NOT EXISTS model_insights
|
676
|
-
(
|
677
|
-
id integer primary key,
|
678
|
-
model_insight text
|
679
|
-
)"""
|
680
|
-
)
|
681
|
-
|
682
|
-
if semver.compare(self.document.version, "6.0.0") < 0:
|
683
|
-
from sqlite3 import OperationalError
|
684
|
-
|
685
|
-
try:
|
686
|
-
self.cursor.execute(
|
687
|
-
"ALTER TABLE content_exceptions ADD COLUMN exception_type_id text"
|
688
|
-
)
|
689
|
-
except OperationalError:
|
690
|
-
logger.info("exception_type_id column already exists")
|
691
|
-
pass
|
692
|
-
self.document.version = "6.0.0"
|
693
|
-
self.update_metadata()
|
694
|
-
|
695
|
-
def get_content_parts(self, new_node):
|
696
|
-
"""
|
697
|
-
Retrieves the content parts of a given node.
|
698
|
-
|
699
|
-
Args:
|
700
|
-
new_node (Node): The node whose content parts are to be retrieved.
|
701
|
-
|
702
|
-
Returns:
|
703
|
-
list: A list of the content parts of the node.
|
704
|
-
"""
|
705
|
-
content_parts = self.cursor.execute(
|
706
|
-
"select cn_id, pos, content, content_idx from cnp where cn_id = ? order by pos",
|
707
|
-
[new_node.uuid],
|
708
|
-
).fetchall()
|
709
|
-
|
710
|
-
parts = []
|
711
|
-
for content_part in content_parts:
|
712
|
-
if content_part[3] is None:
|
713
|
-
parts.append(content_part[2])
|
714
|
-
else:
|
715
|
-
parts.append(content_part[3])
|
716
|
-
return parts
|
717
|
-
|
718
|
-
def __build_node(self, node_row):
|
719
|
-
"""
|
720
|
-
Builds a node from a given row of the database.
|
721
|
-
|
722
|
-
Args:
|
723
|
-
node_row (tuple): A tuple containing the values of the node.
|
724
|
-
|
725
|
-
Returns:
|
726
|
-
Node: The built node.
|
727
|
-
"""
|
728
|
-
new_node = ContentNode(
|
729
|
-
self.document,
|
730
|
-
self.node_types[node_row[2]],
|
731
|
-
parent=self.get_node(node_row[1]),
|
732
|
-
)
|
733
|
-
new_node.uuid = node_row[0]
|
734
|
-
new_node.index = node_row[3]
|
735
|
-
return new_node
|
736
|
-
|
737
|
-
def add_content_node(self, node, parent, execute=True):
|
738
|
-
"""
|
739
|
-
Adds a content node to the document.
|
740
|
-
|
741
|
-
Args:
|
742
|
-
node (Node): The node to be added.
|
743
|
-
parent (Node): The parent node of the node to be added.
|
744
|
-
execute (bool, optional): If True, the node will be added immediately. Defaults to True.
|
745
|
-
|
746
|
-
Returns:
|
747
|
-
tuple: A tuple containing the values of the node and its parts.
|
748
|
-
"""
|
749
|
-
return self.__insert_node(node, parent, execute)
|
750
|
-
|
751
|
-
def remove_feature(self, node, feature_type, name):
|
752
|
-
"""
|
753
|
-
Removes a feature from a given node.
|
754
|
-
|
755
|
-
Args:
|
756
|
-
node (Node): The node from which the feature is to be removed.
|
757
|
-
feature_type (str): The type of the feature to be removed.
|
758
|
-
name (str): The name of the feature to be removed.
|
759
|
-
"""
|
760
|
-
|
761
|
-
feature = ContentFeature(feature_type, name, None)
|
762
|
-
f_values = [node.uuid, self.__resolve_f_type(feature)]
|
763
|
-
self.cursor.execute(FEATURE_DELETE, f_values)
|
764
|
-
|
765
|
-
def get_children(self, content_node):
|
766
|
-
"""
|
767
|
-
Retrieves the children of a given node.
|
768
|
-
|
769
|
-
Args:
|
770
|
-
content_node (ContentNode): The node whose children are to be retrieved.
|
771
|
-
|
772
|
-
Returns:
|
773
|
-
list: A list of the children of the node.
|
774
|
-
"""
|
775
|
-
|
776
|
-
# We need to get the child nodes
|
777
|
-
children = []
|
778
|
-
for child_node in self.cursor.execute(
|
779
|
-
"select id, pid, nt, idx from cn where pid = ? order by idx",
|
780
|
-
[content_node.uuid],
|
781
|
-
).fetchall():
|
782
|
-
children.append(self.__build_node(child_node))
|
783
|
-
return children
|
784
|
-
|
785
|
-
def get_child_ids(self, content_node):
|
786
|
-
"""
|
787
|
-
Retrieves the ids of the children of a given node.
|
788
|
-
|
789
|
-
Args:
|
790
|
-
content_node (ContentNode): The node whose children's ids are to be retrieved.
|
791
|
-
|
792
|
-
Returns:
|
793
|
-
list: A list of the ids of the children of the node.
|
794
|
-
"""
|
795
|
-
|
796
|
-
# We need to get the child nodes
|
797
|
-
children = []
|
798
|
-
for child_node in self.cursor.execute(
|
799
|
-
"select id, pid, nt, idx from cn where pid = ? order by idx",
|
800
|
-
[content_node.uuid],
|
801
|
-
).fetchall():
|
802
|
-
children.append(child_node[0])
|
803
|
-
return children
|
804
|
-
|
805
|
-
def get_node(self, node_id):
|
806
|
-
"""
|
807
|
-
Retrieves a node by its id.
|
808
|
-
|
809
|
-
Args:
|
810
|
-
node_id (int): The id of the node to be retrieved.
|
811
|
-
|
812
|
-
Returns:
|
813
|
-
Node: The node with the given id.
|
814
|
-
"""
|
815
|
-
node_row = self.cursor.execute(
|
816
|
-
"select id, pid, nt, idx from cn where id = ?", [node_id]
|
817
|
-
).fetchone()
|
818
|
-
if node_row:
|
819
|
-
return self.__build_node(node_row)
|
820
|
-
|
821
|
-
return None
|
822
|
-
|
823
|
-
def get_parent(self, content_node):
|
824
|
-
"""
|
825
|
-
Retrieves the parent of a given node.
|
826
|
-
|
827
|
-
Args:
|
828
|
-
content_node (ContentNode): The node whose parent is to be retrieved.
|
829
|
-
|
830
|
-
Returns:
|
831
|
-
Node: The parent of the node.
|
832
|
-
"""
|
833
|
-
|
834
|
-
parent = self.cursor.execute(
|
835
|
-
"select pid from cn where id = ?", [content_node.uuid]
|
836
|
-
).fetchone()
|
837
|
-
if parent:
|
838
|
-
return self.get_node(parent[0])
|
839
|
-
|
840
|
-
return None
|
841
|
-
|
842
|
-
def update_metadata(self):
|
843
|
-
"""
|
844
|
-
Updates the metadata of the document.
|
845
|
-
"""
|
846
|
-
self.__update_metadata()
|
847
|
-
|
848
|
-
def __rebuild_from_document(self):
|
849
|
-
"""
|
850
|
-
Rebuilds the database from the document.
|
851
|
-
"""
|
852
|
-
self.cursor.execute("DELETE FROM cn")
|
853
|
-
self.cursor.execute("DELETE FROM cnp")
|
854
|
-
self.cursor.execute("DELETE FROM ft")
|
855
|
-
|
856
|
-
self.__update_metadata()
|
857
|
-
if self.document.content_node:
|
858
|
-
self.__insert_node(self.document.content_node, None)
|
859
|
-
|
860
|
-
def sync(self):
|
861
|
-
"""
|
862
|
-
Synchronizes the database with the document.
|
863
|
-
"""
|
864
|
-
self.__update_metadata()
|
865
|
-
self.cursor.execute("pragma optimize")
|
866
|
-
self.connection.commit()
|
867
|
-
self.cursor.execute("VACUUM")
|
868
|
-
self.cursor = self.connection.cursor()
|
869
|
-
self.cursor.execute("PRAGMA journal_mode=OFF")
|
870
|
-
self.cursor.execute("PRAGMA temp_store=MEMORY")
|
871
|
-
self.cursor.execute("PRAGMA mmap_size=30000000000")
|
872
|
-
self.cursor.execute("PRAGMA cache_size=10000")
|
873
|
-
self.cursor.execute("PRAGMA page_size=4096")
|
874
|
-
|
875
|
-
def dump_in_memory_db_to_file(self):
|
876
|
-
# Connect to a new or existing database file
|
877
|
-
disk_conn = sqlite3.connect(self.current_filename)
|
878
|
-
|
879
|
-
# Use the backup API to copy the in-memory database to the disk file
|
880
|
-
with disk_conn:
|
881
|
-
self.connection.backup(disk_conn)
|
882
|
-
|
883
|
-
# Close the file-based database connection
|
884
|
-
disk_conn.close()
|
885
|
-
|
886
|
-
def get_bytes(self):
|
887
|
-
"""
|
888
|
-
Retrieves the document as bytes.
|
889
|
-
|
890
|
-
Returns:
|
891
|
-
bytes: The document as bytes.
|
892
|
-
"""
|
893
|
-
self.sync()
|
894
|
-
|
895
|
-
if self.inmemory:
|
896
|
-
self.dump_in_memory_db_to_file()
|
897
|
-
|
898
|
-
with open(self.current_filename, "rb") as f:
|
899
|
-
return f.read()
|
900
|
-
|
901
|
-
def get_features(self, node):
|
902
|
-
"""
|
903
|
-
Retrieves the features of a given node.
|
904
|
-
|
905
|
-
Args:
|
906
|
-
node (Node): The node whose features are to be retrieved.
|
907
|
-
|
908
|
-
Returns:
|
909
|
-
list: A list of the features of the node.
|
910
|
-
"""
|
911
|
-
# We need to get the features back
|
912
|
-
|
913
|
-
features = []
|
914
|
-
for feature in self.cursor.execute(
|
915
|
-
"select id, cn_id, f_type, binary_value, single from ft where cn_id = ?",
|
916
|
-
[node.uuid],
|
917
|
-
).fetchall():
|
918
|
-
feature_type_name = self.feature_type_names[feature[2]]
|
919
|
-
single = feature[4] == 1
|
920
|
-
value = msgpack.unpackb(feature[3])
|
921
|
-
features.append(
|
922
|
-
ContentFeature(
|
923
|
-
feature_type_name.split(":")[0],
|
924
|
-
feature_type_name.split(":")[1],
|
925
|
-
value,
|
926
|
-
single=single,
|
927
|
-
)
|
928
|
-
)
|
929
|
-
|
930
|
-
return features
|
931
|
-
|
932
|
-
def update_content_parts(self, node, content_parts):
|
933
|
-
"""
|
934
|
-
Updates the content parts of a given node.
|
935
|
-
|
936
|
-
Args:
|
937
|
-
node (Node): The node whose content parts are to be updated.
|
938
|
-
content_parts (list): The new content parts of the node.
|
939
|
-
"""
|
940
|
-
self.cursor.execute("delete from cnp where cn_id=?", [node.uuid])
|
941
|
-
|
942
|
-
all_parts = []
|
943
|
-
for idx, part in enumerate(content_parts):
|
944
|
-
all_parts.append(
|
945
|
-
[
|
946
|
-
node.uuid,
|
947
|
-
idx,
|
948
|
-
part if isinstance(part, str) else None,
|
949
|
-
part if not isinstance(part, str) else None,
|
950
|
-
]
|
951
|
-
)
|
952
|
-
self.cursor.executemany(CONTENT_NODE_PART_INSERT, all_parts)
|
953
|
-
|
954
|
-
def remove_content_node(self, node):
|
955
|
-
"""
|
956
|
-
Removes a node from the document.
|
957
|
-
|
958
|
-
Args:
|
959
|
-
node (Node): The node to be removed.
|
960
|
-
"""
|
961
|
-
|
962
|
-
def get_all_node_ids(node):
|
963
|
-
"""
|
964
|
-
This function recursively traverses a node tree, collecting the ids of all non-virtual nodes.
|
965
|
-
"""
|
966
|
-
all_node_ids = []
|
967
|
-
if not node.virtual:
|
968
|
-
all_node_ids.append(node.uuid) # Append the uuid directly, not as a list
|
969
|
-
for child in node.get_children():
|
970
|
-
all_node_ids.extend(get_all_node_ids(child))
|
971
|
-
return all_node_ids
|
972
|
-
|
973
|
-
all_child_ids = get_all_node_ids(node)
|
974
|
-
parameter_tuples = [(id,) for id in all_child_ids] # Prepare the parameters as tuples
|
975
|
-
|
976
|
-
# Assuming `self.cursor` is part of a larger transaction management system
|
977
|
-
try:
|
978
|
-
self.cursor.executemany("delete from cnp where cn_id=?", parameter_tuples)
|
979
|
-
self.cursor.executemany("delete from cn where id=?", parameter_tuples)
|
980
|
-
self.cursor.executemany("delete from ft where cn_id=?", parameter_tuples)
|
981
|
-
self.connection.commit() # Commit the transaction if part of one
|
982
|
-
return all_child_ids
|
983
|
-
except Exception as e:
|
984
|
-
self.connection.rollback() # Rollback in case of error
|
985
|
-
logger.error(f"An error occurred: {e}")
|
986
|
-
|
987
|
-
def remove_all_features(self, node):
|
988
|
-
"""
|
989
|
-
Removes all features from a given node.
|
990
|
-
|
991
|
-
Args:
|
992
|
-
node (Node): The node from which all features are to be removed.
|
993
|
-
"""
|
994
|
-
self.cursor.execute("delete from ft where cn_id=?", [node.uuid])
|
995
|
-
|
996
|
-
def remove_all_features_by_id(self, node_id):
|
997
|
-
"""
|
998
|
-
Removes all features from a node by its id.
|
999
|
-
|
1000
|
-
Args:
|
1001
|
-
node_id (int): The id of the node from which all features are to be removed.
|
1002
|
-
"""
|
1003
|
-
self.cursor.execute("delete from ft where cn_id=?", [node_id])
|
1004
|
-
|
1005
|
-
def get_next_node_id(self):
|
1006
|
-
"""
|
1007
|
-
Retrieves the next node id from the document.
|
1008
|
-
|
1009
|
-
Returns:
|
1010
|
-
int: The next node id.
|
1011
|
-
"""
|
1012
|
-
next_id = self.cursor.execute("select max(id) from cn").fetchone()
|
1013
|
-
if next_id[0] is None:
|
1014
|
-
return 1
|
1015
|
-
|
1016
|
-
return next_id[0] + 1
|
1017
|
-
|
1018
|
-
def get_tagged_nodes(self, tag, tag_uuid=None):
|
1019
|
-
"""
|
1020
|
-
Retrieves nodes with a given tag.
|
1021
|
-
|
1022
|
-
Args:
|
1023
|
-
tag (str): The tag of the nodes to be retrieved.
|
1024
|
-
tag_uuid (str, optional): The uuid of the tag. Defaults to None.
|
1025
|
-
|
1026
|
-
Returns:
|
1027
|
-
list: A list of nodes with the given tag.
|
1028
|
-
"""
|
1029
|
-
content_nodes = []
|
1030
|
-
if tag_uuid is None:
|
1031
|
-
query = f"select distinct(cn_id) from ft where f_type in (select id from f_type where name like 'tag:{tag}')"
|
1032
|
-
else:
|
1033
|
-
query = f"select distinct(cn_id) from ft where f_type in (select id from f_type where name like 'tag:{tag}') and tag_uuid = '{tag_uuid}'"
|
1034
|
-
for content_node_ids in self.cursor.execute(query).fetchall():
|
1035
|
-
content_nodes.append(self.get_node(content_node_ids[0]))
|
1036
|
-
|
1037
|
-
return content_nodes
|
1038
|
-
|
1039
|
-
def add_model_insight(self, model_insights: ModelInsight):
|
1040
|
-
"""
|
1041
|
-
Adds a model insight to the document.
|
1042
|
-
|
1043
|
-
Args:
|
1044
|
-
model_insights (ModelInsight): The model insight to be added.
|
1045
|
-
"""
|
1046
|
-
self.cursor.execute(MODEL_INSIGHT_INSERT, [model_insights.json()])
|
1047
|
-
|
1048
|
-
def get_model_insights(self) -> List[ModelInsight]:
|
1049
|
-
"""
|
1050
|
-
Retrieves all model insights from the document.
|
1051
|
-
|
1052
|
-
Returns:
|
1053
|
-
list: A list of all model insights in the document.
|
1054
|
-
"""
|
1055
|
-
model_insights = []
|
1056
|
-
for model_insight in self.cursor.execute(MODEL_INSIGHT_SELECT).fetchall():
|
1057
|
-
model_insights.append(ModelInsight.model_validate_json(model_insight[0]))
|
1058
|
-
|
1059
|
-
return model_insights
|
1060
|
-
|
1061
|
-
def add_exception(self, exception: ContentException):
|
1062
|
-
"""
|
1063
|
-
Adds an exception to the document.
|
1064
|
-
|
1065
|
-
Args:
|
1066
|
-
exception (ContentException): The exception to be added.
|
1067
|
-
"""
|
1068
|
-
# Add an exception to the exception table
|
1069
|
-
self.cursor.execute(
|
1070
|
-
EXCEPTION_INSERT,
|
1071
|
-
[
|
1072
|
-
exception.tag,
|
1073
|
-
exception.message,
|
1074
|
-
exception.exception_details,
|
1075
|
-
exception.group_uuid,
|
1076
|
-
exception.tag_uuid,
|
1077
|
-
exception.exception_type,
|
1078
|
-
exception.severity,
|
1079
|
-
exception.node_uuid,
|
1080
|
-
exception.exception_type_id,
|
1081
|
-
],
|
1082
|
-
)
|
1083
|
-
|
1084
|
-
def get_exceptions(self) -> List[ContentException]:
|
1085
|
-
"""
|
1086
|
-
Retrieves all exceptions from the document.
|
1087
|
-
|
1088
|
-
Returns:
|
1089
|
-
list: A list of all exceptions in the document.
|
1090
|
-
"""
|
1091
|
-
exceptions = []
|
1092
|
-
for exception in self.cursor.execute(EXCEPTION_SELECT).fetchall():
|
1093
|
-
exceptions.append(
|
1094
|
-
ContentException(
|
1095
|
-
tag=exception[0],
|
1096
|
-
message=exception[1],
|
1097
|
-
exception_details=exception[2],
|
1098
|
-
group_uuid=exception[3],
|
1099
|
-
tag_uuid=exception[4],
|
1100
|
-
exception_type=exception[5],
|
1101
|
-
severity=exception[6],
|
1102
|
-
node_uuid=exception[7],
|
1103
|
-
exception_type_id=exception[8],
|
1104
|
-
)
|
1105
|
-
)
|
1106
|
-
return exceptions
|
1107
|
-
|
1108
|
-
def replace_exceptions(self, exceptions: List[ContentException]):
|
1109
|
-
"""
|
1110
|
-
Replaces all exceptions in the document with a given list of exceptions.
|
1111
|
-
|
1112
|
-
Args:
|
1113
|
-
exceptions (list): The new list of exceptions.
|
1114
|
-
"""
|
1115
|
-
self.cursor.execute("delete from content_exceptions")
|
1116
|
-
for exception in exceptions:
|
1117
|
-
self.add_exception(exception)
|
1118
|
-
|
1119
|
-
def clear_model_insights(self):
|
1120
|
-
"""
|
1121
|
-
Clears all model insights from the document.
|
1122
|
-
"""
|
1123
|
-
self.cursor.execute("delete from model_insights")
|
1124
|
-
|
1125
|
-
def get_all_tagged_nodes(self):
|
1126
|
-
"""
|
1127
|
-
Retrieves all nodes with tags from the document.
|
1128
|
-
|
1129
|
-
Returns:
|
1130
|
-
list: A list of all nodes with tags in the document.
|
1131
|
-
"""
|
1132
|
-
content_nodes = []
|
1133
|
-
query = "select distinct(cn_id) from ft where f_type in (select id from f_type where name like 'tag:%')"
|
1134
|
-
for content_node_ids in self.cursor.execute(query).fetchall():
|
1135
|
-
content_nodes.append(self.get_node(content_node_ids[0]))
|
1136
|
-
|
1137
|
-
return content_nodes
|
1138
|
-
|
1139
|
-
def get_nodes_by_type(self, node_type):
|
1140
|
-
"""
|
1141
|
-
Retrieves nodes of a given type from the document.
|
1142
|
-
|
1143
|
-
Args:
|
1144
|
-
node_type (str): The type of the nodes to be retrieved.
|
1145
|
-
|
1146
|
-
Returns:
|
1147
|
-
list: A list of nodes of the given type.
|
1148
|
-
"""
|
1149
|
-
content_nodes = []
|
1150
|
-
|
1151
|
-
node_type_id = self.node_type_id_by_name.get(node_type)
|
1152
|
-
|
1153
|
-
query = "select id, pid, nt, idx from cn where nt = ? order by idx"
|
1154
|
-
for content_node in self.cursor.execute(query, [node_type_id]).fetchall():
|
1155
|
-
content_nodes.append(self.__build_node(content_node))
|
1156
|
-
|
1157
|
-
return content_nodes
|
1158
|
-
|
1159
|
-
def __ensure_validations_table_exists(self):
|
1160
|
-
"""
|
1161
|
-
Ensure the 'validations' table exists in the database.
|
1162
|
-
Creates the table if it does not exist and initializes it with an empty list.
|
1163
|
-
"""
|
1164
|
-
self.cursor.execute("""
|
1165
|
-
CREATE TABLE IF NOT EXISTS validations (
|
1166
|
-
obj BLOB
|
1167
|
-
)
|
1168
|
-
""")
|
1169
|
-
|
1170
|
-
# Check if the table has any rows, if not, insert an initial empty row
|
1171
|
-
result = self.cursor.execute("SELECT COUNT(*) FROM validations").fetchone()
|
1172
|
-
if result[0] == 0:
|
1173
|
-
self.cursor.execute("INSERT INTO validations (obj) VALUES (?)", [sqlite3.Binary(msgpack.packb([]))])
|
1174
|
-
|
1175
|
-
def set_validations(self, validations: List[DocumentTaxonValidation]):
|
1176
|
-
"""
|
1177
|
-
Sets the validations for the document.
|
1178
|
-
|
1179
|
-
Args:
|
1180
|
-
validations (List[DocumentTaxonValidation]): The validations to store.
|
1181
|
-
"""
|
1182
|
-
self.__ensure_validations_table_exists()
|
1183
|
-
serialized_data = sqlite3.Binary(msgpack.packb([v.model_dump(by_alias=True) for v in validations]))
|
1184
|
-
self.cursor.execute("UPDATE validations SET obj = ? WHERE rowid = 1", [serialized_data])
|
1185
|
-
self.connection.commit()
|
1186
|
-
|
1187
|
-
def get_validations(self) -> List[DocumentTaxonValidation]:
|
1188
|
-
"""
|
1189
|
-
Gets the validations associated with this document.
|
1190
|
-
|
1191
|
-
Returns:
|
1192
|
-
List[DocumentTaxonValidation]: The list of validations stored in the validations table.
|
1193
|
-
"""
|
1194
|
-
self.__ensure_validations_table_exists()
|
1195
|
-
result = self.cursor.execute("SELECT obj FROM validations WHERE rowid = 1").fetchone()
|
1196
|
-
if result and result[0]:
|
1197
|
-
return [DocumentTaxonValidation.model_validate(v) for v in msgpack.unpackb(result[0])]
|
1198
|
-
return []
|
1199
|
-
|
1200
|
-
def set_external_data(self, external_data: dict, key: str = "default"):
|
1201
|
-
"""
|
1202
|
-
Sets the external data for the document for a specific key.
|
1203
|
-
|
1204
|
-
Args:
|
1205
|
-
external_data (dict): The external data to store, must be JSON serializable.
|
1206
|
-
key (str): The key to store the data under, defaults to "default"
|
1207
|
-
"""
|
1208
|
-
self.__ensure_ed_table_exists()
|
1209
|
-
serialized_data = sqlite3.Binary(msgpack.packb(external_data))
|
1210
|
-
self.cursor.execute("DELETE FROM ed WHERE key = ?", [key])
|
1211
|
-
self.cursor.execute("INSERT INTO ed (key, obj) VALUES (?, ?)", [key, serialized_data])
|
1212
|
-
self.connection.commit()
|
1213
|
-
|
1214
|
-
def get_external_data(self, key: str = "default") -> dict:
|
1215
|
-
"""
|
1216
|
-
Gets the external data associated with this document for a specific key.
|
1217
|
-
|
1218
|
-
Args:
|
1219
|
-
key (str): The key to retrieve data for, defaults to "default"
|
1220
|
-
|
1221
|
-
Returns:
|
1222
|
-
dict: The external data stored in the ed table for the given key.
|
1223
|
-
"""
|
1224
|
-
self.__ensure_ed_table_exists()
|
1225
|
-
result = self.cursor.execute("SELECT obj FROM ed WHERE key = ?", [key]).fetchone()
|
1226
|
-
if result and result[0]:
|
1227
|
-
return msgpack.unpackb(result[0])
|
1228
|
-
return {}
|
1229
|
-
|
1230
|
-
def get_external_data_keys(self) -> List[str]:
|
1231
|
-
"""
|
1232
|
-
Gets all keys under which external data is stored.
|
1233
|
-
|
1234
|
-
Returns:
|
1235
|
-
List[str]: A list of all keys that have external data stored.
|
1236
|
-
"""
|
1237
|
-
self.__ensure_ed_table_exists()
|
1238
|
-
results = self.cursor.execute("SELECT key FROM ed").fetchall()
|
1239
|
-
return [row[0] for row in results]
|
1240
|
-
|
1241
|
-
def __ensure_ed_table_exists(self):
|
1242
|
-
"""
|
1243
|
-
Ensure the 'ed' table exists in the database.
|
1244
|
-
Creates the table if it does not exist.
|
1245
|
-
"""
|
1246
|
-
# First check if the old table exists and has key column
|
1247
|
-
old_table = self.cursor.execute("""
|
1248
|
-
SELECT name FROM sqlite_master
|
1249
|
-
WHERE type='table' AND name='ed'
|
1250
|
-
""").fetchone()
|
1251
|
-
|
1252
|
-
if old_table:
|
1253
|
-
# Check if table has key column
|
1254
|
-
table_info = self.cursor.execute("PRAGMA table_info(ed)").fetchall()
|
1255
|
-
has_key_column = any(col[1] == 'key' for col in table_info)
|
1256
|
-
|
1257
|
-
if not has_key_column:
|
1258
|
-
# Get the old data and drop the table
|
1259
|
-
data = self.cursor.execute("SELECT obj FROM ed").fetchone()
|
1260
|
-
self.cursor.execute("DROP TABLE ed")
|
1261
|
-
|
1262
|
-
# Create new table with key column
|
1263
|
-
self.cursor.execute("""
|
1264
|
-
CREATE TABLE ed (
|
1265
|
-
key TEXT PRIMARY KEY,
|
1266
|
-
obj BLOB
|
1267
|
-
)
|
1268
|
-
""")
|
1269
|
-
|
1270
|
-
# If there was data in the old table, insert it with default key
|
1271
|
-
if data:
|
1272
|
-
self.cursor.execute("INSERT INTO ed (key, obj) VALUES (?, ?)",
|
1273
|
-
["default", data[0]])
|
1274
|
-
else:
|
1275
|
-
# Table exists and has key column - do nothing
|
1276
|
-
return
|
1277
|
-
else:
|
1278
|
-
# Create new table if it doesn't exist
|
1279
|
-
self.cursor.execute("""
|
1280
|
-
CREATE TABLE IF NOT EXISTS ed (
|
1281
|
-
key TEXT PRIMARY KEY,
|
1282
|
-
obj BLOB
|
1283
|
-
)
|
1284
|
-
""")
|
1285
|
-
|
1286
|
-
# Check if default key exists, if not insert empty data
|
1287
|
-
result = self.cursor.execute("SELECT COUNT(*) FROM ed WHERE key = 'default'").fetchone()
|
1288
|
-
if result[0] == 0:
|
1289
|
-
self.cursor.execute("INSERT INTO ed (key, obj) VALUES (?, ?)",
|
1290
|
-
["default", sqlite3.Binary(msgpack.packb({}))])
|
1291
|
-
|
1292
|
-
def __ensure_steps_table_exists(self):
|
1293
|
-
"""
|
1294
|
-
Ensure the 'steps' table exists in the database.
|
1295
|
-
Creates the table if it does not exist.
|
1296
|
-
"""
|
1297
|
-
self.cursor.execute("""
|
1298
|
-
CREATE TABLE IF NOT EXISTS steps (
|
1299
|
-
obj BLOB
|
1300
|
-
)
|
1301
|
-
""")
|
1302
|
-
|
1303
|
-
# Check if the table has any rows, if not, insert an initial empty row
|
1304
|
-
result = self.cursor.execute("SELECT COUNT(*) FROM steps").fetchone()
|
1305
|
-
if result[0] == 0:
|
1306
|
-
self.cursor.execute("INSERT INTO steps (obj) VALUES (?)", [sqlite3.Binary(msgpack.packb([]))])
|
1307
|
-
|
1308
|
-
def set_steps(self, steps: List[ProcessingStep]):
|
1309
|
-
"""
|
1310
|
-
Sets the processing steps for the document.
|
1311
|
-
|
1312
|
-
Args:
|
1313
|
-
steps (List[ProcessingStep]): A list of ProcessingStep objects to store.
|
1314
|
-
"""
|
1315
|
-
self.__ensure_steps_table_exists()
|
1316
|
-
serialized_steps = [step.to_dict() for step in steps]
|
1317
|
-
packed_data = sqlite3.Binary(msgpack.packb(serialized_steps))
|
1318
|
-
self.cursor.execute("UPDATE steps SET obj = ? WHERE rowid = 1", [packed_data])
|
1319
|
-
self.connection.commit()
|
1320
|
-
|
1321
|
-
def get_steps(self) -> List[ProcessingStep]:
|
1322
|
-
"""
|
1323
|
-
Gets the processing steps associated with this document.
|
1324
|
-
|
1325
|
-
Returns:
|
1326
|
-
List[ProcessingStep]: A list of ProcessingStep objects.
|
1327
|
-
"""
|
1328
|
-
self.__ensure_steps_table_exists()
|
1329
|
-
result = self.cursor.execute("SELECT obj FROM steps WHERE rowid = 1").fetchone()
|
1330
|
-
if result and result[0]:
|
1331
|
-
unpacked_data = msgpack.unpackb(result[0])
|
1332
|
-
return [ProcessingStep(**step) for step in unpacked_data]
|
1333
|
-
return []
|
1334
|
-
|
1335
|
-
|
1336
|
-
class SimpleObjectCache(object):
|
1337
|
-
"""
|
1338
|
-
A simple cache based on ID'd objects, where we will build ID's for new
|
1339
|
-
objects, store them and also a dirty flag so that it is easy to pull all
|
1340
|
-
dirty objects and store them as needed.
|
1341
|
-
"""
|
1342
|
-
|
1343
|
-
"""
|
1344
|
-
A simple cache based on ID'd objects, where we will build ID's for new
|
1345
|
-
objects, store them and also a dirty flag so that it is easy to pull all
|
1346
|
-
dirty objects and store them as needed.
|
1347
|
-
"""
|
1348
|
-
"""
|
1349
|
-
A simple cache based on ID'd objects, where we will build ID's for new
|
1350
|
-
objects, store them and also a dirty flag so that it is easy to pull all
|
1351
|
-
dirty objects and store them as needed
|
1352
|
-
"""
|
1353
|
-
|
1354
|
-
def __init__(self):
|
1355
|
-
self.objs = {}
|
1356
|
-
self.next_id = 1
|
1357
|
-
self.dirty_objs = set()
|
1358
|
-
|
1359
|
-
def get_obj(self, obj_id) -> Optional[ContentNode]:
|
1360
|
-
"""
|
1361
|
-
Get the object with the given ID.
|
1362
|
-
|
1363
|
-
Args:
|
1364
|
-
obj_id (int): The ID of the object.
|
1365
|
-
|
1366
|
-
Returns:
|
1367
|
-
object: The object with the given ID if it exists, None otherwise.
|
1368
|
-
"""
|
1369
|
-
if obj_id in self.objs:
|
1370
|
-
return self.objs[obj_id]
|
1371
|
-
|
1372
|
-
return None
|
1373
|
-
|
1374
|
-
def add_obj(self, obj: ContentNode):
|
1375
|
-
"""
|
1376
|
-
Add an object to the cache.
|
1377
|
-
|
1378
|
-
Args:
|
1379
|
-
obj (object): The object to add. If the object does not have a uuid, one will be assigned.
|
1380
|
-
"""
|
1381
|
-
if obj.uuid is None:
|
1382
|
-
obj.uuid = self.next_id
|
1383
|
-
self.next_id += 1
|
1384
|
-
self.objs[obj.uuid] = obj
|
1385
|
-
self.dirty_objs.add(obj.uuid)
|
1386
|
-
|
1387
|
-
def remove_obj(self, obj: ContentNode):
|
1388
|
-
"""
|
1389
|
-
Remove an object from the cache.
|
1390
|
-
|
1391
|
-
Args:
|
1392
|
-
obj (object): The object to remove.
|
1393
|
-
"""
|
1394
|
-
if obj and obj.uuid in self.objs:
|
1395
|
-
self.objs.pop(obj.uuid)
|
1396
|
-
if obj.uuid in self.dirty_objs:
|
1397
|
-
self.dirty_objs.remove(obj.uuid)
|
1398
|
-
|
1399
|
-
def get_dirty_objs(self) -> list[ContentNode]:
|
1400
|
-
"""
|
1401
|
-
Get all dirty objects in the cache.
|
1402
|
-
|
1403
|
-
Returns:
|
1404
|
-
list: A list of all dirty objects in the cache.
|
1405
|
-
"""
|
1406
|
-
results = []
|
1407
|
-
for set_id in set(self.dirty_objs):
|
1408
|
-
node = self.get_obj(set_id)
|
1409
|
-
if node is not None:
|
1410
|
-
results.append(node)
|
1411
|
-
return results
|
1412
|
-
|
1413
|
-
def undirty(self, obj):
|
1414
|
-
"""
|
1415
|
-
Mark an object as not dirty.
|
1416
|
-
|
1417
|
-
Args:
|
1418
|
-
obj (object): The object to mark as not dirty.
|
1419
|
-
"""
|
1420
|
-
self.dirty_objs.remove(obj.uuid)
|
1421
|
-
|
1422
|
-
|
1423
|
-
class PersistenceManager(object):
|
1424
|
-
"""
|
1425
|
-
The persistence manager supports holding the document and only flushing objects to the persistence layer
|
1426
|
-
as needed. This is implemented to allow us to work with large complex documents in a performance centered way.
|
1427
|
-
|
1428
|
-
Attributes:
|
1429
|
-
document (Document): The document to be managed.
|
1430
|
-
node_cache (SimpleObjectCache): Cache for nodes.
|
1431
|
-
child_cache (dict): Cache for child nodes.
|
1432
|
-
child_id_cache (dict): Cache for child node IDs.
|
1433
|
-
feature_cache (dict): Cache for features.
|
1434
|
-
content_parts_cache (dict): Cache for content parts.
|
1435
|
-
node_parent_cache (dict): Cache for node parents.
|
1436
|
-
_underlying_persistence (SqliteDocumentPersistence): The underlying persistence layer.
|
1437
|
-
"""
|
1438
|
-
|
1439
|
-
"""
|
1440
|
-
The persistence manager supports holding the document and only flushing objects to the persistence layer
|
1441
|
-
as needed. This is implemented to allow us to work with large complex documents in a performance centered way.
|
1442
|
-
|
1443
|
-
Attributes:
|
1444
|
-
document (Document): The document to be managed.
|
1445
|
-
node_cache (SimpleObjectCache): Cache for nodes.
|
1446
|
-
child_cache (dict): Cache for child nodes.
|
1447
|
-
child_id_cache (dict): Cache for child node IDs.
|
1448
|
-
feature_cache (dict): Cache for features.
|
1449
|
-
content_parts_cache (dict): Cache for content parts.
|
1450
|
-
node_parent_cache (dict): Cache for node parents.
|
1451
|
-
_underlying_persistence (SqliteDocumentPersistence): The underlying persistence layer.
|
1452
|
-
"""
|
1453
|
-
"""
|
1454
|
-
The persistence manager supports holding the document and only flushing objects to the persistence layer
|
1455
|
-
as needed.
|
1456
|
-
|
1457
|
-
This is implemented to allow us to work with large complex documents in a performance centered way.
|
1458
|
-
"""
|
1459
|
-
|
1460
|
-
def __init__(self, document: Document, filename: str = None, delete_on_close=False, inmemory=False):
|
1461
|
-
self.document = document
|
1462
|
-
self.node_cache = SimpleObjectCache()
|
1463
|
-
self.child_cache = {}
|
1464
|
-
self.child_id_cache = {}
|
1465
|
-
self.feature_cache = {}
|
1466
|
-
self.content_parts_cache = {}
|
1467
|
-
self.node_parent_cache = {}
|
1468
|
-
|
1469
|
-
self._underlying_persistence = SqliteDocumentPersistence(
|
1470
|
-
document, filename, delete_on_close, inmemory=inmemory, persistence_manager=self
|
1471
|
-
)
|
1472
|
-
|
1473
|
-
def get_steps(self) -> list[ProcessingStep]:
|
1474
|
-
"""
|
1475
|
-
Gets the processing steps for this document
|
1476
|
-
|
1477
|
-
:return:
|
1478
|
-
"""
|
1479
|
-
return self._underlying_persistence.get_steps()
|
1480
|
-
|
1481
|
-
def set_steps(self, steps: list[ProcessingStep]):
|
1482
|
-
self._underlying_persistence.set_steps(steps)
|
1483
|
-
|
1484
|
-
def set_validations(self, validations: list[DocumentTaxonValidation]):
|
1485
|
-
self._underlying_persistence.set_validations(validations)
|
1486
|
-
|
1487
|
-
def get_validations(self) -> list[DocumentTaxonValidation]:
|
1488
|
-
return self._underlying_persistence.get_validations()
|
1489
|
-
|
1490
|
-
def get_external_data(self, key="default") -> dict:
|
1491
|
-
"""
|
1492
|
-
Gets the external data object associated with this document
|
1493
|
-
|
1494
|
-
:return: dict of the external data
|
1495
|
-
"""
|
1496
|
-
return self._underlying_persistence.get_external_data(key)
|
1497
|
-
|
1498
|
-
def get_external_data_keys(self) -> List[str]:
|
1499
|
-
"""
|
1500
|
-
Gets all keys under which external data is stored.
|
1501
|
-
|
1502
|
-
Returns:
|
1503
|
-
List[str]: A list of all keys that have external data stored.
|
1504
|
-
"""
|
1505
|
-
return self._underlying_persistence.get_external_data_keys()
|
1506
|
-
|
1507
|
-
def set_external_data(self, external_data:dict, key="default"):
|
1508
|
-
"""
|
1509
|
-
Sets the external data for this document
|
1510
|
-
|
1511
|
-
:param external_data: dict representing the external data, must be JSON serializable
|
1512
|
-
:return:
|
1513
|
-
"""
|
1514
|
-
self._underlying_persistence.set_external_data(external_data, key)
|
1515
|
-
|
1516
|
-
def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
|
1517
|
-
"""
|
1518
|
-
Retrieves all nodes of a given type from the underlying persistence layer.
|
1519
|
-
|
1520
|
-
Args:
|
1521
|
-
node_type (str): The type of the nodes to be retrieved.
|
1522
|
-
|
1523
|
-
Returns:
|
1524
|
-
List[ContentNode]: A list of all nodes of the given type.
|
1525
|
-
"""
|
1526
|
-
return self._underlying_persistence.get_nodes_by_type(node_type)
|
1527
|
-
|
1528
|
-
def get_node_by_uuid(self, uuid: int) -> ContentNode:
|
1529
|
-
"""
|
1530
|
-
Retrieves a node by its uuid.
|
1531
|
-
|
1532
|
-
Args:
|
1533
|
-
uuid (str): The uuid of the node to be retrieved.
|
1534
|
-
|
1535
|
-
Returns:
|
1536
|
-
ContentNode: The node with the given uuid.
|
1537
|
-
"""
|
1538
|
-
if self.node_cache.get_obj(uuid) is None:
|
1539
|
-
node = self._underlying_persistence.get_node(uuid)
|
1540
|
-
if node:
|
1541
|
-
self.node_cache.add_obj(node)
|
1542
|
-
return node
|
1543
|
-
|
1544
|
-
return self.node_cache.get_obj(uuid) # return the cached version
|
1545
|
-
|
1546
|
-
def add_model_insight(self, model_insight: ModelInsight):
|
1547
|
-
"""
|
1548
|
-
Adds a model insight to the underlying persistence layer.
|
1549
|
-
|
1550
|
-
Args:
|
1551
|
-
model_insight (ModelInsight): The model insight to be added.
|
1552
|
-
"""
|
1553
|
-
self._underlying_persistence.add_model_insight(model_insight)
|
1554
|
-
|
1555
|
-
def clear_model_insights(self):
|
1556
|
-
"""
|
1557
|
-
Clears all model insights from the underlying persistence layer.
|
1558
|
-
"""
|
1559
|
-
self._underlying_persistence.clear_model_insights()
|
1560
|
-
|
1561
|
-
def get_model_insights(self) -> List[ModelInsight]:
|
1562
|
-
"""
|
1563
|
-
Retrieves all model insights from the underlying persistence layer.
|
1564
|
-
|
1565
|
-
Returns:
|
1566
|
-
List[ModelInsight]: A list of all model insights.
|
1567
|
-
"""
|
1568
|
-
return self._underlying_persistence.get_model_insights()
|
1569
|
-
|
1570
|
-
def add_exception(self, exception: ContentException):
|
1571
|
-
"""
|
1572
|
-
Adds an exception to the underlying persistence layer.
|
1573
|
-
|
1574
|
-
Args:
|
1575
|
-
exception (ContentException): The exception to be added.
|
1576
|
-
"""
|
1577
|
-
self._underlying_persistence.add_exception(exception)
|
1578
|
-
|
1579
|
-
def get_exceptions(self) -> List[ContentException]:
|
1580
|
-
"""
|
1581
|
-
Retrieves all exceptions from the underlying persistence layer.
|
1582
|
-
|
1583
|
-
Returns:
|
1584
|
-
List[ContentException]: A list of all exceptions.
|
1585
|
-
"""
|
1586
|
-
return self._underlying_persistence.get_exceptions()
|
1587
|
-
|
1588
|
-
def replace_exceptions(self, exceptions: List[ContentException]):
|
1589
|
-
"""
|
1590
|
-
Replaces all exceptions in the underlying persistence layer with the provided list.
|
1591
|
-
|
1592
|
-
Args:
|
1593
|
-
exceptions (List[ContentException]): The list of exceptions to replace with.
|
1594
|
-
"""
|
1595
|
-
self._underlying_persistence.replace_exceptions(exceptions)
|
1596
|
-
|
1597
|
-
def get_all_tags(self):
|
1598
|
-
"""
|
1599
|
-
Retrieves all tags from the underlying persistence layer.
|
1600
|
-
|
1601
|
-
Returns:
|
1602
|
-
List[str]: A list of all tags.
|
1603
|
-
"""
|
1604
|
-
return self._underlying_persistence.get_all_tags()
|
1605
|
-
|
1606
|
-
def get_tagged_nodes(self, tag, tag_uuid=None):
|
1607
|
-
"""
|
1608
|
-
Retrieves all nodes tagged with the specified tag from the underlying persistence layer.
|
1609
|
-
|
1610
|
-
Args:
|
1611
|
-
tag (str): The tag to filter nodes by.
|
1612
|
-
tag_uuid (str, optional): The UUID of the tag to filter nodes by. Defaults to None.
|
1613
|
-
|
1614
|
-
Returns:
|
1615
|
-
List[Node]: A list of nodes tagged with the specified tag.
|
1616
|
-
"""
|
1617
|
-
return self._underlying_persistence.get_tagged_nodes(tag, tag_uuid)
|
1618
|
-
|
1619
|
-
def get_all_tagged_nodes(self):
|
1620
|
-
"""
|
1621
|
-
Retrieves all tagged nodes from the underlying persistence layer.
|
1622
|
-
|
1623
|
-
Returns:
|
1624
|
-
List[Node]: A list of all tagged nodes.
|
1625
|
-
"""
|
1626
|
-
return self._underlying_persistence.get_all_tagged_nodes()
|
1627
|
-
|
1628
|
-
def initialize(self):
|
1629
|
-
"""
|
1630
|
-
Initializes the persistence manager by setting up the underlying persistence layer and node cache.
|
1631
|
-
"""
|
1632
|
-
self._underlying_persistence.initialize()
|
1633
|
-
|
1634
|
-
self.node_cache.next_id = self._underlying_persistence.get_next_node_id()
|
1635
|
-
|
1636
|
-
def get_parent(self, node):
|
1637
|
-
"""
|
1638
|
-
Retrieves the parent of the specified node.
|
1639
|
-
|
1640
|
-
Args:
|
1641
|
-
node (Node): The node to get the parent of.
|
1642
|
-
|
1643
|
-
Returns:
|
1644
|
-
Node: The parent of the specified node.
|
1645
|
-
"""
|
1646
|
-
if node.uuid in self.node_parent_cache:
|
1647
|
-
return self.node_cache.get_obj(self.node_parent_cache[node.uuid])
|
1648
|
-
|
1649
|
-
return self._underlying_persistence.get_parent(node)
|
1650
|
-
|
1651
|
-
def close(self):
|
1652
|
-
"""
|
1653
|
-
Closes the underlying persistence layer.
|
1654
|
-
"""
|
1655
|
-
self._underlying_persistence.close()
|
1656
|
-
|
1657
|
-
@monitor_performance
|
1658
|
-
def flush_cache(self):
|
1659
|
-
"""
|
1660
|
-
Flushes the cache by merging it with the underlying persistence layer.
|
1661
|
-
"""
|
1662
|
-
all_node_ids = []
|
1663
|
-
all_nodes = []
|
1664
|
-
all_content_parts = []
|
1665
|
-
all_features = []
|
1666
|
-
node_id_with_features = []
|
1667
|
-
dirty_nodes = self.node_cache.get_dirty_objs()
|
1668
|
-
|
1669
|
-
if len(dirty_nodes) == 0:
|
1670
|
-
return
|
1671
|
-
|
1672
|
-
if not self._underlying_persistence.connection.in_transaction:
|
1673
|
-
self._underlying_persistence.connection.execute("BEGIN TRANSACTION")
|
1674
|
-
|
1675
|
-
next_feature_id = self._underlying_persistence.get_max_feature_id()
|
1676
|
-
for node in dirty_nodes:
|
1677
|
-
if not node.virtual:
|
1678
|
-
all_node_ids.append([node.uuid])
|
1679
|
-
node_obj, content_parts = self._underlying_persistence.add_content_node(
|
1680
|
-
node, None, execute=False
|
1681
|
-
)
|
1682
|
-
all_nodes.extend(node_obj)
|
1683
|
-
all_content_parts.extend(content_parts)
|
1684
|
-
if node.uuid in self.feature_cache:
|
1685
|
-
if node.uuid in self.feature_cache:
|
1686
|
-
node_id_with_features.append([node.uuid])
|
1687
|
-
|
1688
|
-
for feature in self.feature_cache[node.uuid]:
|
1689
|
-
binary_value = sqlite3.Binary(
|
1690
|
-
msgpack.packb(feature.value, use_bin_type=True)
|
1691
|
-
)
|
1692
|
-
|
1693
|
-
tag_uuid = None
|
1694
|
-
if feature.feature_type == "tag" and "uuid" in feature.value[0]:
|
1695
|
-
tag_uuid = feature.value[0]["uuid"]
|
1696
|
-
|
1697
|
-
all_features.append(
|
1698
|
-
[
|
1699
|
-
next_feature_id,
|
1700
|
-
node.uuid,
|
1701
|
-
self._underlying_persistence.get_feature_type_id(
|
1702
|
-
feature
|
1703
|
-
),
|
1704
|
-
binary_value,
|
1705
|
-
feature.single,
|
1706
|
-
tag_uuid,
|
1707
|
-
]
|
1708
|
-
)
|
1709
|
-
next_feature_id = next_feature_id + 1
|
1710
|
-
|
1711
|
-
self.node_cache.undirty(node)
|
1712
|
-
|
1713
|
-
self._underlying_persistence.cursor.executemany(
|
1714
|
-
"DELETE FROM cn where id=?", all_node_ids
|
1715
|
-
)
|
1716
|
-
self._underlying_persistence.cursor.executemany(
|
1717
|
-
"DELETE FROM ft where cn_id=?", node_id_with_features
|
1718
|
-
)
|
1719
|
-
self._underlying_persistence.cursor.executemany(
|
1720
|
-
"INSERT INTO cn (pid, nt, idx, id) VALUES (?,?,?,?)", all_nodes
|
1721
|
-
)
|
1722
|
-
self._underlying_persistence.cursor.executemany(
|
1723
|
-
"DELETE FROM cnp where cn_id=?", all_node_ids
|
1724
|
-
)
|
1725
|
-
self._underlying_persistence.cursor.executemany(
|
1726
|
-
CONTENT_NODE_PART_INSERT, all_content_parts
|
1727
|
-
)
|
1728
|
-
self._underlying_persistence.cursor.executemany(FEATURE_INSERT, all_features)
|
1729
|
-
self._underlying_persistence.connection.commit()
|
1730
|
-
|
1731
|
-
def get_content_nodes(self, node_type, parent_node, include_children):
|
1732
|
-
"""
|
1733
|
-
Retrieves content nodes of the specified type and parent from the underlying persistence layer.
|
1734
|
-
|
1735
|
-
Args:
|
1736
|
-
node_type (str): The type of nodes to retrieve.
|
1737
|
-
parent_node (Node): The parent node to filter nodes by.
|
1738
|
-
include_children (bool): Whether to include child nodes.
|
1739
|
-
|
1740
|
-
Returns:
|
1741
|
-
List[Node]: A list of nodes that match the specified criteria.
|
1742
|
-
"""
|
1743
|
-
return self._underlying_persistence.get_content_nodes(
|
1744
|
-
node_type, parent_node, include_children
|
1745
|
-
)
|
1746
|
-
|
1747
|
-
def get_bytes(self):
|
1748
|
-
"""
|
1749
|
-
Retrieves the bytes of the document from the underlying persistence layer.
|
1750
|
-
|
1751
|
-
Returns:
|
1752
|
-
bytes: The bytes of the document.
|
1753
|
-
"""
|
1754
|
-
self.flush_cache()
|
1755
|
-
self._underlying_persistence.sync()
|
1756
|
-
return self._underlying_persistence.get_bytes()
|
1757
|
-
|
1758
|
-
def update_metadata(self):
|
1759
|
-
"""
|
1760
|
-
Updates the metadata in the underlying persistence layer.
|
1761
|
-
"""
|
1762
|
-
self._underlying_persistence.update_metadata()
|
1763
|
-
|
1764
|
-
def add_content_node(self, node, parent):
|
1765
|
-
"""
|
1766
|
-
Adds a content node to the cache and updates the child and parent caches accordingly.
|
1767
|
-
|
1768
|
-
Args:
|
1769
|
-
node (Node): The node to be added.
|
1770
|
-
parent (Node): The parent of the node to be added.
|
1771
|
-
"""
|
1772
|
-
|
1773
|
-
if node.index is None:
|
1774
|
-
node.index = 0
|
1775
|
-
|
1776
|
-
# Check if the node exists in the DB
|
1777
|
-
if node.uuid is None:
|
1778
|
-
node.uuid = self.node_cache.next_id
|
1779
|
-
self.node_cache.next_id += 1
|
1780
|
-
|
1781
|
-
if self._underlying_persistence.get_node(node.uuid) is None:
|
1782
|
-
self._underlying_persistence.add_content_node(node, parent)
|
1783
|
-
|
1784
|
-
if parent:
|
1785
|
-
node._parent_uuid = parent.uuid
|
1786
|
-
self.node_cache.add_obj(parent)
|
1787
|
-
|
1788
|
-
self.node_cache.add_obj(node)
|
1789
|
-
|
1790
|
-
update_child_cache = False
|
1791
|
-
|
1792
|
-
if node.uuid not in self.node_parent_cache:
|
1793
|
-
self.node_parent_cache[node.uuid] = node._parent_uuid
|
1794
|
-
update_child_cache = True
|
1795
|
-
|
1796
|
-
if (
|
1797
|
-
node.uuid in self.node_parent_cache
|
1798
|
-
and node._parent_uuid != self.node_parent_cache[node.uuid]
|
1799
|
-
):
|
1800
|
-
# Remove from the old parent
|
1801
|
-
self.child_id_cache[self.node_parent_cache[node.uuid]].remove(node.uuid)
|
1802
|
-
self.child_cache[self.node_parent_cache[node.uuid]].remove(node)
|
1803
|
-
# Add to the new parent
|
1804
|
-
self.node_parent_cache[node.uuid] = node._parent_uuid
|
1805
|
-
update_child_cache = True
|
1806
|
-
|
1807
|
-
if update_child_cache:
|
1808
|
-
if node._parent_uuid not in self.child_cache:
|
1809
|
-
self.child_cache[node._parent_uuid] = [node]
|
1810
|
-
self.child_id_cache[node._parent_uuid] = {node.uuid}
|
1811
|
-
else:
|
1812
|
-
if node.uuid not in self.child_id_cache[node._parent_uuid]:
|
1813
|
-
self.child_id_cache[node._parent_uuid].add(node.uuid)
|
1814
|
-
current_children = self.child_cache[node._parent_uuid]
|
1815
|
-
if (
|
1816
|
-
len(current_children) == 0
|
1817
|
-
or node.index >= current_children[-1].index
|
1818
|
-
):
|
1819
|
-
self.child_cache[node._parent_uuid].append(node)
|
1820
|
-
else:
|
1821
|
-
self.child_cache[node._parent_uuid].append(node)
|
1822
|
-
self.child_cache[node._parent_uuid] = sorted(
|
1823
|
-
self.child_cache[node._parent_uuid], key=lambda x: x.index
|
1824
|
-
)
|
1825
|
-
|
1826
|
-
def get_node(self, node_id):
|
1827
|
-
"""
|
1828
|
-
Retrieves a node by its ID from the cache or the underlying persistence layer.
|
1829
|
-
|
1830
|
-
Args:
|
1831
|
-
node_id (str): The ID of the node to retrieve.
|
1832
|
-
|
1833
|
-
Returns:
|
1834
|
-
Node: The node with the specified ID.
|
1835
|
-
"""
|
1836
|
-
|
1837
|
-
node = self.node_cache.get_obj(node_id)
|
1838
|
-
if node is None:
|
1839
|
-
node = self._underlying_persistence.get_node(node_id)
|
1840
|
-
if node is not None:
|
1841
|
-
self.node_cache.add_obj(node)
|
1842
|
-
if node._parent_uuid:
|
1843
|
-
self.node_parent_cache[node.uuid] = node._parent_uuid
|
1844
|
-
if node._parent_uuid not in self.child_id_cache:
|
1845
|
-
self.get_node(node._parent_uuid)
|
1846
|
-
|
1847
|
-
return node
|
1848
|
-
|
1849
|
-
def remove_content_node(self, node):
|
1850
|
-
"""
|
1851
|
-
Removes a content node from the cache and the underlying persistence layer.
|
1852
|
-
|
1853
|
-
Args:
|
1854
|
-
node (Node): The node to be removed.
|
1855
|
-
"""
|
1856
|
-
|
1857
|
-
self.node_cache.remove_obj(node)
|
1858
|
-
|
1859
|
-
if node.uuid in self.node_parent_cache:
|
1860
|
-
try:
|
1861
|
-
self.child_cache[self.node_parent_cache[node.uuid]].remove(node)
|
1862
|
-
except ValueError:
|
1863
|
-
pass
|
1864
|
-
except KeyError:
|
1865
|
-
pass
|
1866
|
-
|
1867
|
-
# We have a sitation where we seem to fail here?
|
1868
|
-
try:
|
1869
|
-
self.child_id_cache[self.node_parent_cache[node.uuid]].remove(node.uuid)
|
1870
|
-
except ValueError:
|
1871
|
-
pass
|
1872
|
-
except KeyError:
|
1873
|
-
pass
|
1874
|
-
del self.node_parent_cache[node.uuid]
|
1875
|
-
|
1876
|
-
self.content_parts_cache.pop(node.uuid, None)
|
1877
|
-
self.feature_cache.pop(node.uuid, None)
|
1878
|
-
|
1879
|
-
all_ids = self._underlying_persistence.remove_content_node(node)
|
1880
|
-
|
1881
|
-
# remove all the ids from the cache
|
1882
|
-
for id in all_ids:
|
1883
|
-
tmp_node = self.node_cache.get_obj(id)
|
1884
|
-
if tmp_node is not None:
|
1885
|
-
self.node_cache.remove_obj(tmp_node)
|
1886
|
-
self.node_cache.dirty_objs.remove(id) if id in self.node_cache.dirty_objs else None
|
1887
|
-
|
1888
|
-
def get_children(self, node):
|
1889
|
-
"""
|
1890
|
-
Retrieves the children of the specified node from the cache or the underlying persistence layer.
|
1891
|
-
|
1892
|
-
Args:
|
1893
|
-
node (Node): The node to get the children of.
|
1894
|
-
|
1895
|
-
Returns:
|
1896
|
-
List[Node]: The children of the specified node.
|
1897
|
-
"""
|
1898
|
-
if node.uuid not in self.child_id_cache:
|
1899
|
-
child_ids = self._underlying_persistence.get_child_ids(node)
|
1900
|
-
else:
|
1901
|
-
child_ids = self.child_id_cache[node.uuid]
|
1902
|
-
|
1903
|
-
if node.uuid not in self.child_cache:
|
1904
|
-
new_children = []
|
1905
|
-
|
1906
|
-
for child_id in child_ids:
|
1907
|
-
child_node = self.node_cache.get_obj(child_id)
|
1908
|
-
|
1909
|
-
if child_node is not None:
|
1910
|
-
new_children.append(child_node)
|
1911
|
-
else:
|
1912
|
-
new_children.append(self.get_node(child_id))
|
1913
|
-
|
1914
|
-
self.child_cache[node.uuid] = sorted(new_children, key=lambda x: x.index)
|
1915
|
-
self.child_id_cache[node.uuid] = set(child_ids)
|
1916
|
-
|
1917
|
-
return self.child_cache[node.uuid]
|
1918
|
-
|
1919
|
-
def update_node(self, node):
|
1920
|
-
"""
|
1921
|
-
Updates a node in the cache and the underlying persistence layer.
|
1922
|
-
|
1923
|
-
Args:
|
1924
|
-
node (Node): The node to be updated.
|
1925
|
-
"""
|
1926
|
-
# We need to also update the parent
|
1927
|
-
self.node_parent_cache[node.uuid] = node._parent_uuid
|
1928
|
-
|
1929
|
-
self._underlying_persistence.update_node(node)
|
1930
|
-
|
1931
|
-
def update_content_parts(self, node, content_parts):
|
1932
|
-
"""
|
1933
|
-
Updates the content parts of a node in the cache.
|
1934
|
-
|
1935
|
-
Args:
|
1936
|
-
node (Node): The node to update the content parts of.
|
1937
|
-
content_parts (List[ContentPart]): The new content parts of the node.
|
1938
|
-
"""
|
1939
|
-
self.content_parts_cache[node.uuid] = content_parts
|
1940
|
-
|
1941
|
-
def get_content_parts(self, node):
|
1942
|
-
"""
|
1943
|
-
Retrieves the content parts of a node from the cache or the underlying persistence layer.
|
1944
|
-
|
1945
|
-
Args:
|
1946
|
-
node (Node): The node to get the content parts of.
|
1947
|
-
|
1948
|
-
Returns:
|
1949
|
-
List[ContentPart]: The content parts of the node.
|
1950
|
-
"""
|
1951
|
-
if node.uuid is None:
|
1952
|
-
return []
|
1953
|
-
|
1954
|
-
cps = (
|
1955
|
-
self.content_parts_cache[node.uuid]
|
1956
|
-
if node.uuid in self.content_parts_cache
|
1957
|
-
else None
|
1958
|
-
)
|
1959
|
-
if cps is None:
|
1960
|
-
cps = self._underlying_persistence.get_content_parts(node)
|
1961
|
-
if cps is not None:
|
1962
|
-
self.content_parts_cache[node.uuid] = cps
|
1963
|
-
|
1964
|
-
return cps
|
1965
|
-
|
1966
|
-
def remove_feature(self, node, feature_type, name):
|
1967
|
-
"""
|
1968
|
-
Removes a feature from a node in the cache and the underlying persistence layer.
|
1969
|
-
|
1970
|
-
Args:
|
1971
|
-
node (Node): The node to remove the feature from.
|
1972
|
-
feature_type (str): The type of the feature to remove.
|
1973
|
-
name (str): The name of the feature to remove.
|
1974
|
-
"""
|
1975
|
-
|
1976
|
-
features = self.get_features(node)
|
1977
|
-
self._underlying_persistence.remove_feature(node, feature_type, name)
|
1978
|
-
new_features = [
|
1979
|
-
i
|
1980
|
-
for i in features
|
1981
|
-
if not (i.feature_type == feature_type and i.name == name)
|
1982
|
-
]
|
1983
|
-
self.feature_cache[node.uuid] = new_features
|
1984
|
-
self.node_cache.add_obj(node)
|
1985
|
-
|
1986
|
-
def get_features(self, node):
|
1987
|
-
"""
|
1988
|
-
Retrieves the features of a node from the cache or the underlying persistence layer.
|
1989
|
-
|
1990
|
-
Args:
|
1991
|
-
node (Node): The node to get the features of.
|
1992
|
-
|
1993
|
-
Returns:
|
1994
|
-
List[Feature]: The features of the node.
|
1995
|
-
"""
|
1996
|
-
|
1997
|
-
if node.uuid not in self.feature_cache:
|
1998
|
-
features = self._underlying_persistence.get_features(node)
|
1999
|
-
self.feature_cache[node.uuid] = features
|
2000
|
-
|
2001
|
-
return self.feature_cache[node.uuid]
|
2002
|
-
|
2003
|
-
def add_feature(self, node, feature):
|
2004
|
-
"""
|
2005
|
-
Adds a feature to a node in the cache and the underlying persistence layer.
|
2006
|
-
|
2007
|
-
Args:
|
2008
|
-
node (Node): The node to add the feature to.
|
2009
|
-
feature (Feature): The feature to be added.
|
2010
|
-
"""
|
2011
|
-
|
2012
|
-
if node.uuid not in self.feature_cache:
|
2013
|
-
features = self._underlying_persistence.get_features(node)
|
2014
|
-
self.feature_cache[node.uuid] = features
|
2015
|
-
|
2016
|
-
self.node_cache.add_obj(node)
|
2017
|
-
self.feature_cache[node.uuid].append(feature)
|