kodexa 7.5.514404640805__py3-none-any.whl → 8.0.14958192442__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. kodexa/dataclasses/__init__.py +1 -1
  2. kodexa/model/__init__.py +2 -2
  3. kodexa/model/objects.py +21 -1
  4. kodexa/model/utils.py +1 -1
  5. kodexa/pipeline/pipeline.py +1 -1
  6. kodexa/platform/client.py +1 -2
  7. kodexa/platform/kodexa.py +4 -1
  8. kodexa/platform/manifest.py +447 -0
  9. kodexa/selectors/__init__.py +1 -1
  10. kodexa/selectors/ast.py +371 -98
  11. kodexa/selectors/error.py +29 -0
  12. kodexa/selectors/kodexa-ast-visitor.py +268 -0
  13. kodexa/selectors/parser.py +91 -0
  14. kodexa/selectors/resources/KodexaSelector.interp +99 -0
  15. kodexa/selectors/resources/KodexaSelector.tokens +56 -0
  16. kodexa/selectors/resources/KodexaSelectorLexer.interp +119 -0
  17. kodexa/selectors/resources/KodexaSelectorLexer.py +204 -0
  18. kodexa/selectors/resources/KodexaSelectorLexer.tokens +56 -0
  19. kodexa/selectors/resources/KodexaSelectorListener.py +570 -0
  20. kodexa/selectors/resources/KodexaSelectorParser.py +3246 -0
  21. kodexa/selectors/resources/KodexaSelectorVisitor.py +323 -0
  22. kodexa/selectors/visitor.py +265 -0
  23. kodexa/steps/__init__.py +4 -2
  24. kodexa/steps/common.py +0 -68
  25. kodexa/testing/test_utils.py +1 -1
  26. {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/METADATA +7 -3
  27. kodexa-8.0.14958192442.dist-info/RECORD +53 -0
  28. {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/WHEEL +1 -1
  29. kodexa/model/model.py +0 -3259
  30. kodexa/model/persistence.py +0 -2017
  31. kodexa/selectors/core.py +0 -124
  32. kodexa/selectors/lexrules.py +0 -137
  33. kodexa/selectors/lextab.py +0 -83
  34. kodexa/selectors/lextab.pyi +0 -1
  35. kodexa/selectors/parserules.py +0 -414
  36. kodexa/selectors/parserules.pyi +0 -1
  37. kodexa/selectors/parsetab.py +0 -4149
  38. kodexa/selectors/parsetab.pyi +0 -1
  39. kodexa-7.5.514404640805.dist-info/RECORD +0 -50
  40. {kodexa-7.5.514404640805.dist-info → kodexa-8.0.14958192442.dist-info}/LICENSE +0 -0
@@ -1,2017 +0,0 @@
1
- import dataclasses
2
- import logging
3
- import pathlib
4
- import sqlite3
5
- import tempfile
6
- import time
7
- import uuid
8
- from typing import List, Optional
9
-
10
- import msgpack
11
-
12
- from kodexa.model import Document, ContentNode, SourceMetadata
13
- from kodexa.model.model import (
14
- DocumentMetadata,
15
- ContentFeature,
16
- ContentException,
17
- ModelInsight, ProcessingStep,
18
- )
19
- from kodexa.model.objects import DocumentTaxonValidation
20
-
21
- logger = logging.getLogger()
22
-
23
- # Heavily used SQL
24
- EXCEPTION_INSERT = "INSERT INTO content_exceptions (tag, message, exception_details, group_uuid, tag_uuid, exception_type, severity, node_uuid, exception_type_id) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"
25
- EXCEPTION_SELECT = "select tag, message, exception_details, group_uuid, tag_uuid, exception_type, severity, node_uuid, exception_type_id from content_exceptions"
26
-
27
- MODEL_INSIGHT_INSERT = "INSERT INTO model_insights (model_insight) VALUES (?)"
28
- MODEL_INSIGHT_SELECT = "select model_insight from model_insights"
29
-
30
- FEATURE_INSERT = "INSERT INTO ft (id, cn_id, f_type, binary_value, single, tag_uuid) VALUES (?,?,?,?,?,?)"
31
- FEATURE_DELETE = "DELETE FROM ft where cn_id=? and f_type=?"
32
-
33
- CONTENT_NODE_INSERT = "INSERT INTO cn (pid, nt, idx) VALUES (?,?,?)"
34
- CONTENT_NODE_UPDATE = "UPDATE cn set pid=?, nt=?, idx=? WHERE id=?"
35
-
36
- CONTENT_NODE_PART_INSERT = (
37
- "INSERT INTO cnp (cn_id, pos, content, content_idx) VALUES (?,?,?,?)"
38
- )
39
- NOTE_TYPE_INSERT = "insert into n_type(name) values (?)"
40
- NODE_TYPE_LOOKUP = "select id from n_type where name = ?"
41
- FEATURE_TYPE_INSERT = "insert into f_type(name) values (?)"
42
- FEATURE_TYPE_LOOKUP = "select id from f_type where name = ?"
43
- METADATA_INSERT = "insert into metadata(id,metadata) values (1,?)"
44
- METADATA_DELETE = "delete from metadata where id=1"
45
-
46
- # Configuration constants
47
- CACHE_SIZE = 10000 # Number of nodes to cache
48
- BATCH_SIZE = 1000 # Size of batches for bulk operations
49
- SLOW_QUERY_THRESHOLD = 1.0 # Seconds
50
- MAX_CONNECTIONS = 5 # Maximum number of database connections
51
-
52
- def monitor_performance(func):
53
- """Performance monitoring decorator"""
54
- def wrapper(*args, **kwargs):
55
- start_time = time.time()
56
- result = func(*args, **kwargs)
57
- duration = time.time() - start_time
58
- if duration > SLOW_QUERY_THRESHOLD:
59
- logger.warning(f"Slow operation detected: {func.__name__}, duration: {duration}s")
60
- return result
61
- return wrapper
62
-
63
- class SqliteDocumentPersistence(object):
64
- """
65
- The Sqlite persistence engine to support large scale documents (part of the V4 Kodexa Document Architecture)
66
-
67
- Attributes:
68
- document (Document): The document to be persisted.
69
- filename (str): The name of the file where the document is stored.
70
- delete_on_close (bool): If True, the file will be deleted when the connection is closed.
71
- """
72
-
73
- """
74
- The Sqlite persistence engine to support large scale documents (part of the V4 Kodexa Document Architecture)
75
- """
76
-
77
- def __init__(self, document: Document, filename: str = None, delete_on_close=False, inmemory=False, persistence_manager=None):
78
- self.document = document
79
-
80
- self.node_types = {}
81
- self.node_type_id_by_name = {}
82
- self.feature_type_id_by_name = {}
83
- self.feature_type_names = {}
84
- self.delete_on_close = delete_on_close
85
-
86
- import sqlite3
87
-
88
- self.is_new = True
89
- if filename is not None:
90
- self.is_tmp = False
91
- path = pathlib.Path(filename)
92
- if path.exists():
93
- # At this point we need to load the db
94
- self.is_new = False
95
- else:
96
- from kodexa import KodexaPlatform
97
-
98
- new_file, filename = tempfile.mkstemp(
99
- suffix=".kddb", dir=KodexaPlatform.get_tempdir()
100
- )
101
- self.is_tmp = True
102
-
103
- self.current_filename = filename
104
-
105
- if inmemory:
106
- self.inmemory=True
107
- self.connection = self.create_in_memory_database(filename)
108
- else:
109
- self.inmemory=False
110
- self.connection = sqlite3.connect(filename)
111
-
112
- self.cursor = self.connection.cursor()
113
- self.cursor.execute("PRAGMA journal_mode=OFF")
114
- self.cursor.execute("PRAGMA temp_store=MEMORY")
115
- self.cursor.execute("PRAGMA mmap_size=30000000000")
116
- self.cursor.execute("PRAGMA cache_size=10000")
117
- self.cursor.execute("PRAGMA page_size=4096")
118
-
119
- try:
120
- # We need to populate node_type_id_by_name
121
- for n_type in self.cursor.execute("select id,name from n_type"):
122
- self.node_types[n_type[0]] = n_type[1]
123
- self.node_type_id_by_name[n_type[1]] = n_type[0]
124
- except:
125
- pass
126
-
127
- def create_in_memory_database(self, disk_db_path: str):
128
- # Connect to the in-memory database
129
- mem_conn = sqlite3.connect(':memory:')
130
- mem_cursor = mem_conn.cursor()
131
-
132
- # Connect to the database on disk
133
- disk_conn = sqlite3.connect(disk_db_path)
134
- disk_cursor = disk_conn.cursor()
135
-
136
- # Load the contents of the disk database into memory
137
- disk_cursor.execute("SELECT name, sql FROM sqlite_master WHERE type='table';")
138
- tables = disk_cursor.fetchall()
139
- for table_name, create_table_sql in tables:
140
- if "sqlite" in table_name:
141
- continue
142
-
143
- # Create the table structure in the in-memory database
144
- mem_cursor.execute(create_table_sql)
145
-
146
- # Populate the table with data from the disk database
147
- disk_cursor.execute(f"SELECT * FROM {table_name}")
148
- rows = disk_cursor.fetchall()
149
- for row in rows:
150
- placeholders = ', '.join('?' * len(row))
151
- mem_cursor.execute(f"INSERT INTO {table_name} VALUES ({placeholders})", row)
152
-
153
- # Commit changes and close disk connection
154
- mem_conn.commit()
155
- disk_conn.close()
156
-
157
- return mem_conn
158
-
159
- @monitor_performance
160
- def get_all_tags(self):
161
- """
162
- Retrieves all tags from the document.
163
-
164
- Returns:
165
- list: A list of all tags in the document.
166
- """
167
- features = []
168
- for feature in self.cursor.execute(
169
- "select name from f_type where name like 'tag:%'"
170
- ).fetchall():
171
- features.append(feature[0].split(":")[1])
172
-
173
- return features
174
-
175
- @monitor_performance
176
- def update_features(self, node):
177
- """
178
- Updates the features of a given node in the document.
179
-
180
- Args:
181
- node (Node): The node whose features are to be updated.
182
- """
183
-
184
- next_feature_id = self.get_max_feature_id()
185
- all_features = []
186
- for feature in node.get_features():
187
- binary_value = sqlite3.Binary(
188
- msgpack.packb(feature.value, use_bin_type=True)
189
- )
190
-
191
- tag_uuid = None
192
- if feature.feature_type == "tag" and "uuid" in feature.value[0]:
193
- tag_uuid = feature.value[0]["uuid"]
194
-
195
- all_features.append(
196
- [
197
- next_feature_id,
198
- node.uuid,
199
- self.get_feature_type_id(feature),
200
- binary_value,
201
- feature.single,
202
- tag_uuid,
203
- ]
204
- )
205
-
206
- next_feature_id = next_feature_id + 1
207
-
208
- self.cursor.execute("DELETE FROM ft where cn_id=?", [node.uuid])
209
- self.cursor.executemany(FEATURE_INSERT, all_features)
210
-
211
- @monitor_performance
212
- def update_node(self, node):
213
- """
214
- Updates a given node in the document.
215
-
216
- Args:
217
- node (Node): The node to be updated.
218
- """
219
- self.cursor.execute(
220
- "update cn set idx=?, pid=? where id=?",
221
- [node.index, node._parent_uuid, node.uuid],
222
- )
223
-
224
- @monitor_performance
225
- def get_content_nodes(self, node_type, parent_node: ContentNode, include_children):
226
- """
227
- Retrieves content nodes from the document based on the given parameters.
228
-
229
- Args:
230
- node_type (str): The type of the node to be retrieved.
231
- parent_node (ContentNode): The parent node of the nodes to be retrieved.
232
- include_children (bool): If True, child nodes will also be retrieved.
233
-
234
- Returns:
235
- list: A list of content nodes that match the given parameters.
236
- """
237
- nodes = []
238
- if not self.connection.in_transaction:
239
- self.cursor.execute("BEGIN TRANSACTION")
240
- if include_children:
241
- if node_type == "*":
242
- query = """
243
- with recursive
244
- parent_node(id, pid, nt, idx, path) AS (
245
- VALUES (?,?,?,?,?)
246
- UNION ALL
247
- SELECT cns.id, cns.pid, cns.nt, cns.idx, parent_node.path || substr('0000000' || cns.idx, -6, 6)
248
- FROM cn cns, parent_node
249
- WHERE parent_node.id = cns.pid
250
- )
251
- SELECT id, pid, nt, idx, path from parent_node order by path
252
- """
253
-
254
- try:
255
- results = self.cursor.execute(
256
- query,
257
- [
258
- parent_node.uuid,
259
- parent_node.get_parent().uuid
260
- if parent_node.get_parent()
261
- else None,
262
- next(
263
- key
264
- for key, value in self.node_types.items()
265
- if value == parent_node.get_node_type()
266
- ),
267
- parent_node.index,
268
- f"{parent_node.index}".zfill(6),
269
- ],
270
- ).fetchall()
271
- except StopIteration:
272
- return []
273
- else:
274
- query = """
275
- with recursive
276
- parent_node(id, pid, nt, idx, path) AS (
277
- VALUES (?,?,?,?,?)
278
- UNION ALL
279
- SELECT cns.id, cns.pid, cns.nt, cns.idx, parent_node.path || substr('000000' || cns.idx, -6, 6)
280
- FROM cn cns, parent_node
281
- WHERE parent_node.id = cns.pid
282
- )
283
- SELECT id, pid, nt, idx, path from parent_node where nt=? order by path
284
- """
285
-
286
- try:
287
- results = self.cursor.execute(
288
- query,
289
- [
290
- parent_node.uuid,
291
- parent_node.get_parent().uuid
292
- if parent_node.get_parent()
293
- else None,
294
- next(
295
- key
296
- for key, value in self.node_types.items()
297
- if value == parent_node.get_node_type()
298
- ),
299
- parent_node.index,
300
- f"{parent_node.index}".zfill(6),
301
- next(
302
- key
303
- for key, value in self.node_types.items()
304
- if value == node_type
305
- ),
306
- ],
307
- ).fetchall()
308
- except StopIteration:
309
- self.connection.commit()
310
- return []
311
- else:
312
- query = "select id, pid, nt, idx from cn where pid=? and nt=? order by idx"
313
- try:
314
- results = self.cursor.execute(
315
- query,
316
- [
317
- parent_node.uuid,
318
- next(
319
- key
320
- for key, value in self.node_types.items()
321
- if value == node_type
322
- ),
323
- ],
324
- ).fetchall()
325
- except StopIteration:
326
- self.connection.commit()
327
- return []
328
-
329
- for raw_node in list(results):
330
- nodes.append(self.__build_node(raw_node))
331
-
332
- self.connection.commit()
333
-
334
- return nodes
335
-
336
- def initialize(self):
337
- """
338
- Initializes the SqliteDocumentPersistence object by either building a new database or loading an existing one.
339
- """
340
- if self.is_new:
341
- self.__build_db()
342
- else:
343
- self.__load_document()
344
-
345
- def close(self):
346
- """
347
- Closes the connection to the database. If delete_on_close is True, the file will also be deleted.
348
- """
349
- if self.is_tmp or self.delete_on_close:
350
- pathlib.Path(self.current_filename).unlink()
351
- else:
352
- self.cursor.close()
353
- self.connection.close()
354
-
355
- @monitor_performance
356
- def get_max_feature_id(self):
357
- """
358
- Retrieves the maximum feature id from the document.
359
-
360
- Returns:
361
- int: The maximum feature id.
362
- """
363
- max_id = self.cursor.execute("select max(id) from ft").fetchone()
364
- if max_id[0] is None:
365
- return 1
366
-
367
- return max_id[0] + 1
368
-
369
- def __build_db(self):
370
- """
371
- Builds a new database for the document.
372
- """
373
- self.cursor.execute(
374
- "CREATE TABLE metadata (id integer primary key, metadata text)"
375
- )
376
- self.cursor.execute(
377
- "CREATE TABLE cn (id integer primary key, nt INTEGER, pid INTEGER, idx INTEGER)"
378
- )
379
- self.cursor.execute(
380
- "CREATE TABLE cnp (id integer primary key, cn_id INTEGER, pos integer, content text, content_idx integer)"
381
- )
382
-
383
- self.cursor.execute("CREATE TABLE n_type (id integer primary key, name text)")
384
- self.cursor.execute("CREATE TABLE f_type (id integer primary key, name text)")
385
- self.cursor.execute(
386
- """CREATE TABLE ft
387
- (
388
- id integer primary key,
389
- cn_id integer,
390
- f_type INTEGER,
391
- binary_value blob,
392
- single integer,
393
- tag_uuid text
394
- )"""
395
- )
396
-
397
- self.cursor.execute("CREATE UNIQUE INDEX n_type_uk ON n_type(name);")
398
- self.cursor.execute("CREATE UNIQUE INDEX f_type_uk ON f_type(name);")
399
- self.cursor.execute("CREATE INDEX cn_perf ON cn(nt);")
400
- self.cursor.execute("CREATE INDEX cn_perf2 ON cn(pid);")
401
- self.cursor.execute("CREATE INDEX cnp_perf ON cnp(cn_id, pos);")
402
- self.cursor.execute("CREATE INDEX f_perf ON ft(cn_id);")
403
- self.cursor.execute("CREATE INDEX f_perf2 ON ft(tag_uuid);")
404
- self.cursor.execute(
405
- """CREATE TABLE content_exceptions
406
- (
407
- id integer primary key,
408
- tag text,
409
- message text,
410
- exception_details text,
411
- group_uuid text,
412
- tag_uuid text,
413
- exception_type text,
414
- exception_type_id text,
415
- severity text,
416
- node_uuid text
417
- )"""
418
- )
419
- self.cursor.execute(
420
- "CREATE TABLE model_insights (id integer primary key,model_insight text);"
421
- )
422
- self.document.version = "6.0.0"
423
-
424
- self.__update_metadata()
425
-
426
- @monitor_performance
427
- def content_node_count(self):
428
- """
429
- Counts the number of content nodes in the document.
430
-
431
- Returns:
432
- int: The number of content nodes in the document.
433
- """
434
- self.cursor.execute("select * from cn").fetchall()
435
-
436
- @monitor_performance
437
- def get_feature_type_id(self, feature):
438
- """
439
- Retrieves the id of a given feature.
440
-
441
- Args:
442
- feature (Feature): The feature whose id is to be retrieved.
443
-
444
- Returns:
445
- int: The id of the feature.
446
- """
447
- return self.__resolve_f_type(feature)
448
-
449
- def __resolve_f_type(self, feature):
450
- """
451
- Resolves the feature type of a given feature.
452
-
453
- Args:
454
- feature (Feature): The feature whose feature type is to be resolved.
455
-
456
- Returns:
457
- int: The id of the feature type.
458
- """
459
- feature_type_name = feature.feature_type + ":" + feature.name
460
-
461
- if feature_type_name in self.feature_type_id_by_name:
462
- return self.feature_type_id_by_name[feature_type_name]
463
-
464
- result = self.cursor.execute(
465
- FEATURE_TYPE_LOOKUP, [feature_type_name]
466
- ).fetchone()
467
- if result is None:
468
- new_feature_type_name_id = self.cursor.execute(
469
- FEATURE_TYPE_INSERT, [feature_type_name]
470
- ).lastrowid
471
- self.feature_type_names[new_feature_type_name_id] = feature_type_name
472
- self.feature_type_id_by_name[feature_type_name] = new_feature_type_name_id
473
- return new_feature_type_name_id
474
-
475
- return result[0]
476
-
477
- def __resolve_n_type(self, n_type):
478
- """
479
- Resolves the node type of a given node.
480
-
481
- Args:
482
- n_type (str): The node type to be resolved.
483
-
484
- Returns:
485
- int: The id of the node type.
486
- """
487
- if n_type in self.node_type_id_by_name:
488
- return self.node_type_id_by_name[n_type]
489
- result = self.cursor.execute(NODE_TYPE_LOOKUP, [n_type]).fetchone()
490
- if result is None:
491
- new_type_id = self.cursor.execute(NOTE_TYPE_INSERT, [n_type]).lastrowid
492
- self.node_types[new_type_id] = n_type
493
- self.node_type_id_by_name[n_type] = new_type_id
494
- return new_type_id
495
-
496
- return result[0]
497
-
498
- @monitor_performance
499
- def __insert_node(self, node: ContentNode, parent, execute=True):
500
- """
501
- Inserts a node into the document.
502
-
503
- Args:
504
- node (ContentNode): The node to be inserted.
505
- parent (Node): The parent node of the node to be inserted.
506
- execute (bool, optional): If True, the node will be inserted immediately. Defaults to True.
507
-
508
- Returns:
509
- tuple: A tuple containing the values of the node and its parts.
510
- """
511
-
512
- if node.index is None:
513
- node.index = 0
514
-
515
- if parent:
516
- node._parent_uuid = parent.uuid
517
-
518
- if node.uuid:
519
- # Delete the existing node
520
- cn_values = [
521
- node._parent_uuid,
522
- self.__resolve_n_type(node.node_type),
523
- node.index,
524
- node.uuid,
525
- ]
526
-
527
- # Make sure we load the content parts if we haven't
528
- node.get_content_parts()
529
-
530
- if execute:
531
- self.cursor.execute("DELETE FROM cn where id=?", [node.uuid])
532
- self.cursor.execute(
533
- "INSERT INTO cn (pid, nt, idx, id) VALUES (?,?,?,?)", cn_values
534
- )
535
- self.cursor.execute("DELETE FROM cnp where cn_id=?", [node.uuid])
536
-
537
- cn_parts_values = []
538
- for idx, part in enumerate(node.get_content_parts()):
539
- cn_parts_values.append(
540
- [
541
- node.uuid,
542
- idx,
543
- part if isinstance(part, str) else None,
544
- part if not isinstance(part, str) else None,
545
- ]
546
- )
547
-
548
- if execute:
549
- self.cursor.executemany(CONTENT_NODE_PART_INSERT, cn_parts_values)
550
-
551
- return ([cn_values], cn_parts_values)
552
-
553
- raise Exception("Node must have a UUID?")
554
-
555
- def __clean_none_values(self, d):
556
- """
557
- Cleans a dictionary by removing keys with None values.
558
-
559
- Args:
560
- d (dict): The dictionary to be cleaned.
561
-
562
- Returns:
563
- dict: The cleaned dictionary.
564
- """
565
- clean = {}
566
- for k, v in d.items():
567
- if isinstance(v, dict):
568
- nested = self.__clean_none_values(v)
569
- if len(nested.keys()) > 0:
570
- clean[k] = nested
571
- elif v is not None:
572
- clean[k] = v
573
- return clean
574
-
575
- def __update_metadata(self):
576
- """
577
- Updates the metadata of the document.
578
- """
579
- document_metadata = {
580
- "version": Document.CURRENT_VERSION,
581
- "metadata": self.document.metadata,
582
- "source": self.__clean_none_values(
583
- dataclasses.asdict(self.document.source)
584
- ),
585
- "mixins": self.document.get_mixins(),
586
- "labels": self.document.labels,
587
- "uuid": self.document.uuid,
588
- }
589
- self.cursor.execute(METADATA_DELETE)
590
- self.cursor.execute(
591
- METADATA_INSERT,
592
- [sqlite3.Binary(msgpack.packb(document_metadata, use_bin_type=True))],
593
- )
594
-
595
- def __load_document(self):
596
- """
597
- Loads an existing document from the database.
598
- """
599
- for n_type in self.cursor.execute("select id,name from n_type"):
600
- self.node_types[n_type[0]] = n_type[1]
601
- for f_type in self.cursor.execute("select id,name from f_type"):
602
- self.feature_type_names[f_type[0]] = f_type[1]
603
-
604
- metadata = msgpack.unpackb(
605
- self.cursor.execute("select * from metadata").fetchone()[1]
606
- )
607
- self.document.metadata = DocumentMetadata(metadata["metadata"])
608
- self.document.version = (
609
- metadata["version"]
610
- if "version" in metadata and metadata["version"]
611
- else Document.PREVIOUS_VERSION
612
- )
613
- # some older docs don't have a version or it's None
614
-
615
- self.uuid = (
616
- metadata["uuid"]
617
- if "uuid" in metadata
618
- else str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
619
- )
620
- if "source" in metadata and metadata["source"]:
621
- self.document.source = SourceMetadata.from_dict(metadata["source"])
622
- if "labels" in metadata and metadata["labels"]:
623
- self.document.labels = metadata["labels"]
624
- if "mixins" in metadata and metadata["mixins"]:
625
- self.document._mixins = metadata["mixins"]
626
-
627
- self.uuid = metadata.get("uuid")
628
-
629
- import semver
630
-
631
- root_node = self.cursor.execute(
632
- "select id, pid, nt, idx from cn where pid is null"
633
- ).fetchone()
634
- if root_node:
635
- self.document.content_node = self.__build_node(root_node)
636
-
637
- if semver.compare(self.document.version, "4.0.1") < 0:
638
- # We need to migrate this to a 4.0.1 document
639
- self.cursor.execute(
640
- """CREATE TABLE ft
641
- (
642
- id integer primary key,
643
- cn_id integer,
644
- f_type INTEGER,
645
- binary_value blob,
646
- single integer,
647
- tag_uuid text
648
- )"""
649
- )
650
- self.cursor.execute(
651
- "insert into ft select f.id, f.cn_id, f.f_type, fv.binary_value, fv.single, null from f, f_value fv where fv.id = f.fvalue_id"
652
- )
653
- # we will create a new feature table
654
- self.cursor.execute("drop table f")
655
- self.cursor.execute("drop table f_value")
656
- self.cursor.execute("CREATE INDEX f_perf ON ft(cn_id);")
657
- self.cursor.execute("CREATE INDEX f_perf2 ON ft(tag_uuid);")
658
-
659
- # We always run this
660
- self.cursor.execute(
661
- """CREATE TABLE IF NOT EXISTS content_exceptions
662
- (
663
- id integer primary key,
664
- tag text,
665
- message text,
666
- exception_details text,
667
- group_uuid text,
668
- tag_uuid text,
669
- exception_type text,
670
- severity text,
671
- node_uuid text
672
- )"""
673
- )
674
- self.cursor.execute(
675
- """CREATE TABLE IF NOT EXISTS model_insights
676
- (
677
- id integer primary key,
678
- model_insight text
679
- )"""
680
- )
681
-
682
- if semver.compare(self.document.version, "6.0.0") < 0:
683
- from sqlite3 import OperationalError
684
-
685
- try:
686
- self.cursor.execute(
687
- "ALTER TABLE content_exceptions ADD COLUMN exception_type_id text"
688
- )
689
- except OperationalError:
690
- logger.info("exception_type_id column already exists")
691
- pass
692
- self.document.version = "6.0.0"
693
- self.update_metadata()
694
-
695
- def get_content_parts(self, new_node):
696
- """
697
- Retrieves the content parts of a given node.
698
-
699
- Args:
700
- new_node (Node): The node whose content parts are to be retrieved.
701
-
702
- Returns:
703
- list: A list of the content parts of the node.
704
- """
705
- content_parts = self.cursor.execute(
706
- "select cn_id, pos, content, content_idx from cnp where cn_id = ? order by pos",
707
- [new_node.uuid],
708
- ).fetchall()
709
-
710
- parts = []
711
- for content_part in content_parts:
712
- if content_part[3] is None:
713
- parts.append(content_part[2])
714
- else:
715
- parts.append(content_part[3])
716
- return parts
717
-
718
- def __build_node(self, node_row):
719
- """
720
- Builds a node from a given row of the database.
721
-
722
- Args:
723
- node_row (tuple): A tuple containing the values of the node.
724
-
725
- Returns:
726
- Node: The built node.
727
- """
728
- new_node = ContentNode(
729
- self.document,
730
- self.node_types[node_row[2]],
731
- parent=self.get_node(node_row[1]),
732
- )
733
- new_node.uuid = node_row[0]
734
- new_node.index = node_row[3]
735
- return new_node
736
-
737
- def add_content_node(self, node, parent, execute=True):
738
- """
739
- Adds a content node to the document.
740
-
741
- Args:
742
- node (Node): The node to be added.
743
- parent (Node): The parent node of the node to be added.
744
- execute (bool, optional): If True, the node will be added immediately. Defaults to True.
745
-
746
- Returns:
747
- tuple: A tuple containing the values of the node and its parts.
748
- """
749
- return self.__insert_node(node, parent, execute)
750
-
751
- def remove_feature(self, node, feature_type, name):
752
- """
753
- Removes a feature from a given node.
754
-
755
- Args:
756
- node (Node): The node from which the feature is to be removed.
757
- feature_type (str): The type of the feature to be removed.
758
- name (str): The name of the feature to be removed.
759
- """
760
-
761
- feature = ContentFeature(feature_type, name, None)
762
- f_values = [node.uuid, self.__resolve_f_type(feature)]
763
- self.cursor.execute(FEATURE_DELETE, f_values)
764
-
765
- def get_children(self, content_node):
766
- """
767
- Retrieves the children of a given node.
768
-
769
- Args:
770
- content_node (ContentNode): The node whose children are to be retrieved.
771
-
772
- Returns:
773
- list: A list of the children of the node.
774
- """
775
-
776
- # We need to get the child nodes
777
- children = []
778
- for child_node in self.cursor.execute(
779
- "select id, pid, nt, idx from cn where pid = ? order by idx",
780
- [content_node.uuid],
781
- ).fetchall():
782
- children.append(self.__build_node(child_node))
783
- return children
784
-
785
- def get_child_ids(self, content_node):
786
- """
787
- Retrieves the ids of the children of a given node.
788
-
789
- Args:
790
- content_node (ContentNode): The node whose children's ids are to be retrieved.
791
-
792
- Returns:
793
- list: A list of the ids of the children of the node.
794
- """
795
-
796
- # We need to get the child nodes
797
- children = []
798
- for child_node in self.cursor.execute(
799
- "select id, pid, nt, idx from cn where pid = ? order by idx",
800
- [content_node.uuid],
801
- ).fetchall():
802
- children.append(child_node[0])
803
- return children
804
-
805
- def get_node(self, node_id):
806
- """
807
- Retrieves a node by its id.
808
-
809
- Args:
810
- node_id (int): The id of the node to be retrieved.
811
-
812
- Returns:
813
- Node: The node with the given id.
814
- """
815
- node_row = self.cursor.execute(
816
- "select id, pid, nt, idx from cn where id = ?", [node_id]
817
- ).fetchone()
818
- if node_row:
819
- return self.__build_node(node_row)
820
-
821
- return None
822
-
823
- def get_parent(self, content_node):
824
- """
825
- Retrieves the parent of a given node.
826
-
827
- Args:
828
- content_node (ContentNode): The node whose parent is to be retrieved.
829
-
830
- Returns:
831
- Node: The parent of the node.
832
- """
833
-
834
- parent = self.cursor.execute(
835
- "select pid from cn where id = ?", [content_node.uuid]
836
- ).fetchone()
837
- if parent:
838
- return self.get_node(parent[0])
839
-
840
- return None
841
-
842
- def update_metadata(self):
843
- """
844
- Updates the metadata of the document.
845
- """
846
- self.__update_metadata()
847
-
848
- def __rebuild_from_document(self):
849
- """
850
- Rebuilds the database from the document.
851
- """
852
- self.cursor.execute("DELETE FROM cn")
853
- self.cursor.execute("DELETE FROM cnp")
854
- self.cursor.execute("DELETE FROM ft")
855
-
856
- self.__update_metadata()
857
- if self.document.content_node:
858
- self.__insert_node(self.document.content_node, None)
859
-
860
- def sync(self):
861
- """
862
- Synchronizes the database with the document.
863
- """
864
- self.__update_metadata()
865
- self.cursor.execute("pragma optimize")
866
- self.connection.commit()
867
- self.cursor.execute("VACUUM")
868
- self.cursor = self.connection.cursor()
869
- self.cursor.execute("PRAGMA journal_mode=OFF")
870
- self.cursor.execute("PRAGMA temp_store=MEMORY")
871
- self.cursor.execute("PRAGMA mmap_size=30000000000")
872
- self.cursor.execute("PRAGMA cache_size=10000")
873
- self.cursor.execute("PRAGMA page_size=4096")
874
-
875
- def dump_in_memory_db_to_file(self):
876
- # Connect to a new or existing database file
877
- disk_conn = sqlite3.connect(self.current_filename)
878
-
879
- # Use the backup API to copy the in-memory database to the disk file
880
- with disk_conn:
881
- self.connection.backup(disk_conn)
882
-
883
- # Close the file-based database connection
884
- disk_conn.close()
885
-
886
- def get_bytes(self):
887
- """
888
- Retrieves the document as bytes.
889
-
890
- Returns:
891
- bytes: The document as bytes.
892
- """
893
- self.sync()
894
-
895
- if self.inmemory:
896
- self.dump_in_memory_db_to_file()
897
-
898
- with open(self.current_filename, "rb") as f:
899
- return f.read()
900
-
901
- def get_features(self, node):
902
- """
903
- Retrieves the features of a given node.
904
-
905
- Args:
906
- node (Node): The node whose features are to be retrieved.
907
-
908
- Returns:
909
- list: A list of the features of the node.
910
- """
911
- # We need to get the features back
912
-
913
- features = []
914
- for feature in self.cursor.execute(
915
- "select id, cn_id, f_type, binary_value, single from ft where cn_id = ?",
916
- [node.uuid],
917
- ).fetchall():
918
- feature_type_name = self.feature_type_names[feature[2]]
919
- single = feature[4] == 1
920
- value = msgpack.unpackb(feature[3])
921
- features.append(
922
- ContentFeature(
923
- feature_type_name.split(":")[0],
924
- feature_type_name.split(":")[1],
925
- value,
926
- single=single,
927
- )
928
- )
929
-
930
- return features
931
-
932
- def update_content_parts(self, node, content_parts):
933
- """
934
- Updates the content parts of a given node.
935
-
936
- Args:
937
- node (Node): The node whose content parts are to be updated.
938
- content_parts (list): The new content parts of the node.
939
- """
940
- self.cursor.execute("delete from cnp where cn_id=?", [node.uuid])
941
-
942
- all_parts = []
943
- for idx, part in enumerate(content_parts):
944
- all_parts.append(
945
- [
946
- node.uuid,
947
- idx,
948
- part if isinstance(part, str) else None,
949
- part if not isinstance(part, str) else None,
950
- ]
951
- )
952
- self.cursor.executemany(CONTENT_NODE_PART_INSERT, all_parts)
953
-
954
- def remove_content_node(self, node):
955
- """
956
- Removes a node from the document.
957
-
958
- Args:
959
- node (Node): The node to be removed.
960
- """
961
-
962
- def get_all_node_ids(node):
963
- """
964
- This function recursively traverses a node tree, collecting the ids of all non-virtual nodes.
965
- """
966
- all_node_ids = []
967
- if not node.virtual:
968
- all_node_ids.append(node.uuid) # Append the uuid directly, not as a list
969
- for child in node.get_children():
970
- all_node_ids.extend(get_all_node_ids(child))
971
- return all_node_ids
972
-
973
- all_child_ids = get_all_node_ids(node)
974
- parameter_tuples = [(id,) for id in all_child_ids] # Prepare the parameters as tuples
975
-
976
- # Assuming `self.cursor` is part of a larger transaction management system
977
- try:
978
- self.cursor.executemany("delete from cnp where cn_id=?", parameter_tuples)
979
- self.cursor.executemany("delete from cn where id=?", parameter_tuples)
980
- self.cursor.executemany("delete from ft where cn_id=?", parameter_tuples)
981
- self.connection.commit() # Commit the transaction if part of one
982
- return all_child_ids
983
- except Exception as e:
984
- self.connection.rollback() # Rollback in case of error
985
- logger.error(f"An error occurred: {e}")
986
-
987
- def remove_all_features(self, node):
988
- """
989
- Removes all features from a given node.
990
-
991
- Args:
992
- node (Node): The node from which all features are to be removed.
993
- """
994
- self.cursor.execute("delete from ft where cn_id=?", [node.uuid])
995
-
996
- def remove_all_features_by_id(self, node_id):
997
- """
998
- Removes all features from a node by its id.
999
-
1000
- Args:
1001
- node_id (int): The id of the node from which all features are to be removed.
1002
- """
1003
- self.cursor.execute("delete from ft where cn_id=?", [node_id])
1004
-
1005
- def get_next_node_id(self):
1006
- """
1007
- Retrieves the next node id from the document.
1008
-
1009
- Returns:
1010
- int: The next node id.
1011
- """
1012
- next_id = self.cursor.execute("select max(id) from cn").fetchone()
1013
- if next_id[0] is None:
1014
- return 1
1015
-
1016
- return next_id[0] + 1
1017
-
1018
- def get_tagged_nodes(self, tag, tag_uuid=None):
1019
- """
1020
- Retrieves nodes with a given tag.
1021
-
1022
- Args:
1023
- tag (str): The tag of the nodes to be retrieved.
1024
- tag_uuid (str, optional): The uuid of the tag. Defaults to None.
1025
-
1026
- Returns:
1027
- list: A list of nodes with the given tag.
1028
- """
1029
- content_nodes = []
1030
- if tag_uuid is None:
1031
- query = f"select distinct(cn_id) from ft where f_type in (select id from f_type where name like 'tag:{tag}')"
1032
- else:
1033
- query = f"select distinct(cn_id) from ft where f_type in (select id from f_type where name like 'tag:{tag}') and tag_uuid = '{tag_uuid}'"
1034
- for content_node_ids in self.cursor.execute(query).fetchall():
1035
- content_nodes.append(self.get_node(content_node_ids[0]))
1036
-
1037
- return content_nodes
1038
-
1039
- def add_model_insight(self, model_insights: ModelInsight):
1040
- """
1041
- Adds a model insight to the document.
1042
-
1043
- Args:
1044
- model_insights (ModelInsight): The model insight to be added.
1045
- """
1046
- self.cursor.execute(MODEL_INSIGHT_INSERT, [model_insights.json()])
1047
-
1048
- def get_model_insights(self) -> List[ModelInsight]:
1049
- """
1050
- Retrieves all model insights from the document.
1051
-
1052
- Returns:
1053
- list: A list of all model insights in the document.
1054
- """
1055
- model_insights = []
1056
- for model_insight in self.cursor.execute(MODEL_INSIGHT_SELECT).fetchall():
1057
- model_insights.append(ModelInsight.model_validate_json(model_insight[0]))
1058
-
1059
- return model_insights
1060
-
1061
- def add_exception(self, exception: ContentException):
1062
- """
1063
- Adds an exception to the document.
1064
-
1065
- Args:
1066
- exception (ContentException): The exception to be added.
1067
- """
1068
- # Add an exception to the exception table
1069
- self.cursor.execute(
1070
- EXCEPTION_INSERT,
1071
- [
1072
- exception.tag,
1073
- exception.message,
1074
- exception.exception_details,
1075
- exception.group_uuid,
1076
- exception.tag_uuid,
1077
- exception.exception_type,
1078
- exception.severity,
1079
- exception.node_uuid,
1080
- exception.exception_type_id,
1081
- ],
1082
- )
1083
-
1084
- def get_exceptions(self) -> List[ContentException]:
1085
- """
1086
- Retrieves all exceptions from the document.
1087
-
1088
- Returns:
1089
- list: A list of all exceptions in the document.
1090
- """
1091
- exceptions = []
1092
- for exception in self.cursor.execute(EXCEPTION_SELECT).fetchall():
1093
- exceptions.append(
1094
- ContentException(
1095
- tag=exception[0],
1096
- message=exception[1],
1097
- exception_details=exception[2],
1098
- group_uuid=exception[3],
1099
- tag_uuid=exception[4],
1100
- exception_type=exception[5],
1101
- severity=exception[6],
1102
- node_uuid=exception[7],
1103
- exception_type_id=exception[8],
1104
- )
1105
- )
1106
- return exceptions
1107
-
1108
- def replace_exceptions(self, exceptions: List[ContentException]):
1109
- """
1110
- Replaces all exceptions in the document with a given list of exceptions.
1111
-
1112
- Args:
1113
- exceptions (list): The new list of exceptions.
1114
- """
1115
- self.cursor.execute("delete from content_exceptions")
1116
- for exception in exceptions:
1117
- self.add_exception(exception)
1118
-
1119
- def clear_model_insights(self):
1120
- """
1121
- Clears all model insights from the document.
1122
- """
1123
- self.cursor.execute("delete from model_insights")
1124
-
1125
- def get_all_tagged_nodes(self):
1126
- """
1127
- Retrieves all nodes with tags from the document.
1128
-
1129
- Returns:
1130
- list: A list of all nodes with tags in the document.
1131
- """
1132
- content_nodes = []
1133
- query = "select distinct(cn_id) from ft where f_type in (select id from f_type where name like 'tag:%')"
1134
- for content_node_ids in self.cursor.execute(query).fetchall():
1135
- content_nodes.append(self.get_node(content_node_ids[0]))
1136
-
1137
- return content_nodes
1138
-
1139
- def get_nodes_by_type(self, node_type):
1140
- """
1141
- Retrieves nodes of a given type from the document.
1142
-
1143
- Args:
1144
- node_type (str): The type of the nodes to be retrieved.
1145
-
1146
- Returns:
1147
- list: A list of nodes of the given type.
1148
- """
1149
- content_nodes = []
1150
-
1151
- node_type_id = self.node_type_id_by_name.get(node_type)
1152
-
1153
- query = "select id, pid, nt, idx from cn where nt = ? order by idx"
1154
- for content_node in self.cursor.execute(query, [node_type_id]).fetchall():
1155
- content_nodes.append(self.__build_node(content_node))
1156
-
1157
- return content_nodes
1158
-
1159
- def __ensure_validations_table_exists(self):
1160
- """
1161
- Ensure the 'validations' table exists in the database.
1162
- Creates the table if it does not exist and initializes it with an empty list.
1163
- """
1164
- self.cursor.execute("""
1165
- CREATE TABLE IF NOT EXISTS validations (
1166
- obj BLOB
1167
- )
1168
- """)
1169
-
1170
- # Check if the table has any rows, if not, insert an initial empty row
1171
- result = self.cursor.execute("SELECT COUNT(*) FROM validations").fetchone()
1172
- if result[0] == 0:
1173
- self.cursor.execute("INSERT INTO validations (obj) VALUES (?)", [sqlite3.Binary(msgpack.packb([]))])
1174
-
1175
- def set_validations(self, validations: List[DocumentTaxonValidation]):
1176
- """
1177
- Sets the validations for the document.
1178
-
1179
- Args:
1180
- validations (List[DocumentTaxonValidation]): The validations to store.
1181
- """
1182
- self.__ensure_validations_table_exists()
1183
- serialized_data = sqlite3.Binary(msgpack.packb([v.model_dump(by_alias=True) for v in validations]))
1184
- self.cursor.execute("UPDATE validations SET obj = ? WHERE rowid = 1", [serialized_data])
1185
- self.connection.commit()
1186
-
1187
- def get_validations(self) -> List[DocumentTaxonValidation]:
1188
- """
1189
- Gets the validations associated with this document.
1190
-
1191
- Returns:
1192
- List[DocumentTaxonValidation]: The list of validations stored in the validations table.
1193
- """
1194
- self.__ensure_validations_table_exists()
1195
- result = self.cursor.execute("SELECT obj FROM validations WHERE rowid = 1").fetchone()
1196
- if result and result[0]:
1197
- return [DocumentTaxonValidation.model_validate(v) for v in msgpack.unpackb(result[0])]
1198
- return []
1199
-
1200
- def set_external_data(self, external_data: dict, key: str = "default"):
1201
- """
1202
- Sets the external data for the document for a specific key.
1203
-
1204
- Args:
1205
- external_data (dict): The external data to store, must be JSON serializable.
1206
- key (str): The key to store the data under, defaults to "default"
1207
- """
1208
- self.__ensure_ed_table_exists()
1209
- serialized_data = sqlite3.Binary(msgpack.packb(external_data))
1210
- self.cursor.execute("DELETE FROM ed WHERE key = ?", [key])
1211
- self.cursor.execute("INSERT INTO ed (key, obj) VALUES (?, ?)", [key, serialized_data])
1212
- self.connection.commit()
1213
-
1214
- def get_external_data(self, key: str = "default") -> dict:
1215
- """
1216
- Gets the external data associated with this document for a specific key.
1217
-
1218
- Args:
1219
- key (str): The key to retrieve data for, defaults to "default"
1220
-
1221
- Returns:
1222
- dict: The external data stored in the ed table for the given key.
1223
- """
1224
- self.__ensure_ed_table_exists()
1225
- result = self.cursor.execute("SELECT obj FROM ed WHERE key = ?", [key]).fetchone()
1226
- if result and result[0]:
1227
- return msgpack.unpackb(result[0])
1228
- return {}
1229
-
1230
- def get_external_data_keys(self) -> List[str]:
1231
- """
1232
- Gets all keys under which external data is stored.
1233
-
1234
- Returns:
1235
- List[str]: A list of all keys that have external data stored.
1236
- """
1237
- self.__ensure_ed_table_exists()
1238
- results = self.cursor.execute("SELECT key FROM ed").fetchall()
1239
- return [row[0] for row in results]
1240
-
1241
- def __ensure_ed_table_exists(self):
1242
- """
1243
- Ensure the 'ed' table exists in the database.
1244
- Creates the table if it does not exist.
1245
- """
1246
- # First check if the old table exists and has key column
1247
- old_table = self.cursor.execute("""
1248
- SELECT name FROM sqlite_master
1249
- WHERE type='table' AND name='ed'
1250
- """).fetchone()
1251
-
1252
- if old_table:
1253
- # Check if table has key column
1254
- table_info = self.cursor.execute("PRAGMA table_info(ed)").fetchall()
1255
- has_key_column = any(col[1] == 'key' for col in table_info)
1256
-
1257
- if not has_key_column:
1258
- # Get the old data and drop the table
1259
- data = self.cursor.execute("SELECT obj FROM ed").fetchone()
1260
- self.cursor.execute("DROP TABLE ed")
1261
-
1262
- # Create new table with key column
1263
- self.cursor.execute("""
1264
- CREATE TABLE ed (
1265
- key TEXT PRIMARY KEY,
1266
- obj BLOB
1267
- )
1268
- """)
1269
-
1270
- # If there was data in the old table, insert it with default key
1271
- if data:
1272
- self.cursor.execute("INSERT INTO ed (key, obj) VALUES (?, ?)",
1273
- ["default", data[0]])
1274
- else:
1275
- # Table exists and has key column - do nothing
1276
- return
1277
- else:
1278
- # Create new table if it doesn't exist
1279
- self.cursor.execute("""
1280
- CREATE TABLE IF NOT EXISTS ed (
1281
- key TEXT PRIMARY KEY,
1282
- obj BLOB
1283
- )
1284
- """)
1285
-
1286
- # Check if default key exists, if not insert empty data
1287
- result = self.cursor.execute("SELECT COUNT(*) FROM ed WHERE key = 'default'").fetchone()
1288
- if result[0] == 0:
1289
- self.cursor.execute("INSERT INTO ed (key, obj) VALUES (?, ?)",
1290
- ["default", sqlite3.Binary(msgpack.packb({}))])
1291
-
1292
- def __ensure_steps_table_exists(self):
1293
- """
1294
- Ensure the 'steps' table exists in the database.
1295
- Creates the table if it does not exist.
1296
- """
1297
- self.cursor.execute("""
1298
- CREATE TABLE IF NOT EXISTS steps (
1299
- obj BLOB
1300
- )
1301
- """)
1302
-
1303
- # Check if the table has any rows, if not, insert an initial empty row
1304
- result = self.cursor.execute("SELECT COUNT(*) FROM steps").fetchone()
1305
- if result[0] == 0:
1306
- self.cursor.execute("INSERT INTO steps (obj) VALUES (?)", [sqlite3.Binary(msgpack.packb([]))])
1307
-
1308
- def set_steps(self, steps: List[ProcessingStep]):
1309
- """
1310
- Sets the processing steps for the document.
1311
-
1312
- Args:
1313
- steps (List[ProcessingStep]): A list of ProcessingStep objects to store.
1314
- """
1315
- self.__ensure_steps_table_exists()
1316
- serialized_steps = [step.to_dict() for step in steps]
1317
- packed_data = sqlite3.Binary(msgpack.packb(serialized_steps))
1318
- self.cursor.execute("UPDATE steps SET obj = ? WHERE rowid = 1", [packed_data])
1319
- self.connection.commit()
1320
-
1321
- def get_steps(self) -> List[ProcessingStep]:
1322
- """
1323
- Gets the processing steps associated with this document.
1324
-
1325
- Returns:
1326
- List[ProcessingStep]: A list of ProcessingStep objects.
1327
- """
1328
- self.__ensure_steps_table_exists()
1329
- result = self.cursor.execute("SELECT obj FROM steps WHERE rowid = 1").fetchone()
1330
- if result and result[0]:
1331
- unpacked_data = msgpack.unpackb(result[0])
1332
- return [ProcessingStep(**step) for step in unpacked_data]
1333
- return []
1334
-
1335
-
1336
- class SimpleObjectCache(object):
1337
- """
1338
- A simple cache based on ID'd objects, where we will build ID's for new
1339
- objects, store them and also a dirty flag so that it is easy to pull all
1340
- dirty objects and store them as needed.
1341
- """
1342
-
1343
- """
1344
- A simple cache based on ID'd objects, where we will build ID's for new
1345
- objects, store them and also a dirty flag so that it is easy to pull all
1346
- dirty objects and store them as needed.
1347
- """
1348
- """
1349
- A simple cache based on ID'd objects, where we will build ID's for new
1350
- objects, store them and also a dirty flag so that it is easy to pull all
1351
- dirty objects and store them as needed
1352
- """
1353
-
1354
- def __init__(self):
1355
- self.objs = {}
1356
- self.next_id = 1
1357
- self.dirty_objs = set()
1358
-
1359
- def get_obj(self, obj_id) -> Optional[ContentNode]:
1360
- """
1361
- Get the object with the given ID.
1362
-
1363
- Args:
1364
- obj_id (int): The ID of the object.
1365
-
1366
- Returns:
1367
- object: The object with the given ID if it exists, None otherwise.
1368
- """
1369
- if obj_id in self.objs:
1370
- return self.objs[obj_id]
1371
-
1372
- return None
1373
-
1374
- def add_obj(self, obj: ContentNode):
1375
- """
1376
- Add an object to the cache.
1377
-
1378
- Args:
1379
- obj (object): The object to add. If the object does not have a uuid, one will be assigned.
1380
- """
1381
- if obj.uuid is None:
1382
- obj.uuid = self.next_id
1383
- self.next_id += 1
1384
- self.objs[obj.uuid] = obj
1385
- self.dirty_objs.add(obj.uuid)
1386
-
1387
- def remove_obj(self, obj: ContentNode):
1388
- """
1389
- Remove an object from the cache.
1390
-
1391
- Args:
1392
- obj (object): The object to remove.
1393
- """
1394
- if obj and obj.uuid in self.objs:
1395
- self.objs.pop(obj.uuid)
1396
- if obj.uuid in self.dirty_objs:
1397
- self.dirty_objs.remove(obj.uuid)
1398
-
1399
- def get_dirty_objs(self) -> list[ContentNode]:
1400
- """
1401
- Get all dirty objects in the cache.
1402
-
1403
- Returns:
1404
- list: A list of all dirty objects in the cache.
1405
- """
1406
- results = []
1407
- for set_id in set(self.dirty_objs):
1408
- node = self.get_obj(set_id)
1409
- if node is not None:
1410
- results.append(node)
1411
- return results
1412
-
1413
- def undirty(self, obj):
1414
- """
1415
- Mark an object as not dirty.
1416
-
1417
- Args:
1418
- obj (object): The object to mark as not dirty.
1419
- """
1420
- self.dirty_objs.remove(obj.uuid)
1421
-
1422
-
1423
- class PersistenceManager(object):
1424
- """
1425
- The persistence manager supports holding the document and only flushing objects to the persistence layer
1426
- as needed. This is implemented to allow us to work with large complex documents in a performance centered way.
1427
-
1428
- Attributes:
1429
- document (Document): The document to be managed.
1430
- node_cache (SimpleObjectCache): Cache for nodes.
1431
- child_cache (dict): Cache for child nodes.
1432
- child_id_cache (dict): Cache for child node IDs.
1433
- feature_cache (dict): Cache for features.
1434
- content_parts_cache (dict): Cache for content parts.
1435
- node_parent_cache (dict): Cache for node parents.
1436
- _underlying_persistence (SqliteDocumentPersistence): The underlying persistence layer.
1437
- """
1438
-
1439
- """
1440
- The persistence manager supports holding the document and only flushing objects to the persistence layer
1441
- as needed. This is implemented to allow us to work with large complex documents in a performance centered way.
1442
-
1443
- Attributes:
1444
- document (Document): The document to be managed.
1445
- node_cache (SimpleObjectCache): Cache for nodes.
1446
- child_cache (dict): Cache for child nodes.
1447
- child_id_cache (dict): Cache for child node IDs.
1448
- feature_cache (dict): Cache for features.
1449
- content_parts_cache (dict): Cache for content parts.
1450
- node_parent_cache (dict): Cache for node parents.
1451
- _underlying_persistence (SqliteDocumentPersistence): The underlying persistence layer.
1452
- """
1453
- """
1454
- The persistence manager supports holding the document and only flushing objects to the persistence layer
1455
- as needed.
1456
-
1457
- This is implemented to allow us to work with large complex documents in a performance centered way.
1458
- """
1459
-
1460
- def __init__(self, document: Document, filename: str = None, delete_on_close=False, inmemory=False):
1461
- self.document = document
1462
- self.node_cache = SimpleObjectCache()
1463
- self.child_cache = {}
1464
- self.child_id_cache = {}
1465
- self.feature_cache = {}
1466
- self.content_parts_cache = {}
1467
- self.node_parent_cache = {}
1468
-
1469
- self._underlying_persistence = SqliteDocumentPersistence(
1470
- document, filename, delete_on_close, inmemory=inmemory, persistence_manager=self
1471
- )
1472
-
1473
- def get_steps(self) -> list[ProcessingStep]:
1474
- """
1475
- Gets the processing steps for this document
1476
-
1477
- :return:
1478
- """
1479
- return self._underlying_persistence.get_steps()
1480
-
1481
- def set_steps(self, steps: list[ProcessingStep]):
1482
- self._underlying_persistence.set_steps(steps)
1483
-
1484
- def set_validations(self, validations: list[DocumentTaxonValidation]):
1485
- self._underlying_persistence.set_validations(validations)
1486
-
1487
- def get_validations(self) -> list[DocumentTaxonValidation]:
1488
- return self._underlying_persistence.get_validations()
1489
-
1490
- def get_external_data(self, key="default") -> dict:
1491
- """
1492
- Gets the external data object associated with this document
1493
-
1494
- :return: dict of the external data
1495
- """
1496
- return self._underlying_persistence.get_external_data(key)
1497
-
1498
- def get_external_data_keys(self) -> List[str]:
1499
- """
1500
- Gets all keys under which external data is stored.
1501
-
1502
- Returns:
1503
- List[str]: A list of all keys that have external data stored.
1504
- """
1505
- return self._underlying_persistence.get_external_data_keys()
1506
-
1507
- def set_external_data(self, external_data:dict, key="default"):
1508
- """
1509
- Sets the external data for this document
1510
-
1511
- :param external_data: dict representing the external data, must be JSON serializable
1512
- :return:
1513
- """
1514
- self._underlying_persistence.set_external_data(external_data, key)
1515
-
1516
- def get_nodes_by_type(self, node_type: str) -> List[ContentNode]:
1517
- """
1518
- Retrieves all nodes of a given type from the underlying persistence layer.
1519
-
1520
- Args:
1521
- node_type (str): The type of the nodes to be retrieved.
1522
-
1523
- Returns:
1524
- List[ContentNode]: A list of all nodes of the given type.
1525
- """
1526
- return self._underlying_persistence.get_nodes_by_type(node_type)
1527
-
1528
- def get_node_by_uuid(self, uuid: int) -> ContentNode:
1529
- """
1530
- Retrieves a node by its uuid.
1531
-
1532
- Args:
1533
- uuid (str): The uuid of the node to be retrieved.
1534
-
1535
- Returns:
1536
- ContentNode: The node with the given uuid.
1537
- """
1538
- if self.node_cache.get_obj(uuid) is None:
1539
- node = self._underlying_persistence.get_node(uuid)
1540
- if node:
1541
- self.node_cache.add_obj(node)
1542
- return node
1543
-
1544
- return self.node_cache.get_obj(uuid) # return the cached version
1545
-
1546
- def add_model_insight(self, model_insight: ModelInsight):
1547
- """
1548
- Adds a model insight to the underlying persistence layer.
1549
-
1550
- Args:
1551
- model_insight (ModelInsight): The model insight to be added.
1552
- """
1553
- self._underlying_persistence.add_model_insight(model_insight)
1554
-
1555
- def clear_model_insights(self):
1556
- """
1557
- Clears all model insights from the underlying persistence layer.
1558
- """
1559
- self._underlying_persistence.clear_model_insights()
1560
-
1561
- def get_model_insights(self) -> List[ModelInsight]:
1562
- """
1563
- Retrieves all model insights from the underlying persistence layer.
1564
-
1565
- Returns:
1566
- List[ModelInsight]: A list of all model insights.
1567
- """
1568
- return self._underlying_persistence.get_model_insights()
1569
-
1570
- def add_exception(self, exception: ContentException):
1571
- """
1572
- Adds an exception to the underlying persistence layer.
1573
-
1574
- Args:
1575
- exception (ContentException): The exception to be added.
1576
- """
1577
- self._underlying_persistence.add_exception(exception)
1578
-
1579
- def get_exceptions(self) -> List[ContentException]:
1580
- """
1581
- Retrieves all exceptions from the underlying persistence layer.
1582
-
1583
- Returns:
1584
- List[ContentException]: A list of all exceptions.
1585
- """
1586
- return self._underlying_persistence.get_exceptions()
1587
-
1588
- def replace_exceptions(self, exceptions: List[ContentException]):
1589
- """
1590
- Replaces all exceptions in the underlying persistence layer with the provided list.
1591
-
1592
- Args:
1593
- exceptions (List[ContentException]): The list of exceptions to replace with.
1594
- """
1595
- self._underlying_persistence.replace_exceptions(exceptions)
1596
-
1597
- def get_all_tags(self):
1598
- """
1599
- Retrieves all tags from the underlying persistence layer.
1600
-
1601
- Returns:
1602
- List[str]: A list of all tags.
1603
- """
1604
- return self._underlying_persistence.get_all_tags()
1605
-
1606
- def get_tagged_nodes(self, tag, tag_uuid=None):
1607
- """
1608
- Retrieves all nodes tagged with the specified tag from the underlying persistence layer.
1609
-
1610
- Args:
1611
- tag (str): The tag to filter nodes by.
1612
- tag_uuid (str, optional): The UUID of the tag to filter nodes by. Defaults to None.
1613
-
1614
- Returns:
1615
- List[Node]: A list of nodes tagged with the specified tag.
1616
- """
1617
- return self._underlying_persistence.get_tagged_nodes(tag, tag_uuid)
1618
-
1619
- def get_all_tagged_nodes(self):
1620
- """
1621
- Retrieves all tagged nodes from the underlying persistence layer.
1622
-
1623
- Returns:
1624
- List[Node]: A list of all tagged nodes.
1625
- """
1626
- return self._underlying_persistence.get_all_tagged_nodes()
1627
-
1628
- def initialize(self):
1629
- """
1630
- Initializes the persistence manager by setting up the underlying persistence layer and node cache.
1631
- """
1632
- self._underlying_persistence.initialize()
1633
-
1634
- self.node_cache.next_id = self._underlying_persistence.get_next_node_id()
1635
-
1636
- def get_parent(self, node):
1637
- """
1638
- Retrieves the parent of the specified node.
1639
-
1640
- Args:
1641
- node (Node): The node to get the parent of.
1642
-
1643
- Returns:
1644
- Node: The parent of the specified node.
1645
- """
1646
- if node.uuid in self.node_parent_cache:
1647
- return self.node_cache.get_obj(self.node_parent_cache[node.uuid])
1648
-
1649
- return self._underlying_persistence.get_parent(node)
1650
-
1651
- def close(self):
1652
- """
1653
- Closes the underlying persistence layer.
1654
- """
1655
- self._underlying_persistence.close()
1656
-
1657
- @monitor_performance
1658
- def flush_cache(self):
1659
- """
1660
- Flushes the cache by merging it with the underlying persistence layer.
1661
- """
1662
- all_node_ids = []
1663
- all_nodes = []
1664
- all_content_parts = []
1665
- all_features = []
1666
- node_id_with_features = []
1667
- dirty_nodes = self.node_cache.get_dirty_objs()
1668
-
1669
- if len(dirty_nodes) == 0:
1670
- return
1671
-
1672
- if not self._underlying_persistence.connection.in_transaction:
1673
- self._underlying_persistence.connection.execute("BEGIN TRANSACTION")
1674
-
1675
- next_feature_id = self._underlying_persistence.get_max_feature_id()
1676
- for node in dirty_nodes:
1677
- if not node.virtual:
1678
- all_node_ids.append([node.uuid])
1679
- node_obj, content_parts = self._underlying_persistence.add_content_node(
1680
- node, None, execute=False
1681
- )
1682
- all_nodes.extend(node_obj)
1683
- all_content_parts.extend(content_parts)
1684
- if node.uuid in self.feature_cache:
1685
- if node.uuid in self.feature_cache:
1686
- node_id_with_features.append([node.uuid])
1687
-
1688
- for feature in self.feature_cache[node.uuid]:
1689
- binary_value = sqlite3.Binary(
1690
- msgpack.packb(feature.value, use_bin_type=True)
1691
- )
1692
-
1693
- tag_uuid = None
1694
- if feature.feature_type == "tag" and "uuid" in feature.value[0]:
1695
- tag_uuid = feature.value[0]["uuid"]
1696
-
1697
- all_features.append(
1698
- [
1699
- next_feature_id,
1700
- node.uuid,
1701
- self._underlying_persistence.get_feature_type_id(
1702
- feature
1703
- ),
1704
- binary_value,
1705
- feature.single,
1706
- tag_uuid,
1707
- ]
1708
- )
1709
- next_feature_id = next_feature_id + 1
1710
-
1711
- self.node_cache.undirty(node)
1712
-
1713
- self._underlying_persistence.cursor.executemany(
1714
- "DELETE FROM cn where id=?", all_node_ids
1715
- )
1716
- self._underlying_persistence.cursor.executemany(
1717
- "DELETE FROM ft where cn_id=?", node_id_with_features
1718
- )
1719
- self._underlying_persistence.cursor.executemany(
1720
- "INSERT INTO cn (pid, nt, idx, id) VALUES (?,?,?,?)", all_nodes
1721
- )
1722
- self._underlying_persistence.cursor.executemany(
1723
- "DELETE FROM cnp where cn_id=?", all_node_ids
1724
- )
1725
- self._underlying_persistence.cursor.executemany(
1726
- CONTENT_NODE_PART_INSERT, all_content_parts
1727
- )
1728
- self._underlying_persistence.cursor.executemany(FEATURE_INSERT, all_features)
1729
- self._underlying_persistence.connection.commit()
1730
-
1731
- def get_content_nodes(self, node_type, parent_node, include_children):
1732
- """
1733
- Retrieves content nodes of the specified type and parent from the underlying persistence layer.
1734
-
1735
- Args:
1736
- node_type (str): The type of nodes to retrieve.
1737
- parent_node (Node): The parent node to filter nodes by.
1738
- include_children (bool): Whether to include child nodes.
1739
-
1740
- Returns:
1741
- List[Node]: A list of nodes that match the specified criteria.
1742
- """
1743
- return self._underlying_persistence.get_content_nodes(
1744
- node_type, parent_node, include_children
1745
- )
1746
-
1747
- def get_bytes(self):
1748
- """
1749
- Retrieves the bytes of the document from the underlying persistence layer.
1750
-
1751
- Returns:
1752
- bytes: The bytes of the document.
1753
- """
1754
- self.flush_cache()
1755
- self._underlying_persistence.sync()
1756
- return self._underlying_persistence.get_bytes()
1757
-
1758
- def update_metadata(self):
1759
- """
1760
- Updates the metadata in the underlying persistence layer.
1761
- """
1762
- self._underlying_persistence.update_metadata()
1763
-
1764
- def add_content_node(self, node, parent):
1765
- """
1766
- Adds a content node to the cache and updates the child and parent caches accordingly.
1767
-
1768
- Args:
1769
- node (Node): The node to be added.
1770
- parent (Node): The parent of the node to be added.
1771
- """
1772
-
1773
- if node.index is None:
1774
- node.index = 0
1775
-
1776
- # Check if the node exists in the DB
1777
- if node.uuid is None:
1778
- node.uuid = self.node_cache.next_id
1779
- self.node_cache.next_id += 1
1780
-
1781
- if self._underlying_persistence.get_node(node.uuid) is None:
1782
- self._underlying_persistence.add_content_node(node, parent)
1783
-
1784
- if parent:
1785
- node._parent_uuid = parent.uuid
1786
- self.node_cache.add_obj(parent)
1787
-
1788
- self.node_cache.add_obj(node)
1789
-
1790
- update_child_cache = False
1791
-
1792
- if node.uuid not in self.node_parent_cache:
1793
- self.node_parent_cache[node.uuid] = node._parent_uuid
1794
- update_child_cache = True
1795
-
1796
- if (
1797
- node.uuid in self.node_parent_cache
1798
- and node._parent_uuid != self.node_parent_cache[node.uuid]
1799
- ):
1800
- # Remove from the old parent
1801
- self.child_id_cache[self.node_parent_cache[node.uuid]].remove(node.uuid)
1802
- self.child_cache[self.node_parent_cache[node.uuid]].remove(node)
1803
- # Add to the new parent
1804
- self.node_parent_cache[node.uuid] = node._parent_uuid
1805
- update_child_cache = True
1806
-
1807
- if update_child_cache:
1808
- if node._parent_uuid not in self.child_cache:
1809
- self.child_cache[node._parent_uuid] = [node]
1810
- self.child_id_cache[node._parent_uuid] = {node.uuid}
1811
- else:
1812
- if node.uuid not in self.child_id_cache[node._parent_uuid]:
1813
- self.child_id_cache[node._parent_uuid].add(node.uuid)
1814
- current_children = self.child_cache[node._parent_uuid]
1815
- if (
1816
- len(current_children) == 0
1817
- or node.index >= current_children[-1].index
1818
- ):
1819
- self.child_cache[node._parent_uuid].append(node)
1820
- else:
1821
- self.child_cache[node._parent_uuid].append(node)
1822
- self.child_cache[node._parent_uuid] = sorted(
1823
- self.child_cache[node._parent_uuid], key=lambda x: x.index
1824
- )
1825
-
1826
- def get_node(self, node_id):
1827
- """
1828
- Retrieves a node by its ID from the cache or the underlying persistence layer.
1829
-
1830
- Args:
1831
- node_id (str): The ID of the node to retrieve.
1832
-
1833
- Returns:
1834
- Node: The node with the specified ID.
1835
- """
1836
-
1837
- node = self.node_cache.get_obj(node_id)
1838
- if node is None:
1839
- node = self._underlying_persistence.get_node(node_id)
1840
- if node is not None:
1841
- self.node_cache.add_obj(node)
1842
- if node._parent_uuid:
1843
- self.node_parent_cache[node.uuid] = node._parent_uuid
1844
- if node._parent_uuid not in self.child_id_cache:
1845
- self.get_node(node._parent_uuid)
1846
-
1847
- return node
1848
-
1849
- def remove_content_node(self, node):
1850
- """
1851
- Removes a content node from the cache and the underlying persistence layer.
1852
-
1853
- Args:
1854
- node (Node): The node to be removed.
1855
- """
1856
-
1857
- self.node_cache.remove_obj(node)
1858
-
1859
- if node.uuid in self.node_parent_cache:
1860
- try:
1861
- self.child_cache[self.node_parent_cache[node.uuid]].remove(node)
1862
- except ValueError:
1863
- pass
1864
- except KeyError:
1865
- pass
1866
-
1867
- # We have a sitation where we seem to fail here?
1868
- try:
1869
- self.child_id_cache[self.node_parent_cache[node.uuid]].remove(node.uuid)
1870
- except ValueError:
1871
- pass
1872
- except KeyError:
1873
- pass
1874
- del self.node_parent_cache[node.uuid]
1875
-
1876
- self.content_parts_cache.pop(node.uuid, None)
1877
- self.feature_cache.pop(node.uuid, None)
1878
-
1879
- all_ids = self._underlying_persistence.remove_content_node(node)
1880
-
1881
- # remove all the ids from the cache
1882
- for id in all_ids:
1883
- tmp_node = self.node_cache.get_obj(id)
1884
- if tmp_node is not None:
1885
- self.node_cache.remove_obj(tmp_node)
1886
- self.node_cache.dirty_objs.remove(id) if id in self.node_cache.dirty_objs else None
1887
-
1888
- def get_children(self, node):
1889
- """
1890
- Retrieves the children of the specified node from the cache or the underlying persistence layer.
1891
-
1892
- Args:
1893
- node (Node): The node to get the children of.
1894
-
1895
- Returns:
1896
- List[Node]: The children of the specified node.
1897
- """
1898
- if node.uuid not in self.child_id_cache:
1899
- child_ids = self._underlying_persistence.get_child_ids(node)
1900
- else:
1901
- child_ids = self.child_id_cache[node.uuid]
1902
-
1903
- if node.uuid not in self.child_cache:
1904
- new_children = []
1905
-
1906
- for child_id in child_ids:
1907
- child_node = self.node_cache.get_obj(child_id)
1908
-
1909
- if child_node is not None:
1910
- new_children.append(child_node)
1911
- else:
1912
- new_children.append(self.get_node(child_id))
1913
-
1914
- self.child_cache[node.uuid] = sorted(new_children, key=lambda x: x.index)
1915
- self.child_id_cache[node.uuid] = set(child_ids)
1916
-
1917
- return self.child_cache[node.uuid]
1918
-
1919
- def update_node(self, node):
1920
- """
1921
- Updates a node in the cache and the underlying persistence layer.
1922
-
1923
- Args:
1924
- node (Node): The node to be updated.
1925
- """
1926
- # We need to also update the parent
1927
- self.node_parent_cache[node.uuid] = node._parent_uuid
1928
-
1929
- self._underlying_persistence.update_node(node)
1930
-
1931
- def update_content_parts(self, node, content_parts):
1932
- """
1933
- Updates the content parts of a node in the cache.
1934
-
1935
- Args:
1936
- node (Node): The node to update the content parts of.
1937
- content_parts (List[ContentPart]): The new content parts of the node.
1938
- """
1939
- self.content_parts_cache[node.uuid] = content_parts
1940
-
1941
- def get_content_parts(self, node):
1942
- """
1943
- Retrieves the content parts of a node from the cache or the underlying persistence layer.
1944
-
1945
- Args:
1946
- node (Node): The node to get the content parts of.
1947
-
1948
- Returns:
1949
- List[ContentPart]: The content parts of the node.
1950
- """
1951
- if node.uuid is None:
1952
- return []
1953
-
1954
- cps = (
1955
- self.content_parts_cache[node.uuid]
1956
- if node.uuid in self.content_parts_cache
1957
- else None
1958
- )
1959
- if cps is None:
1960
- cps = self._underlying_persistence.get_content_parts(node)
1961
- if cps is not None:
1962
- self.content_parts_cache[node.uuid] = cps
1963
-
1964
- return cps
1965
-
1966
- def remove_feature(self, node, feature_type, name):
1967
- """
1968
- Removes a feature from a node in the cache and the underlying persistence layer.
1969
-
1970
- Args:
1971
- node (Node): The node to remove the feature from.
1972
- feature_type (str): The type of the feature to remove.
1973
- name (str): The name of the feature to remove.
1974
- """
1975
-
1976
- features = self.get_features(node)
1977
- self._underlying_persistence.remove_feature(node, feature_type, name)
1978
- new_features = [
1979
- i
1980
- for i in features
1981
- if not (i.feature_type == feature_type and i.name == name)
1982
- ]
1983
- self.feature_cache[node.uuid] = new_features
1984
- self.node_cache.add_obj(node)
1985
-
1986
- def get_features(self, node):
1987
- """
1988
- Retrieves the features of a node from the cache or the underlying persistence layer.
1989
-
1990
- Args:
1991
- node (Node): The node to get the features of.
1992
-
1993
- Returns:
1994
- List[Feature]: The features of the node.
1995
- """
1996
-
1997
- if node.uuid not in self.feature_cache:
1998
- features = self._underlying_persistence.get_features(node)
1999
- self.feature_cache[node.uuid] = features
2000
-
2001
- return self.feature_cache[node.uuid]
2002
-
2003
- def add_feature(self, node, feature):
2004
- """
2005
- Adds a feature to a node in the cache and the underlying persistence layer.
2006
-
2007
- Args:
2008
- node (Node): The node to add the feature to.
2009
- feature (Feature): The feature to be added.
2010
- """
2011
-
2012
- if node.uuid not in self.feature_cache:
2013
- features = self._underlying_persistence.get_features(node)
2014
- self.feature_cache[node.uuid] = features
2015
-
2016
- self.node_cache.add_obj(node)
2017
- self.feature_cache[node.uuid].append(feature)