kodexa-document 7.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kodexa-document might be problematic. Click here for more details.

@@ -0,0 +1,2057 @@
1
+ import dataclasses
2
+ import json
3
+ import logging
4
+ import pathlib
5
+ import sqlite3
6
+ import tempfile
7
+ import time
8
+ import uuid
9
+ from typing import List, Optional
10
+
11
+ import msgpack
12
+
13
+ from kodexa_document.model import (
14
+ Document,
15
+ ContentNode,
16
+ SourceMetadata,
17
+ ContentFeature,
18
+ Tag,
19
+ )
20
+ from kodexa_document.model import (
21
+ DocumentMetadata,
22
+ ContentException,
23
+ ModelInsight,
24
+ ProcessingStep,
25
+ )
26
+ from kodexa_document.model import DocumentTaxonValidation
27
+
28
+ logger = logging.getLogger()
29
+
30
+ # Configuration constants
31
+ CACHE_SIZE = 10000 # Number of nodes to cache
32
+ BATCH_SIZE = 1000 # Size of batches for bulk operations
33
+ SLOW_QUERY_THRESHOLD = 1.0 # Seconds
34
+ MAX_CONNECTIONS = 5 # Maximum number of database connections
35
+
36
+
37
+ def monitor_performance(func):
38
+ """Performance monitoring decorator"""
39
+
40
+ def wrapper(*args, **kwargs):
41
+ start_time = time.time()
42
+ result = func(*args, **kwargs)
43
+ duration = time.time() - start_time
44
+ if duration > SLOW_QUERY_THRESHOLD:
45
+ logger.warning(
46
+ f"Slow operation detected: {func.__name__}, duration: {duration}s"
47
+ )
48
+ return result
49
+
50
+ return wrapper
51
+
52
+
53
+ class SqliteDocumentPersistence(object):
54
+ """
55
+ The Sqlite persistence engine to support large scale documents (part of the V4 Kodexa Document Architecture)
56
+ using Peewee ORM
57
+ """
58
+
59
+ def __init__(
60
+ self,
61
+ document: Document,
62
+ filename: str = None,
63
+ delete_on_close=False,
64
+ inmemory=False,
65
+ persistence_manager=None,
66
+ ):
67
+ self.document = document
68
+ self.delete_on_close = delete_on_close
69
+ self.is_tmp = False
70
+ self.inmemory = inmemory
71
+ self.filename = filename
72
+
73
+ if filename is not None:
74
+ self.is_new = not pathlib.Path(filename).exists()
75
+ self.is_tmp = False
76
+ # Create a temporary copy of the file to work with
77
+ _, newfile = tempfile.mkstemp(
78
+ suffix=".kddb"
79
+ )
80
+ if not self.is_new:
81
+ import shutil
82
+
83
+ shutil.copy2(filename, newfile)
84
+ filename = newfile
85
+ print(f"Using temporary file: {filename}")
86
+ else:
87
+ new_file, filename = tempfile.mkstemp(
88
+ suffix=".kddb"
89
+ )
90
+ self.is_tmp = True
91
+ self.is_new = True
92
+
93
+ self.current_filename = filename
94
+
95
+ from kodexa_document.persistence_models import initialize_database, database
96
+
97
+ initialize_database(filename if not inmemory else ":memory:")
98
+
99
+ self.connection = database
100
+ self.node_type_cache = {}
101
+ self.feature_type_cache = {}
102
+
103
+ self.__convert_old_db()
104
+
105
+ def initialize(self):
106
+ """
107
+ Initializes the SqliteDocumentPersistence object by either building a new database or loading an existing one.
108
+ """
109
+ if self.is_new:
110
+ self.__build_db()
111
+ else:
112
+ self.__load_document()
113
+
114
+ # Ensure processing steps table exists
115
+ self.__ensure_processing_steps_table()
116
+ # Ensure validations table exists
117
+ self.__ensure_validations_table()
118
+
119
+
120
+ def __ensure_processing_steps_table(self):
121
+ """
122
+ Ensures the processing steps table exists and is initialized
123
+ """
124
+ from kodexa_document.persistence_models import ProcessingSteps, database
125
+
126
+ # Create the table using Peewee ORM
127
+ with database.atomic():
128
+ # Check if the table has any rows
129
+ if not ProcessingSteps.select().exists():
130
+ # Initialize with empty steps
131
+ ProcessingSteps.create(steps=msgpack.packb([], use_bin_type=True))
132
+
133
+ def __ensure_validations_table(self):
134
+ """
135
+ Ensures the validations table exists and is initialized
136
+ """
137
+ from kodexa_document.persistence_models import Validations, database
138
+
139
+ # Create the table using Peewee ORM
140
+ with database.atomic():
141
+ # Check if the table has any rows
142
+ if not Validations.select().exists():
143
+ # Initialize with empty validations
144
+ Validations.create(validations=msgpack.packb([], use_bin_type=True))
145
+
146
+ def get_validations(self) -> list[DocumentTaxonValidation]:
147
+ """
148
+ Gets the validations associated with this document.
149
+
150
+ Returns:
151
+ List[DocumentTaxonValidation]: A list of DocumentTaxonValidation objects.
152
+ """
153
+ try:
154
+ from kodexa_document.persistence_models import Validations
155
+
156
+ # Get the validations data using Peewee ORM
157
+ validations_record = Validations.select().order_by(Validations.id.desc()).first()
158
+
159
+ if validations_record and validations_record.validations:
160
+ try:
161
+ unpacked_data = msgpack.unpackb(validations_record.validations)
162
+ validations = []
163
+ for validation_data in unpacked_data:
164
+ try:
165
+ validations.append(DocumentTaxonValidation.model_validate(validation_data))
166
+ except (TypeError, ValueError) as e:
167
+ logger.warning(f"Error deserializing validation: {e}, trying alternative method")
168
+ # Try alternative deserialization
169
+ if isinstance(validation_data, dict):
170
+ # Create a new validation and manually set attributes
171
+ validation = DocumentTaxonValidation(**validation_data)
172
+ validations.append(validation)
173
+ return validations
174
+ except Exception as e:
175
+ logger.error(f"Error unpacking validations data: {e}")
176
+ import traceback
177
+ logger.error(traceback.format_exc())
178
+ return []
179
+
180
+ except Exception as e:
181
+ logger.error(f"Error retrieving validations: {e}")
182
+ import traceback
183
+ logger.error(traceback.format_exc())
184
+ return []
185
+
186
+ def set_validations(self, validations: list[DocumentTaxonValidation]):
187
+ """
188
+ Sets the validations for the document.
189
+
190
+ Args:
191
+ validations (List[DocumentTaxonValidation]): A list of DocumentTaxonValidation objects to store.
192
+ """
193
+ try:
194
+ from kodexa_document.persistence_models import Validations, database
195
+
196
+ # Serialize the validations, handling different versions of Pydantic
197
+ serialized_validations = []
198
+ for validation in validations:
199
+ # Try different serialization methods to handle various Pydantic versions
200
+ try:
201
+ # For newer Pydantic (v2+)
202
+ if hasattr(validation, "model_dump"):
203
+ serialized_validations.append(validation.model_dump(by_alias=True))
204
+ # For older Pydantic (v1)
205
+ elif hasattr(validation, "dict"):
206
+ serialized_validations.append(validation.dict(by_alias=True))
207
+ # Fallback to dataclasses if not using Pydantic
208
+ else:
209
+ serialized_validations.append(dataclasses.asdict(validation))
210
+ except Exception:
211
+ # Final fallback: try to convert to dict using to_dict method
212
+ if hasattr(validation, "to_dict"):
213
+ serialized_validations.append(validation.to_dict())
214
+ else:
215
+ # Last resort: manually create dict with __dict__
216
+ serialized_validations.append(validation.__dict__)
217
+
218
+ packed_data = msgpack.packb(serialized_validations, use_bin_type=True)
219
+
220
+ with database.atomic():
221
+ # Clear existing data and insert new using Peewee ORM
222
+ Validations.delete().execute()
223
+ Validations.create(validations=packed_data)
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error setting validations: {e}")
227
+ import traceback
228
+ logger.error(traceback.format_exc())
229
+
230
+ def __check_for_updates(self):
231
+ """
232
+ Checks for updates to the database schema.
233
+ """
234
+ # Check if we have a table called kddb_metadata
235
+ if not self.connection.table_exists("kddb_metadata"):
236
+ # We are going to assume this is the old database and we need to convert it
237
+ print("Converting old database format to new kddb format...")
238
+ self.__convert_old_db()
239
+
240
+ def get_steps(self) -> list[ProcessingStep]:
241
+ """
242
+ Gets the processing steps associated with this document.
243
+
244
+ Returns:
245
+ List[ProcessingStep]: A list of ProcessingStep objects.
246
+ """
247
+ try:
248
+ from kodexa_document.persistence_models import ProcessingSteps
249
+
250
+ # Get the steps data using Peewee ORM
251
+ steps_record = ProcessingSteps.select().order_by(ProcessingSteps.id.desc()).first()
252
+
253
+ if steps_record and steps_record.steps:
254
+ try:
255
+ unpacked_data = msgpack.unpackb(steps_record.steps)
256
+ steps = []
257
+ for step_data in unpacked_data:
258
+ try:
259
+ steps.append(ProcessingStep(**step_data))
260
+ except (TypeError, ValueError) as e:
261
+ logger.warning(f"Error deserializing processing step: {e}, trying alternative method")
262
+ # Try alternative deserialization
263
+ if isinstance(step_data, dict):
264
+ # Create a new step and manually set attributes
265
+ step = ProcessingStep(type=step_data.get('type', 'unknown'))
266
+ for key, value in step_data.items():
267
+ if key != 'type':
268
+ setattr(step, key, value)
269
+ steps.append(step)
270
+ return steps
271
+ except Exception as e:
272
+ logger.error(f"Error unpacking processing steps data: {e}")
273
+ import traceback
274
+ logger.error(traceback.format_exc())
275
+ return []
276
+
277
+ except Exception as e:
278
+ logger.error(f"Error retrieving processing steps: {e}")
279
+ import traceback
280
+ logger.error(traceback.format_exc())
281
+ return []
282
+
283
+ def set_steps(self, steps: list[ProcessingStep]):
284
+ """
285
+ Sets the processing steps for the document.
286
+
287
+ Args:
288
+ steps (List[ProcessingStep]): A list of ProcessingStep objects to store.
289
+ """
290
+ try:
291
+ from kodexa_document.persistence_models import ProcessingSteps, database
292
+
293
+ # Serialize the steps, handling different versions of Pydantic
294
+ serialized_steps = []
295
+ for step in steps:
296
+ # Try different serialization methods to handle various Pydantic versions
297
+ try:
298
+ # For newer Pydantic (v2+)
299
+ if hasattr(step, "model_dump"):
300
+ serialized_steps.append(step.model_dump())
301
+ # For older Pydantic (v1)
302
+ elif hasattr(step, "dict"):
303
+ serialized_steps.append(step.dict())
304
+ # Fallback to dataclasses if not using Pydantic
305
+ else:
306
+ serialized_steps.append(dataclasses.asdict(step))
307
+ except Exception:
308
+ # Final fallback: try to convert to dict using to_dict method
309
+ if hasattr(step, "to_dict"):
310
+ serialized_steps.append(step.to_dict())
311
+ else:
312
+ # Last resort: manually create dict with __dict__
313
+ serialized_steps.append(step.__dict__)
314
+
315
+ packed_data = msgpack.packb(serialized_steps, use_bin_type=True)
316
+
317
+ with database.atomic():
318
+ # Clear existing data and insert new using Peewee ORM
319
+ ProcessingSteps.delete().execute()
320
+ ProcessingSteps.create(steps=packed_data)
321
+
322
+ except Exception as e:
323
+ logger.error(f"Error setting processing steps: {e}")
324
+ import traceback
325
+ logger.error(traceback.format_exc())
326
+
327
+ def __convert_old_db(self):
328
+ """
329
+ Converts the old database to the new database.
330
+ """
331
+ logging.info("Converting old database format to new kddb format...")
332
+
333
+ # Turn off foreign key constraints during migration
334
+ self.connection.execute_sql("PRAGMA foreign_keys = OFF;")
335
+
336
+ try:
337
+ with self.connection.atomic():
338
+ # Check if migration is needed
339
+ cursor = self.connection.execute_sql(
340
+ """
341
+ SELECT CASE
342
+ WHEN EXISTS(SELECT 1 FROM sqlite_master WHERE type='table' AND name='cn')
343
+ AND NOT EXISTS(SELECT 1 FROM kddb_content_nodes LIMIT 1)
344
+ THEN 1 ELSE 0 END AS should_migrate;
345
+ """
346
+ )
347
+ should_migrate = cursor.fetchone()[0]
348
+
349
+ if not should_migrate:
350
+ logging.info("Migration not needed or already done.")
351
+ return
352
+
353
+ logging.info("Starting database migration...")
354
+
355
+ # Create temporary mapping table
356
+ self.connection.execute_sql(
357
+ """
358
+ CREATE TEMP TABLE IF NOT EXISTS temp_do_mapping (
359
+ cn_id INTEGER PRIMARY KEY,
360
+ do_id INTEGER
361
+ );
362
+ """
363
+ )
364
+
365
+ # Migrate node types (n_type → kddb_node_types)
366
+ self.connection.execute_sql(
367
+ """
368
+ INSERT OR IGNORE INTO kddb_node_types (name)
369
+ SELECT name FROM n_type;
370
+ """
371
+ )
372
+
373
+ # Populate the node_type_cache with all node types
374
+ from kodexa_document.persistence_models import NodeType
375
+
376
+ for node_type in NodeType.select():
377
+ self.node_type_cache[node_type.name] = node_type.name
378
+
379
+ # Check if steps table exists and migrate it to kddb_processing_steps
380
+ cursor = self.connection.execute_sql(
381
+ """
382
+ SELECT CASE
383
+ WHEN EXISTS(SELECT 1 FROM sqlite_master WHERE type='table' AND name='steps')
384
+ THEN 1 ELSE 0 END AS steps_exists;
385
+ """
386
+ )
387
+ steps_exists = cursor.fetchone()[0]
388
+
389
+ # Migrate data from steps table if it exists
390
+ if steps_exists:
391
+ from kodexa_document.persistence_models import ProcessingSteps
392
+
393
+ logging.info("Migrating processing steps from old format...")
394
+ cursor = self.connection.execute_sql("SELECT obj FROM steps")
395
+ steps_data = cursor.fetchone()
396
+
397
+ if steps_data and steps_data[0]:
398
+ # Check if we already have data in the new table
399
+ if not ProcessingSteps.select().exists():
400
+ # Insert the steps data into the new table using Peewee
401
+ ProcessingSteps.create(steps=steps_data[0])
402
+ logging.info("Successfully migrated processing steps data")
403
+ else:
404
+ # Initialize with empty steps if no old table exists
405
+ from kodexa_document.persistence_models import ProcessingSteps
406
+
407
+ if not ProcessingSteps.select().exists():
408
+ ProcessingSteps.create(steps=msgpack.packb([], use_bin_type=True))
409
+
410
+ # Check if validations table exists and migrate it to kddb_validations
411
+ cursor = self.connection.execute_sql(
412
+ """
413
+ SELECT CASE
414
+ WHEN EXISTS(SELECT 1 FROM sqlite_master WHERE type='table' AND name='validations')
415
+ THEN 1 ELSE 0 END AS validations_exists;
416
+ """
417
+ )
418
+ validations_exists = cursor.fetchone()[0]
419
+
420
+ # Migrate data from validations table if it exists
421
+ if validations_exists:
422
+ from kodexa_document.persistence_models import Validations
423
+
424
+ logging.info("Migrating validations from old format...")
425
+ cursor = self.connection.execute_sql("SELECT obj FROM validations")
426
+ validations_data = cursor.fetchone()
427
+
428
+ if validations_data and validations_data[0]:
429
+ # Check if we already have data in the new table
430
+ if not Validations.select().exists():
431
+ # Insert the validations data into the new table using Peewee
432
+ Validations.create(validations=validations_data[0])
433
+ logging.info("Successfully migrated validations data")
434
+ else:
435
+ # Initialize with empty validations if no old table exists
436
+ from kodexa_document.persistence_models import Validations
437
+
438
+ if not Validations.select().exists():
439
+ Validations.create(validations=msgpack.packb([], use_bin_type=True))
440
+
441
+ # Migrate feature types (f_type → kddb_feature_types)
442
+ self.connection.execute_sql(
443
+ """
444
+ INSERT OR IGNORE INTO kddb_feature_types (name)
445
+ SELECT name FROM f_type;
446
+ """
447
+ )
448
+
449
+ # Create temporary table for node hierarchy
450
+ self.connection.execute_sql(
451
+ """
452
+ CREATE TEMP TABLE node_levels (
453
+ id INTEGER PRIMARY KEY,
454
+ pid INTEGER,
455
+ level INTEGER
456
+ );
457
+ """
458
+ )
459
+
460
+ # Insert nodes at each level
461
+ self.connection.execute_sql(
462
+ """
463
+ -- First insert root nodes (level 0)
464
+ INSERT INTO node_levels (id, pid, level)
465
+ SELECT id, pid, 0
466
+ FROM cn
467
+ WHERE pid IS NULL;
468
+ """
469
+ )
470
+
471
+ self.connection.execute_sql(
472
+ """
473
+ -- Insert level 1 nodes
474
+ INSERT INTO node_levels (id, pid, level)
475
+ SELECT c.id, c.pid, 1
476
+ FROM cn c
477
+ JOIN node_levels p ON c.pid = p.id
478
+ WHERE p.level = 0;
479
+ """
480
+ )
481
+
482
+ self.connection.execute_sql(
483
+ """
484
+ -- Insert level 2 nodes
485
+ INSERT INTO node_levels (id, pid, level)
486
+ SELECT c.id, c.pid, 2
487
+ FROM cn c
488
+ JOIN node_levels p ON c.pid = p.id
489
+ WHERE p.level = 1;
490
+ """
491
+ )
492
+
493
+ self.connection.execute_sql(
494
+ """
495
+ -- Insert level 3 nodes
496
+ INSERT INTO node_levels (id, pid, level)
497
+ SELECT c.id, c.pid, 3
498
+ FROM cn c
499
+ JOIN node_levels p ON c.pid = p.id
500
+ WHERE p.level = 2;
501
+ """
502
+ )
503
+
504
+ self.connection.execute_sql(
505
+ """
506
+ -- Insert level 4 nodes
507
+ INSERT INTO node_levels (id, pid, level)
508
+ SELECT c.id, c.pid, 4
509
+ FROM cn c
510
+ JOIN node_levels p ON c.pid = p.id
511
+ WHERE p.level = 3;
512
+ """
513
+ )
514
+
515
+ self.connection.execute_sql(
516
+ """
517
+ -- Insert level 5 nodes
518
+ INSERT INTO node_levels (id, pid, level)
519
+ SELECT c.id, c.pid, 5
520
+ FROM cn c
521
+ JOIN node_levels p ON c.pid = p.id
522
+ WHERE p.level = 4;
523
+ """
524
+ )
525
+
526
+ # Create index for faster lookups
527
+ self.connection.execute_sql(
528
+ """
529
+ CREATE INDEX node_levels_idx ON node_levels(level, id, pid);
530
+ """
531
+ )
532
+
533
+ # Migrate content nodes
534
+ self.connection.execute_sql(
535
+ """
536
+ INSERT INTO kddb_content_nodes (id, parent_id, node_type, content, created, modified, "index")
537
+ SELECT cn.id, cn.pid, nt.name, CASE WHEN cnp.content IS NOT NULL THEN cnp.content ELSE NULL END,
538
+ datetime('now'), datetime('now'), cn.idx
539
+ FROM cn
540
+ JOIN n_type nt ON cn.nt = nt.id
541
+ LEFT JOIN cnp ON cn.id = cnp.cn_id AND cnp.pos = 0;
542
+ """
543
+ )
544
+
545
+ # Make sure the content field exists in kddb_content_nodes
546
+ cursor = self.connection.execute_sql(
547
+ """
548
+ PRAGMA table_info(kddb_content_nodes);
549
+ """
550
+ )
551
+ columns = [column[1] for column in cursor.fetchall()]
552
+
553
+ if "content" not in columns:
554
+ self.connection.execute_sql(
555
+ """
556
+ ALTER TABLE kddb_content_nodes ADD COLUMN content TEXT;
557
+ """
558
+ )
559
+
560
+ # Update content from content parts
561
+ self.connection.execute_sql(
562
+ """
563
+ UPDATE kddb_content_nodes
564
+ SET content = (
565
+ SELECT cnp.content
566
+ FROM kddb_content_node_parts cnp
567
+ WHERE cnp.content_node_id = kddb_content_nodes.id
568
+ AND cnp.pos = 0
569
+ AND cnp.content IS NOT NULL
570
+ LIMIT 1
571
+ );
572
+ """
573
+ )
574
+
575
+ # Make sure the index field exists in kddb_content_nodes
576
+ if "index" not in columns:
577
+ self.connection.execute_sql(
578
+ """
579
+ ALTER TABLE kddb_content_nodes ADD COLUMN "index" INTEGER;
580
+ """
581
+ )
582
+
583
+ # Update index values if possible (e.g., from data object idx)
584
+ self.connection.execute_sql(
585
+ """
586
+ UPDATE kddb_content_nodes
587
+ SET "index" = (
588
+ SELECT do.idx
589
+ FROM kddb_data_objects do
590
+ WHERE do.id = kddb_content_nodes.data_object_id
591
+ LIMIT 1
592
+ );
593
+ """
594
+ )
595
+
596
+ # Migrate content node parts
597
+ self.connection.execute_sql(
598
+ """
599
+ INSERT INTO kddb_content_node_parts (content_node_id, pos, content, content_idx)
600
+ SELECT cn_id, pos, content, content_idx
601
+ FROM cnp;
602
+ """
603
+ )
604
+
605
+ # Migrate features
606
+ self.connection.execute_sql(
607
+ """
608
+ INSERT INTO kddb_features (id, feature_type_id)
609
+ SELECT ft.id, ft.f_type
610
+ FROM ft;
611
+ """
612
+ )
613
+
614
+ # Populate the new ContentNodeFeatureLink table
615
+ self.connection.execute_sql(
616
+ """
617
+ INSERT INTO kddb_content_node_feature_links (content_node_id, feature_id)
618
+ SELECT ft.cn_id, ft.id
619
+ FROM ft
620
+ WHERE ft.cn_id IS NOT NULL;
621
+ """
622
+ )
623
+
624
+ # Migrate feature binary data
625
+ self.connection.execute_sql(
626
+ """
627
+ INSERT INTO kddb_feature_blob (feature_id, binary_value)
628
+ SELECT id, binary_value
629
+ FROM ft
630
+ WHERE binary_value IS NOT NULL;
631
+ """
632
+ )
633
+
634
+ # Migrate metadata if it exists
635
+ self.connection.execute_sql(
636
+ """
637
+ INSERT OR IGNORE INTO kddb_metadata (id, metadata)
638
+ SELECT id, metadata FROM metadata
639
+ WHERE EXISTS(SELECT 1 FROM metadata);
640
+ """
641
+ )
642
+
643
+ # Convert existing metadata from JSON text to msgpack blob if needed
644
+ try:
645
+ metadata_record = self.connection.execute_sql(
646
+ "SELECT id, metadata FROM kddb_metadata"
647
+ ).fetchone()
648
+ if metadata_record and metadata_record[1]:
649
+ # Check if metadata is a JSON string (text) that needs conversion
650
+ try:
651
+ # Try to decode as text - if this works, it's the old JSON format
652
+ metadata_text = metadata_record[1].decode("utf-8")
653
+ metadata_dict = json.loads(metadata_text)
654
+
655
+ # Convert to msgpack blob
656
+ metadata_blob = msgpack.packb(
657
+ metadata_dict, use_bin_type=True
658
+ )
659
+
660
+ # Update the record with the blob
661
+ self.connection.execute_sql(
662
+ "UPDATE kddb_metadata SET metadata = ? WHERE id = ?",
663
+ (metadata_blob, metadata_record[0]),
664
+ )
665
+ logging.info(
666
+ "Converted metadata from JSON to msgpack blob format"
667
+ )
668
+ except (UnicodeDecodeError, json.JSONDecodeError):
669
+ # If this fails, it's probably already in the new format
670
+ logging.info(
671
+ "Metadata is already in the msgpack blob format"
672
+ )
673
+ except Exception as e:
674
+ logging.warning(f"Error converting metadata: {e}")
675
+
676
+ # Migrate external data if the old table exists
677
+ try:
678
+ # Check if old external_data table exists
679
+ cursor = self.connection.execute_sql(
680
+ """
681
+ SELECT CASE
682
+ WHEN EXISTS(SELECT 1 FROM sqlite_master WHERE type='table' AND name='external_data')
683
+ THEN 1 ELSE 0 END AS table_exists;
684
+ """
685
+ )
686
+ external_data_exists = cursor.fetchone()[0]
687
+
688
+ if external_data_exists:
689
+ logging.info("Migrating external data from old format...")
690
+
691
+ # Create kddb_external_data table if it doesn't exist
692
+ self.connection.execute_sql(
693
+ """
694
+ CREATE TABLE IF NOT EXISTS kddb_external_data (
695
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
696
+ taxonomy_id INTEGER,
697
+ key TEXT NOT NULL,
698
+ data BLOB,
699
+ FOREIGN KEY (taxonomy_id) REFERENCES kddb_taxonomies (id)
700
+ );
701
+ """
702
+ )
703
+
704
+ # Get all entries from the old external_data table
705
+ cursor = self.connection.execute_sql(
706
+ "SELECT key, data FROM external_data"
707
+ )
708
+ entries = cursor.fetchall()
709
+
710
+ for key, data in entries:
711
+ # Check if data is a JSON string that needs conversion
712
+ try:
713
+ # Try to decode as text - if this works, it's the old JSON format
714
+ data_text = data.decode("utf-8")
715
+ data_dict = json.loads(data_text)
716
+
717
+ # Convert to msgpack blob
718
+ data_blob = msgpack.packb(data_dict, use_bin_type=True)
719
+
720
+ # Insert into the new table
721
+ self.connection.execute_sql(
722
+ "INSERT INTO kddb_external_data (taxonomy_id, key, data) VALUES (?, ?, ?)",
723
+ (None, key, data_blob),
724
+ )
725
+ logging.info(f"Migrated external data for key: {key}")
726
+ except (
727
+ UnicodeDecodeError,
728
+ json.JSONDecodeError,
729
+ AttributeError,
730
+ ):
731
+ # If data is None or already binary, pack it directly
732
+ if data is not None:
733
+ data_blob = msgpack.packb(data, use_bin_type=True)
734
+ # Insert into the new table
735
+ self.connection.execute_sql(
736
+ "INSERT INTO kddb_external_data (taxonomy_id, key, data) VALUES (?, ?, ?)",
737
+ (None, key, data_blob),
738
+ )
739
+ logging.info(
740
+ f"Migrated external data for key: {key} (already in binary format)"
741
+ )
742
+ except Exception as e:
743
+ logging.warning(f"Error migrating external data: {e}")
744
+ import traceback
745
+
746
+ logging.warning(traceback.format_exc())
747
+
748
+ # Clean up temporary tables
749
+ self.connection.execute_sql("DROP TABLE IF EXISTS temp_do_mapping;")
750
+ self.connection.execute_sql("DROP TABLE IF EXISTS node_levels;")
751
+
752
+ # Check if sqlite_sequence exists before updating it
753
+ cursor = self.connection.execute_sql(
754
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='sqlite_sequence';"
755
+ )
756
+ if cursor.fetchone():
757
+ # Update sequence values
758
+ self.connection.execute_sql(
759
+ """
760
+ UPDATE sqlite_sequence SET seq = (SELECT MAX(id) FROM kddb_content_nodes)
761
+ WHERE name = 'kddb_content_nodes'
762
+ """
763
+ )
764
+
765
+ self.connection.execute_sql(
766
+ """
767
+ UPDATE sqlite_sequence SET seq = (SELECT MAX(id) FROM kddb_content_node_parts)
768
+ WHERE name = 'kddb_content_node_parts'
769
+ """
770
+ )
771
+
772
+ self.connection.execute_sql(
773
+ """
774
+ UPDATE sqlite_sequence SET seq = (SELECT MAX(id) FROM kddb_data_objects)
775
+ WHERE name = 'kddb_data_objects'
776
+ """
777
+ )
778
+
779
+ self.connection.execute_sql(
780
+ """
781
+ UPDATE sqlite_sequence SET seq = (SELECT MAX(id) FROM kddb_features)
782
+ WHERE name = 'kddb_features'
783
+ """
784
+ )
785
+
786
+ self.connection.execute_sql(
787
+ """
788
+ UPDATE sqlite_sequence SET seq = (SELECT MAX(id) FROM kddb_feature_blob)
789
+ WHERE name = 'kddb_feature_blob'
790
+ """
791
+ )
792
+
793
+ logging.info("Database migration completed successfully.")
794
+
795
+ except Exception as e:
796
+ logging.error(f"Error during database migration: {e}")
797
+ raise
798
+
799
+ finally:
800
+ # Turn foreign key constraints back on
801
+ self.connection.execute_sql("PRAGMA foreign_keys = ON;")
802
+
803
+ def __build_db(self):
804
+ """
805
+ Builds a new database for the document using Peewee models.
806
+ """
807
+ from kodexa_document.persistence_models import Metadata
808
+ import uuid as uuid_lib
809
+
810
+ # Store document metadata
811
+ document_metadata = {
812
+ "version": Document.CURRENT_VERSION,
813
+ "metadata": self.document.metadata,
814
+ "source": self.__clean_none_values(
815
+ dataclasses.asdict(self.document.source)
816
+ ),
817
+ "mixins": self.document.get_mixins(),
818
+ "labels": getattr(self.document, "labels", []),
819
+ "uuid": getattr(self.document, "uuid", str(uuid_lib.uuid4())),
820
+ }
821
+
822
+ Metadata.create(
823
+ id=1, metadata=msgpack.packb(document_metadata, use_bin_type=True)
824
+ )
825
+ self.document.version = "6.0.0"
826
+
827
+ def __clean_none_values(self, d):
828
+ """
829
+ Cleans a dictionary by removing keys with None values.
830
+ """
831
+ clean = {}
832
+ for k, v in d.items():
833
+ if isinstance(v, dict):
834
+ nested = self.__clean_none_values(v)
835
+ if len(nested.keys()) > 0:
836
+ clean[k] = nested
837
+ elif v is not None:
838
+ clean[k] = v
839
+ return clean
840
+
841
+ def __load_document(self):
842
+ """
843
+ Loads an existing document from the database using Peewee models.
844
+ """
845
+ from kodexa_document.persistence_models import Metadata, NodeType, FeatureType
846
+
847
+ # Load node types and feature types into cache
848
+ for node_type in NodeType.select():
849
+ self.node_type_cache[node_type.id] = node_type.name
850
+ self.node_type_cache[node_type.name] = (
851
+ node_type.name
852
+ ) # Add name mapping for direct string reference
853
+
854
+ for feature_type in FeatureType.select():
855
+ self.feature_type_cache[feature_type.id] = feature_type.name
856
+
857
+ # Load document metadata
858
+ metadata_record = Metadata.get_or_none(Metadata.id == 1)
859
+ if metadata_record:
860
+ metadata = None
861
+ # Try loading with msgpack first
862
+ try:
863
+ metadata = msgpack.unpackb(metadata_record.metadata)
864
+ except Exception as e:
865
+ # Fallback: try if it's JSON in a text field (backward compatibility)
866
+ try:
867
+ metadata_text = metadata_record.metadata.decode("utf-8")
868
+ metadata = json.loads(metadata_text)
869
+
870
+ # If loaded successfully as JSON, convert to msgpack format for next load
871
+ logging.info("Converting JSON metadata to msgpack format")
872
+ Metadata.delete().where(Metadata.id == 1).execute()
873
+ Metadata.create(
874
+ id=1, metadata=msgpack.packb(metadata, use_bin_type=True)
875
+ )
876
+ except Exception as inner_e:
877
+ logging.error(
878
+ f"Failed to load metadata: {e}, Fallback error: {inner_e}"
879
+ )
880
+
881
+ if metadata:
882
+ self.document.metadata = DocumentMetadata(metadata["metadata"])
883
+ self.document.version = metadata.get(
884
+ "version", Document.PREVIOUS_VERSION
885
+ )
886
+
887
+ self.id = metadata.get(
888
+ "uuid", str(uuid.uuid5(uuid.NAMESPACE_DNS, "kodexa.com"))
889
+ )
890
+
891
+ if "source" in metadata and metadata["source"]:
892
+ self.document.source = SourceMetadata.from_dict(metadata["source"])
893
+ if "labels" in metadata and metadata["labels"]:
894
+ self.document.labels = metadata["labels"]
895
+ if "mixins" in metadata and metadata["mixins"]:
896
+ self.document._mixins = metadata["mixins"]
897
+
898
+ # Load root node
899
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
900
+
901
+ root_content_node = PeeweeContentNode.get_or_none(
902
+ PeeweeContentNode.parent == None
903
+ )
904
+ if root_content_node:
905
+ self.document.content_node = self.__build_node(root_content_node)
906
+
907
+ # Ensure we're on the latest version
908
+ self.document.version = "6.0.0"
909
+ self.update_metadata()
910
+
911
+ def __build_node(self, peewee_node):
912
+ """
913
+ Builds a ContentNode from a Peewee ContentNode model.
914
+ """
915
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
916
+
917
+ # Handle either ID or direct node_type string reference
918
+ node_type = peewee_node.node_type
919
+ if node_type in self.node_type_cache:
920
+ node_type = self.node_type_cache[node_type]
921
+ else:
922
+ # Add to cache for future reference
923
+ self.node_type_cache[node_type] = node_type
924
+
925
+ new_node = ContentNode(
926
+ document=self.document,
927
+ node_type=node_type,
928
+ content=peewee_node.content,
929
+ parent=self.get_node(peewee_node.parent.id) if peewee_node.parent else None,
930
+ index=peewee_node.index,
931
+ )
932
+ new_node.id = peewee_node.id
933
+
934
+ return new_node
935
+
936
+ def close(self):
937
+ """
938
+ Closes the connection to the database. If delete_on_close is True, the file will also be deleted.
939
+ """
940
+ from kodexa_document.persistence_models import close_database
941
+
942
+ close_database()
943
+
944
+ if self.is_tmp or self.delete_on_close:
945
+ pathlib.Path(self.current_filename).unlink()
946
+
947
+ def update_metadata(self):
948
+ """
949
+ Updates the metadata of the document.
950
+ """
951
+ from kodexa_document.persistence_models import Metadata
952
+ import uuid as uuid_lib
953
+
954
+ document_metadata = {
955
+ "version": Document.CURRENT_VERSION,
956
+ "metadata": self.document.metadata,
957
+ "source": self.__clean_none_values(
958
+ dataclasses.asdict(self.document.source)
959
+ ),
960
+ "mixins": self.document.get_mixins(),
961
+ "labels": getattr(self.document, "labels", []),
962
+ "uuid": getattr(self.document, "uuid", str(uuid_lib.uuid4())),
963
+ }
964
+
965
+ # Delete existing metadata and create new
966
+ Metadata.delete().where(Metadata.id == 1).execute()
967
+ Metadata.create(
968
+ id=1, metadata=msgpack.packb(document_metadata, use_bin_type=True)
969
+ )
970
+
971
+ def get_node(self, node_id: Optional[int] = None) -> ContentNode:
972
+ """
973
+ Retrieves a node by its id.
974
+ """
975
+
976
+ if node_id is None:
977
+ return None
978
+
979
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
980
+
981
+ peewee_node = PeeweeContentNode.get_or_none(PeeweeContentNode.id == node_id)
982
+ if peewee_node:
983
+ return self.__build_node(peewee_node)
984
+ else:
985
+ raise ValueError(f"Node {node_id} not found")
986
+
987
+ def get_parent(self, content_node: ContentNode):
988
+ """
989
+ Retrieves the parent of a given node.
990
+ """
991
+ return self.get_node(content_node._parent_id)
992
+
993
+ def get_children(self, content_node: ContentNode):
994
+ """
995
+ Retrieves the children of a given node.
996
+ """
997
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
998
+
999
+ children = []
1000
+ peewee_node = PeeweeContentNode.get_or_none(
1001
+ PeeweeContentNode.id == content_node.id
1002
+ )
1003
+ if peewee_node:
1004
+ # Get child nodes directly through parent relationship
1005
+ child_nodes = (
1006
+ PeeweeContentNode.select()
1007
+ .where(PeeweeContentNode.parent == peewee_node)
1008
+ .order_by(PeeweeContentNode.index)
1009
+ )
1010
+
1011
+ # If any child nodes have data_object still set, verify they're not deleted
1012
+ for child_node in child_nodes:
1013
+ children.append(self.__build_node(child_node))
1014
+
1015
+ return children
1016
+
1017
+ def get_child_ids(self, content_node):
1018
+ """
1019
+ Retrieves the ids of the children of a given node.
1020
+ """
1021
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1022
+
1023
+ children = []
1024
+ peewee_node = PeeweeContentNode.get_or_none(
1025
+ PeeweeContentNode.id == content_node.id
1026
+ )
1027
+ if peewee_node:
1028
+ # Get child nodes directly through parent relationship
1029
+ child_nodes = PeeweeContentNode.select().where(
1030
+ PeeweeContentNode.parent == peewee_node
1031
+ )
1032
+
1033
+ # If any child nodes have data_object still set, verify they're not deleted
1034
+ for child_node in child_nodes:
1035
+ children.append(child_node.id)
1036
+
1037
+ return children
1038
+
1039
+ def add_content_node(self, node: ContentNode, parent: Optional[ContentNode] = None):
1040
+ """
1041
+ Adds a content node to the document.
1042
+ """
1043
+
1044
+ if node.virtual:
1045
+ return
1046
+
1047
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1048
+ from kodexa_document.persistence_models import (
1049
+ ContentNodePart as PeeweeContentNodePart,
1050
+ )
1051
+
1052
+ if node.index is None and parent is not None:
1053
+ node.index = len(self.get_children(parent))
1054
+ else:
1055
+ node.index = node.index if node.index is not None else 0
1056
+
1057
+ with self.connection.atomic():
1058
+ # Get parent node if provided
1059
+ parent_node = None
1060
+ if parent:
1061
+ if not parent.id:
1062
+ raise ValueError("Parent node ID is required to add a content node")
1063
+ parent_node = PeeweeContentNode.get_or_none(
1064
+ PeeweeContentNode.id == parent.id
1065
+ )
1066
+
1067
+ if parent_node is None:
1068
+ raise ValueError(f"Parent node {parent.id} not found")
1069
+
1070
+ if parent and parent.id is None:
1071
+ raise ValueError("Parent node ID is required to add a content node")
1072
+
1073
+ if node.id is None:
1074
+
1075
+ # Create ContentNode with parent relationship
1076
+ peewee_node = PeeweeContentNode.create(
1077
+ node_type=node.node_type,
1078
+ parent=parent_node,
1079
+ content=node.content,
1080
+ index=node.index,
1081
+ )
1082
+
1083
+ # Create ContentNodePart entries
1084
+ for idx, part in enumerate(node.get_content_parts()):
1085
+ PeeweeContentNodePart.create(
1086
+ content_node=peewee_node.id,
1087
+ pos=idx,
1088
+ content=part if isinstance(part, str) else None,
1089
+ content_idx=part if not isinstance(part, str) else None,
1090
+ )
1091
+
1092
+ # Set the UUID to the new node's ID
1093
+ node.id = peewee_node.id
1094
+
1095
+ else:
1096
+ peewee_node = PeeweeContentNode.get_or_none(
1097
+ PeeweeContentNode.id == node.id
1098
+ )
1099
+
1100
+ if peewee_node is None:
1101
+ raise ValueError(f"Node {node.id} not found")
1102
+
1103
+ if peewee_node.parent is None or peewee_node.parent.id != parent.id:
1104
+ peewee_node.parent = parent_node
1105
+ peewee_node.index = node.index
1106
+ peewee_node.save()
1107
+ if peewee_node.parent:
1108
+ node._parent_id = peewee_node.parent.id
1109
+ else:
1110
+ node._parent_id = None
1111
+
1112
+ if parent and node.id is None:
1113
+ node._parent_id = parent.id
1114
+
1115
+ return node
1116
+
1117
+ def get_content_parts(self, node):
1118
+ """
1119
+ Retrieves the content parts of a given node.
1120
+ """
1121
+ from kodexa_document.persistence_models import (
1122
+ ContentNodePart as PeeweeContentNodePart,
1123
+ )
1124
+
1125
+ parts = []
1126
+ if node.id:
1127
+ peewee_parts = (
1128
+ PeeweeContentNodePart.select()
1129
+ .where(PeeweeContentNodePart.content_node == node.id)
1130
+ .order_by(PeeweeContentNodePart.pos)
1131
+ )
1132
+
1133
+ for part in peewee_parts:
1134
+ if part.content_idx is None:
1135
+ parts.append(part.content)
1136
+ else:
1137
+ parts.append(part.content_idx)
1138
+
1139
+ return parts
1140
+
1141
+ def update_content_parts(self, node: ContentNode, content_parts: List[str | int]):
1142
+ """
1143
+ Updates the content parts of a given node.
1144
+ """
1145
+ from kodexa_document.persistence_models import (
1146
+ ContentNodePart as PeeweeContentNodePart,
1147
+ )
1148
+
1149
+ # Ensure node has an id before updating content parts
1150
+ if node.id is None:
1151
+ return
1152
+
1153
+ with self.connection.atomic():
1154
+ # Delete existing parts
1155
+ PeeweeContentNodePart.delete().where(
1156
+ PeeweeContentNodePart.content_node == node.id
1157
+ ).execute()
1158
+
1159
+ # Create new parts
1160
+ for idx, part in enumerate(content_parts):
1161
+ PeeweeContentNodePart.create(
1162
+ content_node=node.id,
1163
+ pos=idx,
1164
+ content=part if isinstance(part, str) else None,
1165
+ content_idx=part if not isinstance(part, str) else None,
1166
+ )
1167
+
1168
+ def get_features(self, node: ContentNode) -> List[ContentFeature]:
1169
+ from kodexa_document.persistence_models import (
1170
+ Feature as PeeweeFeature,
1171
+ FeatureBlob,
1172
+ FeatureType,
1173
+ )
1174
+ from kodexa_document.persistence_models import FeatureTag, FeatureBBox
1175
+ from kodexa_document.persistence_models import ContentNodeFeatureLink
1176
+
1177
+ content_features_list: List[ContentFeature] = []
1178
+ if node.id:
1179
+ peewee_feature_instances = (
1180
+ PeeweeFeature.select()
1181
+ .join(
1182
+ ContentNodeFeatureLink,
1183
+ on=(PeeweeFeature.id == ContentNodeFeatureLink.feature_id),
1184
+ )
1185
+ .where(ContentNodeFeatureLink.content_node_id == node.id)
1186
+ )
1187
+
1188
+ for pf_instance in peewee_feature_instances:
1189
+ db_feature_type = FeatureType.get_by_id(pf_instance.feature_type_id)
1190
+ feature_type_name = db_feature_type.name
1191
+
1192
+ feature_parts = feature_type_name.split(":", 1)
1193
+ feature_type_str = feature_parts[0]
1194
+ feature_name_str = feature_parts[1] if len(feature_parts) > 1 else ""
1195
+
1196
+ value_list = []
1197
+ if feature_type_str == "tag":
1198
+ tag_records = FeatureTag.select().where(
1199
+ FeatureTag.feature_id == pf_instance.id
1200
+ ) # Use feature_id
1201
+ for tag_record in tag_records:
1202
+ tag_data_dict = {
1203
+ "start": tag_record.start_pos,
1204
+ "end": tag_record.end_pos,
1205
+ "value": tag_record.tag_value,
1206
+ "uuid": tag_record.uuid,
1207
+ "data": (
1208
+ msgpack.unpackb(tag_record.data)
1209
+ if tag_record.data
1210
+ else None
1211
+ ),
1212
+ "confidence": tag_record.confidence,
1213
+ "group_uuid": tag_record.group_uuid,
1214
+ "parent_group_uuid": tag_record.parent_group_uuid,
1215
+ "cell_index": tag_record.cell_index,
1216
+ "index": tag_record.index,
1217
+ "note": tag_record.note,
1218
+ "status": tag_record.status,
1219
+ "owner_uri": tag_record.owner_uri,
1220
+ "is_dirty": (
1221
+ bool(tag_record.is_dirty)
1222
+ if tag_record.is_dirty is not None
1223
+ else None
1224
+ ),
1225
+ }
1226
+ value_list.append(Tag(**tag_data_dict))
1227
+ elif feature_type_str == "spatial" and feature_name_str == "bbox":
1228
+ bbox_records = FeatureBBox.select().where(
1229
+ FeatureBBox.feature_id == pf_instance.id
1230
+ ) # Use feature_id
1231
+ for bbox_record in bbox_records:
1232
+ value_list.append(
1233
+ [
1234
+ bbox_record.x1,
1235
+ bbox_record.y1,
1236
+ bbox_record.x2,
1237
+ bbox_record.y2,
1238
+ ]
1239
+ )
1240
+ else:
1241
+ blob_records = FeatureBlob.select().where(
1242
+ FeatureBlob.feature_id == pf_instance.id
1243
+ ) # Use feature_id
1244
+ for blob_record in blob_records:
1245
+ if blob_record.binary_value is not None:
1246
+ value_list.append(msgpack.unpackb(blob_record.binary_value))
1247
+ else:
1248
+ value_list.append(None)
1249
+
1250
+ content_features_list.append(
1251
+ ContentFeature( # This is the ContentFeature from kodexa.model
1252
+ feature_type_str, feature_name_str, value_list
1253
+ )
1254
+ )
1255
+
1256
+ return content_features_list
1257
+
1258
+ def add_feature(self, node: ContentNode, feature: ContentFeature, replace=False):
1259
+ if node.id is None:
1260
+ raise ValueError("Node ID is required to add a feature")
1261
+
1262
+ from kodexa_document.persistence_models import (
1263
+ Feature as PeeweeFeature,
1264
+ FeatureBlob,
1265
+ FeatureType,
1266
+ )
1267
+ from kodexa_document.persistence_models import (
1268
+ FeatureTag,
1269
+ FeatureBBox,
1270
+ ) # Added FeatureBBox
1271
+ from kodexa_document.persistence_models import ContentNodeFeatureLink
1272
+
1273
+ with self.connection.atomic():
1274
+ feature_type_name = f"{feature.feature_type}:{feature.name}"
1275
+ db_feature_type, _ = FeatureType.get_or_create(name=feature_type_name)
1276
+
1277
+ peewee_feature = (
1278
+ PeeweeFeature.select()
1279
+ .join(
1280
+ ContentNodeFeatureLink,
1281
+ on=(PeeweeFeature.id == ContentNodeFeatureLink.feature_id),
1282
+ )
1283
+ .where(
1284
+ (ContentNodeFeatureLink.content_node_id == node.id)
1285
+ & (PeeweeFeature.feature_type_id == db_feature_type.id)
1286
+ )
1287
+ .first()
1288
+ )
1289
+
1290
+ if not peewee_feature:
1291
+ peewee_feature = PeeweeFeature.create(feature_type=db_feature_type)
1292
+ ContentNodeFeatureLink.create(
1293
+ content_node_id=node.id, feature_id=peewee_feature.id
1294
+ )
1295
+
1296
+ if replace:
1297
+ FeatureTag.delete().where(
1298
+ FeatureTag.feature_id == peewee_feature.id
1299
+ ).execute()
1300
+ FeatureBBox.delete().where(
1301
+ FeatureBBox.feature_id == peewee_feature.id
1302
+ ).execute()
1303
+ FeatureBlob.delete().where(
1304
+ FeatureBlob.feature_id == peewee_feature.id
1305
+ ).execute()
1306
+
1307
+ # Legacy handling for spatial bbox, check if the value is a list of 4 numbers, if so wrap in list
1308
+ if feature.feature_type == "spatial" and feature.name == "bbox":
1309
+ if (
1310
+ isinstance(feature.value, list)
1311
+ and len(feature.value) == 4
1312
+ and all(isinstance(coord, (int, float)) for coord in feature.value)
1313
+ ):
1314
+ feature.value = [feature.value]
1315
+
1316
+ for item_value in feature.value: # feature.value is always a list
1317
+ if feature.feature_type == "tag":
1318
+ if not isinstance(item_value, Tag):
1319
+ raise ValueError(
1320
+ f"Expected Tag object for feature type 'tag', got {type(item_value)}"
1321
+ )
1322
+ tag_obj: Tag = item_value
1323
+ FeatureTag.create(
1324
+ feature=peewee_feature,
1325
+ tag_value=tag_obj.value,
1326
+ start_pos=tag_obj.start,
1327
+ end_pos=tag_obj.end,
1328
+ uuid=tag_obj.uuid,
1329
+ data=(
1330
+ msgpack.packb(tag_obj.data, use_bin_type=True)
1331
+ if tag_obj.data is not None
1332
+ else None
1333
+ ),
1334
+ confidence=tag_obj.confidence,
1335
+ group_uuid=tag_obj.group_uuid,
1336
+ parent_group_uuid=tag_obj.parent_group_uuid,
1337
+ cell_index=tag_obj.cell_index,
1338
+ index=tag_obj.index,
1339
+ note=tag_obj.note,
1340
+ status=tag_obj.status,
1341
+ owner_uri=tag_obj.owner_uri,
1342
+ is_dirty=1 if tag_obj.is_dirty else 0,
1343
+ )
1344
+ elif feature.feature_type == "spatial" and feature.name == "bbox":
1345
+ if not (
1346
+ isinstance(item_value, list)
1347
+ and len(item_value) == 4
1348
+ and all(isinstance(coord, (int, float)) for coord in item_value)
1349
+ ):
1350
+ raise ValueError(
1351
+ f"Expected a list of 4 numeric coordinates for 'spatial:bbox', got {item_value}"
1352
+ )
1353
+ FeatureBBox.create(
1354
+ feature=peewee_feature,
1355
+ x1=item_value[0],
1356
+ y1=item_value[1],
1357
+ x2=item_value[2],
1358
+ y2=item_value[3],
1359
+ )
1360
+ else:
1361
+ FeatureBlob.create(
1362
+ feature=peewee_feature,
1363
+ binary_value=(
1364
+ msgpack.packb(item_value, use_bin_type=True)
1365
+ if item_value is not None
1366
+ else None
1367
+ ),
1368
+ )
1369
+
1370
+ # We need to reload the values to get all the existing feature values
1371
+ feature.value = []
1372
+ if feature.feature_type == "tag":
1373
+ feature_tags = FeatureTag.select().where(
1374
+ FeatureTag.feature_id == peewee_feature.id
1375
+ )
1376
+ for feature_tag in feature_tags:
1377
+ feature.value.append(
1378
+ Tag(
1379
+ feature_tag.tag_value,
1380
+ feature_tag.start_pos,
1381
+ feature_tag.end_pos,
1382
+ feature_tag.uuid,
1383
+ feature_tag.data,
1384
+ feature_tag.confidence,
1385
+ feature_tag.group_uuid,
1386
+ feature_tag.parent_group_uuid,
1387
+ feature_tag.cell_index,
1388
+ feature_tag.index,
1389
+ feature_tag.note,
1390
+ feature_tag.status,
1391
+ feature_tag.owner_uri,
1392
+ feature_tag.is_dirty,
1393
+ )
1394
+ )
1395
+ elif feature.feature_type == "spatial" and feature.name == "bbox":
1396
+ values = FeatureBBox.select().where(
1397
+ FeatureBBox.feature_id == peewee_feature.id
1398
+ )
1399
+ for value in values:
1400
+ feature.value.append([value.x1, value.y1, value.x2, value.y2])
1401
+ else:
1402
+ values = FeatureBlob.select().where(
1403
+ FeatureBlob.feature_id == peewee_feature.id
1404
+ )
1405
+ for value in values:
1406
+ feature.value.append(msgpack.unpackb(value.binary_value))
1407
+
1408
+ def remove_feature(self, node, feature_type, name):
1409
+ """
1410
+ Removes a feature from a given node.
1411
+ """
1412
+ from kodexa_document.persistence_models import (
1413
+ Feature as PeeweeFeature,
1414
+ FeatureBlob,
1415
+ FeatureType,
1416
+ FeatureTag,
1417
+ FeatureBBox,
1418
+ ContentNodeFeatureLink,
1419
+ )
1420
+
1421
+ feature_type_name = f"{feature_type}:{name}"
1422
+
1423
+ # Find the feature type
1424
+ peewee_feature_type = FeatureType.get_or_none(
1425
+ FeatureType.name == feature_type_name
1426
+ )
1427
+ if peewee_feature_type:
1428
+ # Find features with this type linked to this node
1429
+ features_query = (
1430
+ PeeweeFeature.select()
1431
+ .join(
1432
+ ContentNodeFeatureLink,
1433
+ on=(PeeweeFeature.id == ContentNodeFeatureLink.feature),
1434
+ )
1435
+ .where(
1436
+ ContentNodeFeatureLink.content_node == node.id,
1437
+ PeeweeFeature.feature_type == peewee_feature_type,
1438
+ )
1439
+ )
1440
+
1441
+ feature_ids_to_delete = [f.id for f in features_query]
1442
+
1443
+ if feature_ids_to_delete:
1444
+ with self.connection.atomic():
1445
+ # Delete feature tags
1446
+ FeatureTag.delete().where(
1447
+ FeatureTag.feature_id.in_(feature_ids_to_delete)
1448
+ ).execute()
1449
+
1450
+ # Delete feature bboxes
1451
+ FeatureBBox.delete().where(
1452
+ FeatureBBox.feature_id.in_(feature_ids_to_delete)
1453
+ ).execute()
1454
+
1455
+ # Delete feature blobs
1456
+ FeatureBlob.delete().where(
1457
+ FeatureBlob.feature_id.in_(feature_ids_to_delete)
1458
+ ).execute()
1459
+
1460
+ # Delete links between content node and features
1461
+ ContentNodeFeatureLink.delete().where(
1462
+ ContentNodeFeatureLink.content_node
1463
+ == node.id, # Use node.id for FK object
1464
+ ContentNodeFeatureLink.feature_id.in_(feature_ids_to_delete),
1465
+ ).execute()
1466
+
1467
+ # Now delete the features themselves
1468
+ PeeweeFeature.delete().where(
1469
+ PeeweeFeature.id.in_(feature_ids_to_delete)
1470
+ ).execute()
1471
+
1472
+ def remove_all_features(self, node):
1473
+ """
1474
+ Removes all features from a given node.
1475
+ """
1476
+ from kodexa_document.persistence_models import (
1477
+ Feature as PeeweeFeature,
1478
+ FeatureBlob,
1479
+ FeatureTag,
1480
+ FeatureBBox,
1481
+ ContentNodeFeatureLink,
1482
+ )
1483
+
1484
+ # Find all features linked to this node
1485
+ features_query = (
1486
+ PeeweeFeature.select()
1487
+ .join(
1488
+ ContentNodeFeatureLink,
1489
+ on=(PeeweeFeature.id == ContentNodeFeatureLink.feature),
1490
+ )
1491
+ .where(ContentNodeFeatureLink.content_node == node.id)
1492
+ ) # Use node.id for FK object
1493
+
1494
+ feature_ids_to_delete = [f.id for f in features_query]
1495
+
1496
+ if feature_ids_to_delete:
1497
+ with self.connection.atomic():
1498
+ # Delete feature tags
1499
+ FeatureTag.delete().where(
1500
+ FeatureTag.feature_id.in_(feature_ids_to_delete)
1501
+ ).execute()
1502
+
1503
+ # Delete feature bboxes
1504
+ FeatureBBox.delete().where(
1505
+ FeatureBBox.feature_id.in_(feature_ids_to_delete)
1506
+ ).execute()
1507
+
1508
+ # Delete feature blobs
1509
+ FeatureBlob.delete().where(
1510
+ FeatureBlob.feature_id.in_(feature_ids_to_delete)
1511
+ ).execute()
1512
+
1513
+ # Delete links between content node and features
1514
+ ContentNodeFeatureLink.delete().where(
1515
+ ContentNodeFeatureLink.content_node
1516
+ == node.id, # Use node.id for FK object
1517
+ ContentNodeFeatureLink.feature_id.in_(feature_ids_to_delete),
1518
+ ).execute()
1519
+
1520
+ # Now delete the features themselves
1521
+ PeeweeFeature.delete().where(
1522
+ PeeweeFeature.id.in_(feature_ids_to_delete)
1523
+ ).execute()
1524
+
1525
+ def remove_all_features_by_id(self, node_id):
1526
+ """
1527
+ Removes all features from a node by its id.
1528
+ """
1529
+ self.remove_all_features(ContentNode(self.document, "", uuid=node_id))
1530
+
1531
+ def remove_content_node(self, node: "ContentNode"):
1532
+ """
1533
+ Removes a node and all its children from the document.
1534
+ """
1535
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1536
+ from kodexa_document.persistence_models import (
1537
+ ContentNodePart as PeeweeContentNodePart,
1538
+ )
1539
+
1540
+ def get_all_node_ids(node: "ContentNode"):
1541
+ """
1542
+ This function recursively traverses a node tree, collecting the ids of all non-virtual nodes.
1543
+ """
1544
+ all_node_ids = []
1545
+ if not node.virtual:
1546
+ all_node_ids.append(node.id)
1547
+ for child in node.get_children():
1548
+ all_node_ids.extend(get_all_node_ids(child))
1549
+ return all_node_ids
1550
+
1551
+ all_child_ids = get_all_node_ids(node)
1552
+
1553
+ try:
1554
+ with self.connection.atomic():
1555
+ # Batch remove features
1556
+ for node_id in all_child_ids:
1557
+ self.remove_all_features_by_id(node_id)
1558
+
1559
+ # Batch remove content parts
1560
+ PeeweeContentNodePart.delete().where(
1561
+ PeeweeContentNodePart.content_node.in_(all_child_ids)
1562
+ ).execute()
1563
+
1564
+ # Remove the content nodes
1565
+ PeeweeContentNode.delete().where(
1566
+ PeeweeContentNode.id.in_(all_child_ids)
1567
+ ).execute()
1568
+
1569
+ return all_child_ids
1570
+
1571
+ except Exception as e:
1572
+ self.connection.rollback()
1573
+ logger.error(f"Error removing content node: {e}")
1574
+ return []
1575
+
1576
+ def update_node(self, node: ContentNode):
1577
+ """
1578
+ Updates a given node in the document.
1579
+ """
1580
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1581
+
1582
+ try:
1583
+ peewee_node = PeeweeContentNode.get_or_none(PeeweeContentNode.id == node.id)
1584
+ # Update the peewee_node to match the node
1585
+ if peewee_node is not None:
1586
+ # Update node properties
1587
+ peewee_node.node_type = node.node_type
1588
+ peewee_node.content = node.content
1589
+ peewee_node.index = node.index
1590
+ peewee_node.save()
1591
+
1592
+ # Update content parts
1593
+ self.update_content_parts(node, node.get_content_parts())
1594
+
1595
+ logger.debug(f"Successfully updated node {node.id}")
1596
+ except Exception as e:
1597
+ logger.error(f"Failed to update node: {e}")
1598
+
1599
+ def get_nodes_by_type(self, node_type):
1600
+ """
1601
+ Retrieves nodes of a given type from the document.
1602
+ """
1603
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1604
+
1605
+ content_nodes = []
1606
+
1607
+ try:
1608
+ # Get all nodes of the specified type
1609
+ peewee_nodes = PeeweeContentNode.select().where(
1610
+ PeeweeContentNode.node_type == node_type
1611
+ )
1612
+
1613
+ # Build content nodes
1614
+ for peewee_node in peewee_nodes:
1615
+ content_nodes.append(self.__build_node(peewee_node))
1616
+
1617
+ except Exception as e:
1618
+ logger.error(f"Error retrieving nodes by type: {e}")
1619
+
1620
+ return content_nodes
1621
+
1622
+ def get_content_nodes(self, node_type, parent_node, include_children):
1623
+ """
1624
+ Retrieves content nodes from the document based on the given parameters.
1625
+ """
1626
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1627
+
1628
+ nodes = []
1629
+
1630
+ try:
1631
+ with self.connection.atomic():
1632
+ # Find the parent node
1633
+ parent_peewee_node = PeeweeContentNode.get_or_none(
1634
+ PeeweeContentNode.id == parent_node.id
1635
+ )
1636
+ if include_children:
1637
+ # For now, use a recursive function to get all descendants
1638
+ def get_all_descendants(node_id):
1639
+ descendants = []
1640
+ # Get direct children
1641
+ children = list(
1642
+ PeeweeContentNode.select().where(
1643
+ PeeweeContentNode.parent_id == node_id
1644
+ )
1645
+ )
1646
+
1647
+ # Add children to descendants
1648
+ for child in children:
1649
+ descendants.append(child)
1650
+ # Recursively get children's descendants
1651
+ descendants.extend(get_all_descendants(child.id))
1652
+
1653
+ return descendants
1654
+
1655
+ # Get all descendants of parent node
1656
+ if parent_peewee_node is not None:
1657
+ all_nodes = get_all_descendants(parent_peewee_node.id)
1658
+ else:
1659
+ if node_type == "*":
1660
+ all_nodes = PeeweeContentNode.select()
1661
+ else:
1662
+ all_nodes = PeeweeContentNode.select().where(
1663
+ PeeweeContentNode.node_type == node_type
1664
+ )
1665
+
1666
+ # Filter by node type if needed
1667
+ if node_type != "*":
1668
+ all_nodes = [n for n in all_nodes if n.node_type == node_type]
1669
+
1670
+ # Sort by index and create ContentNodes
1671
+ for peewee_node in sorted(all_nodes, key=lambda x: x.index or 0):
1672
+ nodes.append(self.__build_node(peewee_node))
1673
+ else:
1674
+ # Get direct children of parent node with specific node type
1675
+ child_nodes = PeeweeContentNode.select().where(
1676
+ PeeweeContentNode.parent_id == parent_peewee_node.id
1677
+ )
1678
+
1679
+ if node_type != "*":
1680
+ child_nodes = child_nodes.where(
1681
+ PeeweeContentNode.node_type == node_type
1682
+ )
1683
+
1684
+ # Sort by index
1685
+ for child_node in sorted(
1686
+ list(child_nodes), key=lambda x: getattr(x, "index", 0) or 0
1687
+ ):
1688
+ nodes.append(self.__build_node(child_node))
1689
+ except Exception as e:
1690
+ logger.error(f"Error getting content nodes: {e}")
1691
+ self.connection.rollback()
1692
+
1693
+ return nodes
1694
+
1695
+ def get_all_tags(self):
1696
+ """
1697
+ Retrieves all tags from the document.
1698
+ """
1699
+ from kodexa_document.persistence_models import FeatureType
1700
+
1701
+ features = []
1702
+ tag_feature_types = FeatureType.select().where(
1703
+ FeatureType.name.startswith("tag:")
1704
+ )
1705
+
1706
+ for feature_type in tag_feature_types:
1707
+ features.append(feature_type.name.split(":")[1])
1708
+
1709
+ return features
1710
+
1711
+ def get_tagged_nodes(self, tag, tag_uuid=None):
1712
+ """
1713
+ Retrieves nodes with a given tag.
1714
+ """
1715
+ from kodexa_document.persistence_models import (
1716
+ Feature as PeeweeFeature,
1717
+ FeatureType,
1718
+ )
1719
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1720
+ from kodexa_document.persistence_models import FeatureTag
1721
+
1722
+ content_nodes = []
1723
+ tag_name = f"tag:{tag}"
1724
+
1725
+ try:
1726
+ # Find the feature type for this tag
1727
+ feature_type = FeatureType.get_or_none(FeatureType.name == tag_name)
1728
+ if feature_type:
1729
+ # Query for features with this type
1730
+ feature_query = (
1731
+ PeeweeFeature.select(PeeweeFeature.content_node)
1732
+ .distinct()
1733
+ .where(PeeweeFeature.feature_type == feature_type)
1734
+ )
1735
+
1736
+ if tag_uuid:
1737
+ # If we have a tag UUID, look for matching tags
1738
+ matching_tags = FeatureTag.select(FeatureTag.feature).where(
1739
+ FeatureTag.uuid == tag_uuid
1740
+ )
1741
+ matching_feature_ids = [tag.feature_id for tag in matching_tags]
1742
+
1743
+ if matching_feature_ids:
1744
+ feature_query = feature_query.where(
1745
+ PeeweeFeature.id.in_(matching_feature_ids)
1746
+ )
1747
+ else:
1748
+ # Check for tag_uuid in the Feature table as well (older format)
1749
+ feature_query = feature_query.where(
1750
+ PeeweeFeature.tag_uuid == tag_uuid
1751
+ )
1752
+
1753
+ for feature in feature_query:
1754
+ peewee_node = PeeweeContentNode.get_or_none(
1755
+ PeeweeContentNode.id == feature.content_node
1756
+ )
1757
+ if peewee_node:
1758
+ content_nodes.append(self.__build_node(peewee_node))
1759
+
1760
+ except Exception as e:
1761
+ logger.error(f"Error retrieving tagged nodes: {e}")
1762
+
1763
+ return content_nodes
1764
+
1765
+ def get_all_tagged_nodes(self):
1766
+ """
1767
+ Retrieves all nodes with tags from the document.
1768
+ """
1769
+ from kodexa_document.persistence_models import (
1770
+ Feature as PeeweeFeature,
1771
+ FeatureType,
1772
+ )
1773
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1774
+
1775
+ content_nodes = []
1776
+
1777
+ try:
1778
+ # Find all tag feature types
1779
+ tag_feature_types = FeatureType.select().where(
1780
+ FeatureType.name.startswith("tag:")
1781
+ )
1782
+ if tag_feature_types:
1783
+ # Query for features with any tag type
1784
+ feature_query = (
1785
+ PeeweeFeature.select(PeeweeFeature.content_node)
1786
+ .distinct()
1787
+ .where(
1788
+ PeeweeFeature.feature_type.in_(
1789
+ [ft.id for ft in tag_feature_types]
1790
+ )
1791
+ )
1792
+ )
1793
+
1794
+ for feature in feature_query:
1795
+ peewee_node = PeeweeContentNode.get_or_none(
1796
+ PeeweeContentNode.id == feature.content_node
1797
+ )
1798
+ if peewee_node:
1799
+ node = self.__build_node(peewee_node)
1800
+ if node not in content_nodes: # Avoid duplicates
1801
+ content_nodes.append(node)
1802
+ except Exception as e:
1803
+ logger.error(f"Error retrieving all tagged nodes: {e}")
1804
+
1805
+ return content_nodes
1806
+
1807
+ def add_model_insight(self, model_insights: ModelInsight):
1808
+ """
1809
+ Adds a model insight to the document.
1810
+ """
1811
+ from kodexa_document.persistence_models import database
1812
+
1813
+ with database.atomic():
1814
+ # Execute raw SQL since there's no dedicated model
1815
+ database.execute_sql(
1816
+ "INSERT INTO model_insights (model_insight) VALUES (?)",
1817
+ (model_insights.json(),),
1818
+ )
1819
+
1820
+ def get_model_insights(self) -> List[ModelInsight]:
1821
+ """
1822
+ Retrieves all model insights from the document.
1823
+ """
1824
+ from kodexa_document.persistence_models import database
1825
+
1826
+ model_insights = []
1827
+ cursor = database.execute_sql("SELECT model_insight FROM model_insights")
1828
+
1829
+ for row in cursor.fetchall():
1830
+ model_insights.append(ModelInsight.model_validate_json(row[0]))
1831
+
1832
+ return model_insights
1833
+
1834
+ def clear_model_insights(self):
1835
+ """
1836
+ Clears all model insights from the document.
1837
+ """
1838
+ from kodexa_document.persistence_models import database
1839
+
1840
+ with database.atomic():
1841
+ database.execute_sql("DELETE FROM model_insights")
1842
+
1843
+ def add_exception(self, exception: ContentException):
1844
+ """
1845
+ Adds an exception to the document.
1846
+ """
1847
+ from kodexa_document.persistence_models import (
1848
+ ContentException as PeeweeContentException,
1849
+ )
1850
+
1851
+ with self.connection.atomic():
1852
+ PeeweeContentException.create(
1853
+ data_object=None, # Remove data_object relationship
1854
+ message=exception.message,
1855
+ exception_details=exception.exception_details,
1856
+ exception_type=exception.exception_type,
1857
+ severity=exception.severity,
1858
+ path=None, # Not in original model
1859
+ closing_comment=None, # Not in original model
1860
+ open=True, # Default to open
1861
+ node_uuid=exception.node_uuid, # Store node_uuid directly
1862
+ exception_type_id=exception.exception_type_id, # Store exception_type_id
1863
+ )
1864
+
1865
+ def get_exceptions(self) -> List[ContentException]:
1866
+ """
1867
+ Retrieves all exceptions from the document.
1868
+ """
1869
+ from kodexa_document.persistence_models import (
1870
+ ContentException as PeeweeContentException,
1871
+ )
1872
+
1873
+ exceptions = []
1874
+ peewee_exceptions = PeeweeContentException.select()
1875
+
1876
+ for peewee_exception in peewee_exceptions:
1877
+ exceptions.append(
1878
+ ContentException(
1879
+ tag=None, # Not in Peewee model
1880
+ message=peewee_exception.message,
1881
+ exception_details=peewee_exception.exception_details,
1882
+ group_uuid=None, # Not in Peewee model
1883
+ tag_uuid=None, # Not in Peewee model
1884
+ exception_type=peewee_exception.exception_type,
1885
+ severity=peewee_exception.severity,
1886
+ node_uuid=peewee_exception.node_uuid, # Get node_uuid directly
1887
+ exception_type_id=peewee_exception.exception_type_id, # Get exception_type_id
1888
+ )
1889
+ )
1890
+
1891
+ return exceptions
1892
+
1893
+ def replace_exceptions(self, exceptions: List[ContentException]):
1894
+ """
1895
+ Replaces all exceptions in the document with a given list of exceptions.
1896
+ """
1897
+ from kodexa_document.persistence_models import (
1898
+ ContentException as PeeweeContentException,
1899
+ )
1900
+
1901
+ with self.connection.atomic():
1902
+ PeeweeContentException.delete().execute()
1903
+
1904
+ for exception in exceptions:
1905
+ self.add_exception(exception)
1906
+
1907
+ def get_bytes(self):
1908
+ """
1909
+ Retrieves the document as bytes.
1910
+ """
1911
+ self.sync()
1912
+
1913
+ if self.inmemory:
1914
+ # For in-memory DB, first save to disk using the sqlite3 backup API
1915
+ import sqlite3
1916
+
1917
+ # Create a connection to the temporary file
1918
+ dest_conn = sqlite3.connect(self.current_filename)
1919
+
1920
+ # Get the source connection from Peewee
1921
+ source_conn = self.connection.connection()
1922
+
1923
+ # Back up the in-memory database to the file
1924
+ source_conn.backup(dest_conn)
1925
+
1926
+ # Close the destination connection to ensure it's flushed to disk
1927
+ dest_conn.close()
1928
+
1929
+ with open(self.current_filename, "rb") as f:
1930
+ return f.read()
1931
+
1932
+ def sync(self):
1933
+ """
1934
+ Synchronizes the database with the document.
1935
+ """
1936
+ self.update_metadata()
1937
+ self.connection.commit()
1938
+
1939
+ def debug_tags(self):
1940
+ """
1941
+ Debug method to print tag information from the database.
1942
+ """
1943
+ from kodexa_document.persistence_models import (
1944
+ Feature as PeeweeFeature,
1945
+ FeatureType,
1946
+ )
1947
+ from kodexa_document.persistence_models import ContentNode as PeeweeContentNode
1948
+ from kodexa_document.persistence_models import FeatureTag, FeatureBlob
1949
+
1950
+ try:
1951
+ # Log tag feature types
1952
+ tag_types = FeatureType.select().where(FeatureType.name.startswith("tag:"))
1953
+ print(f"Tag feature types: {[t.name for t in tag_types]}")
1954
+
1955
+ # Log features with tag types
1956
+ for tag_type in tag_types:
1957
+ features = PeeweeFeature.select().where(
1958
+ PeeweeFeature.feature_type == tag_type
1959
+ )
1960
+ print(f"Features for tag type {tag_type.name}: {features.count()}")
1961
+
1962
+ for feature in features:
1963
+ print(
1964
+ f"Feature ID: {feature.id}, Content Node ID: {feature.content_node}"
1965
+ )
1966
+
1967
+ # Check for FeatureTags
1968
+ tags = FeatureTag.select().where(FeatureTag.feature == feature)
1969
+ print(f"Feature Tags: {tags.count()}")
1970
+
1971
+ # Check for feature blob
1972
+ blobs = FeatureBlob.select().where(FeatureBlob.feature == feature)
1973
+ print(f"Feature Blobs: {blobs.count()}")
1974
+
1975
+ if blobs.count() > 0:
1976
+ blob = blobs.first()
1977
+ blob_content = msgpack.unpackb(blob.binary_value)
1978
+ print(f"Blob content: {blob_content}")
1979
+
1980
+ # Log content nodes with features
1981
+ feature_count = PeeweeFeature.select().count()
1982
+ print(f"Total features in the database: {feature_count}")
1983
+
1984
+ return True
1985
+ except Exception as e:
1986
+ print(f"Error in debug_tags: {e}")
1987
+ import traceback
1988
+
1989
+ print(traceback.format_exc())
1990
+ return False
1991
+
1992
+ def get_external_data(self, key="default") -> dict:
1993
+ """
1994
+ Get external data stored with the given key.
1995
+
1996
+ Args:
1997
+ key: The key for the external data, defaults to "default"
1998
+
1999
+ Returns:
2000
+ A dictionary of the external data
2001
+ """
2002
+ from kodexa_document.persistence_models import ExternalData
2003
+
2004
+ try:
2005
+ external_data = ExternalData.get_or_none(ExternalData.key == key)
2006
+ if external_data:
2007
+ return msgpack.unpackb(external_data.data)
2008
+ return {}
2009
+ except Exception as e:
2010
+ logger.error(f"Error getting external data: {e}")
2011
+ return {}
2012
+
2013
+ def get_external_data_keys(self) -> List[str]:
2014
+ """
2015
+ Get all keys used for external data.
2016
+
2017
+ Returns:
2018
+ A list of keys used for external data
2019
+ """
2020
+ from kodexa_document.persistence_models import ExternalData
2021
+
2022
+ try:
2023
+ keys = ExternalData.select(ExternalData.key).distinct()
2024
+ key_list = [k.key for k in keys]
2025
+
2026
+ # Always include 'default' in the list of keys
2027
+ if "default" not in key_list:
2028
+ key_list.insert(0, "default")
2029
+
2030
+ return key_list
2031
+ except Exception as e:
2032
+ logger.error(f"Error getting external data keys: {e}")
2033
+ return ["default"] # Return default key even on error
2034
+
2035
+ def set_external_data(self, external_data: dict, key="default"):
2036
+ """
2037
+ Store external data with the given key.
2038
+
2039
+ Args:
2040
+ external_data: A dictionary of data to store
2041
+ key: The key to store the data under, defaults to "default"
2042
+ """
2043
+ from kodexa_document.persistence_models import ExternalData
2044
+
2045
+ try:
2046
+ with self.connection.atomic():
2047
+ # Delete any existing data with this key
2048
+ ExternalData.delete().where(ExternalData.key == key).execute()
2049
+
2050
+ # Store the new data
2051
+ ExternalData.create(
2052
+ taxonomy=None, # Could be linked to taxonomy in the future
2053
+ key=key,
2054
+ data=msgpack.packb(external_data, use_bin_type=True),
2055
+ )
2056
+ except Exception as e:
2057
+ logger.error(f"Error setting external data: {e}")