longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,656 @@
1
+ """MongoDB client and CRUD operations for LongParser API.
2
+
3
+ Uses Motor (async MongoDB driver) with tenant-scoped queries and
4
+ materialized path hierarchy indexes.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import logging
10
+ from datetime import datetime, timedelta, timezone
11
+ from typing import Optional
12
+
13
+ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase
14
+
15
+ from .schemas import (
16
+ JobStatus,
17
+ ReviewStatus,
18
+ FinalizePolicy,
19
+ Revision,
20
+ JobResponse,
21
+ ReviewProgress,
22
+ BlockResponse,
23
+ ChunkResponse,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Database client
31
+ # ---------------------------------------------------------------------------
32
+
33
+ class Database:
34
+ """Async MongoDB interface for LongParser."""
35
+
36
+ def __init__(self, mongo_url: Optional[str] = None, db_name: str = "longparser"):
37
+ import os
38
+ if mongo_url is None:
39
+ mongo_url = os.getenv("LONGPARSER_MONGO_URL", "mongodb://localhost:27017")
40
+ self.client = AsyncIOMotorClient(mongo_url)
41
+ self.db: AsyncIOMotorDatabase = self.client[db_name]
42
+
43
+ # Collections
44
+ self.jobs = self.db["jobs"]
45
+ self.blocks = self.db["blocks"]
46
+ self.chunks = self.db["chunks"]
47
+ self.block_revisions = self.db["block_revisions"]
48
+ self.chunk_revisions = self.db["chunk_revisions"]
49
+ self.index_versions = self.db["index_versions"]
50
+ self.chat_sessions = self.db["chat_sessions"]
51
+ self.chat_turns = self.db["chat_turns"]
52
+
53
+ async def create_indexes(self) -> None:
54
+ """Create all required indexes (idempotent)."""
55
+ # Jobs
56
+ await self.jobs.create_index(
57
+ [("tenant_id", 1), ("job_id", 1)], unique=True
58
+ )
59
+ await self.jobs.create_index([("tenant_id", 1), ("status", 1)])
60
+
61
+ # Blocks
62
+ await self.blocks.create_index(
63
+ [("tenant_id", 1), ("job_id", 1), ("block_id", 1)], unique=True
64
+ )
65
+ await self.blocks.create_index(
66
+ [("tenant_id", 1), ("job_id", 1), ("review_status", 1)]
67
+ )
68
+ await self.blocks.create_index(
69
+ [("tenant_id", 1), ("job_id", 1), ("type", 1)]
70
+ )
71
+ await self.blocks.create_index(
72
+ [("tenant_id", 1), ("job_id", 1), ("hierarchy_path", 1)]
73
+ )
74
+
75
+ # Chunks
76
+ await self.chunks.create_index(
77
+ [("tenant_id", 1), ("job_id", 1), ("chunk_id", 1)], unique=True
78
+ )
79
+ await self.chunks.create_index(
80
+ [("tenant_id", 1), ("job_id", 1), ("review_status", 1)]
81
+ )
82
+ await self.chunks.create_index(
83
+ [("tenant_id", 1), ("job_id", 1), ("section_path", 1)]
84
+ )
85
+
86
+ # Revisions
87
+ await self.block_revisions.create_index(
88
+ [("tenant_id", 1), ("job_id", 1), ("block_id", 1), ("timestamp", 1)]
89
+ )
90
+ await self.chunk_revisions.create_index(
91
+ [("tenant_id", 1), ("job_id", 1), ("chunk_id", 1), ("timestamp", 1)]
92
+ )
93
+
94
+ # Index versions
95
+ await self.index_versions.create_index(
96
+ [("tenant_id", 1), ("job_id", 1), ("index_version", 1)], unique=True
97
+ )
98
+ await self.index_versions.create_index(
99
+ [("tenant_id", 1), ("job_id", 1), ("status", 1)]
100
+ )
101
+
102
+ # Chat sessions
103
+ await self.chat_sessions.create_index(
104
+ [("tenant_id", 1), ("session_id", 1)], unique=True
105
+ )
106
+ await self.chat_sessions.create_index(
107
+ [("tenant_id", 1), ("job_id", 1)]
108
+ )
109
+ # TTL index — auto-purge soft-deleted sessions after 30 days
110
+ await self.chat_sessions.create_index(
111
+ [("deleted_at", 1)],
112
+ expireAfterSeconds=30 * 24 * 3600,
113
+ partialFilterExpression={"deleted_at": {"$type": "date"}},
114
+ )
115
+
116
+ # Chat turns
117
+ await self.chat_turns.create_index(
118
+ [("tenant_id", 1), ("session_id", 1), ("created_at", 1)]
119
+ )
120
+ await self.chat_turns.create_index(
121
+ [("tenant_id", 1), ("session_id", 1), ("idempotency_key", 1)],
122
+ unique=True,
123
+ partialFilterExpression={"idempotency_key": {"$type": "string"}},
124
+ )
125
+ await self.chat_turns.create_index(
126
+ [("session_id", 1), ("archived", 1)]
127
+ )
128
+
129
+ logger.info("MongoDB indexes created.")
130
+
131
+ # -----------------------------------------------------------------------
132
+ # Jobs CRUD
133
+ # -----------------------------------------------------------------------
134
+
135
+ async def create_job(
136
+ self, tenant_id: str, job_id: str, source_file: str, file_hash: str
137
+ ) -> dict:
138
+ """Create a new processing job."""
139
+ doc = {
140
+ "tenant_id": tenant_id,
141
+ "job_id": job_id,
142
+ "status": JobStatus.QUEUED.value,
143
+ "source_file": source_file,
144
+ "file_hash": file_hash,
145
+ "total_pages": 0,
146
+ "total_blocks": 0,
147
+ "total_chunks": 0,
148
+ "progress": {"pages_done": 0, "blocks_saved": 0, "chunks_saved": 0, "embeddings_done": 0},
149
+ "created_at": datetime.now(timezone.utc),
150
+ "finalized_at": None,
151
+ "error": None,
152
+ }
153
+ await self.jobs.insert_one(doc)
154
+ return doc
155
+
156
+ async def get_job(self, tenant_id: str, job_id: str) -> Optional[dict]:
157
+ """Get a job by tenant + job_id."""
158
+ return await self.jobs.find_one(
159
+ {"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0}
160
+ )
161
+
162
+ async def list_jobs(
163
+ self, tenant_id: str, status: Optional[str] = None,
164
+ skip: int = 0, limit: int = 50
165
+ ) -> tuple[list[dict], int]:
166
+ """List jobs for a tenant with optional status filter."""
167
+ query = {"tenant_id": tenant_id}
168
+ if status:
169
+ query["status"] = status
170
+ total = await self.jobs.count_documents(query)
171
+ cursor = self.jobs.find(query, {"_id": 0}).skip(skip).limit(limit)
172
+ docs = await cursor.to_list(length=limit)
173
+ return docs, total
174
+
175
+ async def update_job(self, tenant_id: str, job_id: str, updates: dict) -> bool:
176
+ """Update job fields."""
177
+ result = await self.jobs.update_one(
178
+ {"tenant_id": tenant_id, "job_id": job_id},
179
+ {"$set": updates},
180
+ )
181
+ return result.modified_count > 0
182
+
183
+ async def delete_job(self, tenant_id: str, job_id: str) -> None:
184
+ """Delete a job and all associated data."""
185
+ query = {"tenant_id": tenant_id, "job_id": job_id}
186
+ await self.blocks.delete_many(query)
187
+ await self.chunks.delete_many(query)
188
+ await self.block_revisions.delete_many(query)
189
+ await self.chunk_revisions.delete_many(query)
190
+ await self.index_versions.delete_many(query)
191
+ await self.jobs.delete_one(query)
192
+
193
+ # -----------------------------------------------------------------------
194
+ # Blocks CRUD
195
+ # -----------------------------------------------------------------------
196
+
197
+ async def upsert_block(self, tenant_id: str, job_id: str, block: dict) -> None:
198
+ """Upsert a block (idempotent for retries)."""
199
+ block["tenant_id"] = tenant_id
200
+ block["job_id"] = job_id
201
+ block.setdefault("review_status", ReviewStatus.PENDING.value)
202
+ block.setdefault("current_revision_id", None)
203
+ block.setdefault("version", 1)
204
+ await self.blocks.update_one(
205
+ {"tenant_id": tenant_id, "job_id": job_id, "block_id": block["block_id"]},
206
+ {"$set": block},
207
+ upsert=True,
208
+ )
209
+
210
+ async def get_blocks(
211
+ self, tenant_id: str, job_id: str,
212
+ status: Optional[str] = None,
213
+ block_type: Optional[str] = None,
214
+ page: Optional[int] = None,
215
+ skip: int = 0, limit: int = 100,
216
+ ) -> list[dict]:
217
+ """Get blocks with optional filters."""
218
+ query: dict = {"tenant_id": tenant_id, "job_id": job_id}
219
+ if status:
220
+ query["review_status"] = status
221
+ if block_type:
222
+ query["type"] = block_type
223
+ if page is not None:
224
+ query["page_number"] = page
225
+ cursor = self.blocks.find(query, {"_id": 0, "confidence": 0}).skip(skip).limit(limit)
226
+ return await cursor.to_list(length=limit)
227
+
228
+ async def update_block_review(
229
+ self, tenant_id: str, job_id: str, block_id: str,
230
+ review_status: str, version: int,
231
+ edited_text: Optional[str] = None,
232
+ edited_type: Optional[str] = None,
233
+ revision_id: Optional[str] = None,
234
+ ) -> Optional[dict]:
235
+ """Update block review status with optimistic locking."""
236
+ updates: dict = {
237
+ "review_status": review_status,
238
+ "current_revision_id": revision_id,
239
+ }
240
+ if edited_text is not None:
241
+ updates["edited_text"] = edited_text
242
+ if edited_type is not None:
243
+ updates["edited_type"] = edited_type
244
+
245
+ result = await self.blocks.find_one_and_update(
246
+ {
247
+ "tenant_id": tenant_id,
248
+ "job_id": job_id,
249
+ "block_id": block_id,
250
+ "version": version,
251
+ },
252
+ {"$set": updates, "$inc": {"version": 1}},
253
+ return_document=True,
254
+ projection={"_id": 0, "confidence": 0},
255
+ )
256
+ return result
257
+
258
+ # -----------------------------------------------------------------------
259
+ # Chunks CRUD
260
+ # -----------------------------------------------------------------------
261
+
262
+ async def upsert_chunk(self, tenant_id: str, job_id: str, chunk: dict) -> None:
263
+ """Upsert a chunk (idempotent for retries)."""
264
+ chunk["tenant_id"] = tenant_id
265
+ chunk["job_id"] = job_id
266
+ chunk.setdefault("review_status", ReviewStatus.PENDING.value)
267
+ chunk.setdefault("current_revision_id", None)
268
+ chunk.setdefault("version", 1)
269
+ await self.chunks.update_one(
270
+ {"tenant_id": tenant_id, "job_id": job_id, "chunk_id": chunk["chunk_id"]},
271
+ {"$set": chunk},
272
+ upsert=True,
273
+ )
274
+
275
+ async def get_chunks(
276
+ self, tenant_id: str, job_id: str,
277
+ status: Optional[str] = None,
278
+ chunk_type: Optional[str] = None,
279
+ skip: int = 0, limit: int = 100,
280
+ ) -> list[dict]:
281
+ """Get chunks with optional filters."""
282
+ query: dict = {"tenant_id": tenant_id, "job_id": job_id}
283
+ if status:
284
+ query["review_status"] = status
285
+ if chunk_type:
286
+ query["chunk_type"] = chunk_type
287
+ cursor = self.chunks.find(query, {"_id": 0}).skip(skip).limit(limit)
288
+ return await cursor.to_list(length=limit)
289
+
290
+ async def update_chunk_review(
291
+ self, tenant_id: str, job_id: str, chunk_id: str,
292
+ review_status: str, version: int,
293
+ edited_text: Optional[str] = None,
294
+ revision_id: Optional[str] = None,
295
+ ) -> Optional[dict]:
296
+ """Update chunk review status with optimistic locking."""
297
+ updates: dict = {
298
+ "review_status": review_status,
299
+ "current_revision_id": revision_id,
300
+ }
301
+ if edited_text is not None:
302
+ updates["edited_text"] = edited_text
303
+
304
+ result = await self.chunks.find_one_and_update(
305
+ {
306
+ "tenant_id": tenant_id,
307
+ "job_id": job_id,
308
+ "chunk_id": chunk_id,
309
+ "version": version,
310
+ },
311
+ {"$set": updates, "$inc": {"version": 1}},
312
+ return_document=True,
313
+ projection={"_id": 0},
314
+ )
315
+ return result
316
+
317
+ # -----------------------------------------------------------------------
318
+ # Revisions (append-only)
319
+ # -----------------------------------------------------------------------
320
+
321
+ async def create_revision(self, tenant_id: str, job_id: str, revision: Revision) -> None:
322
+ """Append a revision record."""
323
+ doc = revision.model_dump()
324
+ doc["tenant_id"] = tenant_id
325
+ doc["job_id"] = job_id
326
+ collection = (
327
+ self.block_revisions if revision.entity_type == "block"
328
+ else self.chunk_revisions
329
+ )
330
+ await collection.insert_one(doc)
331
+
332
+ async def get_audit_trail(
333
+ self, tenant_id: str, job_id: str, skip: int = 0, limit: int = 200
334
+ ) -> list[dict]:
335
+ """Get combined revision history for a job."""
336
+ block_revs = await self.block_revisions.find(
337
+ {"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0}
338
+ ).sort("timestamp", 1).to_list(length=limit)
339
+ chunk_revs = await self.chunk_revisions.find(
340
+ {"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0}
341
+ ).sort("timestamp", 1).to_list(length=limit)
342
+
343
+ combined = sorted(
344
+ block_revs + chunk_revs,
345
+ key=lambda r: r.get("timestamp", datetime.min),
346
+ )
347
+ return combined[skip:skip + limit]
348
+
349
+ # -----------------------------------------------------------------------
350
+ # Review progress
351
+ # -----------------------------------------------------------------------
352
+
353
+ async def get_review_progress(self, tenant_id: str, job_id: str) -> ReviewProgress:
354
+ """Count blocks by review status."""
355
+ pipeline = [
356
+ {"$match": {"tenant_id": tenant_id, "job_id": job_id}},
357
+ {"$group": {"_id": "$review_status", "count": {"$sum": 1}}},
358
+ ]
359
+ result = ReviewProgress()
360
+ async for doc in self.blocks.aggregate(pipeline):
361
+ status = doc["_id"]
362
+ count = doc["count"]
363
+ if status == "approved":
364
+ result.approved = count
365
+ elif status == "edited":
366
+ result.edited = count
367
+ elif status == "rejected":
368
+ result.rejected = count
369
+ elif status == "pending":
370
+ result.pending = count
371
+ return result
372
+
373
+ # -----------------------------------------------------------------------
374
+ # Finalize
375
+ # -----------------------------------------------------------------------
376
+
377
+ async def apply_finalize_policy(
378
+ self, tenant_id: str, job_id: str, policy: FinalizePolicy
379
+ ) -> int:
380
+ """Apply finalize policy to pending blocks/chunks. Returns count affected."""
381
+ query = {
382
+ "tenant_id": tenant_id,
383
+ "job_id": job_id,
384
+ "review_status": ReviewStatus.PENDING.value,
385
+ }
386
+
387
+ if policy == FinalizePolicy.REQUIRE_ALL_APPROVED:
388
+ count = await self.blocks.count_documents(query)
389
+ count += await self.chunks.count_documents(query)
390
+ return count # caller checks if > 0 → 400
391
+
392
+ new_status = (
393
+ ReviewStatus.REJECTED.value
394
+ if policy == FinalizePolicy.REJECT_PENDING
395
+ else ReviewStatus.APPROVED.value
396
+ )
397
+
398
+ r1 = await self.blocks.update_many(query, {"$set": {"review_status": new_status}})
399
+ r2 = await self.chunks.update_many(query, {"$set": {"review_status": new_status}})
400
+ return r1.modified_count + r2.modified_count
401
+
402
+ async def get_approved_chunks(self, tenant_id: str, job_id: str) -> list[dict]:
403
+ """Get all approved/edited chunks for embedding."""
404
+ return await self.chunks.find(
405
+ {
406
+ "tenant_id": tenant_id,
407
+ "job_id": job_id,
408
+ "review_status": {"$in": [
409
+ ReviewStatus.APPROVED.value,
410
+ ReviewStatus.EDITED.value,
411
+ ]},
412
+ },
413
+ {"_id": 0},
414
+ ).to_list(length=None)
415
+
416
+ # -----------------------------------------------------------------------
417
+ # Index versions
418
+ # -----------------------------------------------------------------------
419
+
420
+ async def create_index_version(
421
+ self, tenant_id: str, job_id: str, index_version: str, config: dict
422
+ ) -> None:
423
+ """Create an immutable index version record."""
424
+ doc = {
425
+ "tenant_id": tenant_id,
426
+ "job_id": job_id,
427
+ "index_version": index_version,
428
+ "status": "embedding",
429
+ "created_at": datetime.now(timezone.utc),
430
+ **config,
431
+ }
432
+ await self.index_versions.update_one(
433
+ {"tenant_id": tenant_id, "job_id": job_id, "index_version": index_version},
434
+ {"$set": doc},
435
+ upsert=True,
436
+ )
437
+
438
+ async def get_latest_index_version(
439
+ self, tenant_id: str, job_id: str
440
+ ) -> Optional[dict]:
441
+ """Get the latest successful index version for a job."""
442
+ cursor = self.index_versions.find(
443
+ {"tenant_id": tenant_id, "job_id": job_id, "status": "indexed"},
444
+ projection={"_id": 0}
445
+ ).sort("created_at", -1).limit(1)
446
+ docs = await cursor.to_list(length=1)
447
+ return docs[0] if docs else None
448
+
449
+ async def list_index_versions(self, tenant_id: str, job_id: str) -> list[dict]:
450
+ """List all index versions for a job (for cleanup on delete)."""
451
+ return await self.index_versions.find(
452
+ {"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0}
453
+ ).to_list(length=None)
454
+
455
+ # -----------------------------------------------------------------------
456
+ # Chat Sessions
457
+ # -----------------------------------------------------------------------
458
+
459
+ async def create_chat_session(
460
+ self, tenant_id: str, session_id: str, job_id: str
461
+ ) -> dict:
462
+ """Create a new chat session (server-generated session_id)."""
463
+ doc = {
464
+ "tenant_id": tenant_id,
465
+ "session_id": session_id,
466
+ "job_id": job_id,
467
+ "turn_count": 0,
468
+ "rolling_summary": "",
469
+ "long_term_facts": [],
470
+ "version": 1,
471
+ "created_at": datetime.now(timezone.utc),
472
+ "updated_at": datetime.now(timezone.utc),
473
+ "deleted_at": None,
474
+ }
475
+ await self.chat_sessions.insert_one(doc)
476
+ return doc
477
+
478
+ async def get_chat_session(
479
+ self, tenant_id: str, session_id: str
480
+ ) -> Optional[dict]:
481
+ """Get a chat session (excludes soft-deleted)."""
482
+ return await self.chat_sessions.find_one(
483
+ {
484
+ "tenant_id": tenant_id,
485
+ "session_id": session_id,
486
+ "deleted_at": None,
487
+ },
488
+ {"_id": 0},
489
+ )
490
+
491
+ async def soft_delete_chat_session(
492
+ self, tenant_id: str, session_id: str
493
+ ) -> bool:
494
+ """Soft-delete a session (sets deleted_at, excluded from queries)."""
495
+ result = await self.chat_sessions.update_one(
496
+ {"tenant_id": tenant_id, "session_id": session_id, "deleted_at": None},
497
+ {"$set": {"deleted_at": datetime.now(timezone.utc)}},
498
+ )
499
+ return result.modified_count > 0
500
+
501
+ async def update_rolling_summary(
502
+ self, tenant_id: str, session_id: str, summary: str, version: int
503
+ ) -> bool:
504
+ """Update rolling summary with optimistic locking."""
505
+ result = await self.chat_sessions.update_one(
506
+ {
507
+ "tenant_id": tenant_id,
508
+ "session_id": session_id,
509
+ "version": version,
510
+ },
511
+ {
512
+ "$set": {
513
+ "rolling_summary": summary,
514
+ "updated_at": datetime.now(timezone.utc),
515
+ },
516
+ "$inc": {"version": 1},
517
+ },
518
+ )
519
+ return result.modified_count > 0
520
+
521
+ async def update_long_term_facts(
522
+ self, tenant_id: str, session_id: str, facts: list[dict], version: int
523
+ ) -> bool:
524
+ """Update long-term facts with optimistic locking."""
525
+ result = await self.chat_sessions.update_one(
526
+ {
527
+ "tenant_id": tenant_id,
528
+ "session_id": session_id,
529
+ "version": version,
530
+ },
531
+ {
532
+ "$set": {
533
+ "long_term_facts": facts,
534
+ "updated_at": datetime.now(timezone.utc),
535
+ },
536
+ "$inc": {"version": 1},
537
+ },
538
+ )
539
+ return result.modified_count > 0
540
+
541
+ # -----------------------------------------------------------------------
542
+ # Chat Turns
543
+ # -----------------------------------------------------------------------
544
+
545
+ async def save_turn(
546
+ self, tenant_id: str, session_id: str, turn
547
+ ) -> None:
548
+ """Save a turn and atomically increment turn_count."""
549
+ doc = turn.model_dump(mode="json")
550
+ doc["tenant_id"] = tenant_id
551
+ doc["session_id"] = session_id
552
+ await self.chat_turns.insert_one(doc)
553
+
554
+ # Atomic turn count increment
555
+ await self.chat_sessions.update_one(
556
+ {"tenant_id": tenant_id, "session_id": session_id},
557
+ {
558
+ "$inc": {"turn_count": 1},
559
+ "$set": {"updated_at": datetime.now(timezone.utc)},
560
+ },
561
+ )
562
+
563
+ async def get_recent_turns(
564
+ self, tenant_id: str, session_id: str, n: int = 8
565
+ ) -> list[dict]:
566
+ """Get the N most recent non-archived turns (short-term memory)."""
567
+ cursor = self.chat_turns.find(
568
+ {
569
+ "tenant_id": tenant_id,
570
+ "session_id": session_id,
571
+ "archived": False,
572
+ },
573
+ {"_id": 0},
574
+ ).sort("created_at", -1).limit(n)
575
+ turns = await cursor.to_list(length=n)
576
+ turns.reverse() # chronological order
577
+ return turns
578
+
579
+ async def get_turn_by_idempotency_key(
580
+ self, tenant_id: str, session_id: str, idempotency_key: str
581
+ ) -> Optional[dict]:
582
+ """Check for existing turn with same idempotency key (exactly-once)."""
583
+ return await self.chat_turns.find_one(
584
+ {
585
+ "tenant_id": tenant_id,
586
+ "session_id": session_id,
587
+ "idempotency_key": idempotency_key,
588
+ },
589
+ {"_id": 0},
590
+ )
591
+
592
+ async def get_all_turns(
593
+ self, tenant_id: str, session_id: str
594
+ ) -> list[dict]:
595
+ """Get all turns for a session (including archived), for session history."""
596
+ cursor = self.chat_turns.find(
597
+ {"tenant_id": tenant_id, "session_id": session_id},
598
+ {"_id": 0},
599
+ ).sort("created_at", 1)
600
+ return await cursor.to_list(length=None)
601
+
602
+ async def get_unarchived_turns(
603
+ self, tenant_id: str, session_id: str
604
+ ) -> list[dict]:
605
+ """Get all non-archived turns (for summarization)."""
606
+ cursor = self.chat_turns.find(
607
+ {
608
+ "tenant_id": tenant_id,
609
+ "session_id": session_id,
610
+ "archived": False,
611
+ },
612
+ {"_id": 0},
613
+ ).sort("created_at", 1)
614
+ return await cursor.to_list(length=None)
615
+
616
+ async def archive_turns(
617
+ self, tenant_id: str, session_id: str, turn_ids: list[str]
618
+ ) -> int:
619
+ """Mark turns as archived (excluded from prompt, kept in DB)."""
620
+ result = await self.chat_turns.update_many(
621
+ {
622
+ "tenant_id": tenant_id,
623
+ "session_id": session_id,
624
+ "turn_id": {"$in": turn_ids},
625
+ },
626
+ {"$set": {"archived": True}},
627
+ )
628
+ return result.modified_count
629
+
630
+ async def purge_turns_for_session(
631
+ self, tenant_id: str, session_id: str
632
+ ) -> int:
633
+ """Hard-delete all turns for a session (used by purge worker)."""
634
+ result = await self.chat_turns.delete_many(
635
+ {"tenant_id": tenant_id, "session_id": session_id}
636
+ )
637
+ return result.deleted_count
638
+
639
+ async def get_expired_sessions(
640
+ self, ttl_days: int = 30
641
+ ) -> list[dict]:
642
+ """Find soft-deleted sessions past the retention period."""
643
+ cutoff = datetime.now(timezone.utc) - timedelta(days=ttl_days)
644
+ cursor = self.chat_sessions.find(
645
+ {"deleted_at": {"$lte": cutoff}},
646
+ {"session_id": 1, "tenant_id": 1, "_id": 0},
647
+ )
648
+ return await cursor.to_list(length=None)
649
+
650
+ # -----------------------------------------------------------------------
651
+ # Lifecycle
652
+ # -----------------------------------------------------------------------
653
+
654
+ async def close(self) -> None:
655
+ """Close the database connection."""
656
+ self.client.close()