longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1045 @@
1
+ """LongParser FastAPI application — HITL review + embedding + search.
2
+
3
+ Start with:
4
+ uv run uvicorn longparser.server.app:app --reload --port 8000
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ # Load .env for local development (no-op if python-dotenv not installed)
10
+ try:
11
+ from dotenv import load_dotenv
12
+ load_dotenv()
13
+ except ImportError:
14
+ pass
15
+
16
+ import hashlib
17
+ import io
18
+ import logging
19
+ import os
20
+ import shutil
21
+ import uuid
22
+ import zipfile
23
+ from contextlib import asynccontextmanager
24
+ from datetime import datetime, timezone
25
+ from pathlib import Path
26
+ from typing import Optional
27
+ import time as _time
28
+
29
+ from fastapi import (
30
+ FastAPI,
31
+ File,
32
+ Header,
33
+ HTTPException,
34
+ Query,
35
+ Request,
36
+ UploadFile,
37
+ )
38
+ from fastapi.responses import JSONResponse, StreamingResponse
39
+
40
+ from .db import Database
41
+ from .queue import ARQBackend
42
+ from .schemas import (
43
+ BlockResponse,
44
+ BlockReviewUpdate,
45
+ ChunkResponse,
46
+ ChunkReviewUpdate,
47
+ EmbedRequest,
48
+ FinalizePolicy,
49
+ FinalizeRequest,
50
+ JobListResponse,
51
+ JobResponse,
52
+ JobStatus,
53
+ ReviewProgress,
54
+ ReviewStatus,
55
+ Revision,
56
+ SearchRequest,
57
+ SearchResponse,
58
+ SearchResult,
59
+ )
60
+
61
+ logger = logging.getLogger(__name__)
62
+
63
+ # ---------------------------------------------------------------------------
64
+ # Config
65
+ # ---------------------------------------------------------------------------
66
+
67
+ MAX_UPLOAD_SIZE = 100 * 1024 * 1024 # 100 MB
68
+ ALLOWED_MIMES = {
69
+ "application/pdf",
70
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
71
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
72
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
73
+ "text/csv",
74
+ "application/octet-stream", # fallback for unknown MIME
75
+ }
76
+ UPLOAD_DIR = Path(os.getenv("LONGPARSER_UPLOAD_DIR", "./uploads")).resolve()
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # App lifecycle
80
+ # ---------------------------------------------------------------------------
81
+
82
+ db = Database(
83
+ mongo_url=os.getenv("LONGPARSER_MONGO_URL", "mongodb://localhost:27017"),
84
+ db_name=os.getenv("LONGPARSER_DB_NAME", "longparser"),
85
+ )
86
+ queue = ARQBackend(
87
+ redis_url=os.getenv("LONGPARSER_REDIS_URL", "redis://localhost:6379"),
88
+ )
89
+
90
+
91
+ @asynccontextmanager
92
+ async def lifespan(app: FastAPI):
93
+ """Startup/shutdown hooks."""
94
+ await db.create_indexes()
95
+ logger.info("LongParser API started")
96
+ yield
97
+ await queue.close()
98
+ await db.close()
99
+ if hasattr(app.state, "chat_engine"):
100
+ await app.state.chat_engine.close()
101
+ logger.info("LongParser API stopped")
102
+
103
+
104
+ app = FastAPI(
105
+ title="LongParser API",
106
+ description="Document intelligence engine with HITL review, embedding, and vector search.",
107
+ version="0.3.0",
108
+ lifespan=lifespan,
109
+ )
110
+
111
+
112
+ # ---------------------------------------------------------------------------
113
+ # Auth middleware (API key — v1)
114
+ # ---------------------------------------------------------------------------
115
+
116
+ def _get_tenant(x_api_key: str = Header(...)) -> str:
117
+ """Extract tenant_id from API key.
118
+
119
+ v1: API key IS the tenant identifier.
120
+ v2: look up hashed key → tenant mapping in DB.
121
+ """
122
+ if not x_api_key or len(x_api_key) < 8:
123
+ raise HTTPException(status_code=401, detail="Invalid API key")
124
+ # For v1, use a hash of the key as tenant_id
125
+ return hashlib.sha256(x_api_key.encode()).hexdigest()[:16]
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Upload helpers
130
+ # ---------------------------------------------------------------------------
131
+
132
+ async def _stream_upload(upload: UploadFile, dest: Path) -> tuple[str, int]:
133
+ """Stream uploaded file to disk in chunks. Returns (sha256, size)."""
134
+ dest.parent.mkdir(parents=True, exist_ok=True)
135
+ sha = hashlib.sha256()
136
+ size = 0
137
+
138
+ with open(dest, "wb") as f:
139
+ while True:
140
+ chunk = await upload.read(64 * 1024) # 64KB chunks
141
+ if not chunk:
142
+ break
143
+ size += len(chunk)
144
+ if size > MAX_UPLOAD_SIZE:
145
+ dest.unlink(missing_ok=True)
146
+ raise HTTPException(
147
+ status_code=413,
148
+ detail=f"File exceeds {MAX_UPLOAD_SIZE // (1024*1024)}MB limit",
149
+ )
150
+ sha.update(chunk)
151
+ f.write(chunk)
152
+ os.fsync(f.fileno())
153
+
154
+ return sha.hexdigest(), size
155
+
156
+
157
+ # ---------------------------------------------------------------------------
158
+ # Routes: Jobs
159
+ # ---------------------------------------------------------------------------
160
+
161
+ @app.post("/jobs", response_model=JobResponse, status_code=201)
162
+ async def create_job(
163
+ file: UploadFile = File(...),
164
+ x_api_key: str = Header(...),
165
+ ):
166
+ """Upload a document → enqueue extraction."""
167
+ tenant_id = _get_tenant(x_api_key)
168
+
169
+ # Validate content type
170
+ if file.content_type and file.content_type not in ALLOWED_MIMES:
171
+ raise HTTPException(
172
+ status_code=415,
173
+ detail=f"Unsupported file type: {file.content_type}",
174
+ )
175
+
176
+ # Generate job ID and save file
177
+ job_id = str(uuid.uuid4())
178
+ dest = UPLOAD_DIR / tenant_id / job_id / (file.filename or "document")
179
+ file_hash, file_size = await _stream_upload(file, dest)
180
+
181
+ # Create job in MongoDB
182
+ job_doc = await db.create_job(
183
+ tenant_id=tenant_id,
184
+ job_id=job_id,
185
+ source_file=file.filename or "document",
186
+ file_hash=file_hash,
187
+ )
188
+
189
+ # Enqueue extraction
190
+ await queue.enqueue("extract_job", {
191
+ "tenant_id": tenant_id,
192
+ "job_id": job_id,
193
+ "file_path": str(dest),
194
+ })
195
+
196
+ return JobResponse(
197
+ job_id=job_id,
198
+ tenant_id=tenant_id,
199
+ status=JobStatus.QUEUED,
200
+ source_file=file.filename or "document",
201
+ file_hash=file_hash,
202
+ created_at=job_doc["created_at"],
203
+ )
204
+
205
+
206
+ @app.get("/jobs", response_model=JobListResponse)
207
+ async def list_jobs(
208
+ x_api_key: str = Header(...),
209
+ status: Optional[str] = Query(None),
210
+ skip: int = Query(0, ge=0),
211
+ limit: int = Query(50, ge=1, le=200),
212
+ ):
213
+ """List all jobs for the tenant."""
214
+ tenant_id = _get_tenant(x_api_key)
215
+ jobs, total = await db.list_jobs(tenant_id, status=status, skip=skip, limit=limit)
216
+
217
+ job_responses = []
218
+ for j in jobs:
219
+ progress = await db.get_review_progress(tenant_id, j["job_id"])
220
+ job_responses.append(JobResponse(
221
+ job_id=j["job_id"],
222
+ tenant_id=j["tenant_id"],
223
+ status=j["status"],
224
+ source_file=j["source_file"],
225
+ file_hash=j.get("file_hash", ""),
226
+ total_pages=j.get("total_pages", 0),
227
+ total_blocks=j.get("total_blocks", 0),
228
+ total_chunks=j.get("total_chunks", 0),
229
+ review_progress=progress,
230
+ created_at=j["created_at"],
231
+ finalized_at=j.get("finalized_at"),
232
+ error=j.get("error"),
233
+ ))
234
+
235
+ return JobListResponse(jobs=job_responses, total=total)
236
+
237
+
238
+ @app.get("/jobs/{job_id}", response_model=JobResponse)
239
+ async def get_job(job_id: str, x_api_key: str = Header(...)):
240
+ """Get job status and details."""
241
+ tenant_id = _get_tenant(x_api_key)
242
+ job = await db.get_job(tenant_id, job_id)
243
+ if not job:
244
+ raise HTTPException(status_code=404, detail="Job not found")
245
+
246
+ progress = await db.get_review_progress(tenant_id, job_id)
247
+
248
+ return JobResponse(
249
+ job_id=job["job_id"],
250
+ tenant_id=job["tenant_id"],
251
+ status=job["status"],
252
+ source_file=job["source_file"],
253
+ file_hash=job.get("file_hash", ""),
254
+ total_pages=job.get("total_pages", 0),
255
+ total_blocks=job.get("total_blocks", 0),
256
+ total_chunks=job.get("total_chunks", 0),
257
+ review_progress=progress,
258
+ created_at=job["created_at"],
259
+ finalized_at=job.get("finalized_at"),
260
+ error=job.get("error"),
261
+ )
262
+
263
+
264
+ @app.delete("/jobs/{job_id}", status_code=204)
265
+ async def delete_job(job_id: str, x_api_key: str = Header(...)):
266
+ """Delete job and all associated data (blocks, chunks, revisions, vectors)."""
267
+ tenant_id = _get_tenant(x_api_key)
268
+ job = await db.get_job(tenant_id, job_id)
269
+ if not job:
270
+ raise HTTPException(status_code=404, detail="Job not found")
271
+
272
+ # Delete vectors from all index versions
273
+ index_versions = await db.list_index_versions(tenant_id, job_id)
274
+ for iv in index_versions:
275
+ try:
276
+ from .vectorstores import get_vector_store
277
+ from .embeddings import EmbeddingEngine
278
+
279
+ # Rebuild engine to securely reconstruct fingerprint for deletion
280
+ engine = EmbeddingEngine(
281
+ provider=iv.get("provider", "huggingface"),
282
+ model_name=iv.get("model", "BAAI/bge-base-en-v1.5"),
283
+ dimensions=iv.get("configured_dimensions")
284
+ )
285
+ store = get_vector_store(
286
+ iv.get("vector_db", "chroma"),
287
+ collection_name=iv.get("collection", "longparser"),
288
+ index_fingerprint=engine.get_fingerprint()
289
+ )
290
+ store.delete_by_job(job_id, tenant_id=tenant_id)
291
+ except Exception as e:
292
+ logger.warning(f"Vector delete failed for index {iv.get('index_version')}: {e}")
293
+
294
+ # Delete from MongoDB
295
+ await db.delete_job(tenant_id, job_id)
296
+
297
+ # Delete uploaded file
298
+ upload_dir = UPLOAD_DIR / tenant_id / job_id
299
+ if upload_dir.exists():
300
+ shutil.rmtree(upload_dir)
301
+
302
+
303
+ @app.post("/jobs/{job_id}/cancel", status_code=200)
304
+ async def cancel_job(job_id: str, x_api_key: str = Header(...)):
305
+ """Cancel an in-progress job."""
306
+ tenant_id = _get_tenant(x_api_key)
307
+ job = await db.get_job(tenant_id, job_id)
308
+ if not job:
309
+ raise HTTPException(status_code=404, detail="Job not found")
310
+ if job["status"] in ("finalized", "indexed"):
311
+ raise HTTPException(status_code=400, detail="Cannot cancel a completed job")
312
+
313
+ await db.update_job(tenant_id, job_id, {"status": "cancelled"})
314
+ return {"status": "cancelled", "job_id": job_id}
315
+
316
+
317
+ # ---------------------------------------------------------------------------
318
+ # Helpers for formatting API responses to show edited text
319
+ # ---------------------------------------------------------------------------
320
+
321
+ def _format_block(b: dict) -> BlockResponse:
322
+ if b.get("edited_text") is not None:
323
+ b["text"] = b["edited_text"]
324
+ if b.get("edited_type") is not None:
325
+ b["type"] = b["edited_type"]
326
+ return BlockResponse(**b)
327
+
328
+ def _format_chunk(c: dict) -> ChunkResponse:
329
+ if c.get("edited_text") is not None:
330
+ c["text"] = c["edited_text"]
331
+ return ChunkResponse(**c)
332
+
333
+
334
+ # ---------------------------------------------------------------------------
335
+ # Routes: Blocks (HITL review)
336
+ # ---------------------------------------------------------------------------
337
+
338
+ @app.get("/jobs/{job_id}/blocks", response_model=list[BlockResponse])
339
+ async def list_blocks(
340
+ job_id: str,
341
+ x_api_key: str = Header(...),
342
+ status: Optional[str] = Query(None),
343
+ type: Optional[str] = Query(None),
344
+ page: Optional[int] = Query(None),
345
+ skip: int = Query(0, ge=0),
346
+ limit: int = Query(100, ge=1, le=500),
347
+ ):
348
+ """List blocks for a job (filterable by status, type, page)."""
349
+ tenant_id = _get_tenant(x_api_key)
350
+ blocks = await db.get_blocks(
351
+ tenant_id, job_id,
352
+ status=status, block_type=type, page=page,
353
+ skip=skip, limit=limit,
354
+ )
355
+ return [_format_block(b) for b in blocks]
356
+
357
+
358
+ @app.patch("/jobs/{job_id}/blocks/{block_id}", response_model=BlockResponse)
359
+ async def update_block(
360
+ job_id: str, block_id: str,
361
+ body: BlockReviewUpdate,
362
+ x_api_key: str = Header(...),
363
+ ):
364
+ """Edit/approve/reject a block. Creates revision + auto-rechunks."""
365
+ tenant_id = _get_tenant(x_api_key)
366
+
367
+ # Get current block for revision record
368
+ blocks = await db.get_blocks(tenant_id, job_id)
369
+ current = next((b for b in blocks if b.get("block_id") == block_id), None)
370
+ if not current:
371
+ raise HTTPException(status_code=404, detail="Block not found")
372
+
373
+ # Create revision (append-only)
374
+ revision = Revision(
375
+ entity_type="block",
376
+ entity_id=block_id,
377
+ previous_revision_id=current.get("current_revision_id"),
378
+ action=body.status,
379
+ original_text=current.get("text", ""),
380
+ edited_text=body.edited_text,
381
+ edited_type=body.edited_type,
382
+ reviewer_note=body.reviewer_note,
383
+ )
384
+ await db.create_revision(tenant_id, job_id, revision)
385
+
386
+ # Update block with optimistic lock
387
+ updated = await db.update_block_review(
388
+ tenant_id, job_id, block_id,
389
+ review_status=body.status.value,
390
+ version=body.version,
391
+ edited_text=body.edited_text,
392
+ edited_type=body.edited_type.value if body.edited_type else None,
393
+ revision_id=revision.revision_id,
394
+ )
395
+
396
+ if not updated:
397
+ raise HTTPException(
398
+ status_code=409,
399
+ detail="Version conflict — block was modified by another reviewer",
400
+ )
401
+
402
+ # Auto-rechunk after block edit
403
+ new_count = await _rechunk_job(tenant_id, job_id)
404
+ logger.info(f"Auto-rechunked job {job_id}: {new_count} chunks")
405
+
406
+ return _format_block(updated)
407
+
408
+
409
+ # ---------------------------------------------------------------------------
410
+ # Routes: Chunks (HITL review)
411
+ # ---------------------------------------------------------------------------
412
+
413
+ @app.get("/jobs/{job_id}/chunks", response_model=list[ChunkResponse])
414
+ async def list_chunks(
415
+ job_id: str,
416
+ x_api_key: str = Header(...),
417
+ status: Optional[str] = Query(None),
418
+ chunk_type: Optional[str] = Query(None),
419
+ skip: int = Query(0, ge=0),
420
+ limit: int = Query(100, ge=1, le=500),
421
+ ):
422
+ """List chunks for a job (filterable)."""
423
+ tenant_id = _get_tenant(x_api_key)
424
+ chunks = await db.get_chunks(
425
+ tenant_id, job_id,
426
+ status=status, chunk_type=chunk_type,
427
+ skip=skip, limit=limit,
428
+ )
429
+ return [_format_chunk(c) for c in chunks]
430
+
431
+
432
+ @app.patch("/jobs/{job_id}/chunks/{chunk_id}", response_model=ChunkResponse)
433
+ async def update_chunk(
434
+ job_id: str, chunk_id: str,
435
+ body: ChunkReviewUpdate,
436
+ x_api_key: str = Header(...),
437
+ ):
438
+ """Edit/approve/reject a chunk."""
439
+ tenant_id = _get_tenant(x_api_key)
440
+
441
+ # Get current chunk
442
+ chunks = await db.get_chunks(tenant_id, job_id)
443
+ current = next((c for c in chunks if c.get("chunk_id") == chunk_id), None)
444
+ if not current:
445
+ raise HTTPException(status_code=404, detail="Chunk not found")
446
+
447
+ # Create revision
448
+ revision = Revision(
449
+ entity_type="chunk",
450
+ entity_id=chunk_id,
451
+ previous_revision_id=current.get("current_revision_id"),
452
+ action=body.status,
453
+ original_text=current.get("text", ""),
454
+ edited_text=body.edited_text,
455
+ reviewer_note=body.reviewer_note,
456
+ )
457
+ await db.create_revision(tenant_id, job_id, revision)
458
+
459
+ # Update with optimistic lock
460
+ updated = await db.update_chunk_review(
461
+ tenant_id, job_id, chunk_id,
462
+ review_status=body.status.value,
463
+ version=body.version,
464
+ edited_text=body.edited_text,
465
+ revision_id=revision.revision_id,
466
+ )
467
+
468
+ if not updated:
469
+ raise HTTPException(status_code=409, detail="Version conflict")
470
+
471
+ return _format_chunk(updated)
472
+
473
+
474
+ # ---------------------------------------------------------------------------
475
+ # Routes: Audit
476
+ # ---------------------------------------------------------------------------
477
+
478
+ @app.get("/jobs/{job_id}/audit")
479
+ async def get_audit(
480
+ job_id: str,
481
+ x_api_key: str = Header(...),
482
+ skip: int = Query(0, ge=0),
483
+ limit: int = Query(200, ge=1, le=1000),
484
+ ):
485
+ """Get full revision history for a job."""
486
+ tenant_id = _get_tenant(x_api_key)
487
+ trail = await db.get_audit_trail(tenant_id, job_id, skip=skip, limit=limit)
488
+ return trail
489
+
490
+
491
+ # ---------------------------------------------------------------------------
492
+ # Routes: Admin Purge (hard delete with tombstone audit)
493
+ # ---------------------------------------------------------------------------
494
+
495
+ @app.post("/jobs/{job_id}/blocks/{block_id}/purge")
496
+ async def purge_block(
497
+ job_id: str, block_id: str,
498
+ x_api_key: str = Header(...),
499
+ ):
500
+ """Admin-only: permanently delete a block. Writes a tombstone revision."""
501
+ tenant_id = _get_tenant(x_api_key)
502
+
503
+ # Get block before deletion (for tombstone)
504
+ blocks = await db.get_blocks(tenant_id, job_id)
505
+ current = next((b for b in blocks if b.get("block_id") == block_id), None)
506
+ if not current:
507
+ raise HTTPException(status_code=404, detail="Block not found")
508
+
509
+ # Write tombstone revision — preserve hashes/metadata, scrub sensitive text
510
+ text_hash = hashlib.sha256(current.get("text", "").encode()).hexdigest()[:16]
511
+ tombstone = Revision(
512
+ entity_type="block",
513
+ entity_id=block_id,
514
+ previous_revision_id=current.get("current_revision_id"),
515
+ action=ReviewStatus.REJECTED,
516
+ original_text=f"[PURGED] text_hash={text_hash} type={current.get('type')} page={current.get('page_number')}",
517
+ edited_text=None,
518
+ reviewer_note="ADMIN PURGE — content permanently deleted",
519
+ )
520
+ await db.create_revision(tenant_id, job_id, tombstone)
521
+
522
+ # Delete the block
523
+ await db.blocks.delete_one({
524
+ "tenant_id": tenant_id, "job_id": job_id, "block_id": block_id,
525
+ })
526
+
527
+ # Update block count
528
+ remaining = await db.blocks.count_documents({"tenant_id": tenant_id, "job_id": job_id})
529
+ await db.update_job(tenant_id, job_id, {"total_blocks": remaining})
530
+
531
+ # Auto-rechunk
532
+ new_count = await _rechunk_job(tenant_id, job_id)
533
+
534
+ return {
535
+ "status": "purged",
536
+ "block_id": block_id,
537
+ "tombstone_revision_id": tombstone.revision_id,
538
+ "chunks_after_rechunk": new_count,
539
+ }
540
+
541
+
542
+ @app.post("/jobs/{job_id}/chunks/{chunk_id}/purge")
543
+ async def purge_chunk(
544
+ job_id: str, chunk_id: str,
545
+ x_api_key: str = Header(...),
546
+ ):
547
+ """Admin-only: permanently delete a chunk. Writes a tombstone revision."""
548
+ tenant_id = _get_tenant(x_api_key)
549
+
550
+ # Get chunk before deletion
551
+ chunks = await db.get_chunks(tenant_id, job_id)
552
+ current = next((c for c in chunks if c.get("chunk_id") == chunk_id), None)
553
+ if not current:
554
+ raise HTTPException(status_code=404, detail="Chunk not found")
555
+
556
+ # Write tombstone
557
+ text_hash = hashlib.sha256(current.get("text", "").encode()).hexdigest()[:16]
558
+ tombstone = Revision(
559
+ entity_type="chunk",
560
+ entity_id=chunk_id,
561
+ previous_revision_id=current.get("current_revision_id"),
562
+ action=ReviewStatus.REJECTED,
563
+ original_text=f"[PURGED] text_hash={text_hash} type={current.get('chunk_type')}",
564
+ edited_text=None,
565
+ reviewer_note="ADMIN PURGE — content permanently deleted",
566
+ )
567
+ await db.create_revision(tenant_id, job_id, tombstone)
568
+
569
+ # Delete the chunk
570
+ await db.chunks.delete_one({
571
+ "tenant_id": tenant_id, "job_id": job_id, "chunk_id": chunk_id,
572
+ })
573
+
574
+ # Update chunk count
575
+ remaining = await db.chunks.count_documents({"tenant_id": tenant_id, "job_id": job_id})
576
+ await db.update_job(tenant_id, job_id, {"total_chunks": remaining})
577
+
578
+ return {
579
+ "status": "purged",
580
+ "chunk_id": chunk_id,
581
+ "tombstone_revision_id": tombstone.revision_id,
582
+ }
583
+
584
+
585
+ # ---------------------------------------------------------------------------
586
+ # Rechunk helper (shared by block edit + explicit rechunk)
587
+ # ---------------------------------------------------------------------------
588
+
589
+ async def _rechunk_job(tenant_id: str, job_id: str) -> int:
590
+ """Re-chunk a job from current blocks. Returns new chunk count."""
591
+ from ..schemas import Block, Provenance, BoundingBox, Confidence
592
+ from ..chunkers import HybridChunker
593
+ from ..schemas import ChunkingConfig
594
+
595
+ blocks_data = await db.get_blocks(tenant_id, job_id)
596
+ blocks = []
597
+ for b in blocks_data:
598
+ text = b.get("edited_text") or b.get("text", "")
599
+ blocks.append(Block(
600
+ block_id=b["block_id"],
601
+ type=b.get("type", "paragraph"),
602
+ text=text,
603
+ order_index=b.get("order_index", 0),
604
+ heading_level=b.get("heading_level"),
605
+ indent_level=b.get("indent_level", 0),
606
+ hierarchy_path=b.get("hierarchy_path", []),
607
+ provenance=Provenance(
608
+ source_file=b.get("provenance", {}).get("source_file", ""),
609
+ page_number=b.get("page_number", 0),
610
+ bbox=BoundingBox(**b.get("provenance", {}).get("bbox", {"x0": 0, "y0": 0, "x1": 0, "y1": 0})),
611
+ extractor=b.get("provenance", {}).get("extractor", "docling"),
612
+ ),
613
+ confidence=Confidence(overall=1.0),
614
+ ))
615
+
616
+ chunker = HybridChunker(ChunkingConfig())
617
+ new_chunks = chunker.chunk(blocks)
618
+
619
+ await db.chunks.delete_many({"tenant_id": tenant_id, "job_id": job_id})
620
+ for chunk in new_chunks:
621
+ chunk_doc = chunk.model_dump(mode="json")
622
+ chunk_doc["text_hash"] = hashlib.sha256(chunk.text.encode()).hexdigest()[:16]
623
+ await db.upsert_chunk(tenant_id, job_id, chunk_doc)
624
+
625
+ await db.update_job(tenant_id, job_id, {"total_chunks": len(new_chunks)})
626
+ return len(new_chunks)
627
+
628
+
629
+ @app.post("/jobs/{job_id}/rechunk")
630
+ async def rechunk(job_id: str, x_api_key: str = Header(...)):
631
+ """Explicitly re-chunk the job (also happens automatically after block edits)."""
632
+ tenant_id = _get_tenant(x_api_key)
633
+ job = await db.get_job(tenant_id, job_id)
634
+ if not job:
635
+ raise HTTPException(status_code=404, detail="Job not found")
636
+ if job["status"] not in ("ready_for_review",):
637
+ raise HTTPException(
638
+ status_code=400,
639
+ detail="Can only rechunk jobs in 'ready_for_review' status",
640
+ )
641
+
642
+ new_count = await _rechunk_job(tenant_id, job_id)
643
+ return {"status": "rechunked", "total_chunks": new_count}
644
+
645
+
646
+ # ---------------------------------------------------------------------------
647
+ # Routes: Finalize
648
+ # ---------------------------------------------------------------------------
649
+
650
+ @app.post("/jobs/{job_id}/finalize")
651
+ async def finalize_job(
652
+ job_id: str,
653
+ body: FinalizeRequest,
654
+ x_api_key: str = Header(...),
655
+ ):
656
+ """Finalize review — apply policy, lock job."""
657
+ tenant_id = _get_tenant(x_api_key)
658
+ job = await db.get_job(tenant_id, job_id)
659
+ if not job:
660
+ raise HTTPException(status_code=404, detail="Job not found")
661
+ if job["status"] not in ("ready_for_review",):
662
+ raise HTTPException(status_code=400, detail="Job not in reviewable state")
663
+
664
+ # Apply policy
665
+ if body.finalize_policy == FinalizePolicy.REQUIRE_ALL_APPROVED:
666
+ pending = await db.apply_finalize_policy(
667
+ tenant_id, job_id, body.finalize_policy
668
+ )
669
+ if pending > 0:
670
+ raise HTTPException(
671
+ status_code=400,
672
+ detail=f"{pending} item(s) still pending — approve or reject all before finalizing",
673
+ )
674
+ else:
675
+ _affected = await db.apply_finalize_policy(
676
+ tenant_id, job_id, body.finalize_policy
677
+ )
678
+
679
+ await db.update_job(tenant_id, job_id, {
680
+ "status": "finalized",
681
+ "finalized_at": datetime.now(timezone.utc),
682
+ })
683
+
684
+ return {"status": "finalized", "job_id": job_id, "policy": body.finalize_policy.value}
685
+
686
+
687
+ # ---------------------------------------------------------------------------
688
+ # Routes: Export (streaming zip)
689
+ # ---------------------------------------------------------------------------
690
+
691
+ @app.get("/jobs/{job_id}/export")
692
+ async def export_job(job_id: str, x_api_key: str = Header(...)):
693
+ """Stream download of finalized output as .zip."""
694
+ tenant_id = _get_tenant(x_api_key)
695
+ job = await db.get_job(tenant_id, job_id)
696
+ if not job:
697
+ raise HTTPException(status_code=404, detail="Job not found")
698
+
699
+ blocks = await db.get_blocks(tenant_id, job_id)
700
+ chunks = await db.get_chunks(tenant_id, job_id)
701
+
702
+ # Build zip in memory-efficient streaming fashion
703
+ import json
704
+
705
+ buffer = io.BytesIO()
706
+ with zipfile.ZipFile(buffer, "w", zipfile.ZIP_DEFLATED) as zf:
707
+ zf.writestr("blocks.json", json.dumps(blocks, default=str, indent=2))
708
+ zf.writestr("chunks.json", json.dumps(chunks, default=str, indent=2))
709
+
710
+ # Generate document.md from approved blocks
711
+ md_lines = []
712
+ for b in sorted(blocks, key=lambda x: x.get("order_index", 0)):
713
+ text = b.get("edited_text") or b.get("text", "")
714
+ block_type = b.get("type", "paragraph")
715
+ level = b.get("heading_level", 1)
716
+ if block_type == "heading" and level:
717
+ md_lines.append(f"{'#' * level} {text}")
718
+ else:
719
+ md_lines.append(text)
720
+ md_lines.append("")
721
+ zf.writestr("document.md", "\n".join(md_lines))
722
+
723
+ buffer.seek(0)
724
+ return StreamingResponse(
725
+ buffer,
726
+ media_type="application/zip",
727
+ headers={
728
+ "Content-Disposition": f"attachment; filename=longparser_export_{job_id[:8]}.zip"
729
+ },
730
+ )
731
+
732
+
733
+ # ---------------------------------------------------------------------------
734
+ # Routes: Embed
735
+ # ---------------------------------------------------------------------------
736
+
737
+ @app.post("/jobs/{job_id}/embed")
738
+ async def embed_job_route(
739
+ job_id: str,
740
+ body: EmbedRequest,
741
+ x_api_key: str = Header(...),
742
+ ):
743
+ """Embed approved chunks → store in vector DB."""
744
+ tenant_id = _get_tenant(x_api_key)
745
+ job = await db.get_job(tenant_id, job_id)
746
+ if not job:
747
+ raise HTTPException(status_code=404, detail="Job not found")
748
+ if job["status"] not in ("finalized",):
749
+ raise HTTPException(status_code=400, detail="Job must be finalized before embedding")
750
+
751
+ index_version = str(uuid.uuid4())[:8]
752
+ collection_name = body.collection_name or f"longparser_{job_id[:8]}"
753
+
754
+ # Enqueue embedding task
755
+ await queue.enqueue("embed_job", {
756
+ "tenant_id": tenant_id,
757
+ "job_id": job_id,
758
+ "provider": body.provider,
759
+ "model": body.model,
760
+ "vector_db": body.vector_db,
761
+ "collection_name": collection_name,
762
+ "index_version": index_version,
763
+ })
764
+
765
+ await db.update_job(tenant_id, job_id, {"status": "embedding"})
766
+
767
+ return {
768
+ "status": "embedding",
769
+ "job_id": job_id,
770
+ "index_version": index_version,
771
+ "provider": body.provider,
772
+ "model": body.model,
773
+ "vector_db": body.vector_db,
774
+ "collection": collection_name,
775
+ }
776
+
777
+
778
+ # ---------------------------------------------------------------------------
779
+ # Routes: Search
780
+ # ---------------------------------------------------------------------------
781
+
782
+ @app.post("/search", response_model=SearchResponse)
783
+ async def search(body: SearchRequest, x_api_key: str = Header(...)):
784
+ """Search embedded chunks by similarity."""
785
+ tenant_id = _get_tenant(x_api_key)
786
+ job = await db.get_job(tenant_id, body.job_id)
787
+ if not job:
788
+ raise HTTPException(status_code=404, detail="Job not found")
789
+
790
+ # Get index version
791
+ if body.index_version:
792
+ iv_doc = await db.index_versions.find_one(
793
+ {"tenant_id": tenant_id, "job_id": body.job_id, "index_version": body.index_version},
794
+ {"_id": 0},
795
+ )
796
+ else:
797
+ iv_doc = await db.get_latest_index_version(tenant_id, body.job_id)
798
+
799
+ if not iv_doc:
800
+ raise HTTPException(status_code=404, detail="No embedding index found for this job")
801
+
802
+ # Embed query using identical model configurations
803
+ from .embeddings import EmbeddingEngine
804
+ engine = EmbeddingEngine(
805
+ provider=iv_doc.get("provider", "huggingface"),
806
+ model_name=iv_doc["model"],
807
+ dimensions=iv_doc.get("configured_dimensions")
808
+ )
809
+ query_embedding = engine.embed_query(body.query)
810
+
811
+ # Search in vector DB
812
+ from .vectorstores import get_vector_store
813
+ store = get_vector_store(
814
+ iv_doc["vector_db"],
815
+ collection_name=iv_doc.get("collection", "longparser"),
816
+ index_fingerprint=engine.get_fingerprint()
817
+ )
818
+
819
+ filters = {
820
+ "tenant_id": tenant_id,
821
+ "job_id": body.job_id,
822
+ **body.filters,
823
+ }
824
+ raw_results = store.search(query_embedding, top_k=body.top_k, filters=filters)
825
+
826
+ results = []
827
+ for r in raw_results:
828
+ meta = r.get("metadata", {})
829
+ results.append(SearchResult(
830
+ chunk_id=meta.get("chunk_id", ""),
831
+ text=r.get("document", ""),
832
+ score=r.get("score", 0.0),
833
+ chunk_type=meta.get("chunk_type", ""),
834
+ section_path=meta.get("section_path", []),
835
+ page_numbers=meta.get("page_numbers", []),
836
+ block_ids=meta.get("block_ids", []),
837
+ metadata=meta,
838
+ ))
839
+
840
+ return SearchResponse(
841
+ results=results,
842
+ index_version=iv_doc["index_version"],
843
+ model=iv_doc["model"],
844
+ query=body.query,
845
+ total=len(results),
846
+ )
847
+
848
+
849
+ # ---------------------------------------------------------------------------
850
+ # Observability middleware
851
+ # ---------------------------------------------------------------------------
852
+
853
+ @app.middleware("http")
854
+ async def observability_middleware(request: Request, call_next):
855
+ """Attach request_id and log structured request data."""
856
+ request_id = str(uuid.uuid4())[:8]
857
+ start = _time.monotonic()
858
+ response = await call_next(request)
859
+ latency_ms = (_time.monotonic() - start) * 1000
860
+ logger.info(
861
+ "request_completed",
862
+ extra={
863
+ "request_id": request_id,
864
+ "method": request.method,
865
+ "path": request.url.path,
866
+ "status": response.status_code,
867
+ "latency_ms": round(latency_ms, 2),
868
+ },
869
+ )
870
+ return response
871
+
872
+
873
+ # ---------------------------------------------------------------------------
874
+ # Routes: Chat Sessions
875
+ # ---------------------------------------------------------------------------
876
+
877
+ @app.post("/chat/sessions", status_code=201)
878
+ async def create_chat_session(
879
+ body: dict,
880
+ x_api_key: str = Header(...),
881
+ ):
882
+ """Create a new chat session (server-generated session_id)."""
883
+ from .chat.schemas import CreateSessionRequest
884
+ req = CreateSessionRequest(**body)
885
+ tenant_id = _get_tenant(x_api_key)
886
+
887
+ # Verify job belongs to tenant
888
+ job = await db.get_job(tenant_id, req.job_id)
889
+ if not job:
890
+ raise HTTPException(status_code=404, detail="Job not found")
891
+
892
+ session_id = str(uuid.uuid4())
893
+ await db.create_chat_session(tenant_id, session_id, req.job_id)
894
+
895
+ return {"session_id": session_id, "job_id": req.job_id}
896
+
897
+
898
+ @app.get("/chat/sessions/{session_id}")
899
+ async def get_chat_session(
900
+ session_id: str,
901
+ x_api_key: str = Header(...),
902
+ ):
903
+ """Get chat session with full history."""
904
+ tenant_id = _get_tenant(x_api_key)
905
+ session = await db.get_chat_session(tenant_id, session_id)
906
+ if not session:
907
+ raise HTTPException(status_code=404, detail="Session not found")
908
+
909
+ turns = await db.get_all_turns(tenant_id, session_id)
910
+ session["turns"] = turns
911
+ return session
912
+
913
+
914
+ @app.delete("/chat/sessions/{session_id}")
915
+ async def delete_chat_session(
916
+ session_id: str,
917
+ x_api_key: str = Header(...),
918
+ ):
919
+ """Soft-delete a chat session."""
920
+ tenant_id = _get_tenant(x_api_key)
921
+ deleted = await db.soft_delete_chat_session(tenant_id, session_id)
922
+ if not deleted:
923
+ raise HTTPException(status_code=404, detail="Session not found")
924
+ return {"status": "deleted", "session_id": session_id}
925
+
926
+
927
+ # ---------------------------------------------------------------------------
928
+ # Routes: Chat
929
+ # ---------------------------------------------------------------------------
930
+
931
+ @app.post("/chat")
932
+ async def chat(
933
+ body: dict,
934
+ x_api_key: str = Header(...),
935
+ ):
936
+ """Ask a question — RAG chatbot with 3-layer memory.
937
+
938
+ Set require_approval=true for Human-in-the-Loop review.
939
+ """
940
+ from .chat.schemas import ChatRequest, ChatResponse, ChatConfig
941
+ from .chat.engine import ChatEngine
942
+
943
+ req = ChatRequest(**body)
944
+ tenant_id = _get_tenant(x_api_key)
945
+
946
+ # ── Session ↔ Job binding validation ──
947
+ session = await db.get_chat_session(tenant_id, req.session_id)
948
+ if not session:
949
+ raise HTTPException(status_code=404, detail="Session not found")
950
+ if session["job_id"] != req.job_id:
951
+ raise HTTPException(
952
+ status_code=400,
953
+ detail="job_id does not match session's job_id",
954
+ )
955
+ job = await db.get_job(tenant_id, req.job_id)
956
+ if not job:
957
+ raise HTTPException(status_code=404, detail="Job not found")
958
+
959
+ # ── Create ChatEngine (reuse on app.state if available) ──
960
+ config = ChatConfig()
961
+ if not hasattr(app.state, "chat_engine"):
962
+ app.state.chat_engine = ChatEngine(db=db, queue=queue, config=config)
963
+
964
+ response = await app.state.chat_engine.ask(tenant_id, req)
965
+
966
+ # ── HITL: if require_approval, pause for human review ──
967
+ if req.require_approval and response.status == "complete":
968
+ from .chat.schemas import LLMAnswer, SourceRef
969
+ from .chat.graph import start_hitl_review
970
+
971
+ answer_obj = LLMAnswer(
972
+ answer=response.answer,
973
+ cited_chunk_ids=[s.chunk_id for s in response.sources],
974
+ )
975
+ hitl_result = await start_hitl_review(
976
+ tenant_id=tenant_id,
977
+ session_id=req.session_id,
978
+ job_id=req.job_id,
979
+ question=req.question,
980
+ answer=answer_obj,
981
+ sources=response.sources,
982
+ )
983
+ response.status = "pending_review"
984
+ response.thread_id = hitl_result["thread_id"]
985
+
986
+ return response.model_dump(mode="json")
987
+
988
+
989
+ @app.post("/chat/resume")
990
+ async def resume_chat(
991
+ body: dict,
992
+ x_api_key: str = Header(...),
993
+ ):
994
+ """Resume a paused HITL chat with human decision (approve/edit/reject)."""
995
+ from .chat.schemas import HITLResumeRequest, ChatResponse, SourceRef, Turn
996
+ from .chat.graph import resume_hitl_review
997
+
998
+ req = HITLResumeRequest(**body)
999
+ tenant_id = _get_tenant(x_api_key)
1000
+
1001
+ # Validate session belongs to tenant
1002
+ session = await db.get_chat_session(tenant_id, req.session_id)
1003
+ if not session:
1004
+ raise HTTPException(status_code=404, detail="Session not found")
1005
+
1006
+ # Resume the LangGraph flow
1007
+ result = await resume_hitl_review(
1008
+ thread_id=req.thread_id,
1009
+ action=req.action,
1010
+ edited_answer=req.edited_answer,
1011
+ )
1012
+
1013
+ # If the answer was edited or approved, update the saved turn
1014
+ if result.get("status") == "complete":
1015
+ # Update the last turn's answer if edited
1016
+ if req.action == "edit" and req.edited_answer:
1017
+ await db.chat_turns.update_one(
1018
+ {
1019
+ "tenant_id": tenant_id,
1020
+ "session_id": req.session_id,
1021
+ },
1022
+ {"$set": {"answer": req.edited_answer}},
1023
+ sort=[("created_at", -1)],
1024
+ )
1025
+
1026
+ sources = [SourceRef(**s) for s in result.get("sources", [])]
1027
+
1028
+ return ChatResponse(
1029
+ session_id=req.session_id,
1030
+ turn_id=result.get("turn_id", ""),
1031
+ answer=result.get("answer", ""),
1032
+ sources=sources,
1033
+ status=result.get("status", "complete"),
1034
+ ).model_dump(mode="json")
1035
+
1036
+
1037
+ # ---------------------------------------------------------------------------
1038
+ # Health check
1039
+ # ---------------------------------------------------------------------------
1040
+
1041
+ @app.get("/health")
1042
+ async def health():
1043
+ """Health check endpoint."""
1044
+ return {"status": "ok", "service": "cleanrag-api"}
1045
+