longparser 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. {longparser-0.1.1 → longparser-0.1.3}/PKG-INFO +15 -11
  2. {longparser-0.1.1 → longparser-0.1.3}/README.md +13 -10
  3. {longparser-0.1.1 → longparser-0.1.3}/pyproject.toml +2 -1
  4. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/__init__.py +8 -4
  5. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/chunkers/hybrid_chunker.py +2 -2
  6. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/docling_extractor.py +16 -15
  7. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/pipeline/__init__.py +4 -0
  8. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/app.py +139 -19
  9. longparser-0.1.3/src/longparser/server/chat/checkpointer.py +45 -0
  10. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/engine.py +2 -2
  11. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/graph.py +10 -9
  12. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/llm_chain.py +6 -4
  13. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/schemas.py +1 -1
  14. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/db.py +5 -5
  15. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/embeddings.py +4 -4
  16. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/queue.py +1 -6
  17. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/vectorstores.py +8 -8
  18. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/worker.py +5 -5
  19. {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/PKG-INFO +15 -11
  20. {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/SOURCES.txt +1 -0
  21. {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/requires.txt +1 -0
  22. {longparser-0.1.1 → longparser-0.1.3}/setup.cfg +0 -0
  23. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/chunkers/__init__.py +0 -0
  24. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/__init__.py +0 -0
  25. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/base.py +0 -0
  26. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/latex_ocr.py +0 -0
  27. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/integrations/__init__.py +0 -0
  28. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/integrations/langchain.py +0 -0
  29. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/integrations/llamaindex.py +0 -0
  30. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/pipeline/orchestrator.py +0 -0
  31. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/py.typed +0 -0
  32. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/schemas.py +0 -0
  33. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/__init__.py +0 -0
  34. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/__init__.py +0 -0
  35. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/callbacks.py +0 -0
  36. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/retriever.py +0 -0
  37. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/routers/__init__.py +0 -0
  38. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/schemas.py +0 -0
  39. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/utils/__init__.py +0 -0
  40. {longparser-0.1.1 → longparser-0.1.3}/src/longparser/utils/rtl_detector.py +0 -0
  41. {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/dependency_links.txt +0 -0
  42. {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: longparser
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
5
5
  Author-email: ENDEVSOLS Team <technology@endevsols.com>
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ Description-Content-Type: text/markdown
27
27
  Requires-Dist: pydantic<3,>=2.0
28
28
  Requires-Dist: docling>=2.14
29
29
  Requires-Dist: docling-core>=2.13
30
+ Requires-Dist: langgraph-checkpoint-mongodb>=0.3.1
30
31
  Provides-Extra: pptx
31
32
  Requires-Dist: python-pptx>=1.0; extra == "pptx"
32
33
  Provides-Extra: langchain
@@ -109,8 +110,7 @@ Requires-Dist: httpx>=0.27; extra == "dev"
109
110
  Requires-Dist: anyio>=4.0; extra == "dev"
110
111
 
111
112
  <p align="center">
112
- <!-- Logo goes here once ready -->
113
- <h1 align="center">LongParser</h1>
113
+ <img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
114
114
  <p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
115
115
  <p align="center">
116
116
  Parse PDFs, DOCX, PPTX, XLSX &amp; CSV → validated, AI-ready chunks with HITL review.
@@ -129,7 +129,7 @@ Requires-Dist: anyio>=4.0; extra == "dev"
129
129
  <img src="https://static.pepy.tech/badge/longparser/month" alt="Monthly Downloads">
130
130
  </a>
131
131
  <a href="https://www.python.org/">
132
- <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
132
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python">
133
133
  </a>
134
134
  <a href="LICENSE">
135
135
  <img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
@@ -150,11 +150,12 @@ Requires-Dist: anyio>=4.0; extra == "dev"
150
150
  | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
151
151
  | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
152
152
  | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
153
- | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
153
+ | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
154
154
  | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
155
155
  | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
156
156
  | **Multi-backend vectors** | Chroma, FAISS, Qdrant |
157
- | **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
157
+ | **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
158
+ | **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
158
159
  | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
159
160
  | **Privacy-first** | All processing runs locally; no data leaves your infra |
160
161
 
@@ -215,9 +216,9 @@ pip install "longparser[cpu]"
215
216
  ### Python SDK
216
217
 
217
218
  ```python
218
- from longparser import PipelineOrchestrator, ProcessingConfig
219
+ from longparser import DocumentPipeline, ProcessingConfig
219
220
 
220
- pipeline = PipelineOrchestrator()
221
+ pipeline = DocumentPipeline(ProcessingConfig())
221
222
  result = pipeline.process_file("document.pdf")
222
223
 
223
224
  print(f"Pages: {result.document.metadata.total_pages}")
@@ -296,7 +297,7 @@ src/longparser/
296
297
  ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
297
298
  ├── extractors/ ← Docling, LaTeX OCR backends
298
299
  ├── chunkers/ ← HybridChunker
299
- ├── pipeline/ ← PipelineOrchestrator
300
+ ├── pipeline/ ← DocumentPipeline
300
301
  ├── integrations/ ← LangChain loader & LlamaIndex reader
301
302
  ├── utils/ ← shared helpers (RTL detection, …)
302
303
  └── server/ ← REST API layer
@@ -344,11 +345,14 @@ Copy `.env.example` to `.env` and set:
344
345
  | Variable | Default | Description |
345
346
  |----------|---------|-------------|
346
347
  | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
347
- | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
348
+ | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
348
349
  | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
349
- | `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
350
+ | `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
350
351
  | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
351
352
  | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
353
+ | `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
354
+ | `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
355
+ | `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
352
356
 
353
357
  ---
354
358
 
@@ -1,6 +1,5 @@
1
1
  <p align="center">
2
- <!-- Logo goes here once ready -->
3
- <h1 align="center">LongParser</h1>
2
+ <img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
4
3
  <p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
5
4
  <p align="center">
6
5
  Parse PDFs, DOCX, PPTX, XLSX &amp; CSV → validated, AI-ready chunks with HITL review.
@@ -19,7 +18,7 @@
19
18
  <img src="https://static.pepy.tech/badge/longparser/month" alt="Monthly Downloads">
20
19
  </a>
21
20
  <a href="https://www.python.org/">
22
- <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
21
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python">
23
22
  </a>
24
23
  <a href="LICENSE">
25
24
  <img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
@@ -40,11 +39,12 @@
40
39
  | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
41
40
  | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
42
41
  | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
43
- | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
42
+ | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
44
43
  | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
45
44
  | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
46
45
  | **Multi-backend vectors** | Chroma, FAISS, Qdrant |
47
- | **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
46
+ | **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
47
+ | **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
48
48
  | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
49
49
  | **Privacy-first** | All processing runs locally; no data leaves your infra |
50
50
 
@@ -105,9 +105,9 @@ pip install "longparser[cpu]"
105
105
  ### Python SDK
106
106
 
107
107
  ```python
108
- from longparser import PipelineOrchestrator, ProcessingConfig
108
+ from longparser import DocumentPipeline, ProcessingConfig
109
109
 
110
- pipeline = PipelineOrchestrator()
110
+ pipeline = DocumentPipeline(ProcessingConfig())
111
111
  result = pipeline.process_file("document.pdf")
112
112
 
113
113
  print(f"Pages: {result.document.metadata.total_pages}")
@@ -186,7 +186,7 @@ src/longparser/
186
186
  ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
187
187
  ├── extractors/ ← Docling, LaTeX OCR backends
188
188
  ├── chunkers/ ← HybridChunker
189
- ├── pipeline/ ← PipelineOrchestrator
189
+ ├── pipeline/ ← DocumentPipeline
190
190
  ├── integrations/ ← LangChain loader & LlamaIndex reader
191
191
  ├── utils/ ← shared helpers (RTL detection, …)
192
192
  └── server/ ← REST API layer
@@ -234,11 +234,14 @@ Copy `.env.example` to `.env` and set:
234
234
  | Variable | Default | Description |
235
235
  |----------|---------|-------------|
236
236
  | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
237
- | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
237
+ | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
238
238
  | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
239
- | `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
239
+ | `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
240
240
  | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
241
241
  | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
242
+ | `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
243
+ | `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
244
+ | `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
242
245
 
243
246
  ---
244
247
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "longparser"
7
- version = "0.1.1"
7
+ version = "0.1.3"
8
8
  description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines."
9
9
  readme = {file = "README.md", content-type = "text/markdown"}
10
10
  requires-python = ">=3.10"
@@ -35,6 +35,7 @@ dependencies = [
35
35
  "pydantic>=2.0,<3",
36
36
  "docling>=2.14",
37
37
  "docling-core>=2.13",
38
+ "langgraph-checkpoint-mongodb>=0.3.1",
38
39
  ]
39
40
 
40
41
  [project.optional-dependencies]
@@ -9,9 +9,9 @@ Built by ENDEVSOLS for production RAG pipelines.
9
9
 
10
10
  Quick start::
11
11
 
12
- from longparser import PipelineOrchestrator, ProcessingConfig
12
+ from longparser import DocumentPipeline, ProcessingConfig
13
13
 
14
- pipeline = PipelineOrchestrator()
14
+ pipeline = DocumentPipeline(ProcessingConfig())
15
15
  result = pipeline.process_file("document.pdf")
16
16
  print(result.chunks[0].text)
17
17
 
@@ -19,13 +19,13 @@ For the full REST API server::
19
19
 
20
20
  uv run uvicorn longparser.server.app:app --reload --port 8000
21
21
 
22
- See :class:`~longparser.pipeline.PipelineOrchestrator` for the main SDK entry
22
+ See :class:`~longparser.pipeline.DocumentPipeline` for the main SDK entry
23
23
  point and :mod:`longparser.server` for the REST API layer.
24
24
  """
25
25
 
26
26
  from __future__ import annotations
27
27
 
28
- __version__ = "0.1.1"
28
+ __version__ = "0.1.3"
29
29
  __author__ = "ENDEVSOLS Team"
30
30
  __license__ = "MIT"
31
31
 
@@ -62,6 +62,9 @@ def __getattr__(name: str):
62
62
  if name == "PipelineOrchestrator":
63
63
  from .pipeline import PipelineOrchestrator
64
64
  return PipelineOrchestrator
65
+ if name == "DocumentPipeline":
66
+ from .pipeline import DocumentPipeline
67
+ return DocumentPipeline
65
68
  if name == "PipelineResult":
66
69
  from .pipeline import PipelineResult
67
70
  return PipelineResult
@@ -99,6 +102,7 @@ __all__ = [
99
102
  # Lazily imported (require extras)
100
103
  "DoclingExtractor",
101
104
  "PipelineOrchestrator",
105
+ "DocumentPipeline",
102
106
  "PipelineResult",
103
107
  "HybridChunker",
104
108
  ]
@@ -345,10 +345,10 @@ def _generate_schema_chunk(
345
345
  sample_rows.append(f" Row {r_idx}: " + "; ".join(parts))
346
346
 
347
347
  lines = [
348
- f"[TABLE SCHEMA]",
348
+ "[TABLE SCHEMA]",
349
349
  f"Table ID: {block.block_id}",
350
350
  f"Rows: {n_data} (data rows), Columns: {n_cols}",
351
- f"Columns:",
351
+ "Columns:",
352
352
  ]
353
353
  lines.extend(col_profiles)
354
354
  lines.append(f"Sample Rows ({sample_count}):")
@@ -254,7 +254,7 @@ class DoclingExtractor(BaseExtractor):
254
254
  # Order-based substitution with alignment gate
255
255
  injected = 0
256
256
  _non_omml = 0
257
- for block, latex in zip(formula_blocks, latex_eqs):
257
+ for block, latex in zip(formula_blocks, latex_eqs, strict=False):
258
258
  orig_len = len(block.text.strip()) if block.text else 0
259
259
  latex_len = len(latex.strip())
260
260
 
@@ -431,7 +431,8 @@ class DoclingExtractor(BaseExtractor):
431
431
  page_img = None
432
432
  try:
433
433
  page_img = page_obj.image.pil_image
434
- except Exception:
434
+ except Exception as e:
435
+ logger.warning("Failed to extract image for formula scanning: %s", e)
435
436
  continue
436
437
  if page_img is None:
437
438
  continue
@@ -527,8 +528,8 @@ class DoclingExtractor(BaseExtractor):
527
528
  # Update label to formula so downstream sees it correctly
528
529
  try:
529
530
  item.label = type(item.label)("formula")
530
- except Exception:
531
- pass
531
+ except Exception as e:
532
+ logger.debug(f"Failed to update formula label: {e}")
532
533
  replaced = True
533
534
  logger.debug(f"MFD: replaced garbled block on page {page_no}")
534
535
  break
@@ -1023,15 +1024,15 @@ class DoclingExtractor(BaseExtractor):
1023
1024
  if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'):
1024
1025
  try:
1025
1026
  return item.export_to_markdown(doc=docling_doc)
1026
- except Exception:
1027
- pass
1027
+ except Exception as e:
1028
+ logger.debug(f"Failed to export table item to markdown: {e}")
1028
1029
  if hasattr(item, 'text') and item.text:
1029
1030
  return item.text
1030
1031
  if hasattr(item, 'export_to_markdown'):
1031
1032
  try:
1032
1033
  return item.export_to_markdown()
1033
- except Exception:
1034
- pass
1034
+ except Exception as e:
1035
+ logger.debug(f"Failed to export item to markdown: {e}")
1035
1036
  return ""
1036
1037
 
1037
1038
  def _get_item_confidence(self, item) -> float:
@@ -1080,10 +1081,10 @@ class DoclingExtractor(BaseExtractor):
1080
1081
  if s.placeholder_format.type == PP_PH.SUBTITLE:
1081
1082
  has_subtitle_placeholder = True
1082
1083
  break
1083
- except Exception:
1084
- pass
1085
- except ImportError:
1086
- pass
1084
+ except Exception as e:
1085
+ logger.debug(f"Failed to check PPTX subtitle placeholder format: {e}")
1086
+ except ImportError as e:
1087
+ logger.debug(f"Failed to import python-pptx: {e}")
1087
1088
 
1088
1089
  for shape in slide.shapes:
1089
1090
  found_title = self._extract_pptx_shape_info(
@@ -1160,8 +1161,8 @@ class DoclingExtractor(BaseExtractor):
1160
1161
  is_subtitle_shape = True
1161
1162
  elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER):
1162
1163
  is_footer_shape = True
1163
- except Exception:
1164
- pass
1164
+ except Exception as e:
1165
+ logger.debug(f"Failed to check PPTX placeholder format type: {e}")
1165
1166
 
1166
1167
  # Skip footer/date/slide-number shapes entirely
1167
1168
  if is_footer_shape:
@@ -1267,7 +1268,7 @@ class DoclingExtractor(BaseExtractor):
1267
1268
 
1268
1269
  # Calculate file hash
1269
1270
  with open(file_path, "rb") as f:
1270
- file_hash = hashlib.md5(f.read()).hexdigest()
1271
+ file_hash = hashlib.sha256(f.read()).hexdigest()
1271
1272
 
1272
1273
  # Get conversion result (cached or new)
1273
1274
  result = self._run_docling(file_path, config)
@@ -2,7 +2,11 @@
2
2
 
3
3
  from .orchestrator import PipelineOrchestrator, PipelineResult
4
4
 
5
+ # Public alias — docs and quickstart use this name
6
+ DocumentPipeline = PipelineOrchestrator
7
+
5
8
  __all__ = [
6
9
  "PipelineOrchestrator",
10
+ "DocumentPipeline",
7
11
  "PipelineResult",
8
12
  ]
@@ -13,6 +13,7 @@ try:
13
13
  except ImportError:
14
14
  pass
15
15
 
16
+ from collections import defaultdict
16
17
  import hashlib
17
18
  import io
18
19
  import logging
@@ -25,6 +26,7 @@ from datetime import datetime, timezone
25
26
  from pathlib import Path
26
27
  from typing import Optional
27
28
  import time as _time
29
+ import redis.asyncio as redis
28
30
 
29
31
  from fastapi import (
30
32
  FastAPI,
@@ -35,6 +37,7 @@ from fastapi import (
35
37
  Request,
36
38
  UploadFile,
37
39
  )
40
+ from fastapi.middleware.cors import CORSMiddleware
38
41
  from fastapi.responses import JSONResponse, StreamingResponse
39
42
 
40
43
  from .db import Database
@@ -57,6 +60,15 @@ from .schemas import (
57
60
  SearchResponse,
58
61
  SearchResult,
59
62
  )
63
+ from .chat.schemas import (
64
+ ChatConfig,
65
+ ChatRequest,
66
+ ChatResponse,
67
+ CreateSessionRequest,
68
+ HITLResumeRequest,
69
+ LLMAnswer,
70
+ SourceRef,
71
+ )
60
72
 
61
73
  logger = logging.getLogger(__name__)
62
74
 
@@ -92,8 +104,18 @@ queue = ARQBackend(
92
104
  async def lifespan(app: FastAPI):
93
105
  """Startup/shutdown hooks."""
94
106
  await db.create_indexes()
107
+
108
+ from .chat.checkpointer import init_checkpointer, close_checkpointer
109
+ await init_checkpointer(
110
+ mongo_uri=os.getenv("LONGPARSER_MONGO_URL", "mongodb://localhost:27017"),
111
+ db_name=os.getenv("LONGPARSER_DB_NAME", "longparser"),
112
+ )
113
+
95
114
  logger.info("LongParser API started")
96
115
  yield
116
+
117
+ await close_checkpointer()
118
+
97
119
  await queue.close()
98
120
  await db.close()
99
121
  if hasattr(app.state, "chat_engine"):
@@ -104,11 +126,69 @@ async def lifespan(app: FastAPI):
104
126
  app = FastAPI(
105
127
  title="LongParser API",
106
128
  description="Document intelligence engine with HITL review, embedding, and vector search.",
107
- version="0.3.0",
129
+ version=__import__("longparser").__version__,
108
130
  lifespan=lifespan,
109
131
  )
110
132
 
111
133
 
134
+ # ---------------------------------------------------------------------------
135
+ # CORS middleware
136
+ # ---------------------------------------------------------------------------
137
+
138
+ app.add_middleware(
139
+ CORSMiddleware,
140
+ allow_origins=os.getenv("LONGPARSER_CORS_ORIGINS", "*").split(","),
141
+ allow_credentials=True,
142
+ allow_methods=["*"],
143
+ allow_headers=["*"],
144
+ )
145
+
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Global exception handler
149
+ # ---------------------------------------------------------------------------
150
+
151
+ @app.exception_handler(Exception)
152
+ async def global_exception_handler(request: Request, exc: Exception):
153
+ """Catch unhandled exceptions — return sanitized error, log full trace."""
154
+ logger.exception("Unhandled exception", exc_info=exc)
155
+ return JSONResponse(
156
+ status_code=500,
157
+ content={"detail": "Internal server error"},
158
+ )
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Rate limiter (Redis sliding window)
163
+ # ---------------------------------------------------------------------------
164
+
165
+ class RedisRateLimiter:
166
+ """Redis-backed sliding-window rate limiter (per-tenant) for multi-worker scale."""
167
+
168
+ def __init__(self, redis_url: str, max_requests: int = 60, window_seconds: int = 60):
169
+ self.max_requests = max_requests
170
+ self.window = window_seconds
171
+ self.redis = redis.from_url(redis_url)
172
+
173
+ async def check(self, key: str) -> bool:
174
+ now = _time.time()
175
+ redis_key = f"rate_limit:{key}"
176
+ pipeline = self.redis.pipeline()
177
+ pipeline.zremrangebyscore(redis_key, 0, now - self.window)
178
+ pipeline.zadd(redis_key, {str(now): now})
179
+ pipeline.zcard(redis_key)
180
+ pipeline.expire(redis_key, self.window)
181
+ results = await pipeline.execute()
182
+ return results[2] <= self.max_requests
183
+
184
+
185
+ _rate_limiter = RedisRateLimiter(
186
+ redis_url=os.getenv("LONGPARSER_REDIS_URL", "redis://localhost:6379/0"),
187
+ max_requests=int(os.getenv("LONGPARSER_RATE_LIMIT", "60")),
188
+ window_seconds=60,
189
+ )
190
+
191
+
112
192
  # ---------------------------------------------------------------------------
113
193
  # Auth middleware (API key — v1)
114
194
  # ---------------------------------------------------------------------------
@@ -121,8 +201,33 @@ def _get_tenant(x_api_key: str = Header(...)) -> str:
121
201
  """
122
202
  if not x_api_key or len(x_api_key) < 8:
123
203
  raise HTTPException(status_code=401, detail="Invalid API key")
124
- # For v1, use a hash of the key as tenant_id
125
- return hashlib.sha256(x_api_key.encode()).hexdigest()[:16]
204
+ # Use 32 hex chars (128-bit) to resist brute-force collision attacks
205
+ return hashlib.sha256(x_api_key.encode()).hexdigest()[:32]
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # RBAC (role-based access control)
210
+ # ---------------------------------------------------------------------------
211
+
212
+ _ADMIN_KEYS: set[str] = set(
213
+ k.strip() for k in os.getenv("LONGPARSER_ADMIN_KEYS", "").split(",") if k.strip()
214
+ )
215
+
216
+
217
+ def _get_role(x_api_key: str) -> str:
218
+ """Resolve user role from API key.
219
+
220
+ If LONGPARSER_ADMIN_KEYS is not set, all users are admins (backward compatible).
221
+ """
222
+ if not _ADMIN_KEYS:
223
+ return "admin"
224
+ return "admin" if x_api_key in _ADMIN_KEYS else "reviewer"
225
+
226
+
227
+ def _require_admin(x_api_key: str) -> None:
228
+ """Raise 403 if the API key does not have admin role."""
229
+ if _get_role(x_api_key) != "admin":
230
+ raise HTTPException(status_code=403, detail="Admin access required")
126
231
 
127
232
 
128
233
  # ---------------------------------------------------------------------------
@@ -175,14 +280,23 @@ async def create_job(
175
280
 
176
281
  # Generate job ID and save file
177
282
  job_id = str(uuid.uuid4())
178
- dest = UPLOAD_DIR / tenant_id / job_id / (file.filename or "document")
283
+
284
+ # --- Path Traversal Protection ---
285
+ # Strip all directory components from the user-provided filename
286
+ # to prevent payloads like "../../../etc/passwd" from escaping UPLOAD_DIR.
287
+ raw_name = file.filename or "document"
288
+ safe_name = Path(raw_name).name # keeps only the final component
289
+ if not safe_name or safe_name in (".", ".."):
290
+ safe_name = "document"
291
+
292
+ dest = UPLOAD_DIR / tenant_id / job_id / safe_name
179
293
  file_hash, file_size = await _stream_upload(file, dest)
180
294
 
181
295
  # Create job in MongoDB
182
296
  job_doc = await db.create_job(
183
297
  tenant_id=tenant_id,
184
298
  job_id=job_id,
185
- source_file=file.filename or "document",
299
+ source_file=safe_name,
186
300
  file_hash=file_hash,
187
301
  )
188
302
 
@@ -197,7 +311,7 @@ async def create_job(
197
311
  job_id=job_id,
198
312
  tenant_id=tenant_id,
199
313
  status=JobStatus.QUEUED,
200
- source_file=file.filename or "document",
314
+ source_file=safe_name,
201
315
  file_hash=file_hash,
202
316
  created_at=job_doc["created_at"],
203
317
  )
@@ -498,6 +612,7 @@ async def purge_block(
498
612
  x_api_key: str = Header(...),
499
613
  ):
500
614
  """Admin-only: permanently delete a block. Writes a tombstone revision."""
615
+ _require_admin(x_api_key)
501
616
  tenant_id = _get_tenant(x_api_key)
502
617
 
503
618
  # Get block before deletion (for tombstone)
@@ -545,6 +660,7 @@ async def purge_chunk(
545
660
  x_api_key: str = Header(...),
546
661
  ):
547
662
  """Admin-only: permanently delete a chunk. Writes a tombstone revision."""
663
+ _require_admin(x_api_key)
548
664
  tenant_id = _get_tenant(x_api_key)
549
665
 
550
666
  # Get chunk before deletion
@@ -852,8 +968,19 @@ async def search(body: SearchRequest, x_api_key: str = Header(...)):
852
968
 
853
969
  @app.middleware("http")
854
970
  async def observability_middleware(request: Request, call_next):
855
- """Attach request_id and log structured request data."""
971
+ """Attach request_id, enforce rate limits, and log structured request data."""
856
972
  request_id = str(uuid.uuid4())[:8]
973
+
974
+ # ── Rate limiting (skip unauthenticated endpoints) ──
975
+ api_key = request.headers.get("x-api-key")
976
+ if api_key and len(api_key) >= 8:
977
+ tenant_key = hashlib.sha256(api_key.encode()).hexdigest()[:32]
978
+ if not await _rate_limiter.check(tenant_key):
979
+ return JSONResponse(
980
+ status_code=429,
981
+ content={"detail": "Rate limit exceeded. Try again later."},
982
+ )
983
+
857
984
  start = _time.monotonic()
858
985
  response = await call_next(request)
859
986
  latency_ms = (_time.monotonic() - start) * 1000
@@ -876,12 +1003,10 @@ async def observability_middleware(request: Request, call_next):
876
1003
 
877
1004
  @app.post("/chat/sessions", status_code=201)
878
1005
  async def create_chat_session(
879
- body: dict,
1006
+ req: CreateSessionRequest,
880
1007
  x_api_key: str = Header(...),
881
1008
  ):
882
1009
  """Create a new chat session (server-generated session_id)."""
883
- from .chat.schemas import CreateSessionRequest
884
- req = CreateSessionRequest(**body)
885
1010
  tenant_id = _get_tenant(x_api_key)
886
1011
 
887
1012
  # Verify job belongs to tenant
@@ -930,17 +1055,15 @@ async def delete_chat_session(
930
1055
 
931
1056
  @app.post("/chat")
932
1057
  async def chat(
933
- body: dict,
1058
+ req: ChatRequest,
934
1059
  x_api_key: str = Header(...),
935
1060
  ):
936
1061
  """Ask a question — RAG chatbot with 3-layer memory.
937
1062
 
938
1063
  Set require_approval=true for Human-in-the-Loop review.
939
1064
  """
940
- from .chat.schemas import ChatRequest, ChatResponse, ChatConfig
941
1065
  from .chat.engine import ChatEngine
942
1066
 
943
- req = ChatRequest(**body)
944
1067
  tenant_id = _get_tenant(x_api_key)
945
1068
 
946
1069
  # ── Session ↔ Job binding validation ──
@@ -965,7 +1088,6 @@ async def chat(
965
1088
 
966
1089
  # ── HITL: if require_approval, pause for human review ──
967
1090
  if req.require_approval and response.status == "complete":
968
- from .chat.schemas import LLMAnswer, SourceRef
969
1091
  from .chat.graph import start_hitl_review
970
1092
 
971
1093
  answer_obj = LLMAnswer(
@@ -988,14 +1110,12 @@ async def chat(
988
1110
 
989
1111
  @app.post("/chat/resume")
990
1112
  async def resume_chat(
991
- body: dict,
1113
+ req: HITLResumeRequest,
992
1114
  x_api_key: str = Header(...),
993
1115
  ):
994
1116
  """Resume a paused HITL chat with human decision (approve/edit/reject)."""
995
- from .chat.schemas import HITLResumeRequest, ChatResponse, SourceRef, Turn
996
1117
  from .chat.graph import resume_hitl_review
997
1118
 
998
- req = HITLResumeRequest(**body)
999
1119
  tenant_id = _get_tenant(x_api_key)
1000
1120
 
1001
1121
  # Validate session belongs to tenant
@@ -1014,7 +1134,7 @@ async def resume_chat(
1014
1134
  if result.get("status") == "complete":
1015
1135
  # Update the last turn's answer if edited
1016
1136
  if req.action == "edit" and req.edited_answer:
1017
- await db.chat_turns.update_one(
1137
+ await db.chat_turns.find_one_and_update(
1018
1138
  {
1019
1139
  "tenant_id": tenant_id,
1020
1140
  "session_id": req.session_id,
@@ -1041,5 +1161,5 @@ async def resume_chat(
1041
1161
  @app.get("/health")
1042
1162
  async def health():
1043
1163
  """Health check endpoint."""
1044
- return {"status": "ok", "service": "cleanrag-api"}
1164
+ return {"status": "ok", "service": "longparser-api"}
1045
1165
 
@@ -0,0 +1,45 @@
1
+ """LangGraph MongoDB Checkpointer singleton.
2
+
3
+ Holds the global per-worker instance of the MongoDBSaver.
4
+ """
5
+ import logging
6
+ from typing import Optional
7
+ from pymongo import MongoClient
8
+ from langgraph.checkpoint.mongodb import MongoDBSaver
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ _mongo_client: Optional[MongoClient] = None
13
+ _checkpointer: Optional[MongoDBSaver] = None
14
+
15
+
16
+ async def init_checkpointer(mongo_uri: str, db_name: str) -> None:
17
+ """Initialize the MongoDB checkpointer on app startup."""
18
+ global _mongo_client, _checkpointer
19
+ if _checkpointer is not None:
20
+ return
21
+
22
+ logger.info("Initializing LangGraph MongoDB checkpointer...")
23
+ # Initialize the sync MongoClient
24
+ _mongo_client = MongoClient(mongo_uri)
25
+
26
+ # Initialize the saver
27
+ _checkpointer = MongoDBSaver(_mongo_client, db_name=db_name)
28
+
29
+
30
+ def get_checkpointer() -> MongoDBSaver:
31
+ """Get the active checkpointer instance."""
32
+ global _checkpointer
33
+ if _checkpointer is None:
34
+ raise RuntimeError("Checkpointer not initialized. Call init_checkpointer first.")
35
+ return _checkpointer
36
+
37
+
38
+ async def close_checkpointer() -> None:
39
+ """Close the database checkpointer on app shutdown."""
40
+ global _mongo_client, _checkpointer
41
+ if _mongo_client is not None:
42
+ _mongo_client.close()
43
+ _mongo_client = None
44
+ _checkpointer = None
45
+ logger.info("LangGraph MongoDB checkpointer closed.")
@@ -76,7 +76,7 @@ RAG_PROMPT = ChatPromptTemplate.from_messages([
76
76
  # Token Counting (model-aware) — kept as custom logic
77
77
  # ---------------------------------------------------------------------------
78
78
 
79
- def count_tokens(text: str, model: str = "gpt-4o") -> int:
79
+ def count_tokens(text: str, model: str = "gpt-5.3") -> int:
80
80
  """Count tokens — exact for OpenAI models, conservative approx for others."""
81
81
  try:
82
82
  import tiktoken
@@ -96,7 +96,7 @@ def budget_trim(
96
96
  recent_turns: list[dict],
97
97
  rolling_summary: str,
98
98
  long_term_facts: list[dict],
99
- model: str = "gpt-4o",
99
+ model: str = "gpt-5.3",
100
100
  max_prompt_tokens: int = 6000,
101
101
  ) -> dict:
102
102
  """Priority-ordered truncation of prompt variables to fit token budget.
@@ -17,16 +17,14 @@ import logging
17
17
  import uuid
18
18
  from typing import TypedDict, Optional, Any
19
19
 
20
- from langgraph.checkpoint.memory import InMemorySaver
21
20
  from langgraph.graph import StateGraph, END
22
21
  from langgraph.types import interrupt, Command
23
22
 
24
23
  from .schemas import ChatConfig, ChatRequest, ChatResponse, SourceRef, Turn, LLMAnswer
24
+ from .checkpointer import get_checkpointer
25
25
 
26
26
  logger = logging.getLogger(__name__)
27
27
 
28
- # Shared checkpointer for all HITL flows
29
- _checkpointer = InMemorySaver()
30
28
 
31
29
 
32
30
  # ---------------------------------------------------------------------------
@@ -103,7 +101,7 @@ async def process_decision(state: HITLState) -> HITLState:
103
101
  # Build Graph
104
102
  # ---------------------------------------------------------------------------
105
103
 
106
- def build_hitl_graph() -> Any:
104
+ def build_hitl_graph(checkpointer) -> Any:
107
105
  """Build and compile the HITL state graph."""
108
106
  graph = StateGraph(HITLState)
109
107
 
@@ -116,11 +114,7 @@ def build_hitl_graph() -> Any:
116
114
  graph.add_edge("review", "decide")
117
115
  graph.add_edge("decide", END)
118
116
 
119
- return graph.compile(checkpointer=_checkpointer)
120
-
121
-
122
- # Module-level compiled graph
123
- hitl_graph = build_hitl_graph()
117
+ return graph.compile(checkpointer=checkpointer)
124
118
 
125
119
 
126
120
  # ---------------------------------------------------------------------------
@@ -152,6 +146,10 @@ async def start_hitl_review(
152
146
  }
153
147
 
154
148
  config = {"configurable": {"thread_id": thread_id}}
149
+
150
+ checkpointer = get_checkpointer()
151
+ hitl_graph = build_hitl_graph(checkpointer)
152
+
155
153
  _result = await hitl_graph.ainvoke(initial_state, config=config)
156
154
 
157
155
  return {
@@ -170,6 +168,9 @@ async def resume_hitl_review(
170
168
  """Resume a paused HITL flow with the human's decision."""
171
169
  config = {"configurable": {"thread_id": thread_id}}
172
170
 
171
+ checkpointer = get_checkpointer()
172
+ hitl_graph = build_hitl_graph(checkpointer)
173
+
173
174
  return await hitl_graph.ainvoke(
174
175
  Command(resume={"action": action, "edited_answer": edited_answer}),
175
176
  config=config,
@@ -16,14 +16,16 @@ from .schemas import ChatConfig
16
16
  logger = logging.getLogger(__name__)
17
17
 
18
18
 
19
- # Default models per provider (updated Feb 2026)
19
+ # Default models per provider
20
20
  DEFAULT_MODELS: dict[str, str] = {
21
- "openai": "gpt-5.3-codex",
21
+ "openai": "gpt-5.3",
22
22
  "gemini": "gemini-2.5-flash",
23
23
  "groq": "openai/gpt-oss-120b",
24
- "openrouter": "openai/gpt-5.3-codex",
24
+ "openrouter": "openai/gpt-5.3",
25
25
  }
26
26
 
27
+ SUPPORTED_PROVIDERS = list(DEFAULT_MODELS.keys())
28
+
27
29
 
28
30
  def _create_openai(model: str, temperature: float, max_tokens: int,
29
31
  max_retries: int, callbacks: Optional[list] = None):
@@ -113,7 +115,7 @@ def get_chat_model(
113
115
  """
114
116
  config = config or ChatConfig()
115
117
  provider = provider or config.llm_provider
116
- model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-4o")
118
+ model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-5.3")
117
119
  max_tokens = max_tokens or config.max_output_tokens
118
120
 
119
121
  creator = _CREATORS.get(provider)
@@ -33,7 +33,7 @@ class ChatConfig(BaseModel):
33
33
  default_factory=lambda: os.getenv("LONGPARSER_LLM_PROVIDER", "openai")
34
34
  )
35
35
  llm_model: str = Field(
36
- default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-4o")
36
+ default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-5.3")
37
37
  )
38
38
  max_input_tokens: int = Field(
39
39
  default_factory=lambda: int(os.getenv("LONGPARSER_CHAT_MAX_INPUT_TOKENS", "1000"))
@@ -411,7 +411,7 @@ class Database:
411
411
  ]},
412
412
  },
413
413
  {"_id": 0},
414
- ).to_list(length=None)
414
+ ).to_list(length=10000) # Cap: embedding batches
415
415
 
416
416
  # -----------------------------------------------------------------------
417
417
  # Index versions
@@ -450,7 +450,7 @@ class Database:
450
450
  """List all index versions for a job (for cleanup on delete)."""
451
451
  return await self.index_versions.find(
452
452
  {"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0}
453
- ).to_list(length=None)
453
+ ).to_list(length=100) # Cap: index versions per job
454
454
 
455
455
  # -----------------------------------------------------------------------
456
456
  # Chat Sessions
@@ -597,7 +597,7 @@ class Database:
597
597
  {"tenant_id": tenant_id, "session_id": session_id},
598
598
  {"_id": 0},
599
599
  ).sort("created_at", 1)
600
- return await cursor.to_list(length=None)
600
+ return await cursor.to_list(length=5000) # Cap: session history
601
601
 
602
602
  async def get_unarchived_turns(
603
603
  self, tenant_id: str, session_id: str
@@ -611,7 +611,7 @@ class Database:
611
611
  },
612
612
  {"_id": 0},
613
613
  ).sort("created_at", 1)
614
- return await cursor.to_list(length=None)
614
+ return await cursor.to_list(length=5000) # Cap: summarization batch
615
615
 
616
616
  async def archive_turns(
617
617
  self, tenant_id: str, session_id: str, turn_ids: list[str]
@@ -645,7 +645,7 @@ class Database:
645
645
  {"deleted_at": {"$lte": cutoff}},
646
646
  {"session_id": 1, "tenant_id": 1, "_id": 0},
647
647
  )
648
- return await cursor.to_list(length=None)
648
+ return await cursor.to_list(length=1000) # Cap: purge batch
649
649
 
650
650
  # -----------------------------------------------------------------------
651
651
  # Lifecycle
@@ -93,7 +93,7 @@ class EmbeddingEngine:
93
93
 
94
94
  # Stable json dump
95
95
  cfg_str = json.dumps(config, sort_keys=True)
96
- return hashlib.sha1(cfg_str.encode("utf-8")).hexdigest()[:10]
96
+ return hashlib.sha256(cfg_str.encode("utf-8")).hexdigest()[:10]
97
97
 
98
98
  @property
99
99
  def dim(self) -> int:
@@ -108,7 +108,7 @@ class EmbeddingEngine:
108
108
  return self._dim
109
109
 
110
110
  fp = self.get_fingerprint()
111
- cache_key = f"cleanrag:embed_dim:{fp}"
111
+ cache_key = f"longparser:embed_dim:{fp}"
112
112
 
113
113
  # 1) Try Redis cross-process cache if available
114
114
  try:
@@ -145,8 +145,8 @@ class EmbeddingEngine:
145
145
  try:
146
146
  if 'r' in locals():
147
147
  r.set(cache_key, self._dim)
148
- except Exception:
149
- pass
148
+ except Exception as e:
149
+ logger.debug(f"Failed to set Redis cache: {e}")
150
150
 
151
151
  return self._dim
152
152
 
@@ -45,12 +45,7 @@ class ARQBackend(QueueBackend):
45
45
  from arq import create_pool
46
46
  from arq.connections import RedisSettings
47
47
 
48
- url = self.redis_url.replace("redis://", "")
49
- # Strip database number (e.g., /0) if present
50
- url = url.split("/")[0]
51
- host, _, port_str = url.partition(":")
52
- port = int(port_str) if port_str else 6379
53
- self._pool = await create_pool(RedisSettings(host=host, port=port))
48
+ self._pool = await create_pool(RedisSettings.from_dsn(self.redis_url))
54
49
  return self._pool
55
50
 
56
51
  async def enqueue(self, task_name: str, payload: dict) -> str:
@@ -64,7 +64,7 @@ class ChromaStore(BaseVectorStore):
64
64
  import chromadb
65
65
  except ImportError:
66
66
  raise ImportError(
67
- "chromadb is required. Install: pip install clean_rag[chroma]"
67
+ "chromadb is required. Install: pip install longparser[chroma]"
68
68
  )
69
69
 
70
70
  # Securely isolate vector spaces based on model config
@@ -125,8 +125,8 @@ class ChromaStore(BaseVectorStore):
125
125
  if isinstance(v, str) and v.startswith("["):
126
126
  try:
127
127
  meta[k] = json.loads(v)
128
- except (json.JSONDecodeError, ValueError):
129
- pass
128
+ except (json.JSONDecodeError, ValueError) as e:
129
+ logger.debug(f"Failed to decode JSON list from Chroma metadata: {e}")
130
130
  output.append({
131
131
  "id": vid,
132
132
  "score": 1.0 - (results["distances"][0][i] if results["distances"] else 0),
@@ -165,7 +165,7 @@ class FAISSStore(BaseVectorStore):
165
165
  import faiss # noqa: F401
166
166
  except ImportError:
167
167
  raise ImportError(
168
- "faiss-cpu is required. Install: pip install clean_rag[faiss]"
168
+ "faiss-cpu is required. Install: pip install longparser[faiss-cpu]"
169
169
  )
170
170
 
171
171
  self.base_dir = Path(base_dir)
@@ -297,7 +297,7 @@ class QdrantStore(BaseVectorStore):
297
297
  from qdrant_client.models import Distance, VectorParams
298
298
  except ImportError:
299
299
  raise ImportError(
300
- "qdrant-client is required. Install: pip install clean_rag[qdrant]"
300
+ "qdrant-client is required. Install: pip install longparser[qdrant]"
301
301
  )
302
302
 
303
303
  self.client = QdrantClient(url=url)
@@ -319,7 +319,7 @@ class QdrantStore(BaseVectorStore):
319
319
  if existing_dim != dim:
320
320
  # Mismatch — create new collection with hash suffix
321
321
  import hashlib
322
- suffix = hashlib.md5(f"{dim}".encode()).hexdigest()[:8]
322
+ suffix = hashlib.sha256(f"{dim}".encode()).hexdigest()[:8]
323
323
  self.collection_name = f"{self.collection_name}_{suffix}"
324
324
  logger.warning(
325
325
  f"QdrantStore: dim mismatch, using collection: {self.collection_name}"
@@ -382,8 +382,8 @@ class QdrantStore(BaseVectorStore):
382
382
  if isinstance(v, str) and v.startswith("["):
383
383
  try:
384
384
  payload[k] = json.loads(v)
385
- except (json.JSONDecodeError, ValueError):
386
- pass
385
+ except (json.JSONDecodeError, ValueError) as e:
386
+ logger.debug(f"Failed to decode JSON list from Qdrant metadata: {e}")
387
387
  output.append({
388
388
  "id": payload.get("vector_id", ""),
389
389
  "score": hit.score,
@@ -258,8 +258,8 @@ async def summarize_session(ctx: dict, tenant_id: str, session_id: str) -> dict:
258
258
  4. Archive summarized turns
259
259
  """
260
260
  from .db import Database
261
- from .schemas import ChatConfig
262
- from .llm_chain import get_plain_chat_model
261
+ from .chat.schemas import ChatConfig
262
+ from .chat.llm_chain import get_plain_chat_model
263
263
  from langchain_core.messages import SystemMessage, HumanMessage
264
264
 
265
265
  db = Database()
@@ -324,8 +324,8 @@ async def extract_facts(
324
324
  Only persists facts from allowlisted types with chunk provenance.
325
325
  """
326
326
  from .db import Database
327
- from .schemas import ChatConfig, FactSourceType
328
- from .llm_chain import get_chat_model
327
+ from .chat.schemas import ChatConfig, FactSourceType
328
+ from .chat.llm_chain import get_chat_model
329
329
  from langchain_core.messages import SystemMessage, HumanMessage
330
330
 
331
331
  db = Database()
@@ -407,7 +407,7 @@ async def extract_facts(
407
407
  async def purge_expired_sessions(ctx: dict) -> dict:
408
408
  """Scheduled task: hard-delete turns for soft-deleted sessions past TTL."""
409
409
  from .db import Database
410
- from .schemas import ChatConfig
410
+ from .chat.schemas import ChatConfig
411
411
 
412
412
  db = Database()
413
413
  config = ChatConfig()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: longparser
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
5
5
  Author-email: ENDEVSOLS Team <technology@endevsols.com>
6
6
  License-Expression: MIT
@@ -27,6 +27,7 @@ Description-Content-Type: text/markdown
27
27
  Requires-Dist: pydantic<3,>=2.0
28
28
  Requires-Dist: docling>=2.14
29
29
  Requires-Dist: docling-core>=2.13
30
+ Requires-Dist: langgraph-checkpoint-mongodb>=0.3.1
30
31
  Provides-Extra: pptx
31
32
  Requires-Dist: python-pptx>=1.0; extra == "pptx"
32
33
  Provides-Extra: langchain
@@ -109,8 +110,7 @@ Requires-Dist: httpx>=0.27; extra == "dev"
109
110
  Requires-Dist: anyio>=4.0; extra == "dev"
110
111
 
111
112
  <p align="center">
112
- <!-- Logo goes here once ready -->
113
- <h1 align="center">LongParser</h1>
113
+ <img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
114
114
  <p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
115
115
  <p align="center">
116
116
  Parse PDFs, DOCX, PPTX, XLSX &amp; CSV → validated, AI-ready chunks with HITL review.
@@ -129,7 +129,7 @@ Requires-Dist: anyio>=4.0; extra == "dev"
129
129
  <img src="https://static.pepy.tech/badge/longparser/month" alt="Monthly Downloads">
130
130
  </a>
131
131
  <a href="https://www.python.org/">
132
- <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
132
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python">
133
133
  </a>
134
134
  <a href="LICENSE">
135
135
  <img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
@@ -150,11 +150,12 @@ Requires-Dist: anyio>=4.0; extra == "dev"
150
150
  | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
151
151
  | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
152
152
  | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
153
- | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
153
+ | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
154
154
  | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
155
155
  | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
156
156
  | **Multi-backend vectors** | Chroma, FAISS, Qdrant |
157
- | **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
157
+ | **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
158
+ | **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
158
159
  | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
159
160
  | **Privacy-first** | All processing runs locally; no data leaves your infra |
160
161
 
@@ -215,9 +216,9 @@ pip install "longparser[cpu]"
215
216
  ### Python SDK
216
217
 
217
218
  ```python
218
- from longparser import PipelineOrchestrator, ProcessingConfig
219
+ from longparser import DocumentPipeline, ProcessingConfig
219
220
 
220
- pipeline = PipelineOrchestrator()
221
+ pipeline = DocumentPipeline(ProcessingConfig())
221
222
  result = pipeline.process_file("document.pdf")
222
223
 
223
224
  print(f"Pages: {result.document.metadata.total_pages}")
@@ -296,7 +297,7 @@ src/longparser/
296
297
  ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
297
298
  ├── extractors/ ← Docling, LaTeX OCR backends
298
299
  ├── chunkers/ ← HybridChunker
299
- ├── pipeline/ ← PipelineOrchestrator
300
+ ├── pipeline/ ← DocumentPipeline
300
301
  ├── integrations/ ← LangChain loader & LlamaIndex reader
301
302
  ├── utils/ ← shared helpers (RTL detection, …)
302
303
  └── server/ ← REST API layer
@@ -344,11 +345,14 @@ Copy `.env.example` to `.env` and set:
344
345
  | Variable | Default | Description |
345
346
  |----------|---------|-------------|
346
347
  | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
347
- | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
348
+ | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
348
349
  | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
349
- | `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
350
+ | `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
350
351
  | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
351
352
  | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
353
+ | `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
354
+ | `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
355
+ | `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
352
356
 
353
357
  ---
354
358
 
@@ -29,6 +29,7 @@ src/longparser/server/vectorstores.py
29
29
  src/longparser/server/worker.py
30
30
  src/longparser/server/chat/__init__.py
31
31
  src/longparser/server/chat/callbacks.py
32
+ src/longparser/server/chat/checkpointer.py
32
33
  src/longparser/server/chat/engine.py
33
34
  src/longparser/server/chat/graph.py
34
35
  src/longparser/server/chat/llm_chain.py
@@ -1,6 +1,7 @@
1
1
  pydantic<3,>=2.0
2
2
  docling>=2.14
3
3
  docling-core>=2.13
4
+ langgraph-checkpoint-mongodb>=0.3.1
4
5
 
5
6
  [all]
6
7
  longparser[cpu]
File without changes