longparser 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {longparser-0.1.1 → longparser-0.1.3}/PKG-INFO +15 -11
- {longparser-0.1.1 → longparser-0.1.3}/README.md +13 -10
- {longparser-0.1.1 → longparser-0.1.3}/pyproject.toml +2 -1
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/__init__.py +8 -4
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/chunkers/hybrid_chunker.py +2 -2
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/docling_extractor.py +16 -15
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/pipeline/__init__.py +4 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/app.py +139 -19
- longparser-0.1.3/src/longparser/server/chat/checkpointer.py +45 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/engine.py +2 -2
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/graph.py +10 -9
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/llm_chain.py +6 -4
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/schemas.py +1 -1
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/db.py +5 -5
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/embeddings.py +4 -4
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/queue.py +1 -6
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/vectorstores.py +8 -8
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/worker.py +5 -5
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/PKG-INFO +15 -11
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/SOURCES.txt +1 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/requires.txt +1 -0
- {longparser-0.1.1 → longparser-0.1.3}/setup.cfg +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/chunkers/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/base.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/extractors/latex_ocr.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/integrations/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/integrations/langchain.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/integrations/llamaindex.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/pipeline/orchestrator.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/py.typed +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/schemas.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/callbacks.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/chat/retriever.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/routers/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/server/schemas.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/utils/__init__.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser/utils/rtl_detector.py +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/dependency_links.txt +0 -0
- {longparser-0.1.1 → longparser-0.1.3}/src/longparser.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: longparser
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
|
|
5
5
|
Author-email: ENDEVSOLS Team <technology@endevsols.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,6 +27,7 @@ Description-Content-Type: text/markdown
|
|
|
27
27
|
Requires-Dist: pydantic<3,>=2.0
|
|
28
28
|
Requires-Dist: docling>=2.14
|
|
29
29
|
Requires-Dist: docling-core>=2.13
|
|
30
|
+
Requires-Dist: langgraph-checkpoint-mongodb>=0.3.1
|
|
30
31
|
Provides-Extra: pptx
|
|
31
32
|
Requires-Dist: python-pptx>=1.0; extra == "pptx"
|
|
32
33
|
Provides-Extra: langchain
|
|
@@ -109,8 +110,7 @@ Requires-Dist: httpx>=0.27; extra == "dev"
|
|
|
109
110
|
Requires-Dist: anyio>=4.0; extra == "dev"
|
|
110
111
|
|
|
111
112
|
<p align="center">
|
|
112
|
-
|
|
113
|
-
<h1 align="center">LongParser</h1>
|
|
113
|
+
<img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
|
|
114
114
|
<p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
|
|
115
115
|
<p align="center">
|
|
116
116
|
Parse PDFs, DOCX, PPTX, XLSX & CSV → validated, AI-ready chunks with HITL review.
|
|
@@ -129,7 +129,7 @@ Requires-Dist: anyio>=4.0; extra == "dev"
|
|
|
129
129
|
<img src="https://static.pepy.tech/badge/longparser/month" alt="Monthly Downloads">
|
|
130
130
|
</a>
|
|
131
131
|
<a href="https://www.python.org/">
|
|
132
|
-
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
|
|
132
|
+
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python">
|
|
133
133
|
</a>
|
|
134
134
|
<a href="LICENSE">
|
|
135
135
|
<img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
|
|
@@ -150,11 +150,12 @@ Requires-Dist: anyio>=4.0; extra == "dev"
|
|
|
150
150
|
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
|
|
151
151
|
| **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
|
|
152
152
|
| **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
|
|
153
|
-
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
|
|
153
|
+
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
|
|
154
154
|
| **3-layer memory** | Short-term turns + rolling summary + long-term facts |
|
|
155
155
|
| **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
|
|
156
156
|
| **Multi-backend vectors** | Chroma, FAISS, Qdrant |
|
|
157
|
-
| **
|
|
157
|
+
| **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
|
|
158
|
+
| **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
|
|
158
159
|
| **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
|
|
159
160
|
| **Privacy-first** | All processing runs locally; no data leaves your infra |
|
|
160
161
|
|
|
@@ -215,9 +216,9 @@ pip install "longparser[cpu]"
|
|
|
215
216
|
### Python SDK
|
|
216
217
|
|
|
217
218
|
```python
|
|
218
|
-
from longparser import
|
|
219
|
+
from longparser import DocumentPipeline, ProcessingConfig
|
|
219
220
|
|
|
220
|
-
pipeline =
|
|
221
|
+
pipeline = DocumentPipeline(ProcessingConfig())
|
|
221
222
|
result = pipeline.process_file("document.pdf")
|
|
222
223
|
|
|
223
224
|
print(f"Pages: {result.document.metadata.total_pages}")
|
|
@@ -296,7 +297,7 @@ src/longparser/
|
|
|
296
297
|
├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
|
|
297
298
|
├── extractors/ ← Docling, LaTeX OCR backends
|
|
298
299
|
├── chunkers/ ← HybridChunker
|
|
299
|
-
├── pipeline/ ←
|
|
300
|
+
├── pipeline/ ← DocumentPipeline
|
|
300
301
|
├── integrations/ ← LangChain loader & LlamaIndex reader
|
|
301
302
|
├── utils/ ← shared helpers (RTL detection, …)
|
|
302
303
|
└── server/ ← REST API layer
|
|
@@ -344,11 +345,14 @@ Copy `.env.example` to `.env` and set:
|
|
|
344
345
|
| Variable | Default | Description |
|
|
345
346
|
|----------|---------|-------------|
|
|
346
347
|
| `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
|
|
347
|
-
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
|
|
348
|
+
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
|
|
348
349
|
| `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
|
|
349
|
-
| `LONGPARSER_LLM_MODEL` | `gpt-
|
|
350
|
+
| `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
|
|
350
351
|
| `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
|
|
351
352
|
| `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
|
|
353
|
+
| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
|
|
354
|
+
| `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
|
|
355
|
+
| `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
|
|
352
356
|
|
|
353
357
|
---
|
|
354
358
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
<p align="center">
|
|
2
|
-
|
|
3
|
-
<h1 align="center">LongParser</h1>
|
|
2
|
+
<img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
|
|
4
3
|
<p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
|
|
5
4
|
<p align="center">
|
|
6
5
|
Parse PDFs, DOCX, PPTX, XLSX & CSV → validated, AI-ready chunks with HITL review.
|
|
@@ -19,7 +18,7 @@
|
|
|
19
18
|
<img src="https://static.pepy.tech/badge/longparser/month" alt="Monthly Downloads">
|
|
20
19
|
</a>
|
|
21
20
|
<a href="https://www.python.org/">
|
|
22
|
-
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
|
|
21
|
+
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python">
|
|
23
22
|
</a>
|
|
24
23
|
<a href="LICENSE">
|
|
25
24
|
<img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
|
|
@@ -40,11 +39,12 @@
|
|
|
40
39
|
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
|
|
41
40
|
| **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
|
|
42
41
|
| **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
|
|
43
|
-
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
|
|
42
|
+
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
|
|
44
43
|
| **3-layer memory** | Short-term turns + rolling summary + long-term facts |
|
|
45
44
|
| **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
|
|
46
45
|
| **Multi-backend vectors** | Chroma, FAISS, Qdrant |
|
|
47
|
-
| **
|
|
46
|
+
| **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
|
|
47
|
+
| **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
|
|
48
48
|
| **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
|
|
49
49
|
| **Privacy-first** | All processing runs locally; no data leaves your infra |
|
|
50
50
|
|
|
@@ -105,9 +105,9 @@ pip install "longparser[cpu]"
|
|
|
105
105
|
### Python SDK
|
|
106
106
|
|
|
107
107
|
```python
|
|
108
|
-
from longparser import
|
|
108
|
+
from longparser import DocumentPipeline, ProcessingConfig
|
|
109
109
|
|
|
110
|
-
pipeline =
|
|
110
|
+
pipeline = DocumentPipeline(ProcessingConfig())
|
|
111
111
|
result = pipeline.process_file("document.pdf")
|
|
112
112
|
|
|
113
113
|
print(f"Pages: {result.document.metadata.total_pages}")
|
|
@@ -186,7 +186,7 @@ src/longparser/
|
|
|
186
186
|
├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
|
|
187
187
|
├── extractors/ ← Docling, LaTeX OCR backends
|
|
188
188
|
├── chunkers/ ← HybridChunker
|
|
189
|
-
├── pipeline/ ←
|
|
189
|
+
├── pipeline/ ← DocumentPipeline
|
|
190
190
|
├── integrations/ ← LangChain loader & LlamaIndex reader
|
|
191
191
|
├── utils/ ← shared helpers (RTL detection, …)
|
|
192
192
|
└── server/ ← REST API layer
|
|
@@ -234,11 +234,14 @@ Copy `.env.example` to `.env` and set:
|
|
|
234
234
|
| Variable | Default | Description |
|
|
235
235
|
|----------|---------|-------------|
|
|
236
236
|
| `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
|
|
237
|
-
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
|
|
237
|
+
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
|
|
238
238
|
| `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
|
|
239
|
-
| `LONGPARSER_LLM_MODEL` | `gpt-
|
|
239
|
+
| `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
|
|
240
240
|
| `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
|
|
241
241
|
| `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
|
|
242
|
+
| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
|
|
243
|
+
| `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
|
|
244
|
+
| `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
|
|
242
245
|
|
|
243
246
|
---
|
|
244
247
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "longparser"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.3"
|
|
8
8
|
description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines."
|
|
9
9
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -35,6 +35,7 @@ dependencies = [
|
|
|
35
35
|
"pydantic>=2.0,<3",
|
|
36
36
|
"docling>=2.14",
|
|
37
37
|
"docling-core>=2.13",
|
|
38
|
+
"langgraph-checkpoint-mongodb>=0.3.1",
|
|
38
39
|
]
|
|
39
40
|
|
|
40
41
|
[project.optional-dependencies]
|
|
@@ -9,9 +9,9 @@ Built by ENDEVSOLS for production RAG pipelines.
|
|
|
9
9
|
|
|
10
10
|
Quick start::
|
|
11
11
|
|
|
12
|
-
from longparser import
|
|
12
|
+
from longparser import DocumentPipeline, ProcessingConfig
|
|
13
13
|
|
|
14
|
-
pipeline =
|
|
14
|
+
pipeline = DocumentPipeline(ProcessingConfig())
|
|
15
15
|
result = pipeline.process_file("document.pdf")
|
|
16
16
|
print(result.chunks[0].text)
|
|
17
17
|
|
|
@@ -19,13 +19,13 @@ For the full REST API server::
|
|
|
19
19
|
|
|
20
20
|
uv run uvicorn longparser.server.app:app --reload --port 8000
|
|
21
21
|
|
|
22
|
-
See :class:`~longparser.pipeline.
|
|
22
|
+
See :class:`~longparser.pipeline.DocumentPipeline` for the main SDK entry
|
|
23
23
|
point and :mod:`longparser.server` for the REST API layer.
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
26
|
from __future__ import annotations
|
|
27
27
|
|
|
28
|
-
__version__ = "0.1.
|
|
28
|
+
__version__ = "0.1.3"
|
|
29
29
|
__author__ = "ENDEVSOLS Team"
|
|
30
30
|
__license__ = "MIT"
|
|
31
31
|
|
|
@@ -62,6 +62,9 @@ def __getattr__(name: str):
|
|
|
62
62
|
if name == "PipelineOrchestrator":
|
|
63
63
|
from .pipeline import PipelineOrchestrator
|
|
64
64
|
return PipelineOrchestrator
|
|
65
|
+
if name == "DocumentPipeline":
|
|
66
|
+
from .pipeline import DocumentPipeline
|
|
67
|
+
return DocumentPipeline
|
|
65
68
|
if name == "PipelineResult":
|
|
66
69
|
from .pipeline import PipelineResult
|
|
67
70
|
return PipelineResult
|
|
@@ -99,6 +102,7 @@ __all__ = [
|
|
|
99
102
|
# Lazily imported (require extras)
|
|
100
103
|
"DoclingExtractor",
|
|
101
104
|
"PipelineOrchestrator",
|
|
105
|
+
"DocumentPipeline",
|
|
102
106
|
"PipelineResult",
|
|
103
107
|
"HybridChunker",
|
|
104
108
|
]
|
|
@@ -345,10 +345,10 @@ def _generate_schema_chunk(
|
|
|
345
345
|
sample_rows.append(f" Row {r_idx}: " + "; ".join(parts))
|
|
346
346
|
|
|
347
347
|
lines = [
|
|
348
|
-
|
|
348
|
+
"[TABLE SCHEMA]",
|
|
349
349
|
f"Table ID: {block.block_id}",
|
|
350
350
|
f"Rows: {n_data} (data rows), Columns: {n_cols}",
|
|
351
|
-
|
|
351
|
+
"Columns:",
|
|
352
352
|
]
|
|
353
353
|
lines.extend(col_profiles)
|
|
354
354
|
lines.append(f"Sample Rows ({sample_count}):")
|
|
@@ -254,7 +254,7 @@ class DoclingExtractor(BaseExtractor):
|
|
|
254
254
|
# Order-based substitution with alignment gate
|
|
255
255
|
injected = 0
|
|
256
256
|
_non_omml = 0
|
|
257
|
-
for block, latex in zip(formula_blocks, latex_eqs):
|
|
257
|
+
for block, latex in zip(formula_blocks, latex_eqs, strict=False):
|
|
258
258
|
orig_len = len(block.text.strip()) if block.text else 0
|
|
259
259
|
latex_len = len(latex.strip())
|
|
260
260
|
|
|
@@ -431,7 +431,8 @@ class DoclingExtractor(BaseExtractor):
|
|
|
431
431
|
page_img = None
|
|
432
432
|
try:
|
|
433
433
|
page_img = page_obj.image.pil_image
|
|
434
|
-
except Exception:
|
|
434
|
+
except Exception as e:
|
|
435
|
+
logger.warning("Failed to extract image for formula scanning: %s", e)
|
|
435
436
|
continue
|
|
436
437
|
if page_img is None:
|
|
437
438
|
continue
|
|
@@ -527,8 +528,8 @@ class DoclingExtractor(BaseExtractor):
|
|
|
527
528
|
# Update label to formula so downstream sees it correctly
|
|
528
529
|
try:
|
|
529
530
|
item.label = type(item.label)("formula")
|
|
530
|
-
except Exception:
|
|
531
|
-
|
|
531
|
+
except Exception as e:
|
|
532
|
+
logger.debug(f"Failed to update formula label: {e}")
|
|
532
533
|
replaced = True
|
|
533
534
|
logger.debug(f"MFD: replaced garbled block on page {page_no}")
|
|
534
535
|
break
|
|
@@ -1023,15 +1024,15 @@ class DoclingExtractor(BaseExtractor):
|
|
|
1023
1024
|
if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'):
|
|
1024
1025
|
try:
|
|
1025
1026
|
return item.export_to_markdown(doc=docling_doc)
|
|
1026
|
-
except Exception:
|
|
1027
|
-
|
|
1027
|
+
except Exception as e:
|
|
1028
|
+
logger.debug(f"Failed to export table item to markdown: {e}")
|
|
1028
1029
|
if hasattr(item, 'text') and item.text:
|
|
1029
1030
|
return item.text
|
|
1030
1031
|
if hasattr(item, 'export_to_markdown'):
|
|
1031
1032
|
try:
|
|
1032
1033
|
return item.export_to_markdown()
|
|
1033
|
-
except Exception:
|
|
1034
|
-
|
|
1034
|
+
except Exception as e:
|
|
1035
|
+
logger.debug(f"Failed to export item to markdown: {e}")
|
|
1035
1036
|
return ""
|
|
1036
1037
|
|
|
1037
1038
|
def _get_item_confidence(self, item) -> float:
|
|
@@ -1080,10 +1081,10 @@ class DoclingExtractor(BaseExtractor):
|
|
|
1080
1081
|
if s.placeholder_format.type == PP_PH.SUBTITLE:
|
|
1081
1082
|
has_subtitle_placeholder = True
|
|
1082
1083
|
break
|
|
1083
|
-
except Exception:
|
|
1084
|
-
|
|
1085
|
-
except ImportError:
|
|
1086
|
-
|
|
1084
|
+
except Exception as e:
|
|
1085
|
+
logger.debug(f"Failed to check PPTX subtitle placeholder format: {e}")
|
|
1086
|
+
except ImportError as e:
|
|
1087
|
+
logger.debug(f"Failed to import python-pptx: {e}")
|
|
1087
1088
|
|
|
1088
1089
|
for shape in slide.shapes:
|
|
1089
1090
|
found_title = self._extract_pptx_shape_info(
|
|
@@ -1160,8 +1161,8 @@ class DoclingExtractor(BaseExtractor):
|
|
|
1160
1161
|
is_subtitle_shape = True
|
|
1161
1162
|
elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER):
|
|
1162
1163
|
is_footer_shape = True
|
|
1163
|
-
except Exception:
|
|
1164
|
-
|
|
1164
|
+
except Exception as e:
|
|
1165
|
+
logger.debug(f"Failed to check PPTX placeholder format type: {e}")
|
|
1165
1166
|
|
|
1166
1167
|
# Skip footer/date/slide-number shapes entirely
|
|
1167
1168
|
if is_footer_shape:
|
|
@@ -1267,7 +1268,7 @@ class DoclingExtractor(BaseExtractor):
|
|
|
1267
1268
|
|
|
1268
1269
|
# Calculate file hash
|
|
1269
1270
|
with open(file_path, "rb") as f:
|
|
1270
|
-
file_hash = hashlib.
|
|
1271
|
+
file_hash = hashlib.sha256(f.read()).hexdigest()
|
|
1271
1272
|
|
|
1272
1273
|
# Get conversion result (cached or new)
|
|
1273
1274
|
result = self._run_docling(file_path, config)
|
|
@@ -13,6 +13,7 @@ try:
|
|
|
13
13
|
except ImportError:
|
|
14
14
|
pass
|
|
15
15
|
|
|
16
|
+
from collections import defaultdict
|
|
16
17
|
import hashlib
|
|
17
18
|
import io
|
|
18
19
|
import logging
|
|
@@ -25,6 +26,7 @@ from datetime import datetime, timezone
|
|
|
25
26
|
from pathlib import Path
|
|
26
27
|
from typing import Optional
|
|
27
28
|
import time as _time
|
|
29
|
+
import redis.asyncio as redis
|
|
28
30
|
|
|
29
31
|
from fastapi import (
|
|
30
32
|
FastAPI,
|
|
@@ -35,6 +37,7 @@ from fastapi import (
|
|
|
35
37
|
Request,
|
|
36
38
|
UploadFile,
|
|
37
39
|
)
|
|
40
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
38
41
|
from fastapi.responses import JSONResponse, StreamingResponse
|
|
39
42
|
|
|
40
43
|
from .db import Database
|
|
@@ -57,6 +60,15 @@ from .schemas import (
|
|
|
57
60
|
SearchResponse,
|
|
58
61
|
SearchResult,
|
|
59
62
|
)
|
|
63
|
+
from .chat.schemas import (
|
|
64
|
+
ChatConfig,
|
|
65
|
+
ChatRequest,
|
|
66
|
+
ChatResponse,
|
|
67
|
+
CreateSessionRequest,
|
|
68
|
+
HITLResumeRequest,
|
|
69
|
+
LLMAnswer,
|
|
70
|
+
SourceRef,
|
|
71
|
+
)
|
|
60
72
|
|
|
61
73
|
logger = logging.getLogger(__name__)
|
|
62
74
|
|
|
@@ -92,8 +104,18 @@ queue = ARQBackend(
|
|
|
92
104
|
async def lifespan(app: FastAPI):
|
|
93
105
|
"""Startup/shutdown hooks."""
|
|
94
106
|
await db.create_indexes()
|
|
107
|
+
|
|
108
|
+
from .chat.checkpointer import init_checkpointer, close_checkpointer
|
|
109
|
+
await init_checkpointer(
|
|
110
|
+
mongo_uri=os.getenv("LONGPARSER_MONGO_URL", "mongodb://localhost:27017"),
|
|
111
|
+
db_name=os.getenv("LONGPARSER_DB_NAME", "longparser"),
|
|
112
|
+
)
|
|
113
|
+
|
|
95
114
|
logger.info("LongParser API started")
|
|
96
115
|
yield
|
|
116
|
+
|
|
117
|
+
await close_checkpointer()
|
|
118
|
+
|
|
97
119
|
await queue.close()
|
|
98
120
|
await db.close()
|
|
99
121
|
if hasattr(app.state, "chat_engine"):
|
|
@@ -104,11 +126,69 @@ async def lifespan(app: FastAPI):
|
|
|
104
126
|
app = FastAPI(
|
|
105
127
|
title="LongParser API",
|
|
106
128
|
description="Document intelligence engine with HITL review, embedding, and vector search.",
|
|
107
|
-
version="
|
|
129
|
+
version=__import__("longparser").__version__,
|
|
108
130
|
lifespan=lifespan,
|
|
109
131
|
)
|
|
110
132
|
|
|
111
133
|
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# CORS middleware
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
app.add_middleware(
|
|
139
|
+
CORSMiddleware,
|
|
140
|
+
allow_origins=os.getenv("LONGPARSER_CORS_ORIGINS", "*").split(","),
|
|
141
|
+
allow_credentials=True,
|
|
142
|
+
allow_methods=["*"],
|
|
143
|
+
allow_headers=["*"],
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ---------------------------------------------------------------------------
|
|
148
|
+
# Global exception handler
|
|
149
|
+
# ---------------------------------------------------------------------------
|
|
150
|
+
|
|
151
|
+
@app.exception_handler(Exception)
|
|
152
|
+
async def global_exception_handler(request: Request, exc: Exception):
|
|
153
|
+
"""Catch unhandled exceptions — return sanitized error, log full trace."""
|
|
154
|
+
logger.exception("Unhandled exception", exc_info=exc)
|
|
155
|
+
return JSONResponse(
|
|
156
|
+
status_code=500,
|
|
157
|
+
content={"detail": "Internal server error"},
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# Rate limiter (Redis sliding window)
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
class RedisRateLimiter:
|
|
166
|
+
"""Redis-backed sliding-window rate limiter (per-tenant) for multi-worker scale."""
|
|
167
|
+
|
|
168
|
+
def __init__(self, redis_url: str, max_requests: int = 60, window_seconds: int = 60):
|
|
169
|
+
self.max_requests = max_requests
|
|
170
|
+
self.window = window_seconds
|
|
171
|
+
self.redis = redis.from_url(redis_url)
|
|
172
|
+
|
|
173
|
+
async def check(self, key: str) -> bool:
|
|
174
|
+
now = _time.time()
|
|
175
|
+
redis_key = f"rate_limit:{key}"
|
|
176
|
+
pipeline = self.redis.pipeline()
|
|
177
|
+
pipeline.zremrangebyscore(redis_key, 0, now - self.window)
|
|
178
|
+
pipeline.zadd(redis_key, {str(now): now})
|
|
179
|
+
pipeline.zcard(redis_key)
|
|
180
|
+
pipeline.expire(redis_key, self.window)
|
|
181
|
+
results = await pipeline.execute()
|
|
182
|
+
return results[2] <= self.max_requests
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
_rate_limiter = RedisRateLimiter(
|
|
186
|
+
redis_url=os.getenv("LONGPARSER_REDIS_URL", "redis://localhost:6379/0"),
|
|
187
|
+
max_requests=int(os.getenv("LONGPARSER_RATE_LIMIT", "60")),
|
|
188
|
+
window_seconds=60,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
112
192
|
# ---------------------------------------------------------------------------
|
|
113
193
|
# Auth middleware (API key — v1)
|
|
114
194
|
# ---------------------------------------------------------------------------
|
|
@@ -121,8 +201,33 @@ def _get_tenant(x_api_key: str = Header(...)) -> str:
|
|
|
121
201
|
"""
|
|
122
202
|
if not x_api_key or len(x_api_key) < 8:
|
|
123
203
|
raise HTTPException(status_code=401, detail="Invalid API key")
|
|
124
|
-
#
|
|
125
|
-
return hashlib.sha256(x_api_key.encode()).hexdigest()[:
|
|
204
|
+
# Use 32 hex chars (128-bit) to resist brute-force collision attacks
|
|
205
|
+
return hashlib.sha256(x_api_key.encode()).hexdigest()[:32]
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
# RBAC (role-based access control)
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
_ADMIN_KEYS: set[str] = set(
|
|
213
|
+
k.strip() for k in os.getenv("LONGPARSER_ADMIN_KEYS", "").split(",") if k.strip()
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _get_role(x_api_key: str) -> str:
|
|
218
|
+
"""Resolve user role from API key.
|
|
219
|
+
|
|
220
|
+
If LONGPARSER_ADMIN_KEYS is not set, all users are admins (backward compatible).
|
|
221
|
+
"""
|
|
222
|
+
if not _ADMIN_KEYS:
|
|
223
|
+
return "admin"
|
|
224
|
+
return "admin" if x_api_key in _ADMIN_KEYS else "reviewer"
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _require_admin(x_api_key: str) -> None:
|
|
228
|
+
"""Raise 403 if the API key does not have admin role."""
|
|
229
|
+
if _get_role(x_api_key) != "admin":
|
|
230
|
+
raise HTTPException(status_code=403, detail="Admin access required")
|
|
126
231
|
|
|
127
232
|
|
|
128
233
|
# ---------------------------------------------------------------------------
|
|
@@ -175,14 +280,23 @@ async def create_job(
|
|
|
175
280
|
|
|
176
281
|
# Generate job ID and save file
|
|
177
282
|
job_id = str(uuid.uuid4())
|
|
178
|
-
|
|
283
|
+
|
|
284
|
+
# --- Path Traversal Protection ---
|
|
285
|
+
# Strip all directory components from the user-provided filename
|
|
286
|
+
# to prevent payloads like "../../../etc/passwd" from escaping UPLOAD_DIR.
|
|
287
|
+
raw_name = file.filename or "document"
|
|
288
|
+
safe_name = Path(raw_name).name # keeps only the final component
|
|
289
|
+
if not safe_name or safe_name in (".", ".."):
|
|
290
|
+
safe_name = "document"
|
|
291
|
+
|
|
292
|
+
dest = UPLOAD_DIR / tenant_id / job_id / safe_name
|
|
179
293
|
file_hash, file_size = await _stream_upload(file, dest)
|
|
180
294
|
|
|
181
295
|
# Create job in MongoDB
|
|
182
296
|
job_doc = await db.create_job(
|
|
183
297
|
tenant_id=tenant_id,
|
|
184
298
|
job_id=job_id,
|
|
185
|
-
source_file=
|
|
299
|
+
source_file=safe_name,
|
|
186
300
|
file_hash=file_hash,
|
|
187
301
|
)
|
|
188
302
|
|
|
@@ -197,7 +311,7 @@ async def create_job(
|
|
|
197
311
|
job_id=job_id,
|
|
198
312
|
tenant_id=tenant_id,
|
|
199
313
|
status=JobStatus.QUEUED,
|
|
200
|
-
source_file=
|
|
314
|
+
source_file=safe_name,
|
|
201
315
|
file_hash=file_hash,
|
|
202
316
|
created_at=job_doc["created_at"],
|
|
203
317
|
)
|
|
@@ -498,6 +612,7 @@ async def purge_block(
|
|
|
498
612
|
x_api_key: str = Header(...),
|
|
499
613
|
):
|
|
500
614
|
"""Admin-only: permanently delete a block. Writes a tombstone revision."""
|
|
615
|
+
_require_admin(x_api_key)
|
|
501
616
|
tenant_id = _get_tenant(x_api_key)
|
|
502
617
|
|
|
503
618
|
# Get block before deletion (for tombstone)
|
|
@@ -545,6 +660,7 @@ async def purge_chunk(
|
|
|
545
660
|
x_api_key: str = Header(...),
|
|
546
661
|
):
|
|
547
662
|
"""Admin-only: permanently delete a chunk. Writes a tombstone revision."""
|
|
663
|
+
_require_admin(x_api_key)
|
|
548
664
|
tenant_id = _get_tenant(x_api_key)
|
|
549
665
|
|
|
550
666
|
# Get chunk before deletion
|
|
@@ -852,8 +968,19 @@ async def search(body: SearchRequest, x_api_key: str = Header(...)):
|
|
|
852
968
|
|
|
853
969
|
@app.middleware("http")
|
|
854
970
|
async def observability_middleware(request: Request, call_next):
|
|
855
|
-
"""Attach request_id and log structured request data."""
|
|
971
|
+
"""Attach request_id, enforce rate limits, and log structured request data."""
|
|
856
972
|
request_id = str(uuid.uuid4())[:8]
|
|
973
|
+
|
|
974
|
+
# ── Rate limiting (skip unauthenticated endpoints) ──
|
|
975
|
+
api_key = request.headers.get("x-api-key")
|
|
976
|
+
if api_key and len(api_key) >= 8:
|
|
977
|
+
tenant_key = hashlib.sha256(api_key.encode()).hexdigest()[:32]
|
|
978
|
+
if not await _rate_limiter.check(tenant_key):
|
|
979
|
+
return JSONResponse(
|
|
980
|
+
status_code=429,
|
|
981
|
+
content={"detail": "Rate limit exceeded. Try again later."},
|
|
982
|
+
)
|
|
983
|
+
|
|
857
984
|
start = _time.monotonic()
|
|
858
985
|
response = await call_next(request)
|
|
859
986
|
latency_ms = (_time.monotonic() - start) * 1000
|
|
@@ -876,12 +1003,10 @@ async def observability_middleware(request: Request, call_next):
|
|
|
876
1003
|
|
|
877
1004
|
@app.post("/chat/sessions", status_code=201)
|
|
878
1005
|
async def create_chat_session(
|
|
879
|
-
|
|
1006
|
+
req: CreateSessionRequest,
|
|
880
1007
|
x_api_key: str = Header(...),
|
|
881
1008
|
):
|
|
882
1009
|
"""Create a new chat session (server-generated session_id)."""
|
|
883
|
-
from .chat.schemas import CreateSessionRequest
|
|
884
|
-
req = CreateSessionRequest(**body)
|
|
885
1010
|
tenant_id = _get_tenant(x_api_key)
|
|
886
1011
|
|
|
887
1012
|
# Verify job belongs to tenant
|
|
@@ -930,17 +1055,15 @@ async def delete_chat_session(
|
|
|
930
1055
|
|
|
931
1056
|
@app.post("/chat")
|
|
932
1057
|
async def chat(
|
|
933
|
-
|
|
1058
|
+
req: ChatRequest,
|
|
934
1059
|
x_api_key: str = Header(...),
|
|
935
1060
|
):
|
|
936
1061
|
"""Ask a question — RAG chatbot with 3-layer memory.
|
|
937
1062
|
|
|
938
1063
|
Set require_approval=true for Human-in-the-Loop review.
|
|
939
1064
|
"""
|
|
940
|
-
from .chat.schemas import ChatRequest, ChatResponse, ChatConfig
|
|
941
1065
|
from .chat.engine import ChatEngine
|
|
942
1066
|
|
|
943
|
-
req = ChatRequest(**body)
|
|
944
1067
|
tenant_id = _get_tenant(x_api_key)
|
|
945
1068
|
|
|
946
1069
|
# ── Session ↔ Job binding validation ──
|
|
@@ -965,7 +1088,6 @@ async def chat(
|
|
|
965
1088
|
|
|
966
1089
|
# ── HITL: if require_approval, pause for human review ──
|
|
967
1090
|
if req.require_approval and response.status == "complete":
|
|
968
|
-
from .chat.schemas import LLMAnswer, SourceRef
|
|
969
1091
|
from .chat.graph import start_hitl_review
|
|
970
1092
|
|
|
971
1093
|
answer_obj = LLMAnswer(
|
|
@@ -988,14 +1110,12 @@ async def chat(
|
|
|
988
1110
|
|
|
989
1111
|
@app.post("/chat/resume")
|
|
990
1112
|
async def resume_chat(
|
|
991
|
-
|
|
1113
|
+
req: HITLResumeRequest,
|
|
992
1114
|
x_api_key: str = Header(...),
|
|
993
1115
|
):
|
|
994
1116
|
"""Resume a paused HITL chat with human decision (approve/edit/reject)."""
|
|
995
|
-
from .chat.schemas import HITLResumeRequest, ChatResponse, SourceRef, Turn
|
|
996
1117
|
from .chat.graph import resume_hitl_review
|
|
997
1118
|
|
|
998
|
-
req = HITLResumeRequest(**body)
|
|
999
1119
|
tenant_id = _get_tenant(x_api_key)
|
|
1000
1120
|
|
|
1001
1121
|
# Validate session belongs to tenant
|
|
@@ -1014,7 +1134,7 @@ async def resume_chat(
|
|
|
1014
1134
|
if result.get("status") == "complete":
|
|
1015
1135
|
# Update the last turn's answer if edited
|
|
1016
1136
|
if req.action == "edit" and req.edited_answer:
|
|
1017
|
-
await db.chat_turns.
|
|
1137
|
+
await db.chat_turns.find_one_and_update(
|
|
1018
1138
|
{
|
|
1019
1139
|
"tenant_id": tenant_id,
|
|
1020
1140
|
"session_id": req.session_id,
|
|
@@ -1041,5 +1161,5 @@ async def resume_chat(
|
|
|
1041
1161
|
@app.get("/health")
|
|
1042
1162
|
async def health():
|
|
1043
1163
|
"""Health check endpoint."""
|
|
1044
|
-
return {"status": "ok", "service": "
|
|
1164
|
+
return {"status": "ok", "service": "longparser-api"}
|
|
1045
1165
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""LangGraph MongoDB Checkpointer singleton.
|
|
2
|
+
|
|
3
|
+
Holds the global per-worker instance of the MongoDBSaver.
|
|
4
|
+
"""
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from pymongo import MongoClient
|
|
8
|
+
from langgraph.checkpoint.mongodb import MongoDBSaver
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_mongo_client: Optional[MongoClient] = None
|
|
13
|
+
_checkpointer: Optional[MongoDBSaver] = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
async def init_checkpointer(mongo_uri: str, db_name: str) -> None:
|
|
17
|
+
"""Initialize the MongoDB checkpointer on app startup."""
|
|
18
|
+
global _mongo_client, _checkpointer
|
|
19
|
+
if _checkpointer is not None:
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
logger.info("Initializing LangGraph MongoDB checkpointer...")
|
|
23
|
+
# Initialize the sync MongoClient
|
|
24
|
+
_mongo_client = MongoClient(mongo_uri)
|
|
25
|
+
|
|
26
|
+
# Initialize the saver
|
|
27
|
+
_checkpointer = MongoDBSaver(_mongo_client, db_name=db_name)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_checkpointer() -> MongoDBSaver:
|
|
31
|
+
"""Get the active checkpointer instance."""
|
|
32
|
+
global _checkpointer
|
|
33
|
+
if _checkpointer is None:
|
|
34
|
+
raise RuntimeError("Checkpointer not initialized. Call init_checkpointer first.")
|
|
35
|
+
return _checkpointer
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def close_checkpointer() -> None:
|
|
39
|
+
"""Close the database checkpointer on app shutdown."""
|
|
40
|
+
global _mongo_client, _checkpointer
|
|
41
|
+
if _mongo_client is not None:
|
|
42
|
+
_mongo_client.close()
|
|
43
|
+
_mongo_client = None
|
|
44
|
+
_checkpointer = None
|
|
45
|
+
logger.info("LangGraph MongoDB checkpointer closed.")
|
|
@@ -76,7 +76,7 @@ RAG_PROMPT = ChatPromptTemplate.from_messages([
|
|
|
76
76
|
# Token Counting (model-aware) — kept as custom logic
|
|
77
77
|
# ---------------------------------------------------------------------------
|
|
78
78
|
|
|
79
|
-
def count_tokens(text: str, model: str = "gpt-
|
|
79
|
+
def count_tokens(text: str, model: str = "gpt-5.3") -> int:
|
|
80
80
|
"""Count tokens — exact for OpenAI models, conservative approx for others."""
|
|
81
81
|
try:
|
|
82
82
|
import tiktoken
|
|
@@ -96,7 +96,7 @@ def budget_trim(
|
|
|
96
96
|
recent_turns: list[dict],
|
|
97
97
|
rolling_summary: str,
|
|
98
98
|
long_term_facts: list[dict],
|
|
99
|
-
model: str = "gpt-
|
|
99
|
+
model: str = "gpt-5.3",
|
|
100
100
|
max_prompt_tokens: int = 6000,
|
|
101
101
|
) -> dict:
|
|
102
102
|
"""Priority-ordered truncation of prompt variables to fit token budget.
|
|
@@ -17,16 +17,14 @@ import logging
|
|
|
17
17
|
import uuid
|
|
18
18
|
from typing import TypedDict, Optional, Any
|
|
19
19
|
|
|
20
|
-
from langgraph.checkpoint.memory import InMemorySaver
|
|
21
20
|
from langgraph.graph import StateGraph, END
|
|
22
21
|
from langgraph.types import interrupt, Command
|
|
23
22
|
|
|
24
23
|
from .schemas import ChatConfig, ChatRequest, ChatResponse, SourceRef, Turn, LLMAnswer
|
|
24
|
+
from .checkpointer import get_checkpointer
|
|
25
25
|
|
|
26
26
|
logger = logging.getLogger(__name__)
|
|
27
27
|
|
|
28
|
-
# Shared checkpointer for all HITL flows
|
|
29
|
-
_checkpointer = InMemorySaver()
|
|
30
28
|
|
|
31
29
|
|
|
32
30
|
# ---------------------------------------------------------------------------
|
|
@@ -103,7 +101,7 @@ async def process_decision(state: HITLState) -> HITLState:
|
|
|
103
101
|
# Build Graph
|
|
104
102
|
# ---------------------------------------------------------------------------
|
|
105
103
|
|
|
106
|
-
def build_hitl_graph() -> Any:
|
|
104
|
+
def build_hitl_graph(checkpointer) -> Any:
|
|
107
105
|
"""Build and compile the HITL state graph."""
|
|
108
106
|
graph = StateGraph(HITLState)
|
|
109
107
|
|
|
@@ -116,11 +114,7 @@ def build_hitl_graph() -> Any:
|
|
|
116
114
|
graph.add_edge("review", "decide")
|
|
117
115
|
graph.add_edge("decide", END)
|
|
118
116
|
|
|
119
|
-
return graph.compile(checkpointer=
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
# Module-level compiled graph
|
|
123
|
-
hitl_graph = build_hitl_graph()
|
|
117
|
+
return graph.compile(checkpointer=checkpointer)
|
|
124
118
|
|
|
125
119
|
|
|
126
120
|
# ---------------------------------------------------------------------------
|
|
@@ -152,6 +146,10 @@ async def start_hitl_review(
|
|
|
152
146
|
}
|
|
153
147
|
|
|
154
148
|
config = {"configurable": {"thread_id": thread_id}}
|
|
149
|
+
|
|
150
|
+
checkpointer = get_checkpointer()
|
|
151
|
+
hitl_graph = build_hitl_graph(checkpointer)
|
|
152
|
+
|
|
155
153
|
_result = await hitl_graph.ainvoke(initial_state, config=config)
|
|
156
154
|
|
|
157
155
|
return {
|
|
@@ -170,6 +168,9 @@ async def resume_hitl_review(
|
|
|
170
168
|
"""Resume a paused HITL flow with the human's decision."""
|
|
171
169
|
config = {"configurable": {"thread_id": thread_id}}
|
|
172
170
|
|
|
171
|
+
checkpointer = get_checkpointer()
|
|
172
|
+
hitl_graph = build_hitl_graph(checkpointer)
|
|
173
|
+
|
|
173
174
|
return await hitl_graph.ainvoke(
|
|
174
175
|
Command(resume={"action": action, "edited_answer": edited_answer}),
|
|
175
176
|
config=config,
|
|
@@ -16,14 +16,16 @@ from .schemas import ChatConfig
|
|
|
16
16
|
logger = logging.getLogger(__name__)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
# Default models per provider
|
|
19
|
+
# Default models per provider
|
|
20
20
|
DEFAULT_MODELS: dict[str, str] = {
|
|
21
|
-
"openai": "gpt-5.3
|
|
21
|
+
"openai": "gpt-5.3",
|
|
22
22
|
"gemini": "gemini-2.5-flash",
|
|
23
23
|
"groq": "openai/gpt-oss-120b",
|
|
24
|
-
"openrouter": "openai/gpt-5.3
|
|
24
|
+
"openrouter": "openai/gpt-5.3",
|
|
25
25
|
}
|
|
26
26
|
|
|
27
|
+
SUPPORTED_PROVIDERS = list(DEFAULT_MODELS.keys())
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
def _create_openai(model: str, temperature: float, max_tokens: int,
|
|
29
31
|
max_retries: int, callbacks: Optional[list] = None):
|
|
@@ -113,7 +115,7 @@ def get_chat_model(
|
|
|
113
115
|
"""
|
|
114
116
|
config = config or ChatConfig()
|
|
115
117
|
provider = provider or config.llm_provider
|
|
116
|
-
model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-
|
|
118
|
+
model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-5.3")
|
|
117
119
|
max_tokens = max_tokens or config.max_output_tokens
|
|
118
120
|
|
|
119
121
|
creator = _CREATORS.get(provider)
|
|
@@ -33,7 +33,7 @@ class ChatConfig(BaseModel):
|
|
|
33
33
|
default_factory=lambda: os.getenv("LONGPARSER_LLM_PROVIDER", "openai")
|
|
34
34
|
)
|
|
35
35
|
llm_model: str = Field(
|
|
36
|
-
default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-
|
|
36
|
+
default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-5.3")
|
|
37
37
|
)
|
|
38
38
|
max_input_tokens: int = Field(
|
|
39
39
|
default_factory=lambda: int(os.getenv("LONGPARSER_CHAT_MAX_INPUT_TOKENS", "1000"))
|
|
@@ -411,7 +411,7 @@ class Database:
|
|
|
411
411
|
]},
|
|
412
412
|
},
|
|
413
413
|
{"_id": 0},
|
|
414
|
-
).to_list(length=
|
|
414
|
+
).to_list(length=10000) # Cap: embedding batches
|
|
415
415
|
|
|
416
416
|
# -----------------------------------------------------------------------
|
|
417
417
|
# Index versions
|
|
@@ -450,7 +450,7 @@ class Database:
|
|
|
450
450
|
"""List all index versions for a job (for cleanup on delete)."""
|
|
451
451
|
return await self.index_versions.find(
|
|
452
452
|
{"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0}
|
|
453
|
-
).to_list(length=
|
|
453
|
+
).to_list(length=100) # Cap: index versions per job
|
|
454
454
|
|
|
455
455
|
# -----------------------------------------------------------------------
|
|
456
456
|
# Chat Sessions
|
|
@@ -597,7 +597,7 @@ class Database:
|
|
|
597
597
|
{"tenant_id": tenant_id, "session_id": session_id},
|
|
598
598
|
{"_id": 0},
|
|
599
599
|
).sort("created_at", 1)
|
|
600
|
-
return await cursor.to_list(length=
|
|
600
|
+
return await cursor.to_list(length=5000) # Cap: session history
|
|
601
601
|
|
|
602
602
|
async def get_unarchived_turns(
|
|
603
603
|
self, tenant_id: str, session_id: str
|
|
@@ -611,7 +611,7 @@ class Database:
|
|
|
611
611
|
},
|
|
612
612
|
{"_id": 0},
|
|
613
613
|
).sort("created_at", 1)
|
|
614
|
-
return await cursor.to_list(length=
|
|
614
|
+
return await cursor.to_list(length=5000) # Cap: summarization batch
|
|
615
615
|
|
|
616
616
|
async def archive_turns(
|
|
617
617
|
self, tenant_id: str, session_id: str, turn_ids: list[str]
|
|
@@ -645,7 +645,7 @@ class Database:
|
|
|
645
645
|
{"deleted_at": {"$lte": cutoff}},
|
|
646
646
|
{"session_id": 1, "tenant_id": 1, "_id": 0},
|
|
647
647
|
)
|
|
648
|
-
return await cursor.to_list(length=
|
|
648
|
+
return await cursor.to_list(length=1000) # Cap: purge batch
|
|
649
649
|
|
|
650
650
|
# -----------------------------------------------------------------------
|
|
651
651
|
# Lifecycle
|
|
@@ -93,7 +93,7 @@ class EmbeddingEngine:
|
|
|
93
93
|
|
|
94
94
|
# Stable json dump
|
|
95
95
|
cfg_str = json.dumps(config, sort_keys=True)
|
|
96
|
-
return hashlib.
|
|
96
|
+
return hashlib.sha256(cfg_str.encode("utf-8")).hexdigest()[:10]
|
|
97
97
|
|
|
98
98
|
@property
|
|
99
99
|
def dim(self) -> int:
|
|
@@ -108,7 +108,7 @@ class EmbeddingEngine:
|
|
|
108
108
|
return self._dim
|
|
109
109
|
|
|
110
110
|
fp = self.get_fingerprint()
|
|
111
|
-
cache_key = f"
|
|
111
|
+
cache_key = f"longparser:embed_dim:{fp}"
|
|
112
112
|
|
|
113
113
|
# 1) Try Redis cross-process cache if available
|
|
114
114
|
try:
|
|
@@ -145,8 +145,8 @@ class EmbeddingEngine:
|
|
|
145
145
|
try:
|
|
146
146
|
if 'r' in locals():
|
|
147
147
|
r.set(cache_key, self._dim)
|
|
148
|
-
except Exception:
|
|
149
|
-
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.debug(f"Failed to set Redis cache: {e}")
|
|
150
150
|
|
|
151
151
|
return self._dim
|
|
152
152
|
|
|
@@ -45,12 +45,7 @@ class ARQBackend(QueueBackend):
|
|
|
45
45
|
from arq import create_pool
|
|
46
46
|
from arq.connections import RedisSettings
|
|
47
47
|
|
|
48
|
-
|
|
49
|
-
# Strip database number (e.g., /0) if present
|
|
50
|
-
url = url.split("/")[0]
|
|
51
|
-
host, _, port_str = url.partition(":")
|
|
52
|
-
port = int(port_str) if port_str else 6379
|
|
53
|
-
self._pool = await create_pool(RedisSettings(host=host, port=port))
|
|
48
|
+
self._pool = await create_pool(RedisSettings.from_dsn(self.redis_url))
|
|
54
49
|
return self._pool
|
|
55
50
|
|
|
56
51
|
async def enqueue(self, task_name: str, payload: dict) -> str:
|
|
@@ -64,7 +64,7 @@ class ChromaStore(BaseVectorStore):
|
|
|
64
64
|
import chromadb
|
|
65
65
|
except ImportError:
|
|
66
66
|
raise ImportError(
|
|
67
|
-
"chromadb is required. Install: pip install
|
|
67
|
+
"chromadb is required. Install: pip install longparser[chroma]"
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
# Securely isolate vector spaces based on model config
|
|
@@ -125,8 +125,8 @@ class ChromaStore(BaseVectorStore):
|
|
|
125
125
|
if isinstance(v, str) and v.startswith("["):
|
|
126
126
|
try:
|
|
127
127
|
meta[k] = json.loads(v)
|
|
128
|
-
except (json.JSONDecodeError, ValueError):
|
|
129
|
-
|
|
128
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
129
|
+
logger.debug(f"Failed to decode JSON list from Chroma metadata: {e}")
|
|
130
130
|
output.append({
|
|
131
131
|
"id": vid,
|
|
132
132
|
"score": 1.0 - (results["distances"][0][i] if results["distances"] else 0),
|
|
@@ -165,7 +165,7 @@ class FAISSStore(BaseVectorStore):
|
|
|
165
165
|
import faiss # noqa: F401
|
|
166
166
|
except ImportError:
|
|
167
167
|
raise ImportError(
|
|
168
|
-
"faiss-cpu is required. Install: pip install
|
|
168
|
+
"faiss-cpu is required. Install: pip install longparser[faiss-cpu]"
|
|
169
169
|
)
|
|
170
170
|
|
|
171
171
|
self.base_dir = Path(base_dir)
|
|
@@ -297,7 +297,7 @@ class QdrantStore(BaseVectorStore):
|
|
|
297
297
|
from qdrant_client.models import Distance, VectorParams
|
|
298
298
|
except ImportError:
|
|
299
299
|
raise ImportError(
|
|
300
|
-
"qdrant-client is required. Install: pip install
|
|
300
|
+
"qdrant-client is required. Install: pip install longparser[qdrant]"
|
|
301
301
|
)
|
|
302
302
|
|
|
303
303
|
self.client = QdrantClient(url=url)
|
|
@@ -319,7 +319,7 @@ class QdrantStore(BaseVectorStore):
|
|
|
319
319
|
if existing_dim != dim:
|
|
320
320
|
# Mismatch — create new collection with hash suffix
|
|
321
321
|
import hashlib
|
|
322
|
-
suffix = hashlib.
|
|
322
|
+
suffix = hashlib.sha256(f"{dim}".encode()).hexdigest()[:8]
|
|
323
323
|
self.collection_name = f"{self.collection_name}_{suffix}"
|
|
324
324
|
logger.warning(
|
|
325
325
|
f"QdrantStore: dim mismatch, using collection: {self.collection_name}"
|
|
@@ -382,8 +382,8 @@ class QdrantStore(BaseVectorStore):
|
|
|
382
382
|
if isinstance(v, str) and v.startswith("["):
|
|
383
383
|
try:
|
|
384
384
|
payload[k] = json.loads(v)
|
|
385
|
-
except (json.JSONDecodeError, ValueError):
|
|
386
|
-
|
|
385
|
+
except (json.JSONDecodeError, ValueError) as e:
|
|
386
|
+
logger.debug(f"Failed to decode JSON list from Qdrant metadata: {e}")
|
|
387
387
|
output.append({
|
|
388
388
|
"id": payload.get("vector_id", ""),
|
|
389
389
|
"score": hit.score,
|
|
@@ -258,8 +258,8 @@ async def summarize_session(ctx: dict, tenant_id: str, session_id: str) -> dict:
|
|
|
258
258
|
4. Archive summarized turns
|
|
259
259
|
"""
|
|
260
260
|
from .db import Database
|
|
261
|
-
from .schemas import ChatConfig
|
|
262
|
-
from .llm_chain import get_plain_chat_model
|
|
261
|
+
from .chat.schemas import ChatConfig
|
|
262
|
+
from .chat.llm_chain import get_plain_chat_model
|
|
263
263
|
from langchain_core.messages import SystemMessage, HumanMessage
|
|
264
264
|
|
|
265
265
|
db = Database()
|
|
@@ -324,8 +324,8 @@ async def extract_facts(
|
|
|
324
324
|
Only persists facts from allowlisted types with chunk provenance.
|
|
325
325
|
"""
|
|
326
326
|
from .db import Database
|
|
327
|
-
from .schemas import ChatConfig, FactSourceType
|
|
328
|
-
from .llm_chain import get_chat_model
|
|
327
|
+
from .chat.schemas import ChatConfig, FactSourceType
|
|
328
|
+
from .chat.llm_chain import get_chat_model
|
|
329
329
|
from langchain_core.messages import SystemMessage, HumanMessage
|
|
330
330
|
|
|
331
331
|
db = Database()
|
|
@@ -407,7 +407,7 @@ async def extract_facts(
|
|
|
407
407
|
async def purge_expired_sessions(ctx: dict) -> dict:
|
|
408
408
|
"""Scheduled task: hard-delete turns for soft-deleted sessions past TTL."""
|
|
409
409
|
from .db import Database
|
|
410
|
-
from .schemas import ChatConfig
|
|
410
|
+
from .chat.schemas import ChatConfig
|
|
411
411
|
|
|
412
412
|
db = Database()
|
|
413
413
|
config = ChatConfig()
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: longparser
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
|
|
5
5
|
Author-email: ENDEVSOLS Team <technology@endevsols.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -27,6 +27,7 @@ Description-Content-Type: text/markdown
|
|
|
27
27
|
Requires-Dist: pydantic<3,>=2.0
|
|
28
28
|
Requires-Dist: docling>=2.14
|
|
29
29
|
Requires-Dist: docling-core>=2.13
|
|
30
|
+
Requires-Dist: langgraph-checkpoint-mongodb>=0.3.1
|
|
30
31
|
Provides-Extra: pptx
|
|
31
32
|
Requires-Dist: python-pptx>=1.0; extra == "pptx"
|
|
32
33
|
Provides-Extra: langchain
|
|
@@ -109,8 +110,7 @@ Requires-Dist: httpx>=0.27; extra == "dev"
|
|
|
109
110
|
Requires-Dist: anyio>=4.0; extra == "dev"
|
|
110
111
|
|
|
111
112
|
<p align="center">
|
|
112
|
-
|
|
113
|
-
<h1 align="center">LongParser</h1>
|
|
113
|
+
<img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
|
|
114
114
|
<p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
|
|
115
115
|
<p align="center">
|
|
116
116
|
Parse PDFs, DOCX, PPTX, XLSX & CSV → validated, AI-ready chunks with HITL review.
|
|
@@ -129,7 +129,7 @@ Requires-Dist: anyio>=4.0; extra == "dev"
|
|
|
129
129
|
<img src="https://static.pepy.tech/badge/longparser/month" alt="Monthly Downloads">
|
|
130
130
|
</a>
|
|
131
131
|
<a href="https://www.python.org/">
|
|
132
|
-
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
|
|
132
|
+
<img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12%20%7C%203.13-blue.svg" alt="Python">
|
|
133
133
|
</a>
|
|
134
134
|
<a href="LICENSE">
|
|
135
135
|
<img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
|
|
@@ -150,11 +150,12 @@ Requires-Dist: anyio>=4.0; extra == "dev"
|
|
|
150
150
|
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
|
|
151
151
|
| **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
|
|
152
152
|
| **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
|
|
153
|
-
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
|
|
153
|
+
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
|
|
154
154
|
| **3-layer memory** | Short-term turns + rolling summary + long-term facts |
|
|
155
155
|
| **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
|
|
156
156
|
| **Multi-backend vectors** | Chroma, FAISS, Qdrant |
|
|
157
|
-
| **
|
|
157
|
+
| **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
|
|
158
|
+
| **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
|
|
158
159
|
| **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
|
|
159
160
|
| **Privacy-first** | All processing runs locally; no data leaves your infra |
|
|
160
161
|
|
|
@@ -215,9 +216,9 @@ pip install "longparser[cpu]"
|
|
|
215
216
|
### Python SDK
|
|
216
217
|
|
|
217
218
|
```python
|
|
218
|
-
from longparser import
|
|
219
|
+
from longparser import DocumentPipeline, ProcessingConfig
|
|
219
220
|
|
|
220
|
-
pipeline =
|
|
221
|
+
pipeline = DocumentPipeline(ProcessingConfig())
|
|
221
222
|
result = pipeline.process_file("document.pdf")
|
|
222
223
|
|
|
223
224
|
print(f"Pages: {result.document.metadata.total_pages}")
|
|
@@ -296,7 +297,7 @@ src/longparser/
|
|
|
296
297
|
├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
|
|
297
298
|
├── extractors/ ← Docling, LaTeX OCR backends
|
|
298
299
|
├── chunkers/ ← HybridChunker
|
|
299
|
-
├── pipeline/ ←
|
|
300
|
+
├── pipeline/ ← DocumentPipeline
|
|
300
301
|
├── integrations/ ← LangChain loader & LlamaIndex reader
|
|
301
302
|
├── utils/ ← shared helpers (RTL detection, …)
|
|
302
303
|
└── server/ ← REST API layer
|
|
@@ -344,11 +345,14 @@ Copy `.env.example` to `.env` and set:
|
|
|
344
345
|
| Variable | Default | Description |
|
|
345
346
|
|----------|---------|-------------|
|
|
346
347
|
| `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
|
|
347
|
-
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
|
|
348
|
+
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
|
|
348
349
|
| `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
|
|
349
|
-
| `LONGPARSER_LLM_MODEL` | `gpt-
|
|
350
|
+
| `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
|
|
350
351
|
| `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
|
|
351
352
|
| `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
|
|
353
|
+
| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
|
|
354
|
+
| `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
|
|
355
|
+
| `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
|
|
352
356
|
|
|
353
357
|
---
|
|
354
358
|
|
|
@@ -29,6 +29,7 @@ src/longparser/server/vectorstores.py
|
|
|
29
29
|
src/longparser/server/worker.py
|
|
30
30
|
src/longparser/server/chat/__init__.py
|
|
31
31
|
src/longparser/server/chat/callbacks.py
|
|
32
|
+
src/longparser/server/chat/checkpointer.py
|
|
32
33
|
src/longparser/server/chat/engine.py
|
|
33
34
|
src/longparser/server/chat/graph.py
|
|
34
35
|
src/longparser/server/chat/llm_chain.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|