longparser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. longparser-0.1.0/PKG-INFO +337 -0
  2. longparser-0.1.0/README.md +249 -0
  3. longparser-0.1.0/pyproject.toml +183 -0
  4. longparser-0.1.0/setup.cfg +4 -0
  5. longparser-0.1.0/src/longparser/__init__.py +104 -0
  6. longparser-0.1.0/src/longparser/chunkers/__init__.py +5 -0
  7. longparser-0.1.0/src/longparser/chunkers/hybrid_chunker.py +1046 -0
  8. longparser-0.1.0/src/longparser/extractors/__init__.py +9 -0
  9. longparser-0.1.0/src/longparser/extractors/base.py +62 -0
  10. longparser-0.1.0/src/longparser/extractors/docling_extractor.py +2065 -0
  11. longparser-0.1.0/src/longparser/extractors/latex_ocr.py +404 -0
  12. longparser-0.1.0/src/longparser/integrations/__init__.py +31 -0
  13. longparser-0.1.0/src/longparser/integrations/langchain.py +138 -0
  14. longparser-0.1.0/src/longparser/integrations/llamaindex.py +157 -0
  15. longparser-0.1.0/src/longparser/pipeline/__init__.py +8 -0
  16. longparser-0.1.0/src/longparser/pipeline/orchestrator.py +230 -0
  17. longparser-0.1.0/src/longparser/py.typed +0 -0
  18. longparser-0.1.0/src/longparser/schemas.py +247 -0
  19. longparser-0.1.0/src/longparser/server/__init__.py +22 -0
  20. longparser-0.1.0/src/longparser/server/app.py +1045 -0
  21. longparser-0.1.0/src/longparser/server/chat/__init__.py +39 -0
  22. longparser-0.1.0/src/longparser/server/chat/callbacks.py +110 -0
  23. longparser-0.1.0/src/longparser/server/chat/engine.py +341 -0
  24. longparser-0.1.0/src/longparser/server/chat/graph.py +176 -0
  25. longparser-0.1.0/src/longparser/server/chat/llm_chain.py +153 -0
  26. longparser-0.1.0/src/longparser/server/chat/retriever.py +111 -0
  27. longparser-0.1.0/src/longparser/server/chat/schemas.py +164 -0
  28. longparser-0.1.0/src/longparser/server/db.py +656 -0
  29. longparser-0.1.0/src/longparser/server/embeddings.py +181 -0
  30. longparser-0.1.0/src/longparser/server/queue.py +97 -0
  31. longparser-0.1.0/src/longparser/server/routers/__init__.py +0 -0
  32. longparser-0.1.0/src/longparser/server/schemas.py +204 -0
  33. longparser-0.1.0/src/longparser/server/vectorstores.py +443 -0
  34. longparser-0.1.0/src/longparser/server/worker.py +480 -0
  35. longparser-0.1.0/src/longparser/utils/__init__.py +5 -0
  36. longparser-0.1.0/src/longparser/utils/rtl_detector.py +93 -0
  37. longparser-0.1.0/src/longparser.egg-info/PKG-INFO +337 -0
  38. longparser-0.1.0/src/longparser.egg-info/SOURCES.txt +39 -0
  39. longparser-0.1.0/src/longparser.egg-info/dependency_links.txt +1 -0
  40. longparser-0.1.0/src/longparser.egg-info/requires.txt +73 -0
  41. longparser-0.1.0/src/longparser.egg-info/top_level.txt +1 -0
@@ -0,0 +1,337 @@
1
+ Metadata-Version: 2.4
2
+ Name: longparser
3
+ Version: 0.1.0
4
+ Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
5
+ Author-email: ENDEVSOLS Team <dev@endevsols.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ENDEVSOLS/LongParser
8
+ Project-URL: Repository, https://github.com/ENDEVSOLS/LongParser
9
+ Project-URL: Issues, https://github.com/ENDEVSOLS/LongParser/issues
10
+ Project-URL: Documentation, https://endevsols.github.io/LongParser/
11
+ Project-URL: Changelog, https://github.com/ENDEVSOLS/LongParser/blob/main/CHANGELOG.md
12
+ Keywords: pdf,document,parsing,ocr,rag,ai,docling,chunking,extraction,retrieval-augmented-generation,longparser
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
+ Classifier: Topic :: Text Processing :: General
25
+ Classifier: Typing :: Typed
26
+ Requires-Python: >=3.10
27
+ Description-Content-Type: text/markdown
28
+ Requires-Dist: pydantic<3,>=2.0
29
+ Requires-Dist: docling>=2.14
30
+ Requires-Dist: docling-core>=2.13
31
+ Provides-Extra: pptx
32
+ Requires-Dist: python-pptx>=1.0; extra == "pptx"
33
+ Provides-Extra: langchain
34
+ Requires-Dist: langchain-core>=0.2; extra == "langchain"
35
+ Provides-Extra: llamaindex
36
+ Requires-Dist: llama-index-core>=0.10; extra == "llamaindex"
37
+ Provides-Extra: server
38
+ Requires-Dist: fastapi>=0.115; extra == "server"
39
+ Requires-Dist: uvicorn[standard]>=0.34; extra == "server"
40
+ Requires-Dist: python-multipart>=0.0.9; extra == "server"
41
+ Requires-Dist: motor>=3.6; extra == "server"
42
+ Requires-Dist: arq>=0.26; extra == "server"
43
+ Requires-Dist: python-magic>=0.4.27; extra == "server"
44
+ Requires-Dist: python-dotenv>=1.0; extra == "server"
45
+ Requires-Dist: langchain>=0.3; extra == "server"
46
+ Requires-Dist: langchain-openai>=0.3; extra == "server"
47
+ Requires-Dist: langchain-google-genai>=2.0; extra == "server"
48
+ Requires-Dist: langchain-groq>=0.3; extra == "server"
49
+ Requires-Dist: langchain-mongodb>=0.3; extra == "server"
50
+ Requires-Dist: langchain-huggingface>=0.1; extra == "server"
51
+ Requires-Dist: langchain-chroma>=0.2; extra == "server"
52
+ Requires-Dist: langgraph>=0.2; extra == "server"
53
+ Requires-Dist: langgraph-checkpoint>=2.0; extra == "server"
54
+ Requires-Dist: tiktoken>=0.7; extra == "server"
55
+ Requires-Dist: redis>=5.0; extra == "server"
56
+ Provides-Extra: embeddings
57
+ Requires-Dist: sentence-transformers>=3.0; extra == "embeddings"
58
+ Provides-Extra: chroma
59
+ Requires-Dist: chromadb>=0.5; extra == "chroma"
60
+ Provides-Extra: faiss
61
+ Requires-Dist: faiss-cpu>=1.8; extra == "faiss"
62
+ Provides-Extra: qdrant
63
+ Requires-Dist: qdrant-client>=1.12; extra == "qdrant"
64
+ Provides-Extra: latex-ocr
65
+ Requires-Dist: pix2tex>=0.1.4; extra == "latex-ocr"
66
+ Provides-Extra: docx-equations
67
+ Requires-Dist: docxlatex>=0.3.0; extra == "docx-equations"
68
+ Requires-Dist: defusedxml>=0.7.0; extra == "docx-equations"
69
+ Provides-Extra: mfd
70
+ Requires-Dist: pix2text<1.2,>=1.1.1; extra == "mfd"
71
+ Provides-Extra: all
72
+ Requires-Dist: longparser[pptx]; extra == "all"
73
+ Requires-Dist: longparser[langchain]; extra == "all"
74
+ Requires-Dist: longparser[llamaindex]; extra == "all"
75
+ Requires-Dist: longparser[server]; extra == "all"
76
+ Requires-Dist: longparser[embeddings]; extra == "all"
77
+ Requires-Dist: longparser[chroma]; extra == "all"
78
+ Provides-Extra: dev
79
+ Requires-Dist: pytest>=8.0; extra == "dev"
80
+ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
81
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
82
+ Requires-Dist: ruff>=0.4; extra == "dev"
83
+ Requires-Dist: mypy>=1.10; extra == "dev"
84
+ Requires-Dist: build>=1.0; extra == "dev"
85
+ Requires-Dist: twine>=5.0; extra == "dev"
86
+ Requires-Dist: httpx>=0.27; extra == "dev"
87
+ Requires-Dist: anyio>=4.0; extra == "dev"
88
+
89
+ <p align="center">
90
+ <!-- Logo goes here once ready -->
91
+ <h1 align="center">LongParser</h1>
92
+ <p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
93
+ <p align="center">
94
+ Parse PDFs, DOCX, PPTX, XLSX &amp; CSV → validated, AI-ready chunks with HITL review.
95
+ </p>
96
+ <p align="center">
97
+ <a href="https://github.com/ENDEVSOLS/LongParser/actions/workflows/ci.yml">
98
+ <img src="https://github.com/ENDEVSOLS/LongParser/actions/workflows/ci.yml/badge.svg" alt="CI">
99
+ </a>
100
+ <a href="https://pypi.org/project/longparser/">
101
+ <img src="https://img.shields.io/pypi/v/longparser.svg?label=pypi&color=0078d4" alt="PyPI">
102
+ </a>
103
+ <a href="https://pypi.org/project/longparser/">
104
+ <img src="https://img.shields.io/pypi/dm/longparser.svg?label=downloads&color=28a745" alt="Downloads">
105
+ </a>
106
+ <a href="https://www.python.org/">
107
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
108
+ </a>
109
+ <a href="LICENSE">
110
+ <img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
111
+ </a>
112
+ <a href="https://endevsols.github.io/LongParser/">
113
+ <img src="https://img.shields.io/badge/docs-online-indigo.svg" alt="Docs">
114
+ </a>
115
+ </p>
116
+ </p>
117
+
118
+ ---
119
+
120
+
121
+ ## Features
122
+
123
+ | Feature | Detail |
124
+ |---------|--------|
125
+ | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
126
+ | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
127
+ | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
128
+ | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
129
+ | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
130
+ | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
131
+ | **Multi-backend vectors** | Chroma, FAISS, Qdrant |
132
+ | **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
133
+ | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
134
+ | **Privacy-first** | All processing runs locally; no data leaves your infra |
135
+
136
+ ---
137
+
138
+ ## Installation
139
+
140
+ ### Core (SDK only — no API server)
141
+
142
+ ```bash
143
+ pip install longparser
144
+ ```
145
+
146
+ ### With REST API server (FastAPI + MongoDB + LLM)
147
+
148
+ ```bash
149
+ pip install "longparser[server]"
150
+ ```
151
+
152
+ ### All extras
153
+
154
+ ```bash
155
+ pip install "longparser[all]"
156
+ ```
157
+
158
+ ---
159
+
160
+ ## Quick Start
161
+
162
+ ### Python SDK
163
+
164
+ ```python
165
+ from longparser import PipelineOrchestrator, ProcessingConfig
166
+
167
+ pipeline = PipelineOrchestrator()
168
+ result = pipeline.process_file("document.pdf")
169
+
170
+ print(f"Pages: {result.document.metadata.total_pages}")
171
+ print(f"Chunks: {len(result.chunks)}")
172
+ print(result.chunks[0].text)
173
+ ```
174
+
175
+ ### REST API
176
+
177
+ ```bash
178
+ # 1. Copy and edit configuration
179
+ cp .env.example .env
180
+
181
+ # 2. Start services (MongoDB + Redis)
182
+ docker-compose up -d mongo redis
183
+
184
+ # 3. Start the API
185
+ uv run uvicorn longparser.server.app:app --reload --port 8000
186
+
187
+ # 4. Upload a document
188
+ curl -X POST http://localhost:8000/jobs \
189
+ -H "X-API-Key: your-key" \
190
+ -F "file=@document.pdf"
191
+
192
+ # 5. Check job status
193
+ curl http://localhost:8000/jobs/{job_id} -H "X-API-Key: your-key"
194
+
195
+ # 6. Finalize and embed
196
+ curl -X POST http://localhost:8000/jobs/{job_id}/finalize \
197
+ -H "X-API-Key: your-key" \
198
+ -H "Content-Type: application/json" \
199
+ -d '{"finalize_policy": "approve_all_pending"}'
200
+
201
+ curl -X POST http://localhost:8000/jobs/{job_id}/embed \
202
+ -H "X-API-Key: your-key" \
203
+ -H "Content-Type: application/json" \
204
+ -d '{"provider": "huggingface", "model": "BAAI/bge-base-en-v1.5", "vector_db": "chroma"}'
205
+
206
+ # 7. Chat with the document
207
+ curl -X POST http://localhost:8000/chat/sessions \
208
+ -H "X-API-Key: your-key" \
209
+ -H "Content-Type: application/json" \
210
+ -d '{"job_id": "your-job-id"}'
211
+
212
+ curl -X POST http://localhost:8000/chat \
213
+ -H "X-API-Key: your-key" \
214
+ -H "Content-Type: application/json" \
215
+ -d '{"session_id": "...", "job_id": "...", "question": "What is the refund policy?"}'
216
+ ```
217
+
218
+ ---
219
+
220
+ ## Architecture
221
+
222
+ ```
223
+ Document → Extract → Validate → HITL Review → Chunk → Embed → Index
224
+
225
+ Chat → RAG → LLM → Answer
226
+ ```
227
+
228
+ ### Pipeline Stages
229
+
230
+ 1. **Extract** — Docling converts PDF/DOCX/etc. into structured `Block` objects
231
+ 2. **Validate** — Per-page confidence scoring and RTL detection
232
+ 3. **HITL Review** — Human approves/edits/rejects blocks and chunks via the API
233
+ 4. **Chunk** — `HybridChunker` builds token-aware RAG chunks with section hierarchy
234
+ 5. **Embed** — Embedding engine (HuggingFace / OpenAI) vectors stored in Chroma/FAISS/Qdrant
235
+ 6. **Chat** — LCEL chain with 3-layer memory and citation validation
236
+
237
+ ---
238
+
239
+ ## Project Structure
240
+
241
+ ```
242
+ src/longparser/
243
+ ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
244
+ ├── extractors/ ← Docling, LaTeX OCR backends
245
+ ├── chunkers/ ← HybridChunker
246
+ ├── pipeline/ ← PipelineOrchestrator
247
+ ├── integrations/ ← LangChain loader & LlamaIndex reader
248
+ ├── utils/ ← shared helpers (RTL detection, …)
249
+ └── server/ ← REST API layer
250
+ ├── app.py ← FastAPI application (all routes)
251
+ ├── db.py ← Motor async MongoDB
252
+ ├── queue.py ← ARQ/Redis job queue
253
+ ├── worker.py ← ARQ background worker
254
+ ├── embeddings.py ← HuggingFace / OpenAI embedding engine
255
+ ├── vectorstores.py ← Chroma / FAISS / Qdrant adapters
256
+ └── chat/ ← RAG chat engine
257
+ ├── engine.py ← ChatEngine (LCEL + 3-layer memory)
258
+ ├── graph.py ← LangGraph HITL workflow
259
+ ├── schemas.py ← chat Pydantic models
260
+ ├── retriever.py ← LangChain BaseRetriever adapter
261
+ ├── llm_chain.py ← multi-provider LLM factory
262
+ └── callbacks.py ← observability callbacks
263
+ ```
264
+
265
+ ---
266
+
267
+ ## LangChain Integration
268
+
269
+ ```python
270
+ from longparser.integrations.langchain import LongParserLoader
271
+
272
+ loader = LongParserLoader("report.pdf")
273
+ docs = loader.load() # list[langchain_core.documents.Document]
274
+ ```
275
+
276
+ ## LlamaIndex Integration
277
+
278
+ ```python
279
+ from longparser.integrations.llamaindex import LongParserReader
280
+
281
+ reader = LongParserReader()
282
+ docs = reader.load_data("report.pdf")
283
+ ```
284
+
285
+ ---
286
+
287
+ ## Configuration
288
+
289
+ Copy `.env.example` to `.env` and set:
290
+
291
+ | Variable | Default | Description |
292
+ |----------|---------|-------------|
293
+ | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
294
+ | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
295
+ | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
296
+ | `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
297
+ | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
298
+ | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
299
+
300
+ ---
301
+
302
+ ## Running with Docker
303
+
304
+ ```bash
305
+ docker-compose up
306
+ ```
307
+
308
+ API available at `http://localhost:8000` · Docs at `http://localhost:8000/docs`
309
+
310
+ ---
311
+
312
+ ## Testing
313
+
314
+ ```bash
315
+ # Install dev dependencies
316
+ uv sync --extra dev
317
+
318
+ # Run unit tests
319
+ uv run pytest tests/unit/ -v
320
+
321
+ # Run with coverage
322
+ uv run pytest tests/ --cov=src/longparser --cov-report=term-missing
323
+ ```
324
+
325
+ ---
326
+
327
+ ## Contributing
328
+
329
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and PR guidelines.
330
+
331
+ ## Security
332
+
333
+ See [SECURITY.md](SECURITY.md) for vulnerability reporting.
334
+
335
+ ## License
336
+
337
+ [MIT](LICENSE) — Copyright © 2026 ENDEVSOLS
@@ -0,0 +1,249 @@
1
+ <p align="center">
2
+ <!-- Logo goes here once ready -->
3
+ <h1 align="center">LongParser</h1>
4
+ <p align="center"><strong>Privacy-first document intelligence engine for production RAG pipelines.</strong></p>
5
+ <p align="center">
6
+ Parse PDFs, DOCX, PPTX, XLSX &amp; CSV → validated, AI-ready chunks with HITL review.
7
+ </p>
8
+ <p align="center">
9
+ <a href="https://github.com/ENDEVSOLS/LongParser/actions/workflows/ci.yml">
10
+ <img src="https://github.com/ENDEVSOLS/LongParser/actions/workflows/ci.yml/badge.svg" alt="CI">
11
+ </a>
12
+ <a href="https://pypi.org/project/longparser/">
13
+ <img src="https://img.shields.io/pypi/v/longparser.svg?label=pypi&color=0078d4" alt="PyPI">
14
+ </a>
15
+ <a href="https://pypi.org/project/longparser/">
16
+ <img src="https://img.shields.io/pypi/dm/longparser.svg?label=downloads&color=28a745" alt="Downloads">
17
+ </a>
18
+ <a href="https://www.python.org/">
19
+ <img src="https://img.shields.io/badge/python-3.10%20%7C%203.11%20%7C%203.12-blue.svg" alt="Python">
20
+ </a>
21
+ <a href="LICENSE">
22
+ <img src="https://img.shields.io/badge/License-MIT-brightgreen.svg" alt="MIT License">
23
+ </a>
24
+ <a href="https://endevsols.github.io/LongParser/">
25
+ <img src="https://img.shields.io/badge/docs-online-indigo.svg" alt="Docs">
26
+ </a>
27
+ </p>
28
+ </p>
29
+
30
+ ---
31
+
32
+
33
+ ## Features
34
+
35
+ | Feature | Detail |
36
+ |---------|--------|
37
+ | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
38
+ | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
39
+ | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
40
+ | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
41
+ | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
42
+ | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
43
+ | **Multi-backend vectors** | Chroma, FAISS, Qdrant |
44
+ | **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
45
+ | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
46
+ | **Privacy-first** | All processing runs locally; no data leaves your infra |
47
+
48
+ ---
49
+
50
+ ## Installation
51
+
52
+ ### Core (SDK only — no API server)
53
+
54
+ ```bash
55
+ pip install longparser
56
+ ```
57
+
58
+ ### With REST API server (FastAPI + MongoDB + LLM)
59
+
60
+ ```bash
61
+ pip install "longparser[server]"
62
+ ```
63
+
64
+ ### All extras
65
+
66
+ ```bash
67
+ pip install "longparser[all]"
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Quick Start
73
+
74
+ ### Python SDK
75
+
76
+ ```python
77
+ from longparser import PipelineOrchestrator, ProcessingConfig
78
+
79
+ pipeline = PipelineOrchestrator()
80
+ result = pipeline.process_file("document.pdf")
81
+
82
+ print(f"Pages: {result.document.metadata.total_pages}")
83
+ print(f"Chunks: {len(result.chunks)}")
84
+ print(result.chunks[0].text)
85
+ ```
86
+
87
+ ### REST API
88
+
89
+ ```bash
90
+ # 1. Copy and edit configuration
91
+ cp .env.example .env
92
+
93
+ # 2. Start services (MongoDB + Redis)
94
+ docker-compose up -d mongo redis
95
+
96
+ # 3. Start the API
97
+ uv run uvicorn longparser.server.app:app --reload --port 8000
98
+
99
+ # 4. Upload a document
100
+ curl -X POST http://localhost:8000/jobs \
101
+ -H "X-API-Key: your-key" \
102
+ -F "file=@document.pdf"
103
+
104
+ # 5. Check job status
105
+ curl http://localhost:8000/jobs/{job_id} -H "X-API-Key: your-key"
106
+
107
+ # 6. Finalize and embed
108
+ curl -X POST http://localhost:8000/jobs/{job_id}/finalize \
109
+ -H "X-API-Key: your-key" \
110
+ -H "Content-Type: application/json" \
111
+ -d '{"finalize_policy": "approve_all_pending"}'
112
+
113
+ curl -X POST http://localhost:8000/jobs/{job_id}/embed \
114
+ -H "X-API-Key: your-key" \
115
+ -H "Content-Type: application/json" \
116
+ -d '{"provider": "huggingface", "model": "BAAI/bge-base-en-v1.5", "vector_db": "chroma"}'
117
+
118
+ # 7. Chat with the document
119
+ curl -X POST http://localhost:8000/chat/sessions \
120
+ -H "X-API-Key: your-key" \
121
+ -H "Content-Type: application/json" \
122
+ -d '{"job_id": "your-job-id"}'
123
+
124
+ curl -X POST http://localhost:8000/chat \
125
+ -H "X-API-Key: your-key" \
126
+ -H "Content-Type: application/json" \
127
+ -d '{"session_id": "...", "job_id": "...", "question": "What is the refund policy?"}'
128
+ ```
129
+
130
+ ---
131
+
132
+ ## Architecture
133
+
134
+ ```
135
+ Document → Extract → Validate → HITL Review → Chunk → Embed → Index
136
+
137
+ Chat → RAG → LLM → Answer
138
+ ```
139
+
140
+ ### Pipeline Stages
141
+
142
+ 1. **Extract** — Docling converts PDF/DOCX/etc. into structured `Block` objects
143
+ 2. **Validate** — Per-page confidence scoring and RTL detection
144
+ 3. **HITL Review** — Human approves/edits/rejects blocks and chunks via the API
145
+ 4. **Chunk** — `HybridChunker` builds token-aware RAG chunks with section hierarchy
146
+ 5. **Embed** — Embedding engine (HuggingFace / OpenAI) vectors stored in Chroma/FAISS/Qdrant
147
+ 6. **Chat** — LCEL chain with 3-layer memory and citation validation
148
+
149
+ ---
150
+
151
+ ## Project Structure
152
+
153
+ ```
154
+ src/longparser/
155
+ ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …)
156
+ ├── extractors/ ← Docling, LaTeX OCR backends
157
+ ├── chunkers/ ← HybridChunker
158
+ ├── pipeline/ ← PipelineOrchestrator
159
+ ├── integrations/ ← LangChain loader & LlamaIndex reader
160
+ ├── utils/ ← shared helpers (RTL detection, …)
161
+ └── server/ ← REST API layer
162
+ ├── app.py ← FastAPI application (all routes)
163
+ ├── db.py ← Motor async MongoDB
164
+ ├── queue.py ← ARQ/Redis job queue
165
+ ├── worker.py ← ARQ background worker
166
+ ├── embeddings.py ← HuggingFace / OpenAI embedding engine
167
+ ├── vectorstores.py ← Chroma / FAISS / Qdrant adapters
168
+ └── chat/ ← RAG chat engine
169
+ ├── engine.py ← ChatEngine (LCEL + 3-layer memory)
170
+ ├── graph.py ← LangGraph HITL workflow
171
+ ├── schemas.py ← chat Pydantic models
172
+ ├── retriever.py ← LangChain BaseRetriever adapter
173
+ ├── llm_chain.py ← multi-provider LLM factory
174
+ └── callbacks.py ← observability callbacks
175
+ ```
176
+
177
+ ---
178
+
179
+ ## LangChain Integration
180
+
181
+ ```python
182
+ from longparser.integrations.langchain import LongParserLoader
183
+
184
+ loader = LongParserLoader("report.pdf")
185
+ docs = loader.load() # list[langchain_core.documents.Document]
186
+ ```
187
+
188
+ ## LlamaIndex Integration
189
+
190
+ ```python
191
+ from longparser.integrations.llamaindex import LongParserReader
192
+
193
+ reader = LongParserReader()
194
+ docs = reader.load_data("report.pdf")
195
+ ```
196
+
197
+ ---
198
+
199
+ ## Configuration
200
+
201
+ Copy `.env.example` to `.env` and set:
202
+
203
+ | Variable | Default | Description |
204
+ |----------|---------|-------------|
205
+ | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
206
+ | `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
207
+ | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
208
+ | `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
209
+ | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
210
+ | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
211
+
212
+ ---
213
+
214
+ ## Running with Docker
215
+
216
+ ```bash
217
+ docker-compose up
218
+ ```
219
+
220
+ API available at `http://localhost:8000` · Docs at `http://localhost:8000/docs`
221
+
222
+ ---
223
+
224
+ ## Testing
225
+
226
+ ```bash
227
+ # Install dev dependencies
228
+ uv sync --extra dev
229
+
230
+ # Run unit tests
231
+ uv run pytest tests/unit/ -v
232
+
233
+ # Run with coverage
234
+ uv run pytest tests/ --cov=src/longparser --cov-report=term-missing
235
+ ```
236
+
237
+ ---
238
+
239
+ ## Contributing
240
+
241
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and PR guidelines.
242
+
243
+ ## Security
244
+
245
+ See [SECURITY.md](SECURITY.md) for vulnerability reporting.
246
+
247
+ ## License
248
+
249
+ [MIT](LICENSE) — Copyright © 2026 ENDEVSOLS