longparser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- longparser/__init__.py +104 -0
- longparser/chunkers/__init__.py +5 -0
- longparser/chunkers/hybrid_chunker.py +1046 -0
- longparser/extractors/__init__.py +9 -0
- longparser/extractors/base.py +62 -0
- longparser/extractors/docling_extractor.py +2065 -0
- longparser/extractors/latex_ocr.py +404 -0
- longparser/integrations/__init__.py +31 -0
- longparser/integrations/langchain.py +138 -0
- longparser/integrations/llamaindex.py +157 -0
- longparser/pipeline/__init__.py +8 -0
- longparser/pipeline/orchestrator.py +230 -0
- longparser/py.typed +0 -0
- longparser/schemas.py +247 -0
- longparser/server/__init__.py +22 -0
- longparser/server/app.py +1045 -0
- longparser/server/chat/__init__.py +39 -0
- longparser/server/chat/callbacks.py +110 -0
- longparser/server/chat/engine.py +341 -0
- longparser/server/chat/graph.py +176 -0
- longparser/server/chat/llm_chain.py +153 -0
- longparser/server/chat/retriever.py +111 -0
- longparser/server/chat/schemas.py +164 -0
- longparser/server/db.py +656 -0
- longparser/server/embeddings.py +181 -0
- longparser/server/queue.py +97 -0
- longparser/server/routers/__init__.py +0 -0
- longparser/server/schemas.py +204 -0
- longparser/server/vectorstores.py +443 -0
- longparser/server/worker.py +480 -0
- longparser/utils/__init__.py +5 -0
- longparser/utils/rtl_detector.py +93 -0
- longparser-0.1.0.dist-info/METADATA +337 -0
- longparser-0.1.0.dist-info/RECORD +36 -0
- longparser-0.1.0.dist-info/WHEEL +5 -0
- longparser-0.1.0.dist-info/top_level.txt +1 -0
longparser/__init__.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""LongParser — Privacy-first document intelligence engine for RAG.
|
|
2
|
+
|
|
3
|
+
LongParser converts complex documents (PDFs, DOCX, PPTX, XLSX, CSV) into
|
|
4
|
+
AI-ready structured output via a 5-stage extraction pipeline::
|
|
5
|
+
|
|
6
|
+
Extract → Validate → HITL Review → Chunk → Embed → Index
|
|
7
|
+
|
|
8
|
+
Built by ENDEVSOLS for production RAG pipelines.
|
|
9
|
+
|
|
10
|
+
Quick start::
|
|
11
|
+
|
|
12
|
+
from longparser import PipelineOrchestrator, ProcessingConfig
|
|
13
|
+
|
|
14
|
+
pipeline = PipelineOrchestrator()
|
|
15
|
+
result = pipeline.process_file("document.pdf")
|
|
16
|
+
print(result.chunks[0].text)
|
|
17
|
+
|
|
18
|
+
For the full REST API server::
|
|
19
|
+
|
|
20
|
+
uv run uvicorn longparser.server.app:app --reload --port 8000
|
|
21
|
+
|
|
22
|
+
See :class:`~longparser.pipeline.PipelineOrchestrator` for the main SDK entry
|
|
23
|
+
point and :mod:`longparser.server` for the REST API layer.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
__version__ = "0.1.0"
|
|
29
|
+
__author__ = "ENDEVSOLS Team"
|
|
30
|
+
__license__ = "MIT"
|
|
31
|
+
|
|
32
|
+
from .schemas import (
|
|
33
|
+
Block,
|
|
34
|
+
BlockFlags,
|
|
35
|
+
BlockType,
|
|
36
|
+
BoundingBox,
|
|
37
|
+
Chunk,
|
|
38
|
+
ChunkingConfig,
|
|
39
|
+
Confidence,
|
|
40
|
+
Document,
|
|
41
|
+
DocumentMetadata,
|
|
42
|
+
ExtractionMetadata,
|
|
43
|
+
ExtractorType,
|
|
44
|
+
JobRequest,
|
|
45
|
+
JobResult,
|
|
46
|
+
Page,
|
|
47
|
+
PageProfile,
|
|
48
|
+
ProcessingConfig,
|
|
49
|
+
Provenance,
|
|
50
|
+
Table,
|
|
51
|
+
TableCell,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Heavy dependencies (docling, motor, etc.) are imported lazily so that
|
|
55
|
+
# ``import longparser`` and ``from longparser.schemas import ...`` work
|
|
56
|
+
# in environments where optional extras are not installed.
|
|
57
|
+
def __getattr__(name: str):
|
|
58
|
+
"""Lazy import shim for optional heavy dependencies."""
|
|
59
|
+
if name == "DoclingExtractor":
|
|
60
|
+
from .extractors import DoclingExtractor
|
|
61
|
+
return DoclingExtractor
|
|
62
|
+
if name == "PipelineOrchestrator":
|
|
63
|
+
from .pipeline import PipelineOrchestrator
|
|
64
|
+
return PipelineOrchestrator
|
|
65
|
+
if name == "PipelineResult":
|
|
66
|
+
from .pipeline import PipelineResult
|
|
67
|
+
return PipelineResult
|
|
68
|
+
if name == "HybridChunker":
|
|
69
|
+
from .chunkers import HybridChunker
|
|
70
|
+
return HybridChunker
|
|
71
|
+
raise AttributeError(f"module 'longparser' has no attribute {name!r}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
__all__ = [
|
|
75
|
+
# Meta
|
|
76
|
+
"__version__",
|
|
77
|
+
"__author__",
|
|
78
|
+
"__license__",
|
|
79
|
+
# Schemas — always available (no heavy deps)
|
|
80
|
+
"Document",
|
|
81
|
+
"Page",
|
|
82
|
+
"Block",
|
|
83
|
+
"Table",
|
|
84
|
+
"TableCell",
|
|
85
|
+
"BlockType",
|
|
86
|
+
"ExtractorType",
|
|
87
|
+
"ProcessingConfig",
|
|
88
|
+
"BoundingBox",
|
|
89
|
+
"Provenance",
|
|
90
|
+
"Confidence",
|
|
91
|
+
"BlockFlags",
|
|
92
|
+
"DocumentMetadata",
|
|
93
|
+
"PageProfile",
|
|
94
|
+
"ExtractionMetadata",
|
|
95
|
+
"ChunkingConfig",
|
|
96
|
+
"Chunk",
|
|
97
|
+
"JobRequest",
|
|
98
|
+
"JobResult",
|
|
99
|
+
# Lazily imported (require extras)
|
|
100
|
+
"DoclingExtractor",
|
|
101
|
+
"PipelineOrchestrator",
|
|
102
|
+
"PipelineResult",
|
|
103
|
+
"HybridChunker",
|
|
104
|
+
]
|