longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
longparser/__init__.py ADDED
@@ -0,0 +1,104 @@
1
+ """LongParser — Privacy-first document intelligence engine for RAG.
2
+
3
+ LongParser converts complex documents (PDFs, DOCX, PPTX, XLSX, CSV) into
4
+ AI-ready structured output via a 5-stage extraction pipeline::
5
+
6
+ Extract → Validate → HITL Review → Chunk → Embed → Index
7
+
8
+ Built by ENDEVSOLS for production RAG pipelines.
9
+
10
+ Quick start::
11
+
12
+ from longparser import PipelineOrchestrator, ProcessingConfig
13
+
14
+ pipeline = PipelineOrchestrator()
15
+ result = pipeline.process_file("document.pdf")
16
+ print(result.chunks[0].text)
17
+
18
+ For the full REST API server::
19
+
20
+ uv run uvicorn longparser.server.app:app --reload --port 8000
21
+
22
+ See :class:`~longparser.pipeline.PipelineOrchestrator` for the main SDK entry
23
+ point and :mod:`longparser.server` for the REST API layer.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ __version__ = "0.1.0"
29
+ __author__ = "ENDEVSOLS Team"
30
+ __license__ = "MIT"
31
+
32
+ from .schemas import (
33
+ Block,
34
+ BlockFlags,
35
+ BlockType,
36
+ BoundingBox,
37
+ Chunk,
38
+ ChunkingConfig,
39
+ Confidence,
40
+ Document,
41
+ DocumentMetadata,
42
+ ExtractionMetadata,
43
+ ExtractorType,
44
+ JobRequest,
45
+ JobResult,
46
+ Page,
47
+ PageProfile,
48
+ ProcessingConfig,
49
+ Provenance,
50
+ Table,
51
+ TableCell,
52
+ )
53
+
54
+ # Heavy dependencies (docling, motor, etc.) are imported lazily so that
55
+ # ``import longparser`` and ``from longparser.schemas import ...`` work
56
+ # in environments where optional extras are not installed.
57
+ def __getattr__(name: str):
58
+ """Lazy import shim for optional heavy dependencies."""
59
+ if name == "DoclingExtractor":
60
+ from .extractors import DoclingExtractor
61
+ return DoclingExtractor
62
+ if name == "PipelineOrchestrator":
63
+ from .pipeline import PipelineOrchestrator
64
+ return PipelineOrchestrator
65
+ if name == "PipelineResult":
66
+ from .pipeline import PipelineResult
67
+ return PipelineResult
68
+ if name == "HybridChunker":
69
+ from .chunkers import HybridChunker
70
+ return HybridChunker
71
+ raise AttributeError(f"module 'longparser' has no attribute {name!r}")
72
+
73
+
74
+ __all__ = [
75
+ # Meta
76
+ "__version__",
77
+ "__author__",
78
+ "__license__",
79
+ # Schemas — always available (no heavy deps)
80
+ "Document",
81
+ "Page",
82
+ "Block",
83
+ "Table",
84
+ "TableCell",
85
+ "BlockType",
86
+ "ExtractorType",
87
+ "ProcessingConfig",
88
+ "BoundingBox",
89
+ "Provenance",
90
+ "Confidence",
91
+ "BlockFlags",
92
+ "DocumentMetadata",
93
+ "PageProfile",
94
+ "ExtractionMetadata",
95
+ "ChunkingConfig",
96
+ "Chunk",
97
+ "JobRequest",
98
+ "JobResult",
99
+ # Lazily imported (require extras)
100
+ "DoclingExtractor",
101
+ "PipelineOrchestrator",
102
+ "PipelineResult",
103
+ "HybridChunker",
104
+ ]
@@ -0,0 +1,5 @@
1
+ """Chunking strategies for LongParser RAG-optimized document splitting."""
2
+
3
+ from .hybrid_chunker import HybridChunker
4
+
5
+ __all__ = ["HybridChunker"]