longparser 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- longparser-0.1.5/LICENSE-THIRD-PARTY.md +50 -0
- {longparser-0.1.3 → longparser-0.1.5}/PKG-INFO +16 -2
- {longparser-0.1.3 → longparser-0.1.5}/README.md +6 -1
- {longparser-0.1.3 → longparser-0.1.5}/pyproject.toml +20 -1
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/__init__.py +6 -1
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/chunkers/hybrid_chunker.py +26 -4
- longparser-0.1.5/src/longparser/chunkers/quality_scorer.py +110 -0
- longparser-0.1.5/src/longparser/chunkers/semantic_boundary.py +67 -0
- longparser-0.1.5/src/longparser/extractors/marker_extractor.py +219 -0
- longparser-0.1.5/src/longparser/extractors/pymupdf_extractor.py +493 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/integrations/__init__.py +3 -3
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/integrations/langchain.py +3 -2
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/integrations/llamaindex.py +3 -2
- longparser-0.1.5/src/longparser/pipeline/cross_reference.py +227 -0
- longparser-0.1.5/src/longparser/pipeline/orchestrator.py +411 -0
- longparser-0.1.5/src/longparser/pipeline/pii_redactor.py +198 -0
- longparser-0.1.5/src/longparser/pipeline/summary_enricher.py +117 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/schemas.py +35 -1
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/worker.py +52 -0
- longparser-0.1.5/src/longparser/utils/__init__.py +14 -0
- longparser-0.1.5/src/longparser/utils/lang_detect.py +193 -0
- longparser-0.1.5/src/longparser/utils/ocr_router.py +148 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/PKG-INFO +16 -2
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/SOURCES.txt +10 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/requires.txt +10 -0
- longparser-0.1.3/src/longparser/pipeline/orchestrator.py +0 -230
- longparser-0.1.3/src/longparser/utils/__init__.py +0 -5
- {longparser-0.1.3 → longparser-0.1.5}/setup.cfg +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/chunkers/__init__.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/__init__.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/base.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/docling_extractor.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/latex_ocr.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/pipeline/__init__.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/py.typed +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/__init__.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/app.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/__init__.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/callbacks.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/checkpointer.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/engine.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/graph.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/llm_chain.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/retriever.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/schemas.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/db.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/embeddings.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/queue.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/routers/__init__.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/schemas.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/vectorstores.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser/utils/rtl_detector.py +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/dependency_links.txt +0 -0
- {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Third-Party Licenses
|
|
2
|
+
|
|
3
|
+
LongParser core is licensed under the **MIT License**.
|
|
4
|
+
|
|
5
|
+
Some **optional** backends and integrations use different licenses.
|
|
6
|
+
These packages are **never loaded by default** — they are only imported
|
|
7
|
+
when you explicitly install them and select them in your configuration.
|
|
8
|
+
|
|
9
|
+
## Optional Backend Licenses
|
|
10
|
+
|
|
11
|
+
| Package | License | Install Command | When Loaded |
|
|
12
|
+
|---------|---------|-----------------|-------------|
|
|
13
|
+
| `pymupdf4llm` | AGPL-3.0 or Artifex Commercial | `pip install "longparser[pymupdf]"` | Only when you set `backend="pymupdf"` |
|
|
14
|
+
| `marker-pdf` | GPL-3.0-or-later | `pip install "longparser[marker]"` | Only when you set `backend="marker"` *(future)* |
|
|
15
|
+
| `surya-ocr` | GPL-3.0-or-later | `pip install "longparser[surya]"` | Only when explicitly imported *(future)* |
|
|
16
|
+
|
|
17
|
+
## Core Dependency Licenses (always installed)
|
|
18
|
+
|
|
19
|
+
| Package | License | Purpose |
|
|
20
|
+
|---------|---------|---------|
|
|
21
|
+
| `pydantic` | MIT | Schema validation |
|
|
22
|
+
| `docling` | MIT | Default PDF extraction engine |
|
|
23
|
+
| `docling-core` | MIT | Docling data models |
|
|
24
|
+
| `fast-langdetect` | Apache-2.0 | Document language detection |
|
|
25
|
+
|
|
26
|
+
## What This Means for You
|
|
27
|
+
|
|
28
|
+
- **If you only use `pip install longparser`** — everything is MIT or Apache-2.0.
|
|
29
|
+
You can use LongParser in any project (commercial, proprietary, open source).
|
|
30
|
+
|
|
31
|
+
- **If you install `longparser[pymupdf]`** — the `pymupdf4llm` library is
|
|
32
|
+
AGPL-3.0 licensed. You must comply with AGPL terms for the PyMuPDF component,
|
|
33
|
+
OR purchase a commercial license from [Artifex](https://artifex.com).
|
|
34
|
+
LongParser core code remains MIT.
|
|
35
|
+
|
|
36
|
+
- **If you install `longparser[marker]`** *(future)* — the `marker-pdf` library
|
|
37
|
+
is GPL-3.0 licensed. You must comply with GPL terms for the Marker component.
|
|
38
|
+
LongParser core code remains MIT.
|
|
39
|
+
|
|
40
|
+
## License Isolation Guarantee
|
|
41
|
+
|
|
42
|
+
LongParser uses **lazy imports** to ensure GPL/AGPL packages are never loaded
|
|
43
|
+
unless explicitly requested. The following guarantees hold:
|
|
44
|
+
|
|
45
|
+
1. `import longparser` does NOT import any GPL/AGPL package
|
|
46
|
+
2. `from longparser import DocumentPipeline` does NOT import any GPL/AGPL package
|
|
47
|
+
3. `DocumentPipeline().process_file("doc.pdf")` does NOT import any GPL/AGPL
|
|
48
|
+
package (uses Docling, which is MIT)
|
|
49
|
+
4. GPL/AGPL code is only loaded when you explicitly set `backend="pymupdf"` or
|
|
50
|
+
`backend="marker"` in `ProcessingConfig`
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: longparser
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
|
|
5
5
|
Author-email: ENDEVSOLS Team <technology@endevsols.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -24,16 +24,24 @@ Classifier: Topic :: Text Processing :: General
|
|
|
24
24
|
Classifier: Typing :: Typed
|
|
25
25
|
Requires-Python: >=3.10
|
|
26
26
|
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE-THIRD-PARTY.md
|
|
27
28
|
Requires-Dist: pydantic<3,>=2.0
|
|
28
29
|
Requires-Dist: docling>=2.14
|
|
29
30
|
Requires-Dist: docling-core>=2.13
|
|
30
31
|
Requires-Dist: langgraph-checkpoint-mongodb>=0.3.1
|
|
32
|
+
Requires-Dist: fast-langdetect<1.0,>=0.3
|
|
31
33
|
Provides-Extra: pptx
|
|
32
34
|
Requires-Dist: python-pptx>=1.0; extra == "pptx"
|
|
33
35
|
Provides-Extra: langchain
|
|
34
36
|
Requires-Dist: langchain-core>=0.2; extra == "langchain"
|
|
35
37
|
Provides-Extra: llamaindex
|
|
36
38
|
Requires-Dist: llama-index-core>=0.10; extra == "llamaindex"
|
|
39
|
+
Provides-Extra: pymupdf
|
|
40
|
+
Requires-Dist: pymupdf4llm>=1.27; extra == "pymupdf"
|
|
41
|
+
Provides-Extra: ner
|
|
42
|
+
Requires-Dist: spacy>=3.7.0; extra == "ner"
|
|
43
|
+
Provides-Extra: marker
|
|
44
|
+
Requires-Dist: marker-pdf>=0.3.0; extra == "marker"
|
|
37
45
|
Provides-Extra: server
|
|
38
46
|
Requires-Dist: fastapi>=0.115; extra == "server"
|
|
39
47
|
Requires-Dist: uvicorn[standard]>=0.34; extra == "server"
|
|
@@ -108,6 +116,7 @@ Requires-Dist: build>=1.0; extra == "dev"
|
|
|
108
116
|
Requires-Dist: twine>=5.0; extra == "dev"
|
|
109
117
|
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
110
118
|
Requires-Dist: anyio>=4.0; extra == "dev"
|
|
119
|
+
Dynamic: license-file
|
|
111
120
|
|
|
112
121
|
<p align="center">
|
|
113
122
|
<img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
|
|
@@ -147,8 +156,13 @@ Requires-Dist: anyio>=4.0; extra == "dev"
|
|
|
147
156
|
|
|
148
157
|
| Feature | Detail |
|
|
149
158
|
|---------|--------|
|
|
150
|
-
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
|
|
159
|
+
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling & Marker |
|
|
151
160
|
| **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
|
|
161
|
+
| **Semantic chunking** | Embedding-based boundaries using `all-MiniLM-L6-v2` |
|
|
162
|
+
| **Cross-referencing** | Deterministic linking of explicit and implicit charts/figures |
|
|
163
|
+
| **Quality scoring** | Zero-ML heuristic scoring with dictionary & fastText validation |
|
|
164
|
+
| **PII redaction** | Hybrid Regex + NER (spaCy) redaction with secure HITL preservation |
|
|
165
|
+
| **Summary chunks** | Async ARQ worker generating hierarchical LLM section summaries |
|
|
152
166
|
| **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
|
|
153
167
|
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
|
|
154
168
|
| **3-layer memory** | Short-term turns + rolling summary + long-term facts |
|
|
@@ -36,8 +36,13 @@
|
|
|
36
36
|
|
|
37
37
|
| Feature | Detail |
|
|
38
38
|
|---------|--------|
|
|
39
|
-
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
|
|
39
|
+
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling & Marker |
|
|
40
40
|
| **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
|
|
41
|
+
| **Semantic chunking** | Embedding-based boundaries using `all-MiniLM-L6-v2` |
|
|
42
|
+
| **Cross-referencing** | Deterministic linking of explicit and implicit charts/figures |
|
|
43
|
+
| **Quality scoring** | Zero-ML heuristic scoring with dictionary & fastText validation |
|
|
44
|
+
| **PII redaction** | Hybrid Regex + NER (spaCy) redaction with secure HITL preservation |
|
|
45
|
+
| **Summary chunks** | Async ARQ worker generating hierarchical LLM section summaries |
|
|
41
46
|
| **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
|
|
42
47
|
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
|
|
43
48
|
| **3-layer memory** | Short-term turns + rolling summary + long-term facts |
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "longparser"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.5"
|
|
8
8
|
description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines."
|
|
9
9
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -36,6 +36,7 @@ dependencies = [
|
|
|
36
36
|
"docling>=2.14",
|
|
37
37
|
"docling-core>=2.13",
|
|
38
38
|
"langgraph-checkpoint-mongodb>=0.3.1",
|
|
39
|
+
"fast-langdetect>=0.3,<1.0", # Apache-2.0 — document language detection
|
|
39
40
|
]
|
|
40
41
|
|
|
41
42
|
[project.optional-dependencies]
|
|
@@ -51,6 +52,24 @@ langchain = [
|
|
|
51
52
|
llamaindex = [
|
|
52
53
|
"llama-index-core>=0.10",
|
|
53
54
|
]
|
|
55
|
+
# ----------- v0.1.4: Optional extraction backends -----------
|
|
56
|
+
# ⚠️ pymupdf4llm is AGPL-3.0 licensed. See LICENSE-THIRD-PARTY.md.
|
|
57
|
+
# Only loaded when user sets backend="pymupdf".
|
|
58
|
+
pymupdf = [
|
|
59
|
+
"pymupdf4llm>=1.27",
|
|
60
|
+
]
|
|
61
|
+
# NER redaction (spaCy) for advanced PII detection
|
|
62
|
+
ner = [
|
|
63
|
+
"spacy>=3.7.0",
|
|
64
|
+
]
|
|
65
|
+
# ⚠️ marker-pdf is GPL-3.0. GPU recommended. Future release.
|
|
66
|
+
marker = [
|
|
67
|
+
"marker-pdf>=0.3.0",
|
|
68
|
+
]
|
|
69
|
+
# ⚠️ surya-ocr is GPL-3.0. GPU recommended. Future release.
|
|
70
|
+
# surya = [
|
|
71
|
+
# "surya-ocr>=0.17",
|
|
72
|
+
# ]
|
|
54
73
|
# FastAPI REST server + MongoDB + job queue + LangChain chat engine
|
|
55
74
|
server = [
|
|
56
75
|
"fastapi>=0.115",
|
|
@@ -25,7 +25,7 @@ point and :mod:`longparser.server` for the REST API layer.
|
|
|
25
25
|
|
|
26
26
|
from __future__ import annotations
|
|
27
27
|
|
|
28
|
-
__version__ = "0.1.
|
|
28
|
+
__version__ = "0.1.5"
|
|
29
29
|
__author__ = "ENDEVSOLS Team"
|
|
30
30
|
__license__ = "MIT"
|
|
31
31
|
|
|
@@ -59,6 +59,10 @@ def __getattr__(name: str):
|
|
|
59
59
|
if name == "DoclingExtractor":
|
|
60
60
|
from .extractors import DoclingExtractor
|
|
61
61
|
return DoclingExtractor
|
|
62
|
+
if name == "PyMuPDFExtractor":
|
|
63
|
+
# AGPL-isolated — only loaded when explicitly requested
|
|
64
|
+
from .extractors.pymupdf_extractor import PyMuPDFExtractor
|
|
65
|
+
return PyMuPDFExtractor
|
|
62
66
|
if name == "PipelineOrchestrator":
|
|
63
67
|
from .pipeline import PipelineOrchestrator
|
|
64
68
|
return PipelineOrchestrator
|
|
@@ -101,6 +105,7 @@ __all__ = [
|
|
|
101
105
|
"JobResult",
|
|
102
106
|
# Lazily imported (require extras)
|
|
103
107
|
"DoclingExtractor",
|
|
108
|
+
"PyMuPDFExtractor",
|
|
104
109
|
"PipelineOrchestrator",
|
|
105
110
|
"DocumentPipeline",
|
|
106
111
|
"PipelineResult",
|
|
@@ -620,6 +620,10 @@ class HybridChunker:
|
|
|
620
620
|
# --- Apply overlap ---
|
|
621
621
|
all_chunks = self._apply_overlap(all_chunks)
|
|
622
622
|
|
|
623
|
+
# --- Quality score ---
|
|
624
|
+
from .quality_scorer import score_chunks
|
|
625
|
+
all_chunks = score_chunks(all_chunks, blocks)
|
|
626
|
+
|
|
623
627
|
logger.info(f"[HybridChunker] Done — {len(all_chunks)} chunks produced")
|
|
624
628
|
return all_chunks
|
|
625
629
|
|
|
@@ -749,11 +753,25 @@ class HybridChunker:
|
|
|
749
753
|
Equations are kept with their surrounding context.
|
|
750
754
|
"""
|
|
751
755
|
chunks: list[Chunk] = []
|
|
756
|
+
|
|
757
|
+
# Pre-compute semantic boundaries if enabled
|
|
758
|
+
semantic_boundaries = set()
|
|
759
|
+
if self.config.use_semantic_chunking:
|
|
760
|
+
from .semantic_boundary import find_semantic_boundaries
|
|
761
|
+
semantic_boundaries = set(find_semantic_boundaries(
|
|
762
|
+
[b.text.strip() for b in blocks if b.text.strip()],
|
|
763
|
+
threshold=self.config.semantic_threshold,
|
|
764
|
+
model_name=self.config.semantic_model,
|
|
765
|
+
))
|
|
766
|
+
|
|
752
767
|
current_texts: list[str] = []
|
|
753
768
|
current_ids: list[str] = []
|
|
754
769
|
current_pages: set[int] = set()
|
|
755
770
|
current_tokens = 0
|
|
756
771
|
has_equation = False
|
|
772
|
+
|
|
773
|
+
# We need an index over valid blocks to match semantic_boundaries
|
|
774
|
+
block_idx = 0
|
|
757
775
|
|
|
758
776
|
for block in blocks:
|
|
759
777
|
text = block.text.strip()
|
|
@@ -761,10 +779,12 @@ class HybridChunker:
|
|
|
761
779
|
continue
|
|
762
780
|
|
|
763
781
|
block_tokens = _count_tokens(text)
|
|
764
|
-
|
|
765
|
-
#
|
|
766
|
-
|
|
767
|
-
|
|
782
|
+
|
|
783
|
+
# Flush condition: Token limit reached OR semantic boundary hit
|
|
784
|
+
hit_limit = current_tokens + block_tokens > self.config.max_tokens
|
|
785
|
+
hit_semantic = block_idx in semantic_boundaries
|
|
786
|
+
|
|
787
|
+
if (hit_limit or hit_semantic) and current_texts:
|
|
768
788
|
|
|
769
789
|
carry_text = None
|
|
770
790
|
carry_id = None
|
|
@@ -811,6 +831,8 @@ class HybridChunker:
|
|
|
811
831
|
|
|
812
832
|
if block.type == BlockType.EQUATION:
|
|
813
833
|
has_equation = True
|
|
834
|
+
|
|
835
|
+
block_idx += 1
|
|
814
836
|
|
|
815
837
|
# Flush remaining
|
|
816
838
|
if current_texts:
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Chunk quality scorer based on token-weighted confidence and noise penalties."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
from typing import Dict, Set
|
|
8
|
+
|
|
9
|
+
from ..schemas import Block, Chunk
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# --- Lazy-loaded resources ---
|
|
14
|
+
_english_words: Set[str] | None = None
|
|
15
|
+
|
|
16
|
+
def _get_english_words() -> Set[str]:
|
|
17
|
+
"""Load standard OS dictionary for word coverage checks."""
|
|
18
|
+
global _english_words
|
|
19
|
+
if _english_words is None:
|
|
20
|
+
_english_words = set()
|
|
21
|
+
# Try common unix dictionary path
|
|
22
|
+
try:
|
|
23
|
+
with open("/usr/share/dict/words", "r", encoding="utf-8") as f:
|
|
24
|
+
_english_words = {line.strip().lower() for line in f}
|
|
25
|
+
logger.info(f"Loaded {len(_english_words)} words for quality scoring")
|
|
26
|
+
except Exception:
|
|
27
|
+
logger.debug("System dictionary not found. Word coverage metric will be skipped.")
|
|
28
|
+
return _english_words
|
|
29
|
+
|
|
30
|
+
def _get_lang_confidence(text: str) -> float:
|
|
31
|
+
"""Get fastText language detection confidence (0.0 to 1.0)."""
|
|
32
|
+
text = text.strip().replace("\n", " ")
|
|
33
|
+
if len(text) < 10:
|
|
34
|
+
return 1.0 # Too short to reliably detect, assume okay
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from fast_langdetect import detect
|
|
38
|
+
res = detect(text)
|
|
39
|
+
return res.get("score", 1.0)
|
|
40
|
+
except Exception:
|
|
41
|
+
return 1.0
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def score_chunks(chunks: list[Chunk], blocks: list[Block]) -> list[Chunk]:
|
|
46
|
+
"""Score chunks based on block confidence and text noise.
|
|
47
|
+
|
|
48
|
+
Assigns a quality_score (0.0 to 1.0) to each chunk.
|
|
49
|
+
"""
|
|
50
|
+
if not chunks or not blocks:
|
|
51
|
+
return chunks
|
|
52
|
+
|
|
53
|
+
# Build block lookup for fast access
|
|
54
|
+
block_lookup: Dict[str, Block] = {b.block_id: b for b in blocks}
|
|
55
|
+
|
|
56
|
+
for chunk in chunks:
|
|
57
|
+
chunk_blocks = [
|
|
58
|
+
block_lookup[bid] for bid in chunk.block_ids if bid in block_lookup
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
if not chunk_blocks:
|
|
62
|
+
chunk.quality_score = 0.5 # Fallback
|
|
63
|
+
continue
|
|
64
|
+
|
|
65
|
+
# 1. Base score: token-weighted average of block confidence
|
|
66
|
+
weighted_sum = sum(
|
|
67
|
+
(b.confidence.overall if b.confidence else 1.0) * len(b.text)
|
|
68
|
+
for b in chunk_blocks
|
|
69
|
+
)
|
|
70
|
+
total_weight = sum(len(b.text) for b in chunk_blocks)
|
|
71
|
+
|
|
72
|
+
base_score = weighted_sum / total_weight if total_weight > 0 else 0.5
|
|
73
|
+
|
|
74
|
+
# 2. Noise penalty: density of garbled characters
|
|
75
|
+
text = chunk.text
|
|
76
|
+
noise_chars = sum(
|
|
77
|
+
1 for c in text if not (c.isalnum() or c in ' .,;:!?()-"\'\n\t')
|
|
78
|
+
)
|
|
79
|
+
noise_ratio = noise_chars / max(len(text), 1)
|
|
80
|
+
# Cap penalty at 50%
|
|
81
|
+
penalty = min(noise_ratio * 2.0, 0.5)
|
|
82
|
+
|
|
83
|
+
# 3. Dictionary Word Coverage penalty
|
|
84
|
+
words = _get_english_words()
|
|
85
|
+
if words:
|
|
86
|
+
# Extract alphabetic tokens
|
|
87
|
+
tokens = [t.lower() for t in re.findall(r'\b[a-zA-Z]{2,}\b', text)]
|
|
88
|
+
if tokens:
|
|
89
|
+
coverage = sum(1 for t in tokens if t in words) / len(tokens)
|
|
90
|
+
# If less than 60% of tokens are real words, apply up to 30% penalty
|
|
91
|
+
if coverage < 0.6:
|
|
92
|
+
penalty += min((0.6 - coverage), 0.3)
|
|
93
|
+
|
|
94
|
+
# 4. FastText Language Confidence penalty
|
|
95
|
+
# Garbled text often confuses the language ID model, resulting in low confidence
|
|
96
|
+
lang_score = _get_lang_confidence(text)
|
|
97
|
+
if lang_score < 0.8:
|
|
98
|
+
# Scale penalty: 0.8 confidence = 0 penalty, 0.0 confidence = 0.4 penalty
|
|
99
|
+
penalty += (0.8 - lang_score) * 0.5
|
|
100
|
+
|
|
101
|
+
# 5. Completeness bonus: full sentences score higher
|
|
102
|
+
ends_properly = text.rstrip().endswith(('.', '!', '?', ':', '"'))
|
|
103
|
+
bonus = 0.05 if ends_properly else 0.0
|
|
104
|
+
|
|
105
|
+
# Calculate final score (cap penalty before applying it)
|
|
106
|
+
total_penalty = min(penalty, 0.8) # Max penalty is 80% to avoid dropping to 0 for weird formatting
|
|
107
|
+
final_score = max(0.0, min(1.0, base_score - total_penalty + bonus))
|
|
108
|
+
chunk.quality_score = final_score
|
|
109
|
+
|
|
110
|
+
return chunks
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Semantic boundary detection using SentenceTransformers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import List
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
_models: dict = {}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _get_model(model_name: str = "all-MiniLM-L6-v2"):
|
|
14
|
+
"""Lazily load the SentenceTransformer model (cached by name)."""
|
|
15
|
+
if model_name not in _models:
|
|
16
|
+
try:
|
|
17
|
+
from sentence_transformers import SentenceTransformer
|
|
18
|
+
_models[model_name] = SentenceTransformer(model_name)
|
|
19
|
+
logger.info("Loaded semantic chunking model: %s", model_name)
|
|
20
|
+
except ImportError:
|
|
21
|
+
logger.warning("sentence-transformers not installed. Semantic chunking disabled.")
|
|
22
|
+
return None
|
|
23
|
+
return _models[model_name]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def find_semantic_boundaries(
|
|
27
|
+
texts: List[str],
|
|
28
|
+
threshold: float = 0.3,
|
|
29
|
+
model_name: str = "all-MiniLM-L6-v2",
|
|
30
|
+
) -> List[int]:
|
|
31
|
+
"""Find semantic boundaries in a list of texts.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
texts: List of block texts in reading order.
|
|
35
|
+
threshold: Cosine similarity threshold. Drops below this indicate a shift.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List of block indices where a semantic shift occurs (the boundary is *before* the index).
|
|
39
|
+
"""
|
|
40
|
+
if not texts or len(texts) < 2:
|
|
41
|
+
return []
|
|
42
|
+
|
|
43
|
+
model = _get_model(model_name)
|
|
44
|
+
if not model:
|
|
45
|
+
return []
|
|
46
|
+
|
|
47
|
+
# Batch encode all texts (fast on CPU)
|
|
48
|
+
embeddings = model.encode(texts, batch_size=64, show_progress_bar=False)
|
|
49
|
+
|
|
50
|
+
import numpy as np
|
|
51
|
+
|
|
52
|
+
def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
|
|
53
|
+
norm_a = np.linalg.norm(a)
|
|
54
|
+
norm_b = np.linalg.norm(b)
|
|
55
|
+
if norm_a == 0 or norm_b == 0:
|
|
56
|
+
return 0.0
|
|
57
|
+
return float(np.dot(a, b) / (norm_a * norm_b))
|
|
58
|
+
|
|
59
|
+
boundaries = []
|
|
60
|
+
|
|
61
|
+
for i in range(len(embeddings) - 1):
|
|
62
|
+
sim = cosine_sim(embeddings[i], embeddings[i+1])
|
|
63
|
+
if sim < threshold:
|
|
64
|
+
# Shift occurs before block i+1
|
|
65
|
+
boundaries.append(i + 1)
|
|
66
|
+
|
|
67
|
+
return boundaries
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Marker-based extractor for high-fidelity extraction on complex PDFs.
|
|
2
|
+
|
|
3
|
+
⚠️ LICENSE NOTICE — GPL-3.0
|
|
4
|
+
marker-pdf is licensed under GPL-3.0.
|
|
5
|
+
By using this backend, you agree to the terms of the GPL-3.0 license.
|
|
6
|
+
|
|
7
|
+
This module is NOT imported by default — users must explicitly opt in
|
|
8
|
+
via ``pip install longparser[marker]`` and ``backend='marker'``.
|
|
9
|
+
|
|
10
|
+
⚠️ ISOLATION RULES (do NOT violate)
|
|
11
|
+
1. This file must NEVER be imported by ``extractors/__init__.py``
|
|
12
|
+
2. This file must NEVER be imported at module level by ``orchestrator.py``
|
|
13
|
+
3. This file must ONLY be imported behind ``if backend == "marker":``
|
|
14
|
+
4. ``import longparser`` must NEVER trigger loading this file
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import hashlib
|
|
20
|
+
import logging
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Optional, List, Tuple
|
|
23
|
+
|
|
24
|
+
from ..schemas import (
|
|
25
|
+
Document, Page, Block, BlockType, ExtractorType, ProcessingConfig,
|
|
26
|
+
BoundingBox, Provenance, Confidence, DocumentMetadata, PageProfile, ExtractionMetadata
|
|
27
|
+
)
|
|
28
|
+
from .base import BaseExtractor
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _require_marker():
|
|
34
|
+
"""Check that marker-pdf is installed; raise clear error if not."""
|
|
35
|
+
try:
|
|
36
|
+
import marker
|
|
37
|
+
return marker
|
|
38
|
+
except ImportError:
|
|
39
|
+
raise ImportError(
|
|
40
|
+
"\n"
|
|
41
|
+
"╔══════════════════════════════════════════════════════════╗\n"
|
|
42
|
+
"║ marker-pdf is not installed. ║\n"
|
|
43
|
+
"║ ║\n"
|
|
44
|
+
"║ Install: pip install 'longparser[marker]' ║\n"
|
|
45
|
+
"║ ║\n"
|
|
46
|
+
"║ ⚠️ marker-pdf is licensed under GPL-3.0. ║\n"
|
|
47
|
+
"║ By installing it, you agree to GPL terms for that ║\n"
|
|
48
|
+
"║ component. LongParser core remains MIT-licensed. ║\n"
|
|
49
|
+
"╚══════════════════════════════════════════════════════════╝\n"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class MarkerExtractor(BaseExtractor):
|
|
54
|
+
"""Extractor using Marker for high-fidelity output.
|
|
55
|
+
|
|
56
|
+
Includes soft-cap logic for running on CPU to prevent infinite hangs.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
extractor_type = ExtractorType.MARKER
|
|
60
|
+
version = "1.0.0"
|
|
61
|
+
|
|
62
|
+
def __init__(self):
|
|
63
|
+
"""Initialize and verify marker-pdf is available."""
|
|
64
|
+
_require_marker()
|
|
65
|
+
|
|
66
|
+
# Check for GPU
|
|
67
|
+
try:
|
|
68
|
+
import torch
|
|
69
|
+
if not torch.cuda.is_available() and not torch.backends.mps.is_available():
|
|
70
|
+
logger.warning(
|
|
71
|
+
"⚠️ Marker is running on CPU — expect 5-10× slower extraction. "
|
|
72
|
+
"A soft cap of 10 pages is enforced by default. "
|
|
73
|
+
"Set `force_marker_cpu=True` to bypass this."
|
|
74
|
+
)
|
|
75
|
+
except ImportError:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
logger.info("Marker backend initialized")
|
|
79
|
+
|
|
80
|
+
def extract(
|
|
81
|
+
self,
|
|
82
|
+
file_path: Path,
|
|
83
|
+
config: ProcessingConfig,
|
|
84
|
+
page_numbers: Optional[List[int]] = None,
|
|
85
|
+
) -> Tuple[Document, ExtractionMetadata]:
|
|
86
|
+
"""Extract a PDF using Marker."""
|
|
87
|
+
from marker.convert import convert_single_pdf
|
|
88
|
+
from marker.models import load_all_models
|
|
89
|
+
from marker.settings import settings
|
|
90
|
+
import fitz # PyMuPDF is a marker dependency anyway
|
|
91
|
+
|
|
92
|
+
file_path = Path(file_path)
|
|
93
|
+
logger.info("Extracting with Marker: %s", file_path.name)
|
|
94
|
+
|
|
95
|
+
if file_path.suffix.lower() != ".pdf":
|
|
96
|
+
raise ValueError(f"Marker backend only supports PDF files, got: {file_path.suffix}")
|
|
97
|
+
|
|
98
|
+
pdf_doc = fitz.open(str(file_path))
|
|
99
|
+
total_pages = len(pdf_doc)
|
|
100
|
+
pdf_doc.close()
|
|
101
|
+
|
|
102
|
+
# Soft cap logic for CPU
|
|
103
|
+
try:
|
|
104
|
+
import torch
|
|
105
|
+
is_cpu = not torch.cuda.is_available() and not torch.backends.mps.is_available()
|
|
106
|
+
except ImportError:
|
|
107
|
+
is_cpu = True
|
|
108
|
+
|
|
109
|
+
if is_cpu and not config.force_marker_cpu and total_pages > 10:
|
|
110
|
+
if page_numbers is None or len(page_numbers) > 10:
|
|
111
|
+
raise RuntimeError(
|
|
112
|
+
f"Marker CPU Soft Cap exceeded. Document has {total_pages} pages "
|
|
113
|
+
f"(limit: 10). Extraction will take too long on CPU. "
|
|
114
|
+
f"Set config.force_marker_cpu=True to override."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest()[:16]
|
|
118
|
+
|
|
119
|
+
# Load models (cached internally by Marker)
|
|
120
|
+
model_lst = load_all_models()
|
|
121
|
+
|
|
122
|
+
# Convert
|
|
123
|
+
full_text, images, out_meta = convert_single_pdf(
|
|
124
|
+
str(file_path),
|
|
125
|
+
model_lst,
|
|
126
|
+
max_pages=settings.MAX_PAGES if not page_numbers else len(page_numbers),
|
|
127
|
+
langs=config.languages if config.languages else None,
|
|
128
|
+
batch_multiplier=settings.BATCH_MULTIPLIER,
|
|
129
|
+
start_page=page_numbers[0] if page_numbers else None
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Map to LongParser Document
|
|
133
|
+
# Note: Marker's output is flat markdown, so we do a fast mapping
|
|
134
|
+
# similar to PyMuPDFExtractor.
|
|
135
|
+
document = self._markdown_to_document(
|
|
136
|
+
md_text=full_text,
|
|
137
|
+
file_path=file_path,
|
|
138
|
+
file_hash=file_hash,
|
|
139
|
+
total_pages=total_pages,
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
meta = ExtractionMetadata(
|
|
143
|
+
strategy_used="marker",
|
|
144
|
+
ocr_backend_used="surya (marker)",
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
return document, meta
|
|
148
|
+
|
|
149
|
+
def _markdown_to_document(
|
|
150
|
+
self,
|
|
151
|
+
md_text: str,
|
|
152
|
+
file_path: Path,
|
|
153
|
+
file_hash: str,
|
|
154
|
+
total_pages: int,
|
|
155
|
+
) -> Document:
|
|
156
|
+
"""Convert Marker's markdown into a LongParser Document."""
|
|
157
|
+
metadata = DocumentMetadata(
|
|
158
|
+
source_file=str(file_path),
|
|
159
|
+
file_hash=file_hash,
|
|
160
|
+
total_pages=total_pages,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
pages: list[Page] = []
|
|
164
|
+
blocks: list[Block] = []
|
|
165
|
+
|
|
166
|
+
lines = md_text.strip().split("\n")
|
|
167
|
+
order_idx = 0
|
|
168
|
+
|
|
169
|
+
# Fast parse
|
|
170
|
+
for i, line in enumerate(lines):
|
|
171
|
+
stripped = line.strip()
|
|
172
|
+
if not stripped:
|
|
173
|
+
continue
|
|
174
|
+
|
|
175
|
+
block_type = BlockType.PARAGRAPH
|
|
176
|
+
heading_level = None
|
|
177
|
+
|
|
178
|
+
if stripped.startswith("#"):
|
|
179
|
+
block_type = BlockType.HEADING
|
|
180
|
+
heading_level = min(len(stripped) - len(stripped.lstrip("#")), 6)
|
|
181
|
+
stripped = stripped.lstrip("#").strip()
|
|
182
|
+
elif stripped.startswith(("- ", "* ")):
|
|
183
|
+
block_type = BlockType.LIST_ITEM
|
|
184
|
+
stripped = stripped.lstrip("-* ").strip()
|
|
185
|
+
|
|
186
|
+
blocks.append(Block(
|
|
187
|
+
type=block_type,
|
|
188
|
+
text=stripped,
|
|
189
|
+
order_index=order_idx,
|
|
190
|
+
heading_level=heading_level,
|
|
191
|
+
provenance=Provenance(
|
|
192
|
+
source_file=str(file_path),
|
|
193
|
+
page_number=1, # Marker loses page boundaries in its markdown string
|
|
194
|
+
bbox=BoundingBox(x0=0, y0=0, x1=0, y1=0),
|
|
195
|
+
extractor=self.extractor_type,
|
|
196
|
+
extractor_version=self.version,
|
|
197
|
+
),
|
|
198
|
+
confidence=Confidence(overall=0.9),
|
|
199
|
+
))
|
|
200
|
+
order_idx += 1
|
|
201
|
+
|
|
202
|
+
pages.append(Page(
|
|
203
|
+
page_number=1,
|
|
204
|
+
width=612.0,
|
|
205
|
+
height=792.0,
|
|
206
|
+
blocks=blocks,
|
|
207
|
+
profile=PageProfile(page_number=1, layout_confidence=0.9)
|
|
208
|
+
))
|
|
209
|
+
|
|
210
|
+
return Document(metadata=metadata, pages=pages)
|
|
211
|
+
|
|
212
|
+
def extract_page(
|
|
213
|
+
self,
|
|
214
|
+
file_path: Path,
|
|
215
|
+
page_number: int,
|
|
216
|
+
config: ProcessingConfig,
|
|
217
|
+
) -> Page:
|
|
218
|
+
doc, _ = self.extract(file_path, config, page_numbers=[page_number])
|
|
219
|
+
return doc.pages[0] if doc.pages else Page(page_number=page_number, width=0, height=0)
|