longparser 0.1.3__tar.gz → 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. longparser-0.1.5/LICENSE-THIRD-PARTY.md +50 -0
  2. {longparser-0.1.3 → longparser-0.1.5}/PKG-INFO +16 -2
  3. {longparser-0.1.3 → longparser-0.1.5}/README.md +6 -1
  4. {longparser-0.1.3 → longparser-0.1.5}/pyproject.toml +20 -1
  5. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/__init__.py +6 -1
  6. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/chunkers/hybrid_chunker.py +26 -4
  7. longparser-0.1.5/src/longparser/chunkers/quality_scorer.py +110 -0
  8. longparser-0.1.5/src/longparser/chunkers/semantic_boundary.py +67 -0
  9. longparser-0.1.5/src/longparser/extractors/marker_extractor.py +219 -0
  10. longparser-0.1.5/src/longparser/extractors/pymupdf_extractor.py +493 -0
  11. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/integrations/__init__.py +3 -3
  12. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/integrations/langchain.py +3 -2
  13. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/integrations/llamaindex.py +3 -2
  14. longparser-0.1.5/src/longparser/pipeline/cross_reference.py +227 -0
  15. longparser-0.1.5/src/longparser/pipeline/orchestrator.py +411 -0
  16. longparser-0.1.5/src/longparser/pipeline/pii_redactor.py +198 -0
  17. longparser-0.1.5/src/longparser/pipeline/summary_enricher.py +117 -0
  18. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/schemas.py +35 -1
  19. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/worker.py +52 -0
  20. longparser-0.1.5/src/longparser/utils/__init__.py +14 -0
  21. longparser-0.1.5/src/longparser/utils/lang_detect.py +193 -0
  22. longparser-0.1.5/src/longparser/utils/ocr_router.py +148 -0
  23. {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/PKG-INFO +16 -2
  24. {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/SOURCES.txt +10 -0
  25. {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/requires.txt +10 -0
  26. longparser-0.1.3/src/longparser/pipeline/orchestrator.py +0 -230
  27. longparser-0.1.3/src/longparser/utils/__init__.py +0 -5
  28. {longparser-0.1.3 → longparser-0.1.5}/setup.cfg +0 -0
  29. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/chunkers/__init__.py +0 -0
  30. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/__init__.py +0 -0
  31. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/base.py +0 -0
  32. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/docling_extractor.py +0 -0
  33. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/extractors/latex_ocr.py +0 -0
  34. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/pipeline/__init__.py +0 -0
  35. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/py.typed +0 -0
  36. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/__init__.py +0 -0
  37. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/app.py +0 -0
  38. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/__init__.py +0 -0
  39. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/callbacks.py +0 -0
  40. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/checkpointer.py +0 -0
  41. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/engine.py +0 -0
  42. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/graph.py +0 -0
  43. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/llm_chain.py +0 -0
  44. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/retriever.py +0 -0
  45. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/chat/schemas.py +0 -0
  46. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/db.py +0 -0
  47. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/embeddings.py +0 -0
  48. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/queue.py +0 -0
  49. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/routers/__init__.py +0 -0
  50. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/schemas.py +0 -0
  51. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/server/vectorstores.py +0 -0
  52. {longparser-0.1.3 → longparser-0.1.5}/src/longparser/utils/rtl_detector.py +0 -0
  53. {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/dependency_links.txt +0 -0
  54. {longparser-0.1.3 → longparser-0.1.5}/src/longparser.egg-info/top_level.txt +0 -0
@@ -0,0 +1,50 @@
1
+ # Third-Party Licenses
2
+
3
+ LongParser core is licensed under the **MIT License**.
4
+
5
+ Some **optional** backends and integrations use different licenses.
6
+ These packages are **never loaded by default** — they are only imported
7
+ when you explicitly install them and select them in your configuration.
8
+
9
+ ## Optional Backend Licenses
10
+
11
+ | Package | License | Install Command | When Loaded |
12
+ |---------|---------|-----------------|-------------|
13
+ | `pymupdf4llm` | AGPL-3.0 or Artifex Commercial | `pip install "longparser[pymupdf]"` | Only when you set `backend="pymupdf"` |
14
+ | `marker-pdf` | GPL-3.0-or-later | `pip install "longparser[marker]"` | Only when you set `backend="marker"` *(future)* |
15
+ | `surya-ocr` | GPL-3.0-or-later | `pip install "longparser[surya]"` | Only when explicitly imported *(future)* |
16
+
17
+ ## Core Dependency Licenses (always installed)
18
+
19
+ | Package | License | Purpose |
20
+ |---------|---------|---------|
21
+ | `pydantic` | MIT | Schema validation |
22
+ | `docling` | MIT | Default PDF extraction engine |
23
+ | `docling-core` | MIT | Docling data models |
24
+ | `fast-langdetect` | Apache-2.0 | Document language detection |
25
+
26
+ ## What This Means for You
27
+
28
+ - **If you only use `pip install longparser`** — everything is MIT or Apache-2.0.
29
+ You can use LongParser in any project (commercial, proprietary, open source).
30
+
31
+ - **If you install `longparser[pymupdf]`** — the `pymupdf4llm` library is
32
+ AGPL-3.0 licensed. You must comply with AGPL terms for the PyMuPDF component,
33
+ OR purchase a commercial license from [Artifex](https://artifex.com).
34
+ LongParser core code remains MIT.
35
+
36
+ - **If you install `longparser[marker]`** *(future)* — the `marker-pdf` library
37
+ is GPL-3.0 licensed. You must comply with GPL terms for the Marker component.
38
+ LongParser core code remains MIT.
39
+
40
+ ## License Isolation Guarantee
41
+
42
+ LongParser uses **lazy imports** to ensure GPL/AGPL packages are never loaded
43
+ unless explicitly requested. The following guarantees hold:
44
+
45
+ 1. `import longparser` does NOT import any GPL/AGPL package
46
+ 2. `from longparser import DocumentPipeline` does NOT import any GPL/AGPL package
47
+ 3. `DocumentPipeline().process_file("doc.pdf")` does NOT import any GPL/AGPL
48
+ package (uses Docling, which is MIT)
49
+ 4. GPL/AGPL code is only loaded when you explicitly set `backend="pymupdf"` or
50
+ `backend="marker"` in `ProcessingConfig`
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: longparser
3
- Version: 0.1.3
3
+ Version: 0.1.5
4
4
  Summary: Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines.
5
5
  Author-email: ENDEVSOLS Team <technology@endevsols.com>
6
6
  License-Expression: MIT
@@ -24,16 +24,24 @@ Classifier: Topic :: Text Processing :: General
24
24
  Classifier: Typing :: Typed
25
25
  Requires-Python: >=3.10
26
26
  Description-Content-Type: text/markdown
27
+ License-File: LICENSE-THIRD-PARTY.md
27
28
  Requires-Dist: pydantic<3,>=2.0
28
29
  Requires-Dist: docling>=2.14
29
30
  Requires-Dist: docling-core>=2.13
30
31
  Requires-Dist: langgraph-checkpoint-mongodb>=0.3.1
32
+ Requires-Dist: fast-langdetect<1.0,>=0.3
31
33
  Provides-Extra: pptx
32
34
  Requires-Dist: python-pptx>=1.0; extra == "pptx"
33
35
  Provides-Extra: langchain
34
36
  Requires-Dist: langchain-core>=0.2; extra == "langchain"
35
37
  Provides-Extra: llamaindex
36
38
  Requires-Dist: llama-index-core>=0.10; extra == "llamaindex"
39
+ Provides-Extra: pymupdf
40
+ Requires-Dist: pymupdf4llm>=1.27; extra == "pymupdf"
41
+ Provides-Extra: ner
42
+ Requires-Dist: spacy>=3.7.0; extra == "ner"
43
+ Provides-Extra: marker
44
+ Requires-Dist: marker-pdf>=0.3.0; extra == "marker"
37
45
  Provides-Extra: server
38
46
  Requires-Dist: fastapi>=0.115; extra == "server"
39
47
  Requires-Dist: uvicorn[standard]>=0.34; extra == "server"
@@ -108,6 +116,7 @@ Requires-Dist: build>=1.0; extra == "dev"
108
116
  Requires-Dist: twine>=5.0; extra == "dev"
109
117
  Requires-Dist: httpx>=0.27; extra == "dev"
110
118
  Requires-Dist: anyio>=4.0; extra == "dev"
119
+ Dynamic: license-file
111
120
 
112
121
  <p align="center">
113
122
  <img src="https://raw.githubusercontent.com/ENDEVSOLS/LongParser/main/docs/assets/logo.png" alt="LongParser" width="320">
@@ -147,8 +156,13 @@ Requires-Dist: anyio>=4.0; extra == "dev"
147
156
 
148
157
  | Feature | Detail |
149
158
  |---------|--------|
150
- | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
159
+ | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling & Marker |
151
160
  | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
161
+ | **Semantic chunking** | Embedding-based boundaries using `all-MiniLM-L6-v2` |
162
+ | **Cross-referencing** | Deterministic linking of explicit and implicit charts/figures |
163
+ | **Quality scoring** | Zero-ML heuristic scoring with dictionary & fastText validation |
164
+ | **PII redaction** | Hybrid Regex + NER (spaCy) redaction with secure HITL preservation |
165
+ | **Summary chunks** | Async ARQ worker generating hierarchical LLM section summaries |
152
166
  | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
153
167
  | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
154
168
  | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
@@ -36,8 +36,13 @@
36
36
 
37
37
  | Feature | Detail |
38
38
  |---------|--------|
39
- | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
39
+ | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling & Marker |
40
40
  | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
41
+ | **Semantic chunking** | Embedding-based boundaries using `all-MiniLM-L6-v2` |
42
+ | **Cross-referencing** | Deterministic linking of explicit and implicit charts/figures |
43
+ | **Quality scoring** | Zero-ML heuristic scoring with dictionary & fastText validation |
44
+ | **PII redaction** | Hybrid Regex + NER (spaCy) redaction with secure HITL preservation |
45
+ | **Summary chunks** | Async ARQ worker generating hierarchical LLM section summaries |
41
46
  | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
42
47
  | **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
43
48
  | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "longparser"
7
- version = "0.1.3"
7
+ version = "0.1.5"
8
8
  description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines."
9
9
  readme = {file = "README.md", content-type = "text/markdown"}
10
10
  requires-python = ">=3.10"
@@ -36,6 +36,7 @@ dependencies = [
36
36
  "docling>=2.14",
37
37
  "docling-core>=2.13",
38
38
  "langgraph-checkpoint-mongodb>=0.3.1",
39
+ "fast-langdetect>=0.3,<1.0", # Apache-2.0 — document language detection
39
40
  ]
40
41
 
41
42
  [project.optional-dependencies]
@@ -51,6 +52,24 @@ langchain = [
51
52
  llamaindex = [
52
53
  "llama-index-core>=0.10",
53
54
  ]
55
+ # ----------- v0.1.4: Optional extraction backends -----------
56
+ # ⚠️ pymupdf4llm is AGPL-3.0 licensed. See LICENSE-THIRD-PARTY.md.
57
+ # Only loaded when user sets backend="pymupdf".
58
+ pymupdf = [
59
+ "pymupdf4llm>=1.27",
60
+ ]
61
+ # NER redaction (spaCy) for advanced PII detection
62
+ ner = [
63
+ "spacy>=3.7.0",
64
+ ]
65
+ # ⚠️ marker-pdf is GPL-3.0. GPU recommended. Future release.
66
+ marker = [
67
+ "marker-pdf>=0.3.0",
68
+ ]
69
+ # ⚠️ surya-ocr is GPL-3.0. GPU recommended. Future release.
70
+ # surya = [
71
+ # "surya-ocr>=0.17",
72
+ # ]
54
73
  # FastAPI REST server + MongoDB + job queue + LangChain chat engine
55
74
  server = [
56
75
  "fastapi>=0.115",
@@ -25,7 +25,7 @@ point and :mod:`longparser.server` for the REST API layer.
25
25
 
26
26
  from __future__ import annotations
27
27
 
28
- __version__ = "0.1.3"
28
+ __version__ = "0.1.5"
29
29
  __author__ = "ENDEVSOLS Team"
30
30
  __license__ = "MIT"
31
31
 
@@ -59,6 +59,10 @@ def __getattr__(name: str):
59
59
  if name == "DoclingExtractor":
60
60
  from .extractors import DoclingExtractor
61
61
  return DoclingExtractor
62
+ if name == "PyMuPDFExtractor":
63
+ # AGPL-isolated — only loaded when explicitly requested
64
+ from .extractors.pymupdf_extractor import PyMuPDFExtractor
65
+ return PyMuPDFExtractor
62
66
  if name == "PipelineOrchestrator":
63
67
  from .pipeline import PipelineOrchestrator
64
68
  return PipelineOrchestrator
@@ -101,6 +105,7 @@ __all__ = [
101
105
  "JobResult",
102
106
  # Lazily imported (require extras)
103
107
  "DoclingExtractor",
108
+ "PyMuPDFExtractor",
104
109
  "PipelineOrchestrator",
105
110
  "DocumentPipeline",
106
111
  "PipelineResult",
@@ -620,6 +620,10 @@ class HybridChunker:
620
620
  # --- Apply overlap ---
621
621
  all_chunks = self._apply_overlap(all_chunks)
622
622
 
623
+ # --- Quality score ---
624
+ from .quality_scorer import score_chunks
625
+ all_chunks = score_chunks(all_chunks, blocks)
626
+
623
627
  logger.info(f"[HybridChunker] Done — {len(all_chunks)} chunks produced")
624
628
  return all_chunks
625
629
 
@@ -749,11 +753,25 @@ class HybridChunker:
749
753
  Equations are kept with their surrounding context.
750
754
  """
751
755
  chunks: list[Chunk] = []
756
+
757
+ # Pre-compute semantic boundaries if enabled
758
+ semantic_boundaries = set()
759
+ if self.config.use_semantic_chunking:
760
+ from .semantic_boundary import find_semantic_boundaries
761
+ semantic_boundaries = set(find_semantic_boundaries(
762
+ [b.text.strip() for b in blocks if b.text.strip()],
763
+ threshold=self.config.semantic_threshold,
764
+ model_name=self.config.semantic_model,
765
+ ))
766
+
752
767
  current_texts: list[str] = []
753
768
  current_ids: list[str] = []
754
769
  current_pages: set[int] = set()
755
770
  current_tokens = 0
756
771
  has_equation = False
772
+
773
+ # We need an index over valid blocks to match semantic_boundaries
774
+ block_idx = 0
757
775
 
758
776
  for block in blocks:
759
777
  text = block.text.strip()
@@ -761,10 +779,12 @@ class HybridChunker:
761
779
  continue
762
780
 
763
781
  block_tokens = _count_tokens(text)
764
-
765
- # If adding this block would exceed the limit, flush
766
- if (current_tokens + block_tokens > self.config.max_tokens
767
- and current_texts):
782
+
783
+ # Flush condition: Token limit reached OR semantic boundary hit
784
+ hit_limit = current_tokens + block_tokens > self.config.max_tokens
785
+ hit_semantic = block_idx in semantic_boundaries
786
+
787
+ if (hit_limit or hit_semantic) and current_texts:
768
788
 
769
789
  carry_text = None
770
790
  carry_id = None
@@ -811,6 +831,8 @@ class HybridChunker:
811
831
 
812
832
  if block.type == BlockType.EQUATION:
813
833
  has_equation = True
834
+
835
+ block_idx += 1
814
836
 
815
837
  # Flush remaining
816
838
  if current_texts:
@@ -0,0 +1,110 @@
1
+ """Chunk quality scorer based on token-weighted confidence and noise penalties."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ from typing import Dict, Set
8
+
9
+ from ..schemas import Block, Chunk
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # --- Lazy-loaded resources ---
14
+ _english_words: Set[str] | None = None
15
+
16
+ def _get_english_words() -> Set[str]:
17
+ """Load standard OS dictionary for word coverage checks."""
18
+ global _english_words
19
+ if _english_words is None:
20
+ _english_words = set()
21
+ # Try common unix dictionary path
22
+ try:
23
+ with open("/usr/share/dict/words", "r", encoding="utf-8") as f:
24
+ _english_words = {line.strip().lower() for line in f}
25
+ logger.info(f"Loaded {len(_english_words)} words for quality scoring")
26
+ except Exception:
27
+ logger.debug("System dictionary not found. Word coverage metric will be skipped.")
28
+ return _english_words
29
+
30
+ def _get_lang_confidence(text: str) -> float:
31
+ """Get fastText language detection confidence (0.0 to 1.0)."""
32
+ text = text.strip().replace("\n", " ")
33
+ if len(text) < 10:
34
+ return 1.0 # Too short to reliably detect, assume okay
35
+
36
+ try:
37
+ from fast_langdetect import detect
38
+ res = detect(text)
39
+ return res.get("score", 1.0)
40
+ except Exception:
41
+ return 1.0
42
+
43
+
44
+
45
+ def score_chunks(chunks: list[Chunk], blocks: list[Block]) -> list[Chunk]:
46
+ """Score chunks based on block confidence and text noise.
47
+
48
+ Assigns a quality_score (0.0 to 1.0) to each chunk.
49
+ """
50
+ if not chunks or not blocks:
51
+ return chunks
52
+
53
+ # Build block lookup for fast access
54
+ block_lookup: Dict[str, Block] = {b.block_id: b for b in blocks}
55
+
56
+ for chunk in chunks:
57
+ chunk_blocks = [
58
+ block_lookup[bid] for bid in chunk.block_ids if bid in block_lookup
59
+ ]
60
+
61
+ if not chunk_blocks:
62
+ chunk.quality_score = 0.5 # Fallback
63
+ continue
64
+
65
+ # 1. Base score: token-weighted average of block confidence
66
+ weighted_sum = sum(
67
+ (b.confidence.overall if b.confidence else 1.0) * len(b.text)
68
+ for b in chunk_blocks
69
+ )
70
+ total_weight = sum(len(b.text) for b in chunk_blocks)
71
+
72
+ base_score = weighted_sum / total_weight if total_weight > 0 else 0.5
73
+
74
+ # 2. Noise penalty: density of garbled characters
75
+ text = chunk.text
76
+ noise_chars = sum(
77
+ 1 for c in text if not (c.isalnum() or c in ' .,;:!?()-"\'\n\t')
78
+ )
79
+ noise_ratio = noise_chars / max(len(text), 1)
80
+ # Cap penalty at 50%
81
+ penalty = min(noise_ratio * 2.0, 0.5)
82
+
83
+ # 3. Dictionary Word Coverage penalty
84
+ words = _get_english_words()
85
+ if words:
86
+ # Extract alphabetic tokens
87
+ tokens = [t.lower() for t in re.findall(r'\b[a-zA-Z]{2,}\b', text)]
88
+ if tokens:
89
+ coverage = sum(1 for t in tokens if t in words) / len(tokens)
90
+ # If less than 60% of tokens are real words, apply up to 30% penalty
91
+ if coverage < 0.6:
92
+ penalty += min((0.6 - coverage), 0.3)
93
+
94
+ # 4. FastText Language Confidence penalty
95
+ # Garbled text often confuses the language ID model, resulting in low confidence
96
+ lang_score = _get_lang_confidence(text)
97
+ if lang_score < 0.8:
98
+ # Scale penalty: 0.8 confidence = 0 penalty, 0.0 confidence = 0.4 penalty
99
+ penalty += (0.8 - lang_score) * 0.5
100
+
101
+ # 5. Completeness bonus: full sentences score higher
102
+ ends_properly = text.rstrip().endswith(('.', '!', '?', ':', '"'))
103
+ bonus = 0.05 if ends_properly else 0.0
104
+
105
+ # Calculate final score (cap penalty before applying it)
106
+ total_penalty = min(penalty, 0.8) # Max penalty is 80% to avoid dropping to 0 for weird formatting
107
+ final_score = max(0.0, min(1.0, base_score - total_penalty + bonus))
108
+ chunk.quality_score = final_score
109
+
110
+ return chunks
@@ -0,0 +1,67 @@
1
+ """Semantic boundary detection using SentenceTransformers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import List
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ _models: dict = {}
11
+
12
+
13
+ def _get_model(model_name: str = "all-MiniLM-L6-v2"):
14
+ """Lazily load the SentenceTransformer model (cached by name)."""
15
+ if model_name not in _models:
16
+ try:
17
+ from sentence_transformers import SentenceTransformer
18
+ _models[model_name] = SentenceTransformer(model_name)
19
+ logger.info("Loaded semantic chunking model: %s", model_name)
20
+ except ImportError:
21
+ logger.warning("sentence-transformers not installed. Semantic chunking disabled.")
22
+ return None
23
+ return _models[model_name]
24
+
25
+
26
+ def find_semantic_boundaries(
27
+ texts: List[str],
28
+ threshold: float = 0.3,
29
+ model_name: str = "all-MiniLM-L6-v2",
30
+ ) -> List[int]:
31
+ """Find semantic boundaries in a list of texts.
32
+
33
+ Args:
34
+ texts: List of block texts in reading order.
35
+ threshold: Cosine similarity threshold. Drops below this indicate a shift.
36
+
37
+ Returns:
38
+ List of block indices where a semantic shift occurs (the boundary is *before* the index).
39
+ """
40
+ if not texts or len(texts) < 2:
41
+ return []
42
+
43
+ model = _get_model(model_name)
44
+ if not model:
45
+ return []
46
+
47
+ # Batch encode all texts (fast on CPU)
48
+ embeddings = model.encode(texts, batch_size=64, show_progress_bar=False)
49
+
50
+ import numpy as np
51
+
52
+ def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
53
+ norm_a = np.linalg.norm(a)
54
+ norm_b = np.linalg.norm(b)
55
+ if norm_a == 0 or norm_b == 0:
56
+ return 0.0
57
+ return float(np.dot(a, b) / (norm_a * norm_b))
58
+
59
+ boundaries = []
60
+
61
+ for i in range(len(embeddings) - 1):
62
+ sim = cosine_sim(embeddings[i], embeddings[i+1])
63
+ if sim < threshold:
64
+ # Shift occurs before block i+1
65
+ boundaries.append(i + 1)
66
+
67
+ return boundaries
@@ -0,0 +1,219 @@
1
+ """Marker-based extractor for high-fidelity extraction on complex PDFs.
2
+
3
+ ⚠️ LICENSE NOTICE — GPL-3.0
4
+ marker-pdf is licensed under GPL-3.0.
5
+ By using this backend, you agree to the terms of the GPL-3.0 license.
6
+
7
+ This module is NOT imported by default — users must explicitly opt in
8
+ via ``pip install longparser[marker]`` and ``backend='marker'``.
9
+
10
+ ⚠️ ISOLATION RULES (do NOT violate)
11
+ 1. This file must NEVER be imported by ``extractors/__init__.py``
12
+ 2. This file must NEVER be imported at module level by ``orchestrator.py``
13
+ 3. This file must ONLY be imported behind ``if backend == "marker":``
14
+ 4. ``import longparser`` must NEVER trigger loading this file
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import hashlib
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import Optional, List, Tuple
23
+
24
+ from ..schemas import (
25
+ Document, Page, Block, BlockType, ExtractorType, ProcessingConfig,
26
+ BoundingBox, Provenance, Confidence, DocumentMetadata, PageProfile, ExtractionMetadata
27
+ )
28
+ from .base import BaseExtractor
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ def _require_marker():
34
+ """Check that marker-pdf is installed; raise clear error if not."""
35
+ try:
36
+ import marker
37
+ return marker
38
+ except ImportError:
39
+ raise ImportError(
40
+ "\n"
41
+ "╔══════════════════════════════════════════════════════════╗\n"
42
+ "║ marker-pdf is not installed. ║\n"
43
+ "║ ║\n"
44
+ "║ Install: pip install 'longparser[marker]' ║\n"
45
+ "║ ║\n"
46
+ "║ ⚠️ marker-pdf is licensed under GPL-3.0. ║\n"
47
+ "║ By installing it, you agree to GPL terms for that ║\n"
48
+ "║ component. LongParser core remains MIT-licensed. ║\n"
49
+ "╚══════════════════════════════════════════════════════════╝\n"
50
+ )
51
+
52
+
53
+ class MarkerExtractor(BaseExtractor):
54
+ """Extractor using Marker for high-fidelity output.
55
+
56
+ Includes soft-cap logic for running on CPU to prevent infinite hangs.
57
+ """
58
+
59
+ extractor_type = ExtractorType.MARKER
60
+ version = "1.0.0"
61
+
62
+ def __init__(self):
63
+ """Initialize and verify marker-pdf is available."""
64
+ _require_marker()
65
+
66
+ # Check for GPU
67
+ try:
68
+ import torch
69
+ if not torch.cuda.is_available() and not torch.backends.mps.is_available():
70
+ logger.warning(
71
+ "⚠️ Marker is running on CPU — expect 5-10× slower extraction. "
72
+ "A soft cap of 10 pages is enforced by default. "
73
+ "Set `force_marker_cpu=True` to bypass this."
74
+ )
75
+ except ImportError:
76
+ pass
77
+
78
+ logger.info("Marker backend initialized")
79
+
80
+ def extract(
81
+ self,
82
+ file_path: Path,
83
+ config: ProcessingConfig,
84
+ page_numbers: Optional[List[int]] = None,
85
+ ) -> Tuple[Document, ExtractionMetadata]:
86
+ """Extract a PDF using Marker."""
87
+ from marker.convert import convert_single_pdf
88
+ from marker.models import load_all_models
89
+ from marker.settings import settings
90
+ import fitz # PyMuPDF is a marker dependency anyway
91
+
92
+ file_path = Path(file_path)
93
+ logger.info("Extracting with Marker: %s", file_path.name)
94
+
95
+ if file_path.suffix.lower() != ".pdf":
96
+ raise ValueError(f"Marker backend only supports PDF files, got: {file_path.suffix}")
97
+
98
+ pdf_doc = fitz.open(str(file_path))
99
+ total_pages = len(pdf_doc)
100
+ pdf_doc.close()
101
+
102
+ # Soft cap logic for CPU
103
+ try:
104
+ import torch
105
+ is_cpu = not torch.cuda.is_available() and not torch.backends.mps.is_available()
106
+ except ImportError:
107
+ is_cpu = True
108
+
109
+ if is_cpu and not config.force_marker_cpu and total_pages > 10:
110
+ if page_numbers is None or len(page_numbers) > 10:
111
+ raise RuntimeError(
112
+ f"Marker CPU Soft Cap exceeded. Document has {total_pages} pages "
113
+ f"(limit: 10). Extraction will take too long on CPU. "
114
+ f"Set config.force_marker_cpu=True to override."
115
+ )
116
+
117
+ file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest()[:16]
118
+
119
+ # Load models (cached internally by Marker)
120
+ model_lst = load_all_models()
121
+
122
+ # Convert
123
+ full_text, images, out_meta = convert_single_pdf(
124
+ str(file_path),
125
+ model_lst,
126
+ max_pages=settings.MAX_PAGES if not page_numbers else len(page_numbers),
127
+ langs=config.languages if config.languages else None,
128
+ batch_multiplier=settings.BATCH_MULTIPLIER,
129
+ start_page=page_numbers[0] if page_numbers else None
130
+ )
131
+
132
+ # Map to LongParser Document
133
+ # Note: Marker's output is flat markdown, so we do a fast mapping
134
+ # similar to PyMuPDFExtractor.
135
+ document = self._markdown_to_document(
136
+ md_text=full_text,
137
+ file_path=file_path,
138
+ file_hash=file_hash,
139
+ total_pages=total_pages,
140
+ )
141
+
142
+ meta = ExtractionMetadata(
143
+ strategy_used="marker",
144
+ ocr_backend_used="surya (marker)",
145
+ )
146
+
147
+ return document, meta
148
+
149
+ def _markdown_to_document(
150
+ self,
151
+ md_text: str,
152
+ file_path: Path,
153
+ file_hash: str,
154
+ total_pages: int,
155
+ ) -> Document:
156
+ """Convert Marker's markdown into a LongParser Document."""
157
+ metadata = DocumentMetadata(
158
+ source_file=str(file_path),
159
+ file_hash=file_hash,
160
+ total_pages=total_pages,
161
+ )
162
+
163
+ pages: list[Page] = []
164
+ blocks: list[Block] = []
165
+
166
+ lines = md_text.strip().split("\n")
167
+ order_idx = 0
168
+
169
+ # Fast parse
170
+ for i, line in enumerate(lines):
171
+ stripped = line.strip()
172
+ if not stripped:
173
+ continue
174
+
175
+ block_type = BlockType.PARAGRAPH
176
+ heading_level = None
177
+
178
+ if stripped.startswith("#"):
179
+ block_type = BlockType.HEADING
180
+ heading_level = min(len(stripped) - len(stripped.lstrip("#")), 6)
181
+ stripped = stripped.lstrip("#").strip()
182
+ elif stripped.startswith(("- ", "* ")):
183
+ block_type = BlockType.LIST_ITEM
184
+ stripped = stripped.lstrip("-* ").strip()
185
+
186
+ blocks.append(Block(
187
+ type=block_type,
188
+ text=stripped,
189
+ order_index=order_idx,
190
+ heading_level=heading_level,
191
+ provenance=Provenance(
192
+ source_file=str(file_path),
193
+ page_number=1, # Marker loses page boundaries in its markdown string
194
+ bbox=BoundingBox(x0=0, y0=0, x1=0, y1=0),
195
+ extractor=self.extractor_type,
196
+ extractor_version=self.version,
197
+ ),
198
+ confidence=Confidence(overall=0.9),
199
+ ))
200
+ order_idx += 1
201
+
202
+ pages.append(Page(
203
+ page_number=1,
204
+ width=612.0,
205
+ height=792.0,
206
+ blocks=blocks,
207
+ profile=PageProfile(page_number=1, layout_confidence=0.9)
208
+ ))
209
+
210
+ return Document(metadata=metadata, pages=pages)
211
+
212
+ def extract_page(
213
+ self,
214
+ file_path: Path,
215
+ page_number: int,
216
+ config: ProcessingConfig,
217
+ ) -> Page:
218
+ doc, _ = self.extract(file_path, config, page_numbers=[page_number])
219
+ return doc.pages[0] if doc.pages else Page(page_number=page_number, width=0, height=0)