longparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,404 @@
1
+ """LaTeX OCR module — surgical equation-image → LaTeX conversion.
2
+
3
+ Production-hardened with:
4
+ - Pluggable backends (pix2tex / UniMERNet)
5
+ - Thread-safe singleton with lazy loading
6
+ - LaTeX validation (braces, left/right parity, length, repeated tokens)
7
+ - Forced CPU inference (no GPU surprise)
8
+ - Graceful degradation when weights unavailable
9
+ """
10
+
11
+ import os
12
+ import re
13
+ import logging
14
+ import threading
15
+ from abc import ABC, abstractmethod
16
+ from typing import Optional
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # LaTeX validation utilities
23
+ # ---------------------------------------------------------------------------
24
+
25
+ def validate_latex(s: str) -> bool:
26
+ """Check if a LaTeX string is well-formed enough to use."""
27
+ if not s or not s.strip():
28
+ return False
29
+ s = s.strip()
30
+
31
+ # Max length
32
+ if len(s) > 2000:
33
+ logger.debug(f"LaTeX too long ({len(s)} chars), rejecting")
34
+ return False
35
+
36
+ # Balanced braces
37
+ depth = 0
38
+ for ch in s:
39
+ if ch == '{':
40
+ depth += 1
41
+ elif ch == '}':
42
+ depth -= 1
43
+ if depth < 0:
44
+ return False
45
+ if depth != 0:
46
+ logger.debug("LaTeX has unbalanced braces")
47
+ return False
48
+
49
+ # \left / \right parity
50
+ lefts = len(re.findall(r'\\left[^a-zA-Z]', s))
51
+ rights = len(re.findall(r'\\right[^a-zA-Z]', s))
52
+ if lefts != rights:
53
+ logger.debug(f"LaTeX \\left/{lefts} != \\right/{rights}")
54
+ return False
55
+
56
+ # Repeated token check (e.g., "\frac\frac\frac" junk)
57
+ tokens = re.findall(r'\\[a-zA-Z]+', s)
58
+ if len(tokens) > 5:
59
+ from collections import Counter
60
+ counts = Counter(tokens)
61
+ most_common_count = counts.most_common(1)[0][1]
62
+ if most_common_count > len(tokens) * 0.6:
63
+ logger.debug("LaTeX has repeated junk tokens")
64
+ return False
65
+
66
+ return True
67
+
68
+
69
+ def strip_delimiters(s: str) -> str:
70
+ """Remove existing LaTeX delimiters from raw OCR output."""
71
+ s = s.strip()
72
+ # Remove wrapping $$ or $
73
+ if s.startswith("$$") and s.endswith("$$"):
74
+ s = s[2:-2].strip()
75
+ elif s.startswith("$") and s.endswith("$"):
76
+ s = s[1:-1].strip()
77
+ # Remove \[ \] or \( \)
78
+ if s.startswith("\\[") and s.endswith("\\]"):
79
+ s = s[2:-2].strip()
80
+ elif s.startswith("\\(") and s.endswith("\\)"):
81
+ s = s[2:-2].strip()
82
+ return s
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # Backend ABC
87
+ # ---------------------------------------------------------------------------
88
+
89
+ class LaTeXOCRBackend(ABC):
90
+ """Abstract base for LaTeX OCR backends."""
91
+
92
+ @abstractmethod
93
+ def load(self) -> bool:
94
+ """Load model. Returns True if successful."""
95
+ ...
96
+
97
+ @abstractmethod
98
+ def recognize(self, image) -> Optional[str]:
99
+ """Run inference on a PIL Image. Returns raw LaTeX or None."""
100
+ ...
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # pix2tex backend (CC BY-NC-SA weights — non-commercial only)
105
+ # ---------------------------------------------------------------------------
106
+
107
+ class Pix2TexBackend(LaTeXOCRBackend):
108
+ """pix2tex / LaTeX-OCR backend (~30MB, ~20ms/eq on CPU)."""
109
+
110
+ def __init__(self):
111
+ self._model = None
112
+
113
+ def load(self) -> bool:
114
+ try:
115
+ import torch
116
+ torch.set_num_threads(int(os.getenv("LONGPARSER_LATEX_OCR_THREADS", "2")))
117
+
118
+ from pix2tex.cli import LatexOCR
119
+ self._model = LatexOCR()
120
+
121
+ try:
122
+ # Pre-warm with dummy inference (safe to fail)
123
+ from PIL import Image
124
+ dummy = Image.new("RGB", (64, 64), color="white")
125
+ self._model(dummy)
126
+ except Exception as e:
127
+ logger.debug(f"Pix2Tex pre-warm dummy inference skipped: {e}")
128
+
129
+ logger.info("Pix2TexBackend loaded and pre-warmed")
130
+ return True
131
+ except ImportError:
132
+ logger.warning("pix2tex not installed. Install: pip install 'pix2tex>=0.1.4'")
133
+ return False
134
+ except Exception as e:
135
+ logger.warning(f"Pix2TexBackend failed to load: {e}")
136
+ return False
137
+
138
+ def recognize(self, image) -> Optional[str]:
139
+ if self._model is None:
140
+ return None
141
+ try:
142
+ result = self._model(image)
143
+ return result if isinstance(result, str) else str(result)
144
+ except Exception as e:
145
+ logger.debug(f"pix2tex inference failed: {e}")
146
+ return None
147
+
148
+
149
+ # ---------------------------------------------------------------------------
150
+ # UniMERNet backend (Apache 2.0 — commercial safe)
151
+ # ---------------------------------------------------------------------------
152
+
153
+ class UniMERNetBackend(LaTeXOCRBackend):
154
+ """UniMERNet-tiny backend (~441MB, Apache 2.0)."""
155
+
156
+ def __init__(self):
157
+ self._model = None
158
+
159
+ def load(self) -> bool:
160
+ try:
161
+ import torch
162
+ torch.set_num_threads(int(os.getenv("LONGPARSER_LATEX_OCR_THREADS", "2")))
163
+
164
+ from unimernet.common.config import Config
165
+ from unimernet.processors import load_processor
166
+ from unimernet.models import load_model
167
+
168
+ model_dir = os.getenv("LONGPARSER_UNIMERNET_MODEL_DIR", "")
169
+ if not model_dir or not os.path.isdir(model_dir):
170
+ logger.warning(
171
+ "UniMERNet model dir not found. "
172
+ "Set LONGPARSER_UNIMERNET_MODEL_DIR to the checkpoint directory."
173
+ )
174
+ return False
175
+
176
+ cfg = Config({"model": {"arch": "unimernet_tiny", "model_path": model_dir}})
177
+ self._model = load_model(cfg)
178
+ self._processor = load_processor(cfg)
179
+
180
+ logger.info("UniMERNetBackend loaded")
181
+ return True
182
+ except ImportError:
183
+ logger.warning("unimernet not installed. Install: pip install 'unimernet>=0.2.0'")
184
+ return False
185
+ except Exception as e:
186
+ logger.warning(f"UniMERNetBackend failed to load: {e}")
187
+ return False
188
+
189
+ def recognize(self, image) -> Optional[str]:
190
+ if self._model is None:
191
+ return None
192
+ try:
193
+ inputs = self._processor(image)
194
+ result = self._model.generate(inputs)
195
+ return result[0] if result else None
196
+ except Exception as e:
197
+ logger.debug(f"UniMERNet inference failed: {e}")
198
+ return None
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Main singleton
203
+ # ---------------------------------------------------------------------------
204
+
205
+ class LaTeXOCR:
206
+ """Thread-safe singleton LaTeX OCR with pluggable backend.
207
+
208
+ Usage:
209
+ ocr = LaTeXOCR(backend="pix2tex")
210
+ if ocr.available:
211
+ latex = ocr.recognize(pil_image)
212
+ """
213
+
214
+ _instances: dict = {}
215
+ _lock = threading.Lock()
216
+
217
+ def __new__(cls, backend: str = "pix2tex"):
218
+ with cls._lock:
219
+ if backend not in cls._instances:
220
+ instance = super().__new__(cls)
221
+ instance._backend_name = backend
222
+ instance._backend: Optional[LaTeXOCRBackend] = None
223
+ instance._available = False
224
+ instance._initialized = False
225
+ cls._instances[backend] = instance
226
+ return cls._instances[backend]
227
+
228
+ def _ensure_loaded(self):
229
+ """Lazy-load backend on first use."""
230
+ if self._initialized:
231
+ return
232
+ self._initialized = True
233
+
234
+ if self._backend_name == "pix2tex":
235
+ self._backend = Pix2TexBackend()
236
+ elif self._backend_name == "unimernet":
237
+ self._backend = UniMERNetBackend()
238
+ else:
239
+ logger.error(f"Unknown LaTeX OCR backend: {self._backend_name}")
240
+ return
241
+
242
+ self._available = self._backend.load()
243
+ if not self._available:
244
+ logger.warning(
245
+ f"LaTeX OCR backend '{self._backend_name}' not available. "
246
+ "Formula OCR will be skipped."
247
+ )
248
+
249
+ @property
250
+ def available(self) -> bool:
251
+ """Whether the backend is loaded and ready."""
252
+ self._ensure_loaded()
253
+ return self._available
254
+
255
+ def recognize(self, image) -> Optional[str]:
256
+ """Recognize a formula image → validated LaTeX string.
257
+
258
+ Returns None if backend unavailable, inference fails, or validation fails.
259
+ """
260
+ if not self.available:
261
+ return None
262
+
263
+ with self._lock:
264
+ raw = self._backend.recognize(image)
265
+
266
+ if raw is None:
267
+ return None
268
+
269
+ # Strip existing delimiters
270
+ latex = strip_delimiters(raw)
271
+
272
+ # Validate
273
+ if not validate_latex(latex):
274
+ logger.debug(f"LaTeX validation failed for: {latex[:100]}...")
275
+ return None
276
+
277
+ return latex
278
+
279
+
280
+ # ---------------------------------------------------------------------------
281
+ # MFD: Math Formula Detector (page-level, pix2text YOLO-based)
282
+ # ---------------------------------------------------------------------------
283
+
284
+ class MFDBackend:
285
+ """Thread-safe singleton for page-level math formula detection.
286
+
287
+ Uses pix2text's MathFormulaDetector (YOLO-nano, MIT).
288
+ Requires LONGPARSER_MFD_MODEL_DIR pointing to a directory containing
289
+ a *mfd*.onnx file. If missing, available=False and no network calls
290
+ are ever made.
291
+
292
+ Usage:
293
+ mfd = MFDBackend.get()
294
+ if mfd.available:
295
+ boxes = mfd.detect(page_pil_image)
296
+ """
297
+
298
+ _instance: Optional["MFDBackend"] = None
299
+ _lock = threading.Lock()
300
+
301
+ def __init__(self):
302
+ self._mfd = None
303
+ self.available: bool = False
304
+
305
+ @classmethod
306
+ def get(cls) -> "MFDBackend":
307
+ """Return the singleton, initialising it on first call."""
308
+ with cls._lock:
309
+ if cls._instance is None:
310
+ cls._instance = cls._load()
311
+ return cls._instance
312
+
313
+ @classmethod
314
+ def _load(cls) -> "MFDBackend":
315
+ inst = cls()
316
+ model_dir = os.getenv("LONGPARSER_MFD_MODEL_DIR", "").strip()
317
+ if not model_dir:
318
+ logger.debug("MFD disabled: LONGPARSER_MFD_MODEL_DIR not set")
319
+ return inst
320
+
321
+ from pathlib import Path as _Path
322
+ model_dir_path = _Path(model_dir)
323
+ if not model_dir_path.exists():
324
+ logger.warning(f"MFD model dir not found: {model_dir}. MFD disabled.")
325
+ return inst
326
+
327
+ # Scan for *mfd*.onnx — same glob pattern as pix2text's own find_files()
328
+ candidates = sorted(model_dir_path.rglob("*mfd*.onnx"))
329
+ if not candidates:
330
+ logger.warning(
331
+ f"No *mfd*.onnx found in {model_dir}. MFD disabled. "
332
+ "Download the mfd-1.5.onnx from the pix2text model hub and place it here."
333
+ )
334
+ return inst
335
+
336
+ model_path = candidates[0]
337
+ try:
338
+ from pix2text.formula_detector import MathFormulaDetector
339
+ # Pass model_path directly → prepare_model_files() is never called
340
+ inst._mfd = MathFormulaDetector(
341
+ model_path=model_path,
342
+ device="cpu",
343
+ )
344
+ inst.available = True
345
+ logger.info(f"MFDBackend loaded from: {model_path}")
346
+ except ImportError:
347
+ logger.warning(
348
+ "pix2text not installed. Install: pip install 'pix2text>=1.1.1,<1.2'. MFD disabled."
349
+ )
350
+ except Exception as e:
351
+ logger.warning(f"MFDBackend failed to load: {e}. MFD disabled.")
352
+
353
+ return inst
354
+
355
+ def detect(
356
+ self,
357
+ page_img,
358
+ threshold: float = 0.45,
359
+ max_boxes: int = 10,
360
+ min_area_px: int = 2048,
361
+ ) -> list[dict]:
362
+ """Detect math formula regions in a PIL page image.
363
+
364
+ Args:
365
+ page_img: PIL.Image of a document page.
366
+ threshold: Detection confidence threshold.
367
+ max_boxes: Maximum boxes to return (after sorting).
368
+ min_area_px: Minimum pixel area to keep a detection.
369
+
370
+ Returns:
371
+ List of dicts: {x0, y0, x1, y1, type:'isolated'|'embedding', score}
372
+ Sorted: isolated first → larger area first → higher score first.
373
+ Returns [] on error or unavailability.
374
+ """
375
+ if not self.available:
376
+ return []
377
+ try:
378
+ import numpy as np
379
+ raw = self._mfd.detect(page_img, threshold=threshold)
380
+ boxes = []
381
+ for r in raw:
382
+ pts = r["box"] # np.ndarray shape (4, 2): [[x,y], ...]
383
+ x0 = int(np.min(pts[:, 0]))
384
+ y0 = int(np.min(pts[:, 1]))
385
+ x1 = int(np.max(pts[:, 0]))
386
+ y1 = int(np.max(pts[:, 1]))
387
+ area = (x1 - x0) * (y1 - y0)
388
+ if area < min_area_px:
389
+ continue
390
+ boxes.append({
391
+ "x0": x0, "y0": y0, "x1": x1, "y1": y1,
392
+ "type": r.get("type", "isolated"),
393
+ "score": float(r.get("score", 1.0)),
394
+ })
395
+ # Priority: isolated > larger area > higher confidence
396
+ boxes.sort(key=lambda b: (
397
+ 0 if b["type"] == "isolated" else 1,
398
+ -((b["x1"] - b["x0"]) * (b["y1"] - b["y0"])),
399
+ -b["score"],
400
+ ))
401
+ return boxes[:max_boxes]
402
+ except Exception as e:
403
+ logger.warning(f"MFD detect error: {e}")
404
+ return []
@@ -0,0 +1,31 @@
1
+ """Optional integration adapters for LangChain and LlamaIndex.
2
+
3
+ Install the extras to use these adapters::
4
+
5
+ pip install clean_rag[langchain]
6
+ pip install clean_rag[llamaindex]
7
+ pip install clean_rag[all]
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+
13
+ def _has_langchain() -> bool:
14
+ """Check if langchain-core is installed."""
15
+ try:
16
+ import langchain_core # noqa: F401
17
+ return True
18
+ except ImportError:
19
+ return False
20
+
21
+
22
+ def _has_llamaindex() -> bool:
23
+ """Check if llama-index-core is installed."""
24
+ try:
25
+ import llama_index.core # noqa: F401
26
+ return True
27
+ except ImportError:
28
+ return False
29
+
30
+
31
+ __all__ = ["_has_langchain", "_has_llamaindex"]
@@ -0,0 +1,138 @@
1
+ """LangChain integration adapter for LongParser.
2
+
3
+ Provides ``LongParserLoader``, a LangChain-compatible document loader that
4
+ wraps the LongParser extraction pipeline.
5
+
6
+ Install the extra to use this adapter::
7
+
8
+ pip install clean_rag[langchain]
9
+
10
+ Usage::
11
+
12
+ from longparser.integrations.langchain import LongParserLoader
13
+
14
+ loader = LongParserLoader("report.pdf")
15
+ docs = loader.load() # list[langchain_core.documents.Document]
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from pathlib import Path
21
+ from typing import Iterator, Optional, TYPE_CHECKING
22
+
23
+ if TYPE_CHECKING:
24
+ from langchain_core.documents import Document as LCDocument
25
+
26
+ from ..schemas import ProcessingConfig, ChunkingConfig
27
+
28
+ _INSTALL_MSG = (
29
+ "langchain-core is required for the LangChain adapter. "
30
+ "Install it with: pip install clean_rag[langchain]"
31
+ )
32
+
33
+
34
+ def _import_langchain():
35
+ """Lazy import langchain-core with a clear install message."""
36
+ try:
37
+ from langchain_core.document_loaders import BaseLoader
38
+ from langchain_core.documents import Document as LCDocument
39
+ return BaseLoader, LCDocument
40
+ except ImportError as exc:
41
+ raise ImportError(_INSTALL_MSG) from exc
42
+
43
+
44
+ class LongParserLoader:
45
+ """LangChain document loader backed by the LongParser pipeline.
46
+
47
+ Converts a file (PDF, DOCX, PPTX, XLSX, CSV) into a list of
48
+ LangChain ``Document`` objects — one per chunk (if chunking is
49
+ enabled) or one per block.
50
+
51
+ Parameters
52
+ ----------
53
+ file_path:
54
+ Path to the input file.
55
+ config:
56
+ LongParser ``ProcessingConfig``. Uses defaults if ``None``.
57
+ chunking_config:
58
+ LongParser ``ChunkingConfig``. If provided, the loader yields
59
+ one ``Document`` per chunk; otherwise one per block.
60
+ tesseract_lang:
61
+ Languages for Tesseract OCR (e.g. ``["eng", "urd"]``).
62
+ tessdata_path:
63
+ Path to tessdata directory.
64
+ """
65
+
66
+ def __init__(
67
+ self,
68
+ file_path: str | Path,
69
+ *,
70
+ config: Optional[ProcessingConfig] = None,
71
+ chunking_config: Optional[ChunkingConfig] = None,
72
+ tesseract_lang: list[str] | None = None,
73
+ tessdata_path: str | None = None,
74
+ ) -> None:
75
+ # Validate langchain is available at construction time
76
+ BaseLoader, _ = _import_langchain()
77
+ self._BaseLoader = BaseLoader
78
+
79
+ self.file_path = Path(file_path)
80
+ self.config = config or ProcessingConfig()
81
+ self.chunking_config = chunking_config
82
+ self.tesseract_lang = tesseract_lang
83
+ self.tessdata_path = tessdata_path
84
+
85
+ # ---- LangChain interface -------------------------------------------------
86
+
87
+ def load(self) -> list["LCDocument"]:
88
+ """Load and return all documents."""
89
+ return list(self.lazy_load())
90
+
91
+ def lazy_load(self) -> Iterator["LCDocument"]:
92
+ """Lazily yield LangChain ``Document`` objects."""
93
+ _, LCDocument = _import_langchain()
94
+
95
+ from ..pipeline import PipelineOrchestrator
96
+
97
+ pipeline = PipelineOrchestrator(
98
+ tesseract_lang=self.tesseract_lang,
99
+ tessdata_path=self.tessdata_path,
100
+ )
101
+ result = pipeline.process_file(self.file_path, config=self.config)
102
+
103
+ # If chunking is requested, yield one doc per chunk
104
+ if self.chunking_config is not None:
105
+ chunks = pipeline.chunk(result, config=self.chunking_config)
106
+ for chunk in chunks:
107
+ yield LCDocument(
108
+ page_content=chunk.text,
109
+ metadata={
110
+ "source": str(self.file_path),
111
+ "chunk_id": chunk.chunk_id,
112
+ "chunk_type": chunk.chunk_type,
113
+ "section_path": chunk.section_path,
114
+ "page_numbers": chunk.page_numbers,
115
+ "token_count": chunk.token_count,
116
+ "equation_detected": chunk.equation_detected,
117
+ },
118
+ )
119
+ return
120
+
121
+ # Otherwise, yield one doc per block
122
+ for page in result.document.pages:
123
+ for block in page.blocks:
124
+ yield LCDocument(
125
+ page_content=block.text,
126
+ metadata={
127
+ "source": str(self.file_path),
128
+ "block_id": block.block_id,
129
+ "block_type": block.type.value,
130
+ "heading_level": block.heading_level,
131
+ "hierarchy_path": block.hierarchy_path,
132
+ "page_number": page.page_number,
133
+ "confidence": block.confidence.overall,
134
+ },
135
+ )
136
+
137
+
138
+ __all__ = ["LongParserLoader"]