PyPI - visual-parser - Versions diffs - 1.0.0__py3-none-any.whl - Mend

visual-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

visual_parser/__init__.py +20 -0
visual_parser/__main__.py +8 -0
visual_parser/cli.py +230 -0
visual_parser/cli_main.py +223 -0
visual_parser/config.py +168 -0
visual_parser/figure_describer.py +218 -0
visual_parser/jsonl_writer.py +102 -0
visual_parser/metadata_extractor.py +94 -0
visual_parser/nougat_engine.py +222 -0
visual_parser/pdf_tracker.py +105 -0
visual_parser/pipeline.py +255 -0
visual_parser/prompts.py +98 -0
visual_parser/text_extractor.py +396 -0
visual_parser/vision_llm.py +269 -0
visual_parser-1.0.0.dist-info/METADATA +191 -0
visual_parser-1.0.0.dist-info/RECORD +19 -0
visual_parser-1.0.0.dist-info/WHEEL +5 -0
visual_parser-1.0.0.dist-info/entry_points.txt +2 -0
visual_parser-1.0.0.dist-info/top_level.txt +1 -0

visual_parser/figure_describer.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""
+figure_describer.py — Rasterise every page of each PDF at high DPI and send
+                      each page image to a Vision LLM for figure extraction.
+Extracted and cb-decoupled from the inner ``describe_figures_for_new_pdfs``
+function in PDFAnalyser.py.
+Output
+------
+One record per figure (or per page that contains at least one figure) is
+appended to ``02_visuals_kb.jsonl`` in *output_dir*:
+    {
+        "source":        "myreport.pdf",
+        "page":          3,
+        "document_id":   "a1b2c3d4e5f6g7h8",
+        "figure_index":  0,
+        "figure_id":     "a1b2c3d4e5f6g7h8:p3:f0",
+        "description":   "**Subject:** ..."
+    }
+"""
+from __future__ import annotations
+import json
+import logging
+import os
+import re
+from collections import defaultdict
+from typing import Any, Dict, List, Optional
+import fitz  # PyMuPDF
+from visual_parser.jsonl_writer import append_to_jsonl, make_document_id
+from visual_parser.prompts import FIGURE_PROMPT
+from visual_parser.vision_llm import call_vision_llm
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# JSON response parser
+# ---------------------------------------------------------------------------
+def _parse_llm_response(
+    raw: str,
+    pdf_name: str,
+    page_number: Optional[int] = None,
+) -> Optional[List[Dict]]:
+    """
+    Parse the Vision LLM's JSON list response.
+    Strips markdown fences, tries ``json.loads``, then falls back to a regex
+    search for a JSON array if the model wraps it in prose.
+    """
+    body = raw.strip()
+    for fence in ("```json", "```"):
+        body = body.replace(fence, "")
+    try:
+        return json.loads(body)
+    except json.JSONDecodeError:
+        match = re.search(r"(\[\s*\{.*?\}\s*\])", body, re.S)
+        if match:
+            try:
+                return json.loads(match.group(1))
+            except json.JSONDecodeError:
+                pass
+        label = f"{pdf_name} p{page_number}" if page_number else pdf_name
+        logger.warning("Could not parse JSON for %s: %r", label, body[:200])
+        return None
+# ---------------------------------------------------------------------------
+# Main function
+# ---------------------------------------------------------------------------
+def describe_figures_for_new_pdfs(
+    new_pdf_paths: List[str],
+    output_dir: str,
+    vision_provider: str,
+    vision_api_key: str,
+    vision_model: str,
+    vision_detail: str = "low",
+    raster_dpi: int = 200,
+    figure_prompt: str = FIGURE_PROMPT,
+    reasoning_effort: Optional[str] = "medium",
+) -> None:
+    """
+    For each PDF in *new_pdf_paths*, rasterise every page at *raster_dpi* DPI,
+    call the Vision LLM page-by-page, parse the figure descriptions, and
+    append the results to ``02_visuals_kb.jsonl`` in *output_dir*.
+    Args:
+        new_pdf_paths:    Full paths of PDFs to describe.
+        output_dir:       Directory where ``02_visuals_kb.jsonl`` is written.
+        vision_provider:  ``'gpt'`` or ``'gemini'``.
+        vision_api_key:   API key for the chosen provider.
+        vision_model:     Vision model name string.
+        vision_detail:    Image detail level (GPT only).
+        raster_dpi:       DPI used when rasterising pages.  200 DPI gives a
+                          good balance between quality and API payload size.
+        figure_prompt:    The instruction prompt sent with each page image.
+                          Override this to customise for a specific domain.
+    """
+    # -----------------------------------------------------------------------
+    # Step 1 – Rasterise every page of every new PDF
+    # -----------------------------------------------------------------------
+    page_images: List[Dict[str, Any]] = []
+    for pdf_full_path in new_pdf_paths:
+        pdf_name = os.path.basename(pdf_full_path)
+        try:
+            doc = fitz.open(pdf_full_path)
+            for page_index, page in enumerate(doc):
+                pix = page.get_pixmap(dpi=raster_dpi)
+                page_images.append({
+                    "pdf":   pdf_name,
+                    "page":  page_index + 1,
+                    "bytes": pix.tobytes("png"),
+                })
+            doc.close()
+        except Exception as exc:
+            logger.error("Error rasterising %s: %s", pdf_name, exc)
+    if not page_images:
+        logger.info("No pages to describe (all PDFs failed to rasterise).")
+        return
+    # -----------------------------------------------------------------------
+    # Step 2 – Group page images by PDF name
+    # -----------------------------------------------------------------------
+    pages_by_pdf: Dict[str, List[Dict]] = defaultdict(list)
+    for record in page_images:
+        pages_by_pdf[record["pdf"]].append(record)
+    # -----------------------------------------------------------------------
+    # Step 3 – Call Vision LLM once per page
+    # -----------------------------------------------------------------------
+    descriptions_by_page: Dict[tuple, List[str]] = {}
+    for pdf_name, image_records in pages_by_pdf.items():
+        per_pdf_count = 0
+        for record in image_records:
+            page_number  = record["page"]
+            image_bytes  = record["bytes"]
+            try:
+                raw_response = call_vision_llm(
+                    images=[image_bytes],
+                    prompt=figure_prompt,
+                    provider=vision_provider,
+                    api_key=vision_api_key,
+                    model=vision_model,
+                    detail=vision_detail,
+                    reasoning_effort=reasoning_effort,
+                )
+                captions = _parse_llm_response(raw_response, pdf_name, page_number)
+                # Normalise: the model should return a list, but sometimes
+                # returns a single dict for single-figure pages.
+                if isinstance(captions, dict):
+                    captions = [captions]
+                if not isinstance(captions, list):
+                    logger.warning(
+                        "Vision LLM returned non-list output for %s page %d",
+                        pdf_name, page_number,
+                    )
+                    continue
+                for caption in captions:
+                    if not isinstance(caption, dict):
+                        continue
+                    description = caption.get("description")
+                    if description is None:
+                        continue
+                    key = (pdf_name, page_number)
+                    descriptions_by_page.setdefault(key, []).append(description)
+                    per_pdf_count += 1
+            except Exception as exc:
+                logger.error(
+                    "Vision LLM failed for %s page %d: %s",
+                    pdf_name, page_number, exc,
+                )
+        logger.info("[FIGURES] %s: %d figure(s) extracted.", pdf_name, per_pdf_count)
+    total = sum(len(v) for v in descriptions_by_page.values())
+    logger.info("Total figures captured: %d across %d PDF(s).", total,
+                len({k[0] for k in descriptions_by_page}))
+    # -----------------------------------------------------------------------
+    # Step 4 – Write figure descriptions to 02_visuals_kb.jsonl
+    # -----------------------------------------------------------------------
+    figure_rows: List[Dict] = []
+    for (pdf_name, page_number), descriptions in descriptions_by_page.items():
+        document_id = make_document_id(pdf_name)
+        for fig_idx, description in enumerate(descriptions):
+            figure_rows.append({
+                "source":        pdf_name,
+                "page":          page_number,
+                "document_id":   document_id,
+                "figure_index":  fig_idx,
+                "figure_id":     f"{document_id}:p{page_number}:f{fig_idx}",
+                "description":   description,
+            })
+    if figure_rows:
+        figures_path = os.path.join(output_dir, "02_visuals_kb.jsonl")
+        append_to_jsonl(figures_path, figure_rows)
+        print(f"[FIGURES] Wrote {len(figure_rows)} figure record(s) to 02_visuals_kb.jsonl.")
+    else:
+        logger.info("No figures detected — 02_visuals_kb.jsonl not updated.")

visual_parser/jsonl_writer.py ADDED Viewed

@@ -0,0 +1,102 @@
+"""
+jsonl_writer.py — Atomic JSONL append helper and stable document-ID generator.
+Consolidated from the two duplicate copies that existed in
+utils/nougat_helpers.py and PDFAnalyser.py.
+"""
+from __future__ import annotations
+import hashlib
+import json
+import logging
+import os
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+def make_document_id(source: str) -> str:
+    """
+    Return a 16-character hex SHA-1 digest of the PDF basename.
+    The ID is stable across runs as long as the filename doesn't change,
+    which lets downstream systems deduplicate without re-reading JSONL files.
+    """
+    try:
+        return hashlib.sha1(source.encode("utf-8")).hexdigest()[:16]
+    except Exception as exc:
+        logger.warning("Could not hash source %r: %s — using raw name as fallback.", source, exc)
+        return source
+def append_to_jsonl(jsonl_file: str, new_data: List[Dict]) -> None:
+    """
+    Safely append *new_data* to a JSON Lines file.
+    - Creates the file (and any missing parent directories) if needed.
+    - Skips individual rows that cannot be serialised without aborting the
+      entire write.
+    - Never corrupts existing content: each row is appended as a complete
+      ``\\n``-terminated JSON line.
+    Args:
+        jsonl_file: Absolute or relative path to the target ``.jsonl`` file.
+        new_data:   List of dicts to write (one per line).
+    """
+    if not isinstance(new_data, list):
+        logger.warning(
+            "append_to_jsonl: new_data must be a list, got %s — skipping.",
+            type(new_data).__name__,
+        )
+        return
+    try:
+        parent = os.path.dirname(jsonl_file)
+        if parent:
+            os.makedirs(parent, exist_ok=True)
+        with open(jsonl_file, "a", encoding="utf-8") as fh:
+            for row in new_data:
+                if not isinstance(row, dict):
+                    logger.warning("Skipping non-dict JSONL entry: %s", type(row).__name__)
+                    continue
+                try:
+                    fh.write(json.dumps(row, ensure_ascii=False) + "\n")
+                except (TypeError, ValueError) as exc:
+                    logger.warning("Failed to serialise row — skipping. Error: %s", exc)
+    except OSError as exc:
+        logger.error("File-system error writing %s: %s", jsonl_file, exc)
+    except Exception as exc:
+        logger.error("Unexpected error writing %s: %s", jsonl_file, exc)
+def read_jsonl(jsonl_path: str) -> List[Dict]:
+    """
+    Read all valid JSON lines from *jsonl_path*.
+    Corrupted lines are skipped with a warning; the rest are returned intact.
+    """
+    rows: List[Dict] = []
+    if not os.path.exists(jsonl_path):
+        logger.warning("JSONL file not found: %s", jsonl_path)
+        return rows
+    try:
+        with open(jsonl_path, "r", encoding="utf-8") as fh:
+            for line_num, line in enumerate(fh, start=1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    rows.append(json.loads(line))
+                except json.JSONDecodeError as exc:
+                    logger.warning(
+                        "Skipping corrupted JSONL line %d in %s: %s",
+                        line_num, jsonl_path, exc,
+                    )
+    except Exception as exc:
+        logger.error("Error reading %s: %s", jsonl_path, exc)
+    return rows

visual_parser/metadata_extractor.py ADDED Viewed

@@ -0,0 +1,94 @@
+"""
+metadata_extractor.py — Extract document-level metadata (title, authors, DOI …)
+                         from the front pages of a PDF using a vision LLM.
+Extracted and cb-decoupled from utils/general_utilities.py.
+"""
+from __future__ import annotations
+import json
+import logging
+from typing import Any, Dict, List, Optional
+import fitz  # PyMuPDF
+from visual_parser.prompts import METADATA_PROMPT_TEMPLATE
+from visual_parser.vision_llm import call_vision_llm
+logger = logging.getLogger(__name__)
+def extract_pdf_metadata(
+    pdf_path: str,
+    vision_provider: str,
+    vision_api_key: str,
+    vision_model: str,
+    num_pages: int = 2,
+    vision_detail: str = "auto",
+    reasoning_effort: Optional[str] = "medium",
+) -> Dict[str, Any]:
+    """
+    Rasterize the first *num_pages* of *pdf_path*, send them to the Vision LLM,
+    and parse the JSON metadata response.
+    Args:
+        pdf_path:         Absolute path to the PDF file.
+        vision_provider:  ``'gpt'`` or ``'gemini'``.
+        vision_api_key:   API key for the chosen provider.
+        vision_model:     Model name string.
+        num_pages:        How many front pages to send (default: 2).
+        vision_detail:    Image detail level for GPT ('low', 'high', 'auto').
+    Returns:
+        Dict with any of: title, authors, publication_date, report_number,
+        doi, keywords — plus a ``_source`` entry with the PDF basename.
+    Raises:
+        RuntimeError on unrecoverable errors (PDF open failure, no valid JSON).
+    """
+    # 1) Rasterize front pages
+    try:
+        doc = fitz.open(pdf_path)
+    except Exception as exc:
+        raise RuntimeError(f"Failed to open PDF {pdf_path!r}: {exc}") from exc
+    images: List[bytes] = []
+    for i in range(min(num_pages, doc.page_count)):
+        try:
+            pix = doc.load_page(i).get_pixmap(dpi=200)
+            images.append(pix.tobytes("png"))
+        except Exception as exc:
+            logger.warning("Skipping page %d of %s: %s", i, pdf_path, exc)
+    doc.close()
+    if not images:
+        raise RuntimeError(f"No pages rendered from {pdf_path!r}")
+    # 2) Build prompt
+    prompt = METADATA_PROMPT_TEMPLATE.format(num_pages=num_pages)
+    # 3) Call Vision LLM
+    raw = call_vision_llm(
+        images=images,
+        prompt=prompt,
+        provider=vision_provider,
+        api_key=vision_api_key,
+        model=vision_model,
+        detail=vision_detail,
+        reasoning_effort=reasoning_effort,
+    )
+    # 4) Extract and parse JSON substring
+    start = raw.find("{")
+    end   = raw.rfind("}")
+    if start < 0 or end < 0 or end <= start:
+        raise RuntimeError(f"No JSON found in vision LLM response:\n{raw}")
+    candidate = raw[start: end + 1].strip().strip("```").strip()
+    try:
+        return json.loads(candidate)
+    except json.JSONDecodeError as exc:
+        raise RuntimeError(
+            f"Failed to parse metadata JSON:\n{candidate}\nError: {exc}"
+        ) from exc

visual_parser/nougat_engine.py ADDED Viewed

@@ -0,0 +1,222 @@
+"""
+nougat_engine.py — Nougat model initialisation, PDF rasterisation, and
+                   the stopping-criteria classes from the original Nougat paper.
+Extracted and cleaned from utils/nougat_helpers.py.
+No chatbot, no LangChain, no Dash dependencies.
+"""
+from __future__ import annotations
+import io
+import logging
+from collections import defaultdict
+from pathlib import Path
+from typing import List, Optional
+import fitz  # PyMuPDF
+import torch
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    VisionEncoderDecoderModel,
+)
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Model initialisation
+# ---------------------------------------------------------------------------
+def _normalize_nougat_processor(processor) -> None:
+    """
+    Apply compatibility fixes for processor configs across transformers versions.
+    Some newer processor/image-processor builds reject ``None`` for boolean
+    fields that older Nougat configs may omit. Normalize those fields to safe
+    defaults after loading the processor.
+    """
+    image_processor = getattr(processor, "image_processor", None)
+    if image_processor is None:
+        return
+    fixed_fields = []
+    for attr_name in dir(image_processor):
+        if not attr_name.startswith("do_"):
+            continue
+        try:
+            attr_value = getattr(image_processor, attr_name)
+        except Exception:
+            continue
+        if attr_value is None:
+            try:
+                setattr(image_processor, attr_name, False)
+                fixed_fields.append(attr_name)
+            except Exception:
+                continue
+    if fixed_fields:
+        logger.warning(
+            "[NOUGAT] Normalized image-processor boolean flags with None values: %s",
+            ", ".join(sorted(fixed_fields)),
+        )
+def NougatInitializer(model_name: str = "facebook/nougat-small"):
+    """
+    Load the Nougat processor and model onto the best available device.
+    If ``HF_TOKEN`` is present in the environment (e.g. loaded from .env),
+    the function authenticates with the HuggingFace Hub before downloading
+    weights, which suppresses the unauthenticated-request warning and gives
+    higher rate limits.
+    Returns:
+        (processor, model, device) tuple ready for inference.
+    """
+    import os
+    hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+    if hf_token:
+        try:
+            import huggingface_hub
+            huggingface_hub.login(token=hf_token, add_to_git_credential=False)
+            logger.info("[NOUGAT] Authenticated with HuggingFace Hub.")
+        except Exception as exc:
+            logger.warning("[NOUGAT] HF login attempt failed (non-fatal): %s", exc)
+    else:
+        logger.info(
+            "[NOUGAT] No HF_TOKEN found — downloads may be rate-limited. "
+            "Add HF_TOKEN to your .env to silence this."
+        )
+    print(f"[NOUGAT] Loading model: {model_name} …")
+    processor = AutoProcessor.from_pretrained(model_name, token=hf_token)
+    _normalize_nougat_processor(processor)
+    model     = VisionEncoderDecoderModel.from_pretrained(model_name, token=hf_token)
+    device    = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
+    print(f"[NOUGAT] Model loaded on {device}.")
+    return processor, model, device
+# ---------------------------------------------------------------------------
+# PDF rasterisation
+# ---------------------------------------------------------------------------
+def RasterizePaper(
+    pdf: Path | str,
+    outpath: Optional[Path] = None,
+    dpi: int = 96,
+    return_pil: bool = False,
+    pages: Optional[range] = None,
+) -> Optional[List[io.BytesIO]]:
+    """
+    Rasterize each page of *pdf* to PNG.
+    Args:
+        pdf:        Path to the PDF file.
+        outpath:    Directory to write ``01.png``, ``02.png`` … files.
+                    When *None*, ``return_pil`` is forced to True.
+        dpi:        Rendering resolution (96 dpi for Nougat, 200 for figures).
+        return_pil: Return a list of :class:`io.BytesIO` objects instead of
+                    writing files.
+        pages:      Subset of page indices to process.  Defaults to all pages.
+    Returns:
+        List of :class:`io.BytesIO` objects when ``return_pil=True``,
+        otherwise *None* (files written to *outpath*).
+    """
+    if outpath is None:
+        return_pil = True
+    pillow_images: List[io.BytesIO] = []
+    try:
+        doc = fitz.open(pdf) if isinstance(pdf, (str, Path)) else pdf
+        if pages is None:
+            pages = range(len(doc))
+        for i in pages:
+            page_bytes: bytes = doc[i].get_pixmap(dpi=dpi).pil_tobytes(format="PNG")
+            if return_pil:
+                pillow_images.append(io.BytesIO(page_bytes))
+            else:
+                with (outpath / ("%02d.png" % (i + 1))).open("wb") as f:
+                    f.write(page_bytes)
+    except Exception as exc:
+        logger.error("Error rasterizing PDF %s: %s", pdf, exc)
+    return pillow_images if return_pil else None
+# ---------------------------------------------------------------------------
+# Nougat stopping criteria (from the original Nougat repository)
+# ---------------------------------------------------------------------------
+class RunningVarTorch:
+    """Maintains a sliding-window variance for a sequence of tensors."""
+    def __init__(self, L: int = 15, norm: bool = False):
+        self.values = None
+        self.L      = L
+        self.norm   = norm
+    def push(self, x: torch.Tensor) -> None:
+        assert x.dim() == 1
+        if self.values is None:
+            self.values = x[:, None]
+        elif self.values.shape[1] < self.L:
+            self.values = torch.cat((self.values, x[:, None]), 1)
+        else:
+            self.values = torch.cat((self.values[:, 1:], x[:, None]), 1)
+    def variance(self):
+        if self.values is None:
+            return None
+        if self.norm:
+            return torch.var(self.values, 1) / self.values.shape[1]
+        return torch.var(self.values, 1)
+class StoppingCriteriaScores(StoppingCriteria):
+    """
+    Stops generation when the variance of the score distribution stabilises —
+    as recommended by the Nougat authors to avoid repetition loops.
+    """
+    def __init__(self, threshold: float = 0.015, window_size: int = 200):
+        super().__init__()
+        self.threshold   = threshold
+        self.vars        = RunningVarTorch(norm=True)
+        self.varvars     = RunningVarTorch(L=window_size)
+        self.stop_inds   = defaultdict(int)
+        self.stopped     = defaultdict(bool)
+        self.size        = 0
+        self.window_size = window_size
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_ids: torch.LongTensor,
+        scores:    torch.FloatTensor,
+    ) -> bool:
+        last_scores = scores[-1]
+        self.vars.push(last_scores.max(1)[0].float().cpu())
+        self.varvars.push(self.vars.variance())
+        self.size += 1
+        if self.size < self.window_size:
+            return False
+        varvar = self.varvars.variance()
+        for b in range(len(last_scores)):
+            if varvar[b] < self.threshold:
+                if self.stop_inds[b] > 0 and not self.stopped[b]:
+                    self.stopped[b] = self.stop_inds[b] >= self.size
+                else:
+                    self.stop_inds[b] = int(
+                        min(max(self.size, 1) * 1.15 + 150 + self.window_size, 4095)
+                    )
+            else:
+                self.stop_inds[b] = 0
+                self.stopped[b]   = False
+        return all(self.stopped.values()) and len(self.stopped) > 0