PyPI - visual-parser - Versions diffs - 1.0.0__py3-none-any.whl - Mend

visual-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

visual_parser/__init__.py +20 -0
visual_parser/__main__.py +8 -0
visual_parser/cli.py +230 -0
visual_parser/cli_main.py +223 -0
visual_parser/config.py +168 -0
visual_parser/figure_describer.py +218 -0
visual_parser/jsonl_writer.py +102 -0
visual_parser/metadata_extractor.py +94 -0
visual_parser/nougat_engine.py +222 -0
visual_parser/pdf_tracker.py +105 -0
visual_parser/pipeline.py +255 -0
visual_parser/prompts.py +98 -0
visual_parser/text_extractor.py +396 -0
visual_parser/vision_llm.py +269 -0
visual_parser-1.0.0.dist-info/METADATA +191 -0
visual_parser-1.0.0.dist-info/RECORD +19 -0
visual_parser-1.0.0.dist-info/WHEEL +5 -0
visual_parser-1.0.0.dist-info/entry_points.txt +2 -0
visual_parser-1.0.0.dist-info/top_level.txt +1 -0

visual_parser/pdf_tracker.py ADDED Viewed

@@ -0,0 +1,105 @@
+"""
+pdf_tracker.py — Utilities for detecting new PDFs and persisting the
+                 set of already-processed filenames across pipeline runs.
+Extracted and cleaned from PDFAnalyser.py.
+"""
+from __future__ import annotations
+import logging
+import os
+from typing import List
+logger = logging.getLogger(__name__)
+PROCESSED_REGISTRY = "04_processed_pdfs.txt"
+# ---------------------------------------------------------------------------
+# Registry I/O
+# ---------------------------------------------------------------------------
+def sanitize_string(s: str) -> str:
+    """Re-encode *s* as UTF-8, replacing any unrepresentable characters."""
+    return s.encode("utf-8", errors="replace").decode("utf-8")
+def load_processed_pdfs(registry_path: str) -> List[str]:
+    """
+    Return the list of PDF basenames that have already been processed.
+    Falls back to latin-1 decoding when UTF-8 fails (handles legacy files).
+    """
+    if not os.path.exists(registry_path):
+        return []
+    try:
+        with open(registry_path, "r", encoding="utf-8") as fh:
+            return [line for line in fh.read().splitlines() if line.strip()]
+    except UnicodeDecodeError:
+        logger.warning("UTF-8 decoding failed for %s — retrying with latin-1.", registry_path)
+        with open(registry_path, "r", encoding="latin-1") as fh:
+            return [line for line in fh.read().splitlines() if line.strip()]
+def save_processed_pdfs(registry_path: str, processed_pdfs: List[str]) -> None:
+    """Persist the full (deduplicated) list of processed PDF basenames."""
+    with open(registry_path, "w", encoding="utf-8") as fh:
+        for name in processed_pdfs:
+            fh.write(sanitize_string(name) + "\n")
+def mark_as_processed(
+    registry_path: str,
+    newly_processed: List[str],
+) -> None:
+    """
+    Merge *newly_processed* basenames into the existing registry.
+    Safe to call even if the registry doesn't exist yet.
+    """
+    existing = set(load_processed_pdfs(registry_path))
+    existing.update(newly_processed)
+    save_processed_pdfs(registry_path, sorted(existing))
+# ---------------------------------------------------------------------------
+# PDF discovery
+# ---------------------------------------------------------------------------
+def find_new_pdfs(
+    input_dir: str,
+    registry_filename: str = PROCESSED_REGISTRY,
+    rebuild: bool = False,
+) -> List[str]:
+    """
+    Walk *input_dir* recursively and return full paths of PDFs that have NOT
+    yet been processed.
+    Args:
+        input_dir:         Root directory to search for ``.pdf`` files.
+        registry_filename: Name of the tracking file inside *input_dir*.
+        rebuild:           When True, return *all* PDFs regardless of the
+                           registry (forces a full re-parse).
+    Returns:
+        Sorted list of absolute PDF paths.
+    """
+    registry_path = os.path.join(input_dir, registry_filename)
+    processed = set() if rebuild else set(load_processed_pdfs(registry_path))
+    new_pdfs = [
+        os.path.join(root, filename)
+        for root, _, files in os.walk(input_dir)
+        for filename in files
+        if filename.lower().endswith(".pdf")
+        and os.path.basename(filename) not in processed
+    ]
+    new_pdfs.sort()
+    if new_pdfs:
+        logger.info("Found %d new PDF(s) to process.", len(new_pdfs))
+    else:
+        logger.info("No new PDFs detected in %s.", input_dir)
+    return new_pdfs

visual_parser/pipeline.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+pipeline.py — The main Visual-RAG parsing orchestrator.
+Calls each stage in order:
+    0.   Detect new PDFs
+    0.5  Extract per-document metadata (Vision LLM on front pages)
+    1.   Extract and chunk text  (Nougat  OR  Lightweight, controlled by config)
+    2.   Describe figures        (Vision LLM, page-by-page)
+    3.   Write metadata JSONL
+    4.   Mark PDFs as processed
+No vector store, no embeddings, no retrieval — pure JSONL generation.
+"""
+from __future__ import annotations
+import logging
+import os
+from typing import Dict, List, Optional
+from visual_parser.config import ParserConfig
+from visual_parser.figure_describer import describe_figures_for_new_pdfs
+from visual_parser.jsonl_writer import append_to_jsonl, make_document_id
+from visual_parser.metadata_extractor import extract_pdf_metadata
+from visual_parser.pdf_tracker import (
+    PROCESSED_REGISTRY,
+    find_new_pdfs,
+    mark_as_processed,
+)
+logger = logging.getLogger(__name__)
+def _setup_logging(config: ParserConfig) -> None:
+    log_level = getattr(logging, config.log_level.upper(), logging.ERROR)
+    log_path  = os.path.join(config.effective_output_dir(), "05_pipeline.log")
+    logging.basicConfig(
+        filename=log_path,
+        level=log_level,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    # Also log to stdout so the CLI shows progress
+    console = logging.StreamHandler()
+    console.setLevel(logging.INFO)
+    console.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
+    logging.getLogger().addHandler(console)
+def run_pipeline(config: Optional[ParserConfig] = None) -> Dict:
+    """
+    Execute the full Visual-RAG parsing pipeline.
+    Args:
+        config: A :class:`~visual_parser.config.ParserConfig` instance.
+                When *None*, one is built from environment variables via
+                :meth:`ParserConfig.from_env`.
+    Returns:
+        A summary dict::
+            {
+                "new_pdfs_found":       int,
+                "text_chunks_written":  int,
+                "figures_written":      int,
+                "metadata_written":     int,
+                "processed_basenames":  List[str],
+            }
+    """
+    if config is None:
+        config = ParserConfig.from_env()
+    config.validate()
+    output_dir = config.effective_output_dir()
+    os.makedirs(output_dir, exist_ok=True)
+    _setup_logging(config)
+    summary = {
+        "new_pdfs_found":      0,
+        "text_chunks_written": 0,
+        "figures_written":     0,
+        "metadata_written":    0,
+        "processed_basenames": [],
+        "failed_basenames":    [],
+        "status":              "success",
+    }
+    # -----------------------------------------------------------------------
+    # Step 0 — Discover new PDFs
+    # -----------------------------------------------------------------------
+    registry_path = os.path.join(output_dir, PROCESSED_REGISTRY)
+    new_pdfs      = find_new_pdfs(config.input_dir, rebuild=config.rebuild)
+    summary["new_pdfs_found"] = len(new_pdfs)
+    if not new_pdfs:
+        print("No new PDFs found. Nothing to do.")
+        return summary
+    print(f"Found {len(new_pdfs)} new PDF(s). Starting pipeline …")
+    # -----------------------------------------------------------------------
+    # Step 0.5 — Metadata extraction (Vision LLM on front pages)
+    # -----------------------------------------------------------------------
+    _vision_api_key = (
+        config.openai_api_key if config.vision_provider == "gpt" else config.gemini_api_key
+    )
+    _vision_model = (
+        config.gpt_vision_model if config.vision_provider == "gpt" else config.gemini_vision_model
+    )
+    pdf_meta_map: Dict[str, dict] = {}
+    for pdf_path in new_pdfs:
+        try:
+            meta = extract_pdf_metadata(
+                pdf_path              = pdf_path,
+                vision_provider       = config.vision_provider,
+                vision_api_key        = _vision_api_key,
+                vision_model          = _vision_model,
+                num_pages             = config.metadata_pages,
+                vision_detail         = config.vision_detail,
+                reasoning_effort      = config.gpt_reasoning_effort,
+            )
+            pdf_meta_map[pdf_path] = meta
+        except Exception as exc:
+            logger.warning("Metadata extraction failed for %s: %s", pdf_path, exc)
+            pdf_meta_map[pdf_path] = {"_error": str(exc)}
+    # -----------------------------------------------------------------------
+    # Step 1 — Text extraction and chunking
+    # -----------------------------------------------------------------------
+    if config.text_mode == "nougat":
+        print("[Step 1] Running Nougat text extraction …")
+        from visual_parser.nougat_engine import NougatInitializer
+        from visual_parser.text_extractor import nougat_extract_pdfs
+        processor, model, device = NougatInitializer(config.nougat_model)
+        nougat_summary, processed_basenames, failed_basenames, chunk_count = nougat_extract_pdfs(
+            only_process_these = new_pdfs,
+            output_dir         = output_dir,
+            processor          = processor,
+            model              = model,
+            device             = device,
+            chunk_size         = config.chunk_size,
+            chunk_overlap      = config.chunk_overlap,
+            max_workers        = config.max_workers,
+        )
+        print(nougat_summary)
+    else:  # "lightweight"
+        print("[Step 1] Running lightweight (PyMuPDF) text extraction …")
+        from visual_parser.text_extractor import lightweight_extract_pdfs
+        lw_summary, processed_basenames, failed_basenames, chunk_count = lightweight_extract_pdfs(
+            only_process_these = new_pdfs,
+            output_dir         = output_dir,
+            chunk_size         = config.chunk_size,
+            chunk_overlap      = config.chunk_overlap,
+            max_workers        = config.max_workers,
+        )
+        print(lw_summary)
+    summary["processed_basenames"] = processed_basenames
+    summary["failed_basenames"] = failed_basenames
+    summary["text_chunks_written"] = chunk_count
+    # -----------------------------------------------------------------------
+    # Step 2 — Figure description (Vision LLM, page-by-page)
+    # -----------------------------------------------------------------------
+    figures_path = os.path.join(output_dir, "02_visuals_kb.jsonl")
+    def _count_lines(path: str) -> int:
+        if not os.path.exists(path):
+            return 0
+        with open(path, encoding="utf-8") as fh:
+            return sum(1 for line in fh if line.strip())
+    figures_before = _count_lines(figures_path)
+    pdfs_for_figures = [
+        p for p in new_pdfs
+        if os.path.basename(p) in processed_basenames
+    ]
+    if pdfs_for_figures:
+        print(f"[Step 2] Describing figures in {len(pdfs_for_figures)} PDF(s) …")
+        describe_figures_for_new_pdfs(
+            new_pdf_paths    = pdfs_for_figures,
+            output_dir       = output_dir,
+            vision_provider  = config.vision_provider,
+            vision_api_key   = _vision_api_key,
+            vision_model     = _vision_model,
+            vision_detail    = config.vision_detail,
+            reasoning_effort = config.gpt_reasoning_effort,
+        )
+        summary["figures_written"] = max(0, _count_lines(figures_path) - figures_before)
+    else:
+        print("[Step 2] No PDFs were successfully text-extracted; skipping figure description.")
+    # -----------------------------------------------------------------------
+    # Step 3 — Write metadata JSONL  (only for successfully processed PDFs)
+    # -----------------------------------------------------------------------
+    print("[Step 3] Writing document metadata …")
+    processed_set  = set(processed_basenames)
+    metadata_rows: List[dict] = []
+    for pdf_path, meta in pdf_meta_map.items():
+        source = os.path.basename(pdf_path)
+        if source not in processed_set:
+            # Text extraction failed for this PDF — skip metadata too
+            # so a failed run doesn't leave orphaned metadata records.
+            logger.warning("Skipping metadata for %s (text extraction failed).", source)
+            continue
+        document_id = make_document_id(source)
+        row         = {"source": source, "document_id": document_id}
+        if isinstance(meta, dict):
+            row.update(meta)
+        metadata_rows.append(row)
+    if metadata_rows:
+        metadata_path = os.path.join(output_dir, "03_metadata_kb.jsonl")
+        append_to_jsonl(metadata_path, metadata_rows)
+        summary["metadata_written"] = len(metadata_rows)
+        print(f"[Step 3] Wrote {len(metadata_rows)} metadata record(s).")
+    # -----------------------------------------------------------------------
+    # Step 4 — Persist the processing registry
+    # -----------------------------------------------------------------------
+    print("[Step 4] Updating processed-PDFs registry …")
+    mark_as_processed(registry_path, processed_basenames)
+    # -----------------------------------------------------------------------
+    # Final summary
+    # -----------------------------------------------------------------------
+    if summary["failed_basenames"] and summary["processed_basenames"]:
+        summary["status"] = "partial_failure"
+    elif summary["failed_basenames"]:
+        summary["status"] = "failed"
+    print("\n" + "=" * 60)
+    if summary["status"] == "success":
+        print("Visual-Parser Pipeline Complete")
+    elif summary["status"] == "partial_failure":
+        print("Visual-Parser Pipeline Completed with Errors")
+    else:
+        print("Visual-Parser Pipeline Failed")
+    print(f"  Total PDFs processed  : {len(processed_basenames)}")
+    print(f"  Total PDFs failed     : {len(failed_basenames)}")
+    print(f"  Total Text chunks     : {summary['text_chunks_written']}")
+    print(f"  Total Figure records  : {summary['figures_written']}")
+    print(f"  Total Metadata records: {summary['metadata_written']}")
+    print(f"  Output directory: {output_dir}")
+    if failed_basenames:
+        print(f"  Failed PDFs           : {', '.join(failed_basenames)}")
+    print("=" * 60)
+    return summary

visual_parser/prompts.py ADDED Viewed

@@ -0,0 +1,98 @@
+"""
+prompts.py — Vision-LLM prompt templates used by the parser.
+Keeping prompts in one place makes it easy to customise them without touching
+pipeline logic.  The figure prompt is intentionally detailed and domain-aware;
+users can swap in a shorter, domain-agnostic version for non-technical PDFs.
+"""
+from __future__ import annotations
+# ---------------------------------------------------------------------------
+# Figure / visual-element description prompt
+# ---------------------------------------------------------------------------
+FIGURE_PROMPT: str = (
+    "You are a specialised Scientific Vision Analyst. "
+    "You are viewing a page from a technical document. "
+    "Your goal is to extract high-fidelity structured data from visual elements "
+    "for a Retrieval-Augmented Generation (RAG) system. "
+    "Your output must be precise, quantitative, and strictly follow the structure defined below.\n\n"
+    "**PHASE 1: VISUAL SUPREMACY PROTOCOL (CRITICAL)**\n"
+    "- **Discrepancy Detection**: Explicitly check if the visual data matches surrounding text claims.\n"
+    "- **Trust the Pixels**: If the image shows a label (e.g. '6') but the text says '5', "
+    "record the image value and report the discrepancy.\n\n"
+    "**PHASE 2: STRUCTURAL ANALYSIS**\n"
+    "For each distinct scientific visual (plot, chart, schematic, diagram) generate a description "
+    "using STRICTLY the following five headings.\n\n"
+    "- A **Figure** is defined as a visual element sharing a single figure number or caption "
+    "(e.g. 'Figure 3'), even if it contains multiple panels or subplots.\n"
+    "- If a single Figure contains mixed content (e.g. a schematic and a plot), "
+    "describe all panels together as ONE Figure.\n"
+    "- If no explicit figure number is visible, treat a visually unified group of panels as ONE Figure "
+    "and identify it with the corresponding page number.\n\n"
+    "1. **Subject**: A concise title or classification "
+    "(e.g. 'Vertical Parabolic Gate Schematic', 'PWR Primary Loop P&ID', 'Decay Heat vs Time Plot').\n"
+    "2. **Geometry & Labels**:\n"
+    "   - Describe shapes, layout, and components.\n"
+    "   - List meaningful text labels found *inside* the figure VERBATIM.\n"
+    "   - For schematics: describe connectivity (e.g. 'Pump discharges to Heat Exchanger').\n"
+    "3. **Dimensions & Data (Quantitative)**:\n"
+    "   - **Schematics**: Extract all physical dimension lines, radii, diameters, lengths, "
+    "thicknesses, angles, and tolerances explicitly labelled in the figure.\n"
+    "   - **Plots/Charts (CRITICAL)**:\n"
+    "       * Extract axis variables, units, and numerical ranges (min/max).\n"
+    "       * Identify and quantify key features: peaks, minima, plateaus, inflection points, "
+    "step changes, oscillations, or discontinuities.\n"
+    "       * Describe temporal or parametric trends explicitly using quantitative language:\n"
+    "           - e.g. 'Monotonic increase from 0–20 s',\n"
+    "           - 'Exponential decay after shutdown',\n"
+    "           - 'Asymptotic stabilisation near 600 MW'.\n"
+    "       * If multiple curves are present, distinguish them by legend labels, line style, or colour.\n"
+    "       * If values are approximate, state this (e.g. '≈', 'estimated from plot').\n"
+    "4. **Context**: Summarise the scientific purpose based on the surrounding page text.\n"
+    "5. **Discrepancy Check**: State if visual labels contradict text. "
+    "If none, state 'No discrepancies detected'.\n\n"
+    "**OUTPUT FORMAT**\n\n"
+    "**IMPORTANT**:\n"
+    "  - Return a strictly valid JSON list.\n"
+    "  - Return ONE JSON object per Figure on the page.\n"
+    "  - If a Figure contains subplots, return ONE description per Figure — NOT per subplot.\n"
+    "  - If a page contains no scientific visual, return an EMPTY JSON LIST: [].\n"
+    "  - Do NOT skip pages.\n\n"
+    "[\n"
+    "  { \"description\": \"**Subject:** [Title]\\n"
+    "**Geometry & Labels:** [Detailed description]\\n"
+    "**Dimensions & Data:** [Quantitative extraction]\\n"
+    "**Context:** [Purpose]\\n"
+    "**Discrepancy Check:** [Result]\" },\n"
+    "  { \"description\": \"...\" }\n"
+    "]"
+)
+# ---------------------------------------------------------------------------
+# Metadata extraction prompt
+# ---------------------------------------------------------------------------
+METADATA_PROMPT_TEMPLATE: str = (
+    "You will be shown up to {num_pages} images (PNG) of the front pages of a technical PDF.\n"
+    "Extract as much of the following metadata as you can find, and return it as a pure JSON object "
+    "with these keys:\n"
+    "  • title (string)\n"
+    "  • authors (array of strings)\n"
+    "  • publication_date (YYYY-MM-DD if available)\n"
+    "  • report_number (string)\n"
+    "  • doi (string)\n"
+    "  • keywords (array of short terms)\n\n"
+    "Omit any field you cannot locate."
+)