visual-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,105 @@
1
+ """
2
+ pdf_tracker.py — Utilities for detecting new PDFs and persisting the
3
+ set of already-processed filenames across pipeline runs.
4
+
5
+ Extracted and cleaned from PDFAnalyser.py.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ import os
12
+ from typing import List
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ PROCESSED_REGISTRY = "04_processed_pdfs.txt"
17
+
18
+
19
+ # ---------------------------------------------------------------------------
20
+ # Registry I/O
21
+ # ---------------------------------------------------------------------------
22
+
23
+ def sanitize_string(s: str) -> str:
24
+ """Re-encode *s* as UTF-8, replacing any unrepresentable characters."""
25
+ return s.encode("utf-8", errors="replace").decode("utf-8")
26
+
27
+
28
+ def load_processed_pdfs(registry_path: str) -> List[str]:
29
+ """
30
+ Return the list of PDF basenames that have already been processed.
31
+
32
+ Falls back to latin-1 decoding when UTF-8 fails (handles legacy files).
33
+ """
34
+ if not os.path.exists(registry_path):
35
+ return []
36
+ try:
37
+ with open(registry_path, "r", encoding="utf-8") as fh:
38
+ return [line for line in fh.read().splitlines() if line.strip()]
39
+ except UnicodeDecodeError:
40
+ logger.warning("UTF-8 decoding failed for %s — retrying with latin-1.", registry_path)
41
+ with open(registry_path, "r", encoding="latin-1") as fh:
42
+ return [line for line in fh.read().splitlines() if line.strip()]
43
+
44
+
45
+ def save_processed_pdfs(registry_path: str, processed_pdfs: List[str]) -> None:
46
+ """Persist the full (deduplicated) list of processed PDF basenames."""
47
+ with open(registry_path, "w", encoding="utf-8") as fh:
48
+ for name in processed_pdfs:
49
+ fh.write(sanitize_string(name) + "\n")
50
+
51
+
52
+ def mark_as_processed(
53
+ registry_path: str,
54
+ newly_processed: List[str],
55
+ ) -> None:
56
+ """
57
+ Merge *newly_processed* basenames into the existing registry.
58
+
59
+ Safe to call even if the registry doesn't exist yet.
60
+ """
61
+ existing = set(load_processed_pdfs(registry_path))
62
+ existing.update(newly_processed)
63
+ save_processed_pdfs(registry_path, sorted(existing))
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # PDF discovery
68
+ # ---------------------------------------------------------------------------
69
+
70
+ def find_new_pdfs(
71
+ input_dir: str,
72
+ registry_filename: str = PROCESSED_REGISTRY,
73
+ rebuild: bool = False,
74
+ ) -> List[str]:
75
+ """
76
+ Walk *input_dir* recursively and return full paths of PDFs that have NOT
77
+ yet been processed.
78
+
79
+ Args:
80
+ input_dir: Root directory to search for ``.pdf`` files.
81
+ registry_filename: Name of the tracking file inside *input_dir*.
82
+ rebuild: When True, return *all* PDFs regardless of the
83
+ registry (forces a full re-parse).
84
+
85
+ Returns:
86
+ Sorted list of absolute PDF paths.
87
+ """
88
+ registry_path = os.path.join(input_dir, registry_filename)
89
+ processed = set() if rebuild else set(load_processed_pdfs(registry_path))
90
+
91
+ new_pdfs = [
92
+ os.path.join(root, filename)
93
+ for root, _, files in os.walk(input_dir)
94
+ for filename in files
95
+ if filename.lower().endswith(".pdf")
96
+ and os.path.basename(filename) not in processed
97
+ ]
98
+
99
+ new_pdfs.sort()
100
+ if new_pdfs:
101
+ logger.info("Found %d new PDF(s) to process.", len(new_pdfs))
102
+ else:
103
+ logger.info("No new PDFs detected in %s.", input_dir)
104
+
105
+ return new_pdfs
@@ -0,0 +1,255 @@
1
+ """
2
+ pipeline.py — The main Visual-RAG parsing orchestrator.
3
+
4
+ Calls each stage in order:
5
+ 0. Detect new PDFs
6
+ 0.5 Extract per-document metadata (Vision LLM on front pages)
7
+ 1. Extract and chunk text (Nougat OR Lightweight, controlled by config)
8
+ 2. Describe figures (Vision LLM, page-by-page)
9
+ 3. Write metadata JSONL
10
+ 4. Mark PDFs as processed
11
+
12
+ No vector store, no embeddings, no retrieval — pure JSONL generation.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import os
19
+ from typing import Dict, List, Optional
20
+
21
+ from visual_parser.config import ParserConfig
22
+ from visual_parser.figure_describer import describe_figures_for_new_pdfs
23
+ from visual_parser.jsonl_writer import append_to_jsonl, make_document_id
24
+ from visual_parser.metadata_extractor import extract_pdf_metadata
25
+ from visual_parser.pdf_tracker import (
26
+ PROCESSED_REGISTRY,
27
+ find_new_pdfs,
28
+ mark_as_processed,
29
+ )
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ def _setup_logging(config: ParserConfig) -> None:
35
+ log_level = getattr(logging, config.log_level.upper(), logging.ERROR)
36
+ log_path = os.path.join(config.effective_output_dir(), "05_pipeline.log")
37
+ logging.basicConfig(
38
+ filename=log_path,
39
+ level=log_level,
40
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
41
+ )
42
+ # Also log to stdout so the CLI shows progress
43
+ console = logging.StreamHandler()
44
+ console.setLevel(logging.INFO)
45
+ console.setFormatter(logging.Formatter("%(levelname)s: %(message)s"))
46
+ logging.getLogger().addHandler(console)
47
+
48
+
49
+ def run_pipeline(config: Optional[ParserConfig] = None) -> Dict:
50
+ """
51
+ Execute the full Visual-RAG parsing pipeline.
52
+
53
+ Args:
54
+ config: A :class:`~visual_parser.config.ParserConfig` instance.
55
+ When *None*, one is built from environment variables via
56
+ :meth:`ParserConfig.from_env`.
57
+
58
+ Returns:
59
+ A summary dict::
60
+
61
+ {
62
+ "new_pdfs_found": int,
63
+ "text_chunks_written": int,
64
+ "figures_written": int,
65
+ "metadata_written": int,
66
+ "processed_basenames": List[str],
67
+ }
68
+ """
69
+ if config is None:
70
+ config = ParserConfig.from_env()
71
+
72
+ config.validate()
73
+ output_dir = config.effective_output_dir()
74
+ os.makedirs(output_dir, exist_ok=True)
75
+ _setup_logging(config)
76
+
77
+ summary = {
78
+ "new_pdfs_found": 0,
79
+ "text_chunks_written": 0,
80
+ "figures_written": 0,
81
+ "metadata_written": 0,
82
+ "processed_basenames": [],
83
+ "failed_basenames": [],
84
+ "status": "success",
85
+ }
86
+
87
+ # -----------------------------------------------------------------------
88
+ # Step 0 — Discover new PDFs
89
+ # -----------------------------------------------------------------------
90
+ registry_path = os.path.join(output_dir, PROCESSED_REGISTRY)
91
+ new_pdfs = find_new_pdfs(config.input_dir, rebuild=config.rebuild)
92
+
93
+ summary["new_pdfs_found"] = len(new_pdfs)
94
+
95
+ if not new_pdfs:
96
+ print("No new PDFs found. Nothing to do.")
97
+ return summary
98
+
99
+ print(f"Found {len(new_pdfs)} new PDF(s). Starting pipeline …")
100
+
101
+ # -----------------------------------------------------------------------
102
+ # Step 0.5 — Metadata extraction (Vision LLM on front pages)
103
+ # -----------------------------------------------------------------------
104
+ _vision_api_key = (
105
+ config.openai_api_key if config.vision_provider == "gpt" else config.gemini_api_key
106
+ )
107
+ _vision_model = (
108
+ config.gpt_vision_model if config.vision_provider == "gpt" else config.gemini_vision_model
109
+ )
110
+
111
+ pdf_meta_map: Dict[str, dict] = {}
112
+ for pdf_path in new_pdfs:
113
+ try:
114
+ meta = extract_pdf_metadata(
115
+ pdf_path = pdf_path,
116
+ vision_provider = config.vision_provider,
117
+ vision_api_key = _vision_api_key,
118
+ vision_model = _vision_model,
119
+ num_pages = config.metadata_pages,
120
+ vision_detail = config.vision_detail,
121
+ reasoning_effort = config.gpt_reasoning_effort,
122
+ )
123
+ pdf_meta_map[pdf_path] = meta
124
+ except Exception as exc:
125
+ logger.warning("Metadata extraction failed for %s: %s", pdf_path, exc)
126
+ pdf_meta_map[pdf_path] = {"_error": str(exc)}
127
+
128
+ # -----------------------------------------------------------------------
129
+ # Step 1 — Text extraction and chunking
130
+ # -----------------------------------------------------------------------
131
+ if config.text_mode == "nougat":
132
+ print("[Step 1] Running Nougat text extraction …")
133
+ from visual_parser.nougat_engine import NougatInitializer
134
+ from visual_parser.text_extractor import nougat_extract_pdfs
135
+
136
+ processor, model, device = NougatInitializer(config.nougat_model)
137
+ nougat_summary, processed_basenames, failed_basenames, chunk_count = nougat_extract_pdfs(
138
+ only_process_these = new_pdfs,
139
+ output_dir = output_dir,
140
+ processor = processor,
141
+ model = model,
142
+ device = device,
143
+ chunk_size = config.chunk_size,
144
+ chunk_overlap = config.chunk_overlap,
145
+ max_workers = config.max_workers,
146
+ )
147
+ print(nougat_summary)
148
+
149
+ else: # "lightweight"
150
+ print("[Step 1] Running lightweight (PyMuPDF) text extraction …")
151
+ from visual_parser.text_extractor import lightweight_extract_pdfs
152
+
153
+ lw_summary, processed_basenames, failed_basenames, chunk_count = lightweight_extract_pdfs(
154
+ only_process_these = new_pdfs,
155
+ output_dir = output_dir,
156
+ chunk_size = config.chunk_size,
157
+ chunk_overlap = config.chunk_overlap,
158
+ max_workers = config.max_workers,
159
+ )
160
+ print(lw_summary)
161
+
162
+ summary["processed_basenames"] = processed_basenames
163
+ summary["failed_basenames"] = failed_basenames
164
+ summary["text_chunks_written"] = chunk_count
165
+
166
+ # -----------------------------------------------------------------------
167
+ # Step 2 — Figure description (Vision LLM, page-by-page)
168
+ # -----------------------------------------------------------------------
169
+ figures_path = os.path.join(output_dir, "02_visuals_kb.jsonl")
170
+
171
+ def _count_lines(path: str) -> int:
172
+ if not os.path.exists(path):
173
+ return 0
174
+ with open(path, encoding="utf-8") as fh:
175
+ return sum(1 for line in fh if line.strip())
176
+
177
+ figures_before = _count_lines(figures_path)
178
+
179
+ pdfs_for_figures = [
180
+ p for p in new_pdfs
181
+ if os.path.basename(p) in processed_basenames
182
+ ]
183
+
184
+ if pdfs_for_figures:
185
+ print(f"[Step 2] Describing figures in {len(pdfs_for_figures)} PDF(s) …")
186
+ describe_figures_for_new_pdfs(
187
+ new_pdf_paths = pdfs_for_figures,
188
+ output_dir = output_dir,
189
+ vision_provider = config.vision_provider,
190
+ vision_api_key = _vision_api_key,
191
+ vision_model = _vision_model,
192
+ vision_detail = config.vision_detail,
193
+ reasoning_effort = config.gpt_reasoning_effort,
194
+ )
195
+ summary["figures_written"] = max(0, _count_lines(figures_path) - figures_before)
196
+ else:
197
+ print("[Step 2] No PDFs were successfully text-extracted; skipping figure description.")
198
+
199
+ # -----------------------------------------------------------------------
200
+ # Step 3 — Write metadata JSONL (only for successfully processed PDFs)
201
+ # -----------------------------------------------------------------------
202
+ print("[Step 3] Writing document metadata …")
203
+ processed_set = set(processed_basenames)
204
+ metadata_rows: List[dict] = []
205
+ for pdf_path, meta in pdf_meta_map.items():
206
+ source = os.path.basename(pdf_path)
207
+ if source not in processed_set:
208
+ # Text extraction failed for this PDF — skip metadata too
209
+ # so a failed run doesn't leave orphaned metadata records.
210
+ logger.warning("Skipping metadata for %s (text extraction failed).", source)
211
+ continue
212
+ document_id = make_document_id(source)
213
+ row = {"source": source, "document_id": document_id}
214
+ if isinstance(meta, dict):
215
+ row.update(meta)
216
+ metadata_rows.append(row)
217
+
218
+ if metadata_rows:
219
+ metadata_path = os.path.join(output_dir, "03_metadata_kb.jsonl")
220
+ append_to_jsonl(metadata_path, metadata_rows)
221
+ summary["metadata_written"] = len(metadata_rows)
222
+ print(f"[Step 3] Wrote {len(metadata_rows)} metadata record(s).")
223
+
224
+ # -----------------------------------------------------------------------
225
+ # Step 4 — Persist the processing registry
226
+ # -----------------------------------------------------------------------
227
+ print("[Step 4] Updating processed-PDFs registry …")
228
+ mark_as_processed(registry_path, processed_basenames)
229
+
230
+ # -----------------------------------------------------------------------
231
+ # Final summary
232
+ # -----------------------------------------------------------------------
233
+ if summary["failed_basenames"] and summary["processed_basenames"]:
234
+ summary["status"] = "partial_failure"
235
+ elif summary["failed_basenames"]:
236
+ summary["status"] = "failed"
237
+
238
+ print("\n" + "=" * 60)
239
+ if summary["status"] == "success":
240
+ print("Visual-Parser Pipeline Complete")
241
+ elif summary["status"] == "partial_failure":
242
+ print("Visual-Parser Pipeline Completed with Errors")
243
+ else:
244
+ print("Visual-Parser Pipeline Failed")
245
+ print(f" Total PDFs processed : {len(processed_basenames)}")
246
+ print(f" Total PDFs failed : {len(failed_basenames)}")
247
+ print(f" Total Text chunks : {summary['text_chunks_written']}")
248
+ print(f" Total Figure records : {summary['figures_written']}")
249
+ print(f" Total Metadata records: {summary['metadata_written']}")
250
+ print(f" Output directory: {output_dir}")
251
+ if failed_basenames:
252
+ print(f" Failed PDFs : {', '.join(failed_basenames)}")
253
+ print("=" * 60)
254
+
255
+ return summary
@@ -0,0 +1,98 @@
1
+ """
2
+ prompts.py — Vision-LLM prompt templates used by the parser.
3
+
4
+ Keeping prompts in one place makes it easy to customise them without touching
5
+ pipeline logic. The figure prompt is intentionally detailed and domain-aware;
6
+ users can swap in a shorter, domain-agnostic version for non-technical PDFs.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ # ---------------------------------------------------------------------------
12
+ # Figure / visual-element description prompt
13
+ # ---------------------------------------------------------------------------
14
+
15
+ FIGURE_PROMPT: str = (
16
+ "You are a specialised Scientific Vision Analyst. "
17
+ "You are viewing a page from a technical document. "
18
+ "Your goal is to extract high-fidelity structured data from visual elements "
19
+ "for a Retrieval-Augmented Generation (RAG) system. "
20
+ "Your output must be precise, quantitative, and strictly follow the structure defined below.\n\n"
21
+
22
+ "**PHASE 1: VISUAL SUPREMACY PROTOCOL (CRITICAL)**\n"
23
+ "- **Discrepancy Detection**: Explicitly check if the visual data matches surrounding text claims.\n"
24
+ "- **Trust the Pixels**: If the image shows a label (e.g. '6') but the text says '5', "
25
+ "record the image value and report the discrepancy.\n\n"
26
+
27
+ "**PHASE 2: STRUCTURAL ANALYSIS**\n"
28
+ "For each distinct scientific visual (plot, chart, schematic, diagram) generate a description "
29
+ "using STRICTLY the following five headings.\n\n"
30
+
31
+ "- A **Figure** is defined as a visual element sharing a single figure number or caption "
32
+ "(e.g. 'Figure 3'), even if it contains multiple panels or subplots.\n"
33
+ "- If a single Figure contains mixed content (e.g. a schematic and a plot), "
34
+ "describe all panels together as ONE Figure.\n"
35
+ "- If no explicit figure number is visible, treat a visually unified group of panels as ONE Figure "
36
+ "and identify it with the corresponding page number.\n\n"
37
+
38
+ "1. **Subject**: A concise title or classification "
39
+ "(e.g. 'Vertical Parabolic Gate Schematic', 'PWR Primary Loop P&ID', 'Decay Heat vs Time Plot').\n"
40
+
41
+ "2. **Geometry & Labels**:\n"
42
+ " - Describe shapes, layout, and components.\n"
43
+ " - List meaningful text labels found *inside* the figure VERBATIM.\n"
44
+ " - For schematics: describe connectivity (e.g. 'Pump discharges to Heat Exchanger').\n"
45
+
46
+ "3. **Dimensions & Data (Quantitative)**:\n"
47
+ " - **Schematics**: Extract all physical dimension lines, radii, diameters, lengths, "
48
+ "thicknesses, angles, and tolerances explicitly labelled in the figure.\n"
49
+ " - **Plots/Charts (CRITICAL)**:\n"
50
+ " * Extract axis variables, units, and numerical ranges (min/max).\n"
51
+ " * Identify and quantify key features: peaks, minima, plateaus, inflection points, "
52
+ "step changes, oscillations, or discontinuities.\n"
53
+ " * Describe temporal or parametric trends explicitly using quantitative language:\n"
54
+ " - e.g. 'Monotonic increase from 0–20 s',\n"
55
+ " - 'Exponential decay after shutdown',\n"
56
+ " - 'Asymptotic stabilisation near 600 MW'.\n"
57
+ " * If multiple curves are present, distinguish them by legend labels, line style, or colour.\n"
58
+ " * If values are approximate, state this (e.g. '≈', 'estimated from plot').\n"
59
+
60
+ "4. **Context**: Summarise the scientific purpose based on the surrounding page text.\n"
61
+
62
+ "5. **Discrepancy Check**: State if visual labels contradict text. "
63
+ "If none, state 'No discrepancies detected'.\n\n"
64
+
65
+ "**OUTPUT FORMAT**\n\n"
66
+ "**IMPORTANT**:\n"
67
+ " - Return a strictly valid JSON list.\n"
68
+ " - Return ONE JSON object per Figure on the page.\n"
69
+ " - If a Figure contains subplots, return ONE description per Figure — NOT per subplot.\n"
70
+ " - If a page contains no scientific visual, return an EMPTY JSON LIST: [].\n"
71
+ " - Do NOT skip pages.\n\n"
72
+
73
+ "[\n"
74
+ " { \"description\": \"**Subject:** [Title]\\n"
75
+ "**Geometry & Labels:** [Detailed description]\\n"
76
+ "**Dimensions & Data:** [Quantitative extraction]\\n"
77
+ "**Context:** [Purpose]\\n"
78
+ "**Discrepancy Check:** [Result]\" },\n"
79
+ " { \"description\": \"...\" }\n"
80
+ "]"
81
+ )
82
+
83
+ # ---------------------------------------------------------------------------
84
+ # Metadata extraction prompt
85
+ # ---------------------------------------------------------------------------
86
+
87
+ METADATA_PROMPT_TEMPLATE: str = (
88
+ "You will be shown up to {num_pages} images (PNG) of the front pages of a technical PDF.\n"
89
+ "Extract as much of the following metadata as you can find, and return it as a pure JSON object "
90
+ "with these keys:\n"
91
+ " • title (string)\n"
92
+ " • authors (array of strings)\n"
93
+ " • publication_date (YYYY-MM-DD if available)\n"
94
+ " • report_number (string)\n"
95
+ " • doi (string)\n"
96
+ " • keywords (array of short terms)\n\n"
97
+ "Omit any field you cannot locate."
98
+ )