visual-parser 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,20 @@
1
+ """
2
+ visual_parser — Standalone Visual-RAG PDF Parser
3
+ =================================================
4
+ Detects new PDFs in a user-supplied directory, extracts text (via Nougat or
5
+ lightweight PyMuPDF/PyPDFLoader), describes every figure/chart/schematic using
6
+ a Vision LLM (OpenAI GPT-4o or Google Gemini), and writes three JSONL knowledge
7
+ bases ready for any downstream RAG system:
8
+
9
+ 01_chunks_kb.jsonl – text chunks with stable IDs
10
+ 02_visuals_kb.jsonl – per-figure visual descriptions
11
+ 03_metadata_kb.jsonl – document-level metadata (title, authors, DOI …)
12
+
13
+ No chatbot, no vector store, no retrieval – just a robust parser.
14
+ """
15
+
16
+ from visual_parser.config import ParserConfig
17
+ from visual_parser.pipeline import run_pipeline
18
+
19
+ __all__ = ["ParserConfig", "run_pipeline"]
20
+ __version__ = "1.0.0"
@@ -0,0 +1,8 @@
1
+ """
2
+ Enables: python -m visual_parser --input-dir ./my_pdfs ...
3
+ """
4
+ import sys
5
+ from visual_parser.cli_main import main
6
+
7
+ if __name__ == "__main__":
8
+ sys.exit(main())
visual_parser/cli.py ADDED
@@ -0,0 +1,230 @@
1
+ """
2
+ cli.py — Argument parser and main() entry point for the Visual-RAG PDF Parser.
3
+
4
+ This module is the canonical home for CLI logic. It is imported by:
5
+ • visual-parser.py (top-level convenience script)
6
+ • visual_parser/__main__.py (enables: python -m visual_parser ...)
7
+ • pyproject.toml [project.scripts] (enables: visual-parser ...)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import os
14
+ import sys
15
+
16
+
17
+ USAGE_EXAMPLES = """
18
+ Examples
19
+ --------
20
+ # Nougat (default) + GPT-5.5 vision
21
+ python visual-parser.py --input-dir ./my_pdfs
22
+
23
+ # Fast lightweight extraction + Gemini
24
+ python visual-parser.py --input-dir ./my_pdfs \\
25
+ --text-mode lightweight \\
26
+ --vision-provider gemini \\
27
+ --vision-model gemini-1.5-pro
28
+
29
+ # Write outputs to a separate directory
30
+ python visual-parser.py --input-dir ./my_pdfs --output-dir ./output_kb
31
+
32
+ # Force re-parse all PDFs (ignore tracking registry)
33
+ python visual-parser.py --input-dir ./my_pdfs --rebuild
34
+
35
+ # High-detail images for dense schematics
36
+ python visual-parser.py --input-dir ./my_pdfs --vision-detail high
37
+
38
+ # Verbose console logging
39
+ python visual-parser.py --input-dir ./my_pdfs --log-level INFO
40
+ """
41
+
42
+
43
+ def _build_arg_parser() -> argparse.ArgumentParser:
44
+ p = argparse.ArgumentParser(
45
+ prog="visual-parser",
46
+ description=(
47
+ "Visual-RAG PDF Parser — detects new PDFs, extracts text and "
48
+ "figure descriptions, and writes three JSONL knowledge bases:\n"
49
+ " 01_chunks_kb.jsonl text chunks\n"
50
+ " 02_visuals_kb.jsonl visual descriptions\n"
51
+ " 03_metadata_kb.jsonl document metadata"
52
+ ),
53
+ formatter_class=argparse.RawDescriptionHelpFormatter,
54
+ epilog=USAGE_EXAMPLES,
55
+ )
56
+
57
+ # ---- Paths --------------------------------------------------------------
58
+ io_group = p.add_argument_group("Paths")
59
+ io_group.add_argument(
60
+ "--input-dir", "-i",
61
+ required=True,
62
+ metavar="DIR",
63
+ help="Directory to scan for PDF files (searched recursively).",
64
+ )
65
+ io_group.add_argument(
66
+ "--output-dir", "-o",
67
+ default="",
68
+ metavar="DIR",
69
+ help="Directory where JSONL files are written. Defaults to --input-dir.",
70
+ )
71
+
72
+ # ---- Text extraction ----------------------------------------------------
73
+ text_group = p.add_argument_group("Text extraction")
74
+ text_group.add_argument(
75
+ "--text-mode",
76
+ choices=["nougat", "lightweight"],
77
+ default="nougat",
78
+ help=(
79
+ "nougat — Nougat OCR model (best for scanned/complex PDFs, GPU recommended).\n"
80
+ "lightweight — PyMuPDF text layer + PyPDFLoader fallback (fast, no GPU needed)."
81
+ ),
82
+ )
83
+ text_group.add_argument(
84
+ "--nougat-model",
85
+ default="facebook/nougat-small",
86
+ metavar="MODEL_ID",
87
+ help="HuggingFace model ID for Nougat (default: facebook/nougat-small).",
88
+ )
89
+ text_group.add_argument(
90
+ "--chunk-size",
91
+ type=int,
92
+ default=500,
93
+ metavar="N",
94
+ help="Target characters per text chunk (default: 500).",
95
+ )
96
+ text_group.add_argument(
97
+ "--chunk-overlap",
98
+ type=int,
99
+ default=100,
100
+ metavar="N",
101
+ help="Overlap characters between adjacent chunks (default: 100).",
102
+ )
103
+
104
+ # ---- Vision LLM ---------------------------------------------------------
105
+ vision_group = p.add_argument_group("Vision LLM (figure descriptions & metadata)")
106
+ vision_group.add_argument(
107
+ "--vision-provider",
108
+ choices=["gpt", "gemini"],
109
+ default="gpt",
110
+ help=(
111
+ "gpt — OpenAI GPT-5.5 (set OPENAI_API_KEY in .env).\n"
112
+ "gemini — Google Gemini (set GEMINI_API_KEY in .env)."
113
+ ),
114
+ )
115
+ vision_group.add_argument(
116
+ "--vision-model",
117
+ default=None,
118
+ metavar="MODEL_NAME",
119
+ help=(
120
+ "Vision model name. Omit to use the latest for each provider:\n"
121
+ " gpt → gpt-5.5 (also: gpt-5.4, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1)\n"
122
+ " gemini → gemini-3-pro-preview (also: gemini-2.5-flash, gemini-1.5-pro)"
123
+ ),
124
+ )
125
+ vision_group.add_argument(
126
+ "--vision-detail",
127
+ choices=["low", "high", "auto"],
128
+ default="low",
129
+ help=(
130
+ "Image detail level (GPT only).\n"
131
+ "low — faster/cheaper (default, recommended for most use cases).\n"
132
+ "high — better for dense schematics with small text."
133
+ ),
134
+ )
135
+ vision_group.add_argument(
136
+ "--reasoning-effort",
137
+ choices=["minimal", "none", "low", "medium", "high", "xhigh"],
138
+ default="medium",
139
+ help=(
140
+ "Reasoning effort for GPT-5.x models (ignored for Gemini and older GPT).\n"
141
+ " minimal/none — minimum reasoning, depending on model.\n"
142
+ " low — light reasoning.\n"
143
+ " medium — balanced (default).\n"
144
+ " high — deeper reasoning, slower.\n"
145
+ " xhigh — maximum depth (gpt-5.2, gpt-5.4, and gpt-5.5)."
146
+ ),
147
+ )
148
+ vision_group.add_argument(
149
+ "--metadata-pages",
150
+ type=int,
151
+ default=2,
152
+ metavar="N",
153
+ help="Number of front pages sent to the vision LLM for metadata extraction (default: 2).",
154
+ )
155
+
156
+ # ---- Performance --------------------------------------------------------
157
+ perf_group = p.add_argument_group("Performance")
158
+ perf_group.add_argument(
159
+ "--max-workers",
160
+ type=int,
161
+ default=4,
162
+ metavar="N",
163
+ help="Thread-pool size for parallel PDF processing (default: 4).",
164
+ )
165
+
166
+ # ---- Misc ---------------------------------------------------------------
167
+ misc_group = p.add_argument_group("Miscellaneous")
168
+ misc_group.add_argument(
169
+ "--rebuild",
170
+ action="store_true",
171
+ help=(
172
+ "Reprocess ALL PDFs, ignoring the 04_processed_pdfs.txt registry. "
173
+ "Use after changing prompts, chunking strategy, or switching models."
174
+ ),
175
+ )
176
+ misc_group.add_argument(
177
+ "--log-level",
178
+ choices=["DEBUG", "INFO", "WARNING", "ERROR"],
179
+ default="ERROR",
180
+ help="Verbosity level written to 05_pipeline.log (default: ERROR).",
181
+ )
182
+
183
+ return p
184
+
185
+
186
+ def main(argv=None) -> int:
187
+ """
188
+ Parse CLI arguments, build a :class:`~visual_parser.config.ParserConfig`,
189
+ validate it, and hand off to :func:`~visual_parser.pipeline.run_pipeline`.
190
+
191
+ Returns 0 on success, 1 on configuration error.
192
+ """
193
+ parser = _build_arg_parser()
194
+ args = parser.parse_args(argv)
195
+
196
+ # Default vision model per provider when not explicitly set
197
+ if args.vision_model is None:
198
+ args.vision_model = (
199
+ "gpt-5.5" if args.vision_provider == "gpt" else "gemini-3-pro-preview"
200
+ )
201
+
202
+ from visual_parser.config import ParserConfig
203
+
204
+ config = ParserConfig(
205
+ input_dir = os.path.abspath(args.input_dir),
206
+ output_dir = os.path.abspath(args.output_dir) if args.output_dir else "",
207
+ text_mode = args.text_mode,
208
+ nougat_model = args.nougat_model,
209
+ chunk_size = args.chunk_size,
210
+ chunk_overlap = args.chunk_overlap,
211
+ vision_provider = args.vision_provider,
212
+ gpt_vision_model = args.vision_model if args.vision_provider == "gpt" else "gpt-5.5",
213
+ gemini_vision_model = args.vision_model if args.vision_provider == "gemini" else "gemini-3-pro-preview",
214
+ gpt_reasoning_effort = args.reasoning_effort,
215
+ vision_detail = args.vision_detail,
216
+ metadata_pages = args.metadata_pages,
217
+ max_workers = args.max_workers,
218
+ rebuild = args.rebuild,
219
+ log_level = args.log_level,
220
+ )
221
+
222
+ try:
223
+ config.validate()
224
+ except ValueError as exc:
225
+ print(f"[ERROR] {exc}", file=sys.stderr)
226
+ return 1
227
+
228
+ from visual_parser.pipeline import run_pipeline
229
+ run_pipeline(config)
230
+ return 0
@@ -0,0 +1,223 @@
1
+ """
2
+ cli_main.py - ASCII-safe CLI entry point for the Visual-RAG PDF Parser.
3
+
4
+ This module exists to keep Windows console help output stable for the
5
+ installed ``visual-parser`` command and ``python -m visual_parser``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import os
12
+ import sys
13
+
14
+
15
+ USAGE_EXAMPLES = """
16
+ Examples
17
+ --------
18
+ # Nougat (default) + GPT-5.5 vision
19
+ python visual-parser.py --input-dir ./my_pdfs
20
+
21
+ # Fast lightweight extraction + Gemini
22
+ python visual-parser.py --input-dir ./my_pdfs \\
23
+ --text-mode lightweight \\
24
+ --vision-provider gemini \\
25
+ --vision-model gemini-1.5-pro
26
+
27
+ # Write outputs to a separate directory
28
+ python visual-parser.py --input-dir ./my_pdfs --output-dir ./output_kb
29
+
30
+ # Force re-parse all PDFs (ignore tracking registry)
31
+ python visual-parser.py --input-dir ./my_pdfs --rebuild
32
+
33
+ # High-detail images for dense schematics
34
+ python visual-parser.py --input-dir ./my_pdfs --vision-detail high
35
+
36
+ # Verbose console logging
37
+ python visual-parser.py --input-dir ./my_pdfs --log-level INFO
38
+ """
39
+
40
+
41
+ def _build_arg_parser() -> argparse.ArgumentParser:
42
+ parser = argparse.ArgumentParser(
43
+ prog="visual-parser",
44
+ description=(
45
+ "Visual-RAG PDF Parser - detects new PDFs, extracts text and "
46
+ "figure descriptions, and writes three JSONL knowledge bases:\n"
47
+ " 01_chunks_kb.jsonl text chunks\n"
48
+ " 02_visuals_kb.jsonl visual descriptions\n"
49
+ " 03_metadata_kb.jsonl document metadata"
50
+ ),
51
+ formatter_class=argparse.RawDescriptionHelpFormatter,
52
+ epilog=USAGE_EXAMPLES,
53
+ )
54
+
55
+ io_group = parser.add_argument_group("Paths")
56
+ io_group.add_argument(
57
+ "--input-dir",
58
+ "-i",
59
+ required=True,
60
+ metavar="DIR",
61
+ help="Directory to scan for PDF files (searched recursively).",
62
+ )
63
+ io_group.add_argument(
64
+ "--output-dir",
65
+ "-o",
66
+ default="",
67
+ metavar="DIR",
68
+ help="Directory where JSONL files are written. Defaults to --input-dir.",
69
+ )
70
+
71
+ text_group = parser.add_argument_group("Text extraction")
72
+ text_group.add_argument(
73
+ "--text-mode",
74
+ choices=["nougat", "lightweight"],
75
+ default="nougat",
76
+ help=(
77
+ "nougat - Nougat OCR model (best for scanned/complex PDFs, GPU recommended).\n"
78
+ "lightweight - PyMuPDF text layer + PyPDFLoader fallback (fast, no GPU needed)."
79
+ ),
80
+ )
81
+ text_group.add_argument(
82
+ "--nougat-model",
83
+ default="facebook/nougat-small",
84
+ metavar="MODEL_ID",
85
+ help="HuggingFace model ID for Nougat (default: facebook/nougat-small).",
86
+ )
87
+ text_group.add_argument(
88
+ "--chunk-size",
89
+ type=int,
90
+ default=500,
91
+ metavar="N",
92
+ help="Target characters per text chunk (default: 500).",
93
+ )
94
+ text_group.add_argument(
95
+ "--chunk-overlap",
96
+ type=int,
97
+ default=100,
98
+ metavar="N",
99
+ help="Overlap characters between adjacent chunks (default: 100).",
100
+ )
101
+
102
+ vision_group = parser.add_argument_group("Vision LLM (figure descriptions & metadata)")
103
+ vision_group.add_argument(
104
+ "--vision-provider",
105
+ choices=["gpt", "gemini"],
106
+ default="gpt",
107
+ help=(
108
+ "gpt - OpenAI GPT-5.4 (set OPENAI_API_KEY in .env).\n"
109
+ "gemini - Google Gemini (set GEMINI_API_KEY in .env)."
110
+ ),
111
+ )
112
+ vision_group.add_argument(
113
+ "--vision-model",
114
+ default=None,
115
+ metavar="MODEL_NAME",
116
+ help=(
117
+ "Vision model name. Omit to use the latest for each provider:\n"
118
+ " gpt -> gpt-5.4 (also: gpt-5.5, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1)\n"
119
+ " gemini -> gemini-3-pro-preview (also: gemini-2.5-flash, gemini-1.5-pro)"
120
+ ),
121
+ )
122
+ vision_group.add_argument(
123
+ "--vision-detail",
124
+ choices=["low", "high", "auto"],
125
+ default="low",
126
+ help=(
127
+ "Image detail level (GPT only).\n"
128
+ "low - faster/cheaper (default, recommended for most use cases).\n"
129
+ "high - better for dense schematics with small text."
130
+ ),
131
+ )
132
+ vision_group.add_argument(
133
+ "--reasoning-effort",
134
+ choices=["minimal", "none", "low", "medium", "high", "xhigh"],
135
+ default="medium",
136
+ help=(
137
+ "Reasoning effort for GPT-5.x models (ignored for Gemini and older GPT).\n"
138
+ " minimal/none - minimum reasoning, depending on model.\n"
139
+ " low - light reasoning.\n"
140
+ " medium - balanced (default).\n"
141
+ " high - deeper reasoning, slower.\n"
142
+ " xhigh - maximum depth (gpt-5.2, gpt-5.4, and gpt-5.5)."
143
+ ),
144
+ )
145
+ vision_group.add_argument(
146
+ "--metadata-pages",
147
+ type=int,
148
+ default=2,
149
+ metavar="N",
150
+ help="Number of front pages sent to the vision LLM for metadata extraction (default: 2).",
151
+ )
152
+
153
+ perf_group = parser.add_argument_group("Performance")
154
+ perf_group.add_argument(
155
+ "--max-workers",
156
+ type=int,
157
+ default=4,
158
+ metavar="N",
159
+ help="Thread-pool size for parallel PDF processing (default: 4).",
160
+ )
161
+
162
+ misc_group = parser.add_argument_group("Miscellaneous")
163
+ misc_group.add_argument(
164
+ "--rebuild",
165
+ action="store_true",
166
+ help=(
167
+ "Reprocess ALL PDFs, ignoring the 04_processed_pdfs.txt registry. "
168
+ "Use after changing prompts, chunking strategy, or switching models."
169
+ ),
170
+ )
171
+ misc_group.add_argument(
172
+ "--log-level",
173
+ choices=["DEBUG", "INFO", "WARNING", "ERROR"],
174
+ default="ERROR",
175
+ help="Verbosity level written to 05_pipeline.log (default: ERROR).",
176
+ )
177
+
178
+ return parser
179
+
180
+
181
+ def main(argv=None) -> int:
182
+ parser = _build_arg_parser()
183
+ args = parser.parse_args(argv)
184
+
185
+ if args.vision_model is None:
186
+ args.vision_model = (
187
+ "gpt-5.4" if args.vision_provider == "gpt" else "gemini-3-pro-preview"
188
+ )
189
+
190
+ from visual_parser.config import ParserConfig
191
+
192
+ config = ParserConfig(
193
+ input_dir=os.path.abspath(args.input_dir),
194
+ output_dir=os.path.abspath(args.output_dir) if args.output_dir else "",
195
+ text_mode=args.text_mode,
196
+ nougat_model=args.nougat_model,
197
+ chunk_size=args.chunk_size,
198
+ chunk_overlap=args.chunk_overlap,
199
+ vision_provider=args.vision_provider,
200
+ gpt_vision_model=args.vision_model if args.vision_provider == "gpt" else "gpt-5.4",
201
+ gemini_vision_model=(
202
+ args.vision_model if args.vision_provider == "gemini" else "gemini-3-pro-preview"
203
+ ),
204
+ gpt_reasoning_effort=args.reasoning_effort,
205
+ vision_detail=args.vision_detail,
206
+ metadata_pages=args.metadata_pages,
207
+ max_workers=args.max_workers,
208
+ rebuild=args.rebuild,
209
+ log_level=args.log_level,
210
+ )
211
+
212
+ try:
213
+ config.validate()
214
+ except ValueError as exc:
215
+ print(f"[ERROR] {exc}", file=sys.stderr)
216
+ return 1
217
+
218
+ from visual_parser.pipeline import run_pipeline
219
+
220
+ summary = run_pipeline(config)
221
+ if summary.get("failed_basenames"):
222
+ return 2
223
+ return 0
@@ -0,0 +1,168 @@
1
+ """
2
+ config.py — Central configuration for visual_parser.
3
+
4
+ All settings are read from environment variables (populated from a .env file
5
+ at project root via python-dotenv). Every field has a sensible default so
6
+ the tool works out-of-the-box; the user only *needs* to supply an API key for
7
+ the chosen vision model.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ from dataclasses import dataclass, field
14
+ from pathlib import Path
15
+ from typing import Literal, Optional
16
+
17
+ from dotenv import load_dotenv
18
+
19
+ # Load .env: order = global < CWD < explicit (later overrides earlier)
20
+ def _load_env():
21
+ # 1) Global: one .env for all runs (any --input-dir).
22
+ # Prefer the current product name, but keep the legacy directory for compatibility.
23
+ global_env_candidates = [
24
+ Path.home() / ".config" / "visual-parser" / ".env",
25
+ Path.home() / ".config" / "visual-rag" / ".env",
26
+ ]
27
+ for global_env in global_env_candidates:
28
+ if global_env.is_file():
29
+ load_dotenv(global_env)
30
+ # 2) Current working directory
31
+ load_dotenv()
32
+ # 3) Explicit path (Docker /env/.env or VISUAL_PARSER_ENV_FILE)
33
+ env_file = os.environ.get("VISUAL_PARSER_ENV_FILE")
34
+ if env_file and os.path.isfile(env_file):
35
+ load_dotenv(env_file)
36
+ _load_env()
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Text-extraction modes
41
+ # ---------------------------------------------------------------------------
42
+ TextMode = Literal["nougat", "lightweight"]
43
+ """
44
+ nougat – Facebook Nougat transformer model (best for scanned / complex PDFs)
45
+ lightweight – PyMuPDF text layer + PyPDFLoader fallback (fast, digital PDFs)
46
+ """
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Vision-LLM providers
50
+ # ---------------------------------------------------------------------------
51
+ VisionProvider = Literal["gpt", "gemini"]
52
+
53
+
54
+ @dataclass
55
+ class ParserConfig:
56
+ """
57
+ Single source of truth for every knob in the pipeline.
58
+
59
+ Instantiate directly or call :func:`ParserConfig.from_env` to read from
60
+ environment variables / a .env file.
61
+ """
62
+
63
+ # --- Paths ---------------------------------------------------------------
64
+ input_dir: str = ""
65
+ """Directory that will be scanned recursively for PDF files."""
66
+
67
+ output_dir: str = ""
68
+ """
69
+ Directory where JSONL knowledge bases are written.
70
+ Defaults to *input_dir* when left empty.
71
+ """
72
+
73
+ # --- Text extraction -----------------------------------------------------
74
+ text_mode: TextMode = "nougat"
75
+ """Which text-extraction engine to use ('nougat' or 'lightweight')."""
76
+
77
+ nougat_model: str = "facebook/nougat-small"
78
+ """HuggingFace model identifier for Nougat."""
79
+
80
+ chunk_size: int = 500
81
+ """Target character count per text chunk."""
82
+
83
+ chunk_overlap: int = 100
84
+ """Character overlap between adjacent chunks."""
85
+
86
+ # --- Vision LLM ----------------------------------------------------------
87
+ vision_provider: VisionProvider = "gpt"
88
+ """Which vision LLM to use for figure descriptions and metadata ('gpt' or 'gemini')."""
89
+
90
+ # OpenAI
91
+ openai_api_key: str = field(default_factory=lambda: os.getenv("OPENAI_API_KEY", ""))
92
+ gpt_vision_model: str = "gpt-5.4"
93
+ """Default GPT vision model. Also accepts: gpt-5.5, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1"""
94
+
95
+ # GPT-5.x reasoning effort: none | low | medium | high | xhigh
96
+ # Older gpt-5 uses minimal | low | medium | high.
97
+ gpt_reasoning_effort: str = "medium"
98
+
99
+ # Google Gemini
100
+ gemini_api_key: str = field(default_factory=lambda: os.getenv("GEMINI_API_KEY", ""))
101
+ gemini_vision_model: str = "gemini-3-pro-preview"
102
+ """Latest Gemini vision model. Also accepts: gemini-2.5-flash, gemini-1.5-pro"""
103
+
104
+ # --- Vision detail -------------------------------------------------------
105
+ vision_detail: Literal["low", "high", "auto"] = "low"
106
+ """
107
+ Image detail level sent to the vision API.
108
+ 'low' → faster & cheaper (recommended for figure detection at scale).
109
+ 'high' → higher fidelity (use for small-text schematics).
110
+ """
111
+
112
+ # --- Metadata extraction -------------------------------------------------
113
+ metadata_pages: int = 2
114
+ """Number of front pages to send to the vision LLM for metadata extraction."""
115
+
116
+ # --- Parallelism ---------------------------------------------------------
117
+ max_workers: int = 4
118
+ """Thread-pool size for parallel PDF processing."""
119
+
120
+ # --- Misc ----------------------------------------------------------------
121
+ rebuild: bool = False
122
+ """If True, reprocess all PDFs even if already recorded in 04_processed_pdfs.txt."""
123
+
124
+ log_level: str = "ERROR"
125
+
126
+ # -------------------------------------------------------------------------
127
+
128
+ @classmethod
129
+ def from_env(cls) -> "ParserConfig":
130
+ """Construct a ParserConfig reading every setting from environment variables."""
131
+ return cls(
132
+ input_dir = os.getenv("VISUAL_PARSER_INPUT_DIR", ""),
133
+ output_dir = os.getenv("VISUAL_PARSER_OUTPUT_DIR", ""),
134
+ text_mode = os.getenv("VISUAL_PARSER_TEXT_MODE", "nougat"), # type: ignore[arg-type]
135
+ nougat_model = os.getenv("VISUAL_PARSER_NOUGAT_MODEL", "facebook/nougat-small"),
136
+ chunk_size = int(os.getenv("VISUAL_PARSER_CHUNK_SIZE", "500")),
137
+ chunk_overlap = int(os.getenv("VISUAL_PARSER_CHUNK_OVERLAP", "100")),
138
+ vision_provider = os.getenv("VISUAL_PARSER_VISION_PROVIDER", "gpt"), # type: ignore[arg-type]
139
+ openai_api_key = os.getenv("OPENAI_API_KEY", ""),
140
+ gpt_vision_model = os.getenv("VISUAL_PARSER_GPT_VISION_MODEL", "gpt-5.4"),
141
+ gpt_reasoning_effort = os.getenv("VISUAL_PARSER_GPT_REASONING_EFFORT", "medium"),
142
+ gemini_api_key = os.getenv("GEMINI_API_KEY", ""),
143
+ gemini_vision_model = os.getenv("VISUAL_PARSER_GEMINI_VISION_MODEL", "gemini-3-pro-preview"),
144
+ vision_detail = os.getenv("VISUAL_PARSER_VISION_DETAIL", "low"), # type: ignore[arg-type]
145
+ metadata_pages = int(os.getenv("VISUAL_PARSER_METADATA_PAGES", "2")),
146
+ max_workers = int(os.getenv("VISUAL_PARSER_MAX_WORKERS", "4")),
147
+ rebuild = os.getenv("VISUAL_PARSER_REBUILD", "false").lower() == "true",
148
+ log_level = os.getenv("VISUAL_PARSER_LOG_LEVEL", "ERROR"),
149
+ )
150
+
151
+ def effective_output_dir(self) -> str:
152
+ """Return output_dir, falling back to input_dir when not set."""
153
+ return self.output_dir if self.output_dir else self.input_dir
154
+
155
+ def validate(self) -> None:
156
+ """Raise ValueError for obviously bad configurations."""
157
+ if not self.input_dir:
158
+ raise ValueError("input_dir must be set.")
159
+ if not Path(self.input_dir).is_dir():
160
+ raise ValueError(f"input_dir does not exist: {self.input_dir!r}")
161
+ if self.text_mode not in ("nougat", "lightweight"):
162
+ raise ValueError(f"text_mode must be 'nougat' or 'lightweight', got {self.text_mode!r}")
163
+ if self.vision_provider not in ("gpt", "gemini"):
164
+ raise ValueError(f"vision_provider must be 'gpt' or 'gemini', got {self.vision_provider!r}")
165
+ if self.vision_provider == "gpt" and not self.openai_api_key:
166
+ raise ValueError("OPENAI_API_KEY must be set when vision_provider='gpt'.")
167
+ if self.vision_provider == "gemini" and not self.gemini_api_key:
168
+ raise ValueError("GEMINI_API_KEY must be set when vision_provider='gemini'.")