visual-parser 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- visual_parser/__init__.py +20 -0
- visual_parser/__main__.py +8 -0
- visual_parser/cli.py +230 -0
- visual_parser/cli_main.py +223 -0
- visual_parser/config.py +168 -0
- visual_parser/figure_describer.py +218 -0
- visual_parser/jsonl_writer.py +102 -0
- visual_parser/metadata_extractor.py +94 -0
- visual_parser/nougat_engine.py +222 -0
- visual_parser/pdf_tracker.py +105 -0
- visual_parser/pipeline.py +255 -0
- visual_parser/prompts.py +98 -0
- visual_parser/text_extractor.py +396 -0
- visual_parser/vision_llm.py +269 -0
- visual_parser-1.0.0.dist-info/METADATA +191 -0
- visual_parser-1.0.0.dist-info/RECORD +19 -0
- visual_parser-1.0.0.dist-info/WHEEL +5 -0
- visual_parser-1.0.0.dist-info/entry_points.txt +2 -0
- visual_parser-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
visual_parser — Standalone Visual-RAG PDF Parser
|
|
3
|
+
=================================================
|
|
4
|
+
Detects new PDFs in a user-supplied directory, extracts text (via Nougat or
|
|
5
|
+
lightweight PyMuPDF/PyPDFLoader), describes every figure/chart/schematic using
|
|
6
|
+
a Vision LLM (OpenAI GPT-4o or Google Gemini), and writes three JSONL knowledge
|
|
7
|
+
bases ready for any downstream RAG system:
|
|
8
|
+
|
|
9
|
+
01_chunks_kb.jsonl – text chunks with stable IDs
|
|
10
|
+
02_visuals_kb.jsonl – per-figure visual descriptions
|
|
11
|
+
03_metadata_kb.jsonl – document-level metadata (title, authors, DOI …)
|
|
12
|
+
|
|
13
|
+
No chatbot, no vector store, no retrieval – just a robust parser.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from visual_parser.config import ParserConfig
|
|
17
|
+
from visual_parser.pipeline import run_pipeline
|
|
18
|
+
|
|
19
|
+
__all__ = ["ParserConfig", "run_pipeline"]
|
|
20
|
+
__version__ = "1.0.0"
|
visual_parser/cli.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py — Argument parser and main() entry point for the Visual-RAG PDF Parser.
|
|
3
|
+
|
|
4
|
+
This module is the canonical home for CLI logic. It is imported by:
|
|
5
|
+
• visual-parser.py (top-level convenience script)
|
|
6
|
+
• visual_parser/__main__.py (enables: python -m visual_parser ...)
|
|
7
|
+
• pyproject.toml [project.scripts] (enables: visual-parser ...)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
USAGE_EXAMPLES = """
|
|
18
|
+
Examples
|
|
19
|
+
--------
|
|
20
|
+
# Nougat (default) + GPT-5.5 vision
|
|
21
|
+
python visual-parser.py --input-dir ./my_pdfs
|
|
22
|
+
|
|
23
|
+
# Fast lightweight extraction + Gemini
|
|
24
|
+
python visual-parser.py --input-dir ./my_pdfs \\
|
|
25
|
+
--text-mode lightweight \\
|
|
26
|
+
--vision-provider gemini \\
|
|
27
|
+
--vision-model gemini-1.5-pro
|
|
28
|
+
|
|
29
|
+
# Write outputs to a separate directory
|
|
30
|
+
python visual-parser.py --input-dir ./my_pdfs --output-dir ./output_kb
|
|
31
|
+
|
|
32
|
+
# Force re-parse all PDFs (ignore tracking registry)
|
|
33
|
+
python visual-parser.py --input-dir ./my_pdfs --rebuild
|
|
34
|
+
|
|
35
|
+
# High-detail images for dense schematics
|
|
36
|
+
python visual-parser.py --input-dir ./my_pdfs --vision-detail high
|
|
37
|
+
|
|
38
|
+
# Verbose console logging
|
|
39
|
+
python visual-parser.py --input-dir ./my_pdfs --log-level INFO
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
44
|
+
p = argparse.ArgumentParser(
|
|
45
|
+
prog="visual-parser",
|
|
46
|
+
description=(
|
|
47
|
+
"Visual-RAG PDF Parser — detects new PDFs, extracts text and "
|
|
48
|
+
"figure descriptions, and writes three JSONL knowledge bases:\n"
|
|
49
|
+
" 01_chunks_kb.jsonl text chunks\n"
|
|
50
|
+
" 02_visuals_kb.jsonl visual descriptions\n"
|
|
51
|
+
" 03_metadata_kb.jsonl document metadata"
|
|
52
|
+
),
|
|
53
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
54
|
+
epilog=USAGE_EXAMPLES,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# ---- Paths --------------------------------------------------------------
|
|
58
|
+
io_group = p.add_argument_group("Paths")
|
|
59
|
+
io_group.add_argument(
|
|
60
|
+
"--input-dir", "-i",
|
|
61
|
+
required=True,
|
|
62
|
+
metavar="DIR",
|
|
63
|
+
help="Directory to scan for PDF files (searched recursively).",
|
|
64
|
+
)
|
|
65
|
+
io_group.add_argument(
|
|
66
|
+
"--output-dir", "-o",
|
|
67
|
+
default="",
|
|
68
|
+
metavar="DIR",
|
|
69
|
+
help="Directory where JSONL files are written. Defaults to --input-dir.",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# ---- Text extraction ----------------------------------------------------
|
|
73
|
+
text_group = p.add_argument_group("Text extraction")
|
|
74
|
+
text_group.add_argument(
|
|
75
|
+
"--text-mode",
|
|
76
|
+
choices=["nougat", "lightweight"],
|
|
77
|
+
default="nougat",
|
|
78
|
+
help=(
|
|
79
|
+
"nougat — Nougat OCR model (best for scanned/complex PDFs, GPU recommended).\n"
|
|
80
|
+
"lightweight — PyMuPDF text layer + PyPDFLoader fallback (fast, no GPU needed)."
|
|
81
|
+
),
|
|
82
|
+
)
|
|
83
|
+
text_group.add_argument(
|
|
84
|
+
"--nougat-model",
|
|
85
|
+
default="facebook/nougat-small",
|
|
86
|
+
metavar="MODEL_ID",
|
|
87
|
+
help="HuggingFace model ID for Nougat (default: facebook/nougat-small).",
|
|
88
|
+
)
|
|
89
|
+
text_group.add_argument(
|
|
90
|
+
"--chunk-size",
|
|
91
|
+
type=int,
|
|
92
|
+
default=500,
|
|
93
|
+
metavar="N",
|
|
94
|
+
help="Target characters per text chunk (default: 500).",
|
|
95
|
+
)
|
|
96
|
+
text_group.add_argument(
|
|
97
|
+
"--chunk-overlap",
|
|
98
|
+
type=int,
|
|
99
|
+
default=100,
|
|
100
|
+
metavar="N",
|
|
101
|
+
help="Overlap characters between adjacent chunks (default: 100).",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# ---- Vision LLM ---------------------------------------------------------
|
|
105
|
+
vision_group = p.add_argument_group("Vision LLM (figure descriptions & metadata)")
|
|
106
|
+
vision_group.add_argument(
|
|
107
|
+
"--vision-provider",
|
|
108
|
+
choices=["gpt", "gemini"],
|
|
109
|
+
default="gpt",
|
|
110
|
+
help=(
|
|
111
|
+
"gpt — OpenAI GPT-5.5 (set OPENAI_API_KEY in .env).\n"
|
|
112
|
+
"gemini — Google Gemini (set GEMINI_API_KEY in .env)."
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
vision_group.add_argument(
|
|
116
|
+
"--vision-model",
|
|
117
|
+
default=None,
|
|
118
|
+
metavar="MODEL_NAME",
|
|
119
|
+
help=(
|
|
120
|
+
"Vision model name. Omit to use the latest for each provider:\n"
|
|
121
|
+
" gpt → gpt-5.5 (also: gpt-5.4, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1)\n"
|
|
122
|
+
" gemini → gemini-3-pro-preview (also: gemini-2.5-flash, gemini-1.5-pro)"
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
vision_group.add_argument(
|
|
126
|
+
"--vision-detail",
|
|
127
|
+
choices=["low", "high", "auto"],
|
|
128
|
+
default="low",
|
|
129
|
+
help=(
|
|
130
|
+
"Image detail level (GPT only).\n"
|
|
131
|
+
"low — faster/cheaper (default, recommended for most use cases).\n"
|
|
132
|
+
"high — better for dense schematics with small text."
|
|
133
|
+
),
|
|
134
|
+
)
|
|
135
|
+
vision_group.add_argument(
|
|
136
|
+
"--reasoning-effort",
|
|
137
|
+
choices=["minimal", "none", "low", "medium", "high", "xhigh"],
|
|
138
|
+
default="medium",
|
|
139
|
+
help=(
|
|
140
|
+
"Reasoning effort for GPT-5.x models (ignored for Gemini and older GPT).\n"
|
|
141
|
+
" minimal/none — minimum reasoning, depending on model.\n"
|
|
142
|
+
" low — light reasoning.\n"
|
|
143
|
+
" medium — balanced (default).\n"
|
|
144
|
+
" high — deeper reasoning, slower.\n"
|
|
145
|
+
" xhigh — maximum depth (gpt-5.2, gpt-5.4, and gpt-5.5)."
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
vision_group.add_argument(
|
|
149
|
+
"--metadata-pages",
|
|
150
|
+
type=int,
|
|
151
|
+
default=2,
|
|
152
|
+
metavar="N",
|
|
153
|
+
help="Number of front pages sent to the vision LLM for metadata extraction (default: 2).",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# ---- Performance --------------------------------------------------------
|
|
157
|
+
perf_group = p.add_argument_group("Performance")
|
|
158
|
+
perf_group.add_argument(
|
|
159
|
+
"--max-workers",
|
|
160
|
+
type=int,
|
|
161
|
+
default=4,
|
|
162
|
+
metavar="N",
|
|
163
|
+
help="Thread-pool size for parallel PDF processing (default: 4).",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# ---- Misc ---------------------------------------------------------------
|
|
167
|
+
misc_group = p.add_argument_group("Miscellaneous")
|
|
168
|
+
misc_group.add_argument(
|
|
169
|
+
"--rebuild",
|
|
170
|
+
action="store_true",
|
|
171
|
+
help=(
|
|
172
|
+
"Reprocess ALL PDFs, ignoring the 04_processed_pdfs.txt registry. "
|
|
173
|
+
"Use after changing prompts, chunking strategy, or switching models."
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
misc_group.add_argument(
|
|
177
|
+
"--log-level",
|
|
178
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
179
|
+
default="ERROR",
|
|
180
|
+
help="Verbosity level written to 05_pipeline.log (default: ERROR).",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
return p
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def main(argv=None) -> int:
|
|
187
|
+
"""
|
|
188
|
+
Parse CLI arguments, build a :class:`~visual_parser.config.ParserConfig`,
|
|
189
|
+
validate it, and hand off to :func:`~visual_parser.pipeline.run_pipeline`.
|
|
190
|
+
|
|
191
|
+
Returns 0 on success, 1 on configuration error.
|
|
192
|
+
"""
|
|
193
|
+
parser = _build_arg_parser()
|
|
194
|
+
args = parser.parse_args(argv)
|
|
195
|
+
|
|
196
|
+
# Default vision model per provider when not explicitly set
|
|
197
|
+
if args.vision_model is None:
|
|
198
|
+
args.vision_model = (
|
|
199
|
+
"gpt-5.5" if args.vision_provider == "gpt" else "gemini-3-pro-preview"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
from visual_parser.config import ParserConfig
|
|
203
|
+
|
|
204
|
+
config = ParserConfig(
|
|
205
|
+
input_dir = os.path.abspath(args.input_dir),
|
|
206
|
+
output_dir = os.path.abspath(args.output_dir) if args.output_dir else "",
|
|
207
|
+
text_mode = args.text_mode,
|
|
208
|
+
nougat_model = args.nougat_model,
|
|
209
|
+
chunk_size = args.chunk_size,
|
|
210
|
+
chunk_overlap = args.chunk_overlap,
|
|
211
|
+
vision_provider = args.vision_provider,
|
|
212
|
+
gpt_vision_model = args.vision_model if args.vision_provider == "gpt" else "gpt-5.5",
|
|
213
|
+
gemini_vision_model = args.vision_model if args.vision_provider == "gemini" else "gemini-3-pro-preview",
|
|
214
|
+
gpt_reasoning_effort = args.reasoning_effort,
|
|
215
|
+
vision_detail = args.vision_detail,
|
|
216
|
+
metadata_pages = args.metadata_pages,
|
|
217
|
+
max_workers = args.max_workers,
|
|
218
|
+
rebuild = args.rebuild,
|
|
219
|
+
log_level = args.log_level,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
config.validate()
|
|
224
|
+
except ValueError as exc:
|
|
225
|
+
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
226
|
+
return 1
|
|
227
|
+
|
|
228
|
+
from visual_parser.pipeline import run_pipeline
|
|
229
|
+
run_pipeline(config)
|
|
230
|
+
return 0
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli_main.py - ASCII-safe CLI entry point for the Visual-RAG PDF Parser.
|
|
3
|
+
|
|
4
|
+
This module exists to keep Windows console help output stable for the
|
|
5
|
+
installed ``visual-parser`` command and ``python -m visual_parser``.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
USAGE_EXAMPLES = """
|
|
16
|
+
Examples
|
|
17
|
+
--------
|
|
18
|
+
# Nougat (default) + GPT-5.5 vision
|
|
19
|
+
python visual-parser.py --input-dir ./my_pdfs
|
|
20
|
+
|
|
21
|
+
# Fast lightweight extraction + Gemini
|
|
22
|
+
python visual-parser.py --input-dir ./my_pdfs \\
|
|
23
|
+
--text-mode lightweight \\
|
|
24
|
+
--vision-provider gemini \\
|
|
25
|
+
--vision-model gemini-1.5-pro
|
|
26
|
+
|
|
27
|
+
# Write outputs to a separate directory
|
|
28
|
+
python visual-parser.py --input-dir ./my_pdfs --output-dir ./output_kb
|
|
29
|
+
|
|
30
|
+
# Force re-parse all PDFs (ignore tracking registry)
|
|
31
|
+
python visual-parser.py --input-dir ./my_pdfs --rebuild
|
|
32
|
+
|
|
33
|
+
# High-detail images for dense schematics
|
|
34
|
+
python visual-parser.py --input-dir ./my_pdfs --vision-detail high
|
|
35
|
+
|
|
36
|
+
# Verbose console logging
|
|
37
|
+
python visual-parser.py --input-dir ./my_pdfs --log-level INFO
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
42
|
+
parser = argparse.ArgumentParser(
|
|
43
|
+
prog="visual-parser",
|
|
44
|
+
description=(
|
|
45
|
+
"Visual-RAG PDF Parser - detects new PDFs, extracts text and "
|
|
46
|
+
"figure descriptions, and writes three JSONL knowledge bases:\n"
|
|
47
|
+
" 01_chunks_kb.jsonl text chunks\n"
|
|
48
|
+
" 02_visuals_kb.jsonl visual descriptions\n"
|
|
49
|
+
" 03_metadata_kb.jsonl document metadata"
|
|
50
|
+
),
|
|
51
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
52
|
+
epilog=USAGE_EXAMPLES,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
io_group = parser.add_argument_group("Paths")
|
|
56
|
+
io_group.add_argument(
|
|
57
|
+
"--input-dir",
|
|
58
|
+
"-i",
|
|
59
|
+
required=True,
|
|
60
|
+
metavar="DIR",
|
|
61
|
+
help="Directory to scan for PDF files (searched recursively).",
|
|
62
|
+
)
|
|
63
|
+
io_group.add_argument(
|
|
64
|
+
"--output-dir",
|
|
65
|
+
"-o",
|
|
66
|
+
default="",
|
|
67
|
+
metavar="DIR",
|
|
68
|
+
help="Directory where JSONL files are written. Defaults to --input-dir.",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
text_group = parser.add_argument_group("Text extraction")
|
|
72
|
+
text_group.add_argument(
|
|
73
|
+
"--text-mode",
|
|
74
|
+
choices=["nougat", "lightweight"],
|
|
75
|
+
default="nougat",
|
|
76
|
+
help=(
|
|
77
|
+
"nougat - Nougat OCR model (best for scanned/complex PDFs, GPU recommended).\n"
|
|
78
|
+
"lightweight - PyMuPDF text layer + PyPDFLoader fallback (fast, no GPU needed)."
|
|
79
|
+
),
|
|
80
|
+
)
|
|
81
|
+
text_group.add_argument(
|
|
82
|
+
"--nougat-model",
|
|
83
|
+
default="facebook/nougat-small",
|
|
84
|
+
metavar="MODEL_ID",
|
|
85
|
+
help="HuggingFace model ID for Nougat (default: facebook/nougat-small).",
|
|
86
|
+
)
|
|
87
|
+
text_group.add_argument(
|
|
88
|
+
"--chunk-size",
|
|
89
|
+
type=int,
|
|
90
|
+
default=500,
|
|
91
|
+
metavar="N",
|
|
92
|
+
help="Target characters per text chunk (default: 500).",
|
|
93
|
+
)
|
|
94
|
+
text_group.add_argument(
|
|
95
|
+
"--chunk-overlap",
|
|
96
|
+
type=int,
|
|
97
|
+
default=100,
|
|
98
|
+
metavar="N",
|
|
99
|
+
help="Overlap characters between adjacent chunks (default: 100).",
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
vision_group = parser.add_argument_group("Vision LLM (figure descriptions & metadata)")
|
|
103
|
+
vision_group.add_argument(
|
|
104
|
+
"--vision-provider",
|
|
105
|
+
choices=["gpt", "gemini"],
|
|
106
|
+
default="gpt",
|
|
107
|
+
help=(
|
|
108
|
+
"gpt - OpenAI GPT-5.4 (set OPENAI_API_KEY in .env).\n"
|
|
109
|
+
"gemini - Google Gemini (set GEMINI_API_KEY in .env)."
|
|
110
|
+
),
|
|
111
|
+
)
|
|
112
|
+
vision_group.add_argument(
|
|
113
|
+
"--vision-model",
|
|
114
|
+
default=None,
|
|
115
|
+
metavar="MODEL_NAME",
|
|
116
|
+
help=(
|
|
117
|
+
"Vision model name. Omit to use the latest for each provider:\n"
|
|
118
|
+
" gpt -> gpt-5.4 (also: gpt-5.5, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1)\n"
|
|
119
|
+
" gemini -> gemini-3-pro-preview (also: gemini-2.5-flash, gemini-1.5-pro)"
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
vision_group.add_argument(
|
|
123
|
+
"--vision-detail",
|
|
124
|
+
choices=["low", "high", "auto"],
|
|
125
|
+
default="low",
|
|
126
|
+
help=(
|
|
127
|
+
"Image detail level (GPT only).\n"
|
|
128
|
+
"low - faster/cheaper (default, recommended for most use cases).\n"
|
|
129
|
+
"high - better for dense schematics with small text."
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
vision_group.add_argument(
|
|
133
|
+
"--reasoning-effort",
|
|
134
|
+
choices=["minimal", "none", "low", "medium", "high", "xhigh"],
|
|
135
|
+
default="medium",
|
|
136
|
+
help=(
|
|
137
|
+
"Reasoning effort for GPT-5.x models (ignored for Gemini and older GPT).\n"
|
|
138
|
+
" minimal/none - minimum reasoning, depending on model.\n"
|
|
139
|
+
" low - light reasoning.\n"
|
|
140
|
+
" medium - balanced (default).\n"
|
|
141
|
+
" high - deeper reasoning, slower.\n"
|
|
142
|
+
" xhigh - maximum depth (gpt-5.2, gpt-5.4, and gpt-5.5)."
|
|
143
|
+
),
|
|
144
|
+
)
|
|
145
|
+
vision_group.add_argument(
|
|
146
|
+
"--metadata-pages",
|
|
147
|
+
type=int,
|
|
148
|
+
default=2,
|
|
149
|
+
metavar="N",
|
|
150
|
+
help="Number of front pages sent to the vision LLM for metadata extraction (default: 2).",
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
perf_group = parser.add_argument_group("Performance")
|
|
154
|
+
perf_group.add_argument(
|
|
155
|
+
"--max-workers",
|
|
156
|
+
type=int,
|
|
157
|
+
default=4,
|
|
158
|
+
metavar="N",
|
|
159
|
+
help="Thread-pool size for parallel PDF processing (default: 4).",
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
misc_group = parser.add_argument_group("Miscellaneous")
|
|
163
|
+
misc_group.add_argument(
|
|
164
|
+
"--rebuild",
|
|
165
|
+
action="store_true",
|
|
166
|
+
help=(
|
|
167
|
+
"Reprocess ALL PDFs, ignoring the 04_processed_pdfs.txt registry. "
|
|
168
|
+
"Use after changing prompts, chunking strategy, or switching models."
|
|
169
|
+
),
|
|
170
|
+
)
|
|
171
|
+
misc_group.add_argument(
|
|
172
|
+
"--log-level",
|
|
173
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
174
|
+
default="ERROR",
|
|
175
|
+
help="Verbosity level written to 05_pipeline.log (default: ERROR).",
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
return parser
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def main(argv=None) -> int:
|
|
182
|
+
parser = _build_arg_parser()
|
|
183
|
+
args = parser.parse_args(argv)
|
|
184
|
+
|
|
185
|
+
if args.vision_model is None:
|
|
186
|
+
args.vision_model = (
|
|
187
|
+
"gpt-5.4" if args.vision_provider == "gpt" else "gemini-3-pro-preview"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
from visual_parser.config import ParserConfig
|
|
191
|
+
|
|
192
|
+
config = ParserConfig(
|
|
193
|
+
input_dir=os.path.abspath(args.input_dir),
|
|
194
|
+
output_dir=os.path.abspath(args.output_dir) if args.output_dir else "",
|
|
195
|
+
text_mode=args.text_mode,
|
|
196
|
+
nougat_model=args.nougat_model,
|
|
197
|
+
chunk_size=args.chunk_size,
|
|
198
|
+
chunk_overlap=args.chunk_overlap,
|
|
199
|
+
vision_provider=args.vision_provider,
|
|
200
|
+
gpt_vision_model=args.vision_model if args.vision_provider == "gpt" else "gpt-5.4",
|
|
201
|
+
gemini_vision_model=(
|
|
202
|
+
args.vision_model if args.vision_provider == "gemini" else "gemini-3-pro-preview"
|
|
203
|
+
),
|
|
204
|
+
gpt_reasoning_effort=args.reasoning_effort,
|
|
205
|
+
vision_detail=args.vision_detail,
|
|
206
|
+
metadata_pages=args.metadata_pages,
|
|
207
|
+
max_workers=args.max_workers,
|
|
208
|
+
rebuild=args.rebuild,
|
|
209
|
+
log_level=args.log_level,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
config.validate()
|
|
214
|
+
except ValueError as exc:
|
|
215
|
+
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
216
|
+
return 1
|
|
217
|
+
|
|
218
|
+
from visual_parser.pipeline import run_pipeline
|
|
219
|
+
|
|
220
|
+
summary = run_pipeline(config)
|
|
221
|
+
if summary.get("failed_basenames"):
|
|
222
|
+
return 2
|
|
223
|
+
return 0
|
visual_parser/config.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
config.py — Central configuration for visual_parser.
|
|
3
|
+
|
|
4
|
+
All settings are read from environment variables (populated from a .env file
|
|
5
|
+
at project root via python-dotenv). Every field has a sensible default so
|
|
6
|
+
the tool works out-of-the-box; the user only *needs* to supply an API key for
|
|
7
|
+
the chosen vision model.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Literal, Optional
|
|
16
|
+
|
|
17
|
+
from dotenv import load_dotenv
|
|
18
|
+
|
|
19
|
+
# Load .env: order = global < CWD < explicit (later overrides earlier)
|
|
20
|
+
def _load_env():
|
|
21
|
+
# 1) Global: one .env for all runs (any --input-dir).
|
|
22
|
+
# Prefer the current product name, but keep the legacy directory for compatibility.
|
|
23
|
+
global_env_candidates = [
|
|
24
|
+
Path.home() / ".config" / "visual-parser" / ".env",
|
|
25
|
+
Path.home() / ".config" / "visual-rag" / ".env",
|
|
26
|
+
]
|
|
27
|
+
for global_env in global_env_candidates:
|
|
28
|
+
if global_env.is_file():
|
|
29
|
+
load_dotenv(global_env)
|
|
30
|
+
# 2) Current working directory
|
|
31
|
+
load_dotenv()
|
|
32
|
+
# 3) Explicit path (Docker /env/.env or VISUAL_PARSER_ENV_FILE)
|
|
33
|
+
env_file = os.environ.get("VISUAL_PARSER_ENV_FILE")
|
|
34
|
+
if env_file and os.path.isfile(env_file):
|
|
35
|
+
load_dotenv(env_file)
|
|
36
|
+
_load_env()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Text-extraction modes
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
TextMode = Literal["nougat", "lightweight"]
|
|
43
|
+
"""
|
|
44
|
+
nougat – Facebook Nougat transformer model (best for scanned / complex PDFs)
|
|
45
|
+
lightweight – PyMuPDF text layer + PyPDFLoader fallback (fast, digital PDFs)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Vision-LLM providers
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
VisionProvider = Literal["gpt", "gemini"]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class ParserConfig:
|
|
56
|
+
"""
|
|
57
|
+
Single source of truth for every knob in the pipeline.
|
|
58
|
+
|
|
59
|
+
Instantiate directly or call :func:`ParserConfig.from_env` to read from
|
|
60
|
+
environment variables / a .env file.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# --- Paths ---------------------------------------------------------------
|
|
64
|
+
input_dir: str = ""
|
|
65
|
+
"""Directory that will be scanned recursively for PDF files."""
|
|
66
|
+
|
|
67
|
+
output_dir: str = ""
|
|
68
|
+
"""
|
|
69
|
+
Directory where JSONL knowledge bases are written.
|
|
70
|
+
Defaults to *input_dir* when left empty.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
# --- Text extraction -----------------------------------------------------
|
|
74
|
+
text_mode: TextMode = "nougat"
|
|
75
|
+
"""Which text-extraction engine to use ('nougat' or 'lightweight')."""
|
|
76
|
+
|
|
77
|
+
nougat_model: str = "facebook/nougat-small"
|
|
78
|
+
"""HuggingFace model identifier for Nougat."""
|
|
79
|
+
|
|
80
|
+
chunk_size: int = 500
|
|
81
|
+
"""Target character count per text chunk."""
|
|
82
|
+
|
|
83
|
+
chunk_overlap: int = 100
|
|
84
|
+
"""Character overlap between adjacent chunks."""
|
|
85
|
+
|
|
86
|
+
# --- Vision LLM ----------------------------------------------------------
|
|
87
|
+
vision_provider: VisionProvider = "gpt"
|
|
88
|
+
"""Which vision LLM to use for figure descriptions and metadata ('gpt' or 'gemini')."""
|
|
89
|
+
|
|
90
|
+
# OpenAI
|
|
91
|
+
openai_api_key: str = field(default_factory=lambda: os.getenv("OPENAI_API_KEY", ""))
|
|
92
|
+
gpt_vision_model: str = "gpt-5.4"
|
|
93
|
+
"""Default GPT vision model. Also accepts: gpt-5.5, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1"""
|
|
94
|
+
|
|
95
|
+
# GPT-5.x reasoning effort: none | low | medium | high | xhigh
|
|
96
|
+
# Older gpt-5 uses minimal | low | medium | high.
|
|
97
|
+
gpt_reasoning_effort: str = "medium"
|
|
98
|
+
|
|
99
|
+
# Google Gemini
|
|
100
|
+
gemini_api_key: str = field(default_factory=lambda: os.getenv("GEMINI_API_KEY", ""))
|
|
101
|
+
gemini_vision_model: str = "gemini-3-pro-preview"
|
|
102
|
+
"""Latest Gemini vision model. Also accepts: gemini-2.5-flash, gemini-1.5-pro"""
|
|
103
|
+
|
|
104
|
+
# --- Vision detail -------------------------------------------------------
|
|
105
|
+
vision_detail: Literal["low", "high", "auto"] = "low"
|
|
106
|
+
"""
|
|
107
|
+
Image detail level sent to the vision API.
|
|
108
|
+
'low' → faster & cheaper (recommended for figure detection at scale).
|
|
109
|
+
'high' → higher fidelity (use for small-text schematics).
|
|
110
|
+
"""
|
|
111
|
+
|
|
112
|
+
# --- Metadata extraction -------------------------------------------------
|
|
113
|
+
metadata_pages: int = 2
|
|
114
|
+
"""Number of front pages to send to the vision LLM for metadata extraction."""
|
|
115
|
+
|
|
116
|
+
# --- Parallelism ---------------------------------------------------------
|
|
117
|
+
max_workers: int = 4
|
|
118
|
+
"""Thread-pool size for parallel PDF processing."""
|
|
119
|
+
|
|
120
|
+
# --- Misc ----------------------------------------------------------------
|
|
121
|
+
rebuild: bool = False
|
|
122
|
+
"""If True, reprocess all PDFs even if already recorded in 04_processed_pdfs.txt."""
|
|
123
|
+
|
|
124
|
+
log_level: str = "ERROR"
|
|
125
|
+
|
|
126
|
+
# -------------------------------------------------------------------------
|
|
127
|
+
|
|
128
|
+
@classmethod
|
|
129
|
+
def from_env(cls) -> "ParserConfig":
|
|
130
|
+
"""Construct a ParserConfig reading every setting from environment variables."""
|
|
131
|
+
return cls(
|
|
132
|
+
input_dir = os.getenv("VISUAL_PARSER_INPUT_DIR", ""),
|
|
133
|
+
output_dir = os.getenv("VISUAL_PARSER_OUTPUT_DIR", ""),
|
|
134
|
+
text_mode = os.getenv("VISUAL_PARSER_TEXT_MODE", "nougat"), # type: ignore[arg-type]
|
|
135
|
+
nougat_model = os.getenv("VISUAL_PARSER_NOUGAT_MODEL", "facebook/nougat-small"),
|
|
136
|
+
chunk_size = int(os.getenv("VISUAL_PARSER_CHUNK_SIZE", "500")),
|
|
137
|
+
chunk_overlap = int(os.getenv("VISUAL_PARSER_CHUNK_OVERLAP", "100")),
|
|
138
|
+
vision_provider = os.getenv("VISUAL_PARSER_VISION_PROVIDER", "gpt"), # type: ignore[arg-type]
|
|
139
|
+
openai_api_key = os.getenv("OPENAI_API_KEY", ""),
|
|
140
|
+
gpt_vision_model = os.getenv("VISUAL_PARSER_GPT_VISION_MODEL", "gpt-5.4"),
|
|
141
|
+
gpt_reasoning_effort = os.getenv("VISUAL_PARSER_GPT_REASONING_EFFORT", "medium"),
|
|
142
|
+
gemini_api_key = os.getenv("GEMINI_API_KEY", ""),
|
|
143
|
+
gemini_vision_model = os.getenv("VISUAL_PARSER_GEMINI_VISION_MODEL", "gemini-3-pro-preview"),
|
|
144
|
+
vision_detail = os.getenv("VISUAL_PARSER_VISION_DETAIL", "low"), # type: ignore[arg-type]
|
|
145
|
+
metadata_pages = int(os.getenv("VISUAL_PARSER_METADATA_PAGES", "2")),
|
|
146
|
+
max_workers = int(os.getenv("VISUAL_PARSER_MAX_WORKERS", "4")),
|
|
147
|
+
rebuild = os.getenv("VISUAL_PARSER_REBUILD", "false").lower() == "true",
|
|
148
|
+
log_level = os.getenv("VISUAL_PARSER_LOG_LEVEL", "ERROR"),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def effective_output_dir(self) -> str:
|
|
152
|
+
"""Return output_dir, falling back to input_dir when not set."""
|
|
153
|
+
return self.output_dir if self.output_dir else self.input_dir
|
|
154
|
+
|
|
155
|
+
def validate(self) -> None:
|
|
156
|
+
"""Raise ValueError for obviously bad configurations."""
|
|
157
|
+
if not self.input_dir:
|
|
158
|
+
raise ValueError("input_dir must be set.")
|
|
159
|
+
if not Path(self.input_dir).is_dir():
|
|
160
|
+
raise ValueError(f"input_dir does not exist: {self.input_dir!r}")
|
|
161
|
+
if self.text_mode not in ("nougat", "lightweight"):
|
|
162
|
+
raise ValueError(f"text_mode must be 'nougat' or 'lightweight', got {self.text_mode!r}")
|
|
163
|
+
if self.vision_provider not in ("gpt", "gemini"):
|
|
164
|
+
raise ValueError(f"vision_provider must be 'gpt' or 'gemini', got {self.vision_provider!r}")
|
|
165
|
+
if self.vision_provider == "gpt" and not self.openai_api_key:
|
|
166
|
+
raise ValueError("OPENAI_API_KEY must be set when vision_provider='gpt'.")
|
|
167
|
+
if self.vision_provider == "gemini" and not self.gemini_api_key:
|
|
168
|
+
raise ValueError("GEMINI_API_KEY must be set when vision_provider='gemini'.")
|