visual-parser 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- visual_parser-1.0.0/PKG-INFO +191 -0
- visual_parser-1.0.0/README.md +152 -0
- visual_parser-1.0.0/pyproject.toml +67 -0
- visual_parser-1.0.0/setup.cfg +4 -0
- visual_parser-1.0.0/visual_parser/__init__.py +20 -0
- visual_parser-1.0.0/visual_parser/__main__.py +8 -0
- visual_parser-1.0.0/visual_parser/cli.py +230 -0
- visual_parser-1.0.0/visual_parser/cli_main.py +223 -0
- visual_parser-1.0.0/visual_parser/config.py +168 -0
- visual_parser-1.0.0/visual_parser/figure_describer.py +218 -0
- visual_parser-1.0.0/visual_parser/jsonl_writer.py +102 -0
- visual_parser-1.0.0/visual_parser/metadata_extractor.py +94 -0
- visual_parser-1.0.0/visual_parser/nougat_engine.py +222 -0
- visual_parser-1.0.0/visual_parser/pdf_tracker.py +105 -0
- visual_parser-1.0.0/visual_parser/pipeline.py +255 -0
- visual_parser-1.0.0/visual_parser/prompts.py +98 -0
- visual_parser-1.0.0/visual_parser/text_extractor.py +396 -0
- visual_parser-1.0.0/visual_parser/vision_llm.py +269 -0
- visual_parser-1.0.0/visual_parser.egg-info/PKG-INFO +191 -0
- visual_parser-1.0.0/visual_parser.egg-info/SOURCES.txt +22 -0
- visual_parser-1.0.0/visual_parser.egg-info/dependency_links.txt +1 -0
- visual_parser-1.0.0/visual_parser.egg-info/entry_points.txt +2 -0
- visual_parser-1.0.0/visual_parser.egg-info/requires.txt +20 -0
- visual_parser-1.0.0/visual_parser.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: visual-parser
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Standalone Visual-RAG PDF Parser — text extraction + Vision-LLM figure descriptions → JSONL
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/SmartLabNuclear/RADIANT_LLM
|
|
7
|
+
Project-URL: Repository, https://github.com/SmartLabNuclear/RADIANT_LLM
|
|
8
|
+
Project-URL: Docker Hub, https://hub.docker.com/r/zev94/radiant-llm
|
|
9
|
+
Keywords: pdf,rag,nougat,vision-llm,ocr,document-parsing,jsonl,knowledge-base
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
Requires-Dist: PyMuPDF==1.24.7
|
|
22
|
+
Requires-Dist: Pillow==11.1.0
|
|
23
|
+
Requires-Dist: torch==2.7.0
|
|
24
|
+
Requires-Dist: transformers==4.45.2
|
|
25
|
+
Requires-Dist: huggingface-hub==0.36.0
|
|
26
|
+
Requires-Dist: langchain-community==0.3.3
|
|
27
|
+
Requires-Dist: langchain==0.3.13
|
|
28
|
+
Requires-Dist: langchain-text-splitters==0.3.4
|
|
29
|
+
Requires-Dist: openai==1.78.1
|
|
30
|
+
Requires-Dist: google-generativeai==0.8.5
|
|
31
|
+
Requires-Dist: python-dotenv==1.1.0
|
|
32
|
+
Requires-Dist: tqdm==4.67.1
|
|
33
|
+
Provides-Extra: ocr
|
|
34
|
+
Requires-Dist: pytesseract==0.3.13; extra == "ocr"
|
|
35
|
+
Provides-Extra: dev
|
|
36
|
+
Requires-Dist: pytest; extra == "dev"
|
|
37
|
+
Requires-Dist: ruff; extra == "dev"
|
|
38
|
+
Requires-Dist: mypy; extra == "dev"
|
|
39
|
+
|
|
40
|
+
# visual-parser (Standalone Visual-RAG PDF Ingestion)
|
|
41
|
+
|
|
42
|
+
<!--  -->
|
|
43
|
+

|
|
44
|
+
<!--  -->
|
|
45
|
+
|
|
46
|
+
`visual-parser` is a standalone document-ingestion tool that converts PDFs into a multi-modal JSONL knowledge base (text chunks + figure descriptions + metadata). The intended workflow is:
|
|
47
|
+
|
|
48
|
+
1) Run `visual-parser` on curated PDFs to generate JSONL KB files.
|
|
49
|
+
2) Run RADIANT-LLM Visual-RAG for QA over the generated KB.
|
|
50
|
+
|
|
51
|
+
## Outputs (JSONL KB)
|
|
52
|
+
|
|
53
|
+
By default, the pipeline writes:
|
|
54
|
+
- `01_chunks_kb.jsonl`: chunked text extracted from PDFs (Nougat by default).
|
|
55
|
+
- `02_figures_kb.jsonl`: figure/page visual descriptions (Vision LLM).
|
|
56
|
+
- `03_metadata_kb.jsonl`: document metadata rows (title/author/etc.).
|
|
57
|
+
- `04_processed_pdfs.txt`: a tracker so re-runs only process new PDFs (unless `--rebuild`).
|
|
58
|
+
|
|
59
|
+
## API keys (`.env`)
|
|
60
|
+
|
|
61
|
+
Provide at least one provider:
|
|
62
|
+
- `OPENAI_API_KEY` (OpenAI)
|
|
63
|
+
- `GEMINI_API_KEY` (Gemini)
|
|
64
|
+
|
|
65
|
+
Optional:
|
|
66
|
+
- `HF_TOKEN` (if you use gated Hugging Face models)
|
|
67
|
+
|
|
68
|
+
## Run with Docker (Docker Hub)
|
|
69
|
+
|
|
70
|
+
Prebuilt images are on **[zev94/radiant-llm](https://hub.docker.com/r/zev94/radiant-llm)** under the **visual-parser** tags:
|
|
71
|
+
|
|
72
|
+
| Tag | Description |
|
|
73
|
+
|-----|-------------|
|
|
74
|
+
| `visual-parser-1.0` | Pinned release |
|
|
75
|
+
| `visual-parser-latest` | Latest visual-parser build |
|
|
76
|
+
|
|
77
|
+
### 1) Install Docker
|
|
78
|
+
- Docker Desktop (Windows/macOS) or Docker Engine (Linux)
|
|
79
|
+
|
|
80
|
+
### 2) Pull the image
|
|
81
|
+
```bash
|
|
82
|
+
docker pull zev94/radiant-llm:visual-parser-1.0
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
### 3) Run (input + output on the same mounted folder)
|
|
86
|
+
Windows PowerShell:
|
|
87
|
+
```powershell
|
|
88
|
+
docker run --rm --env-file .env `
|
|
89
|
+
-v "C:\path\to\pdfs:/data" `
|
|
90
|
+
zev94/radiant-llm:visual-parser-1.0 `
|
|
91
|
+
--input-dir /data --output-dir /data
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
Linux / WSL:
|
|
95
|
+
```bash
|
|
96
|
+
docker run --rm --env-file .env \
|
|
97
|
+
-v "/path/to/pdfs:/data" \
|
|
98
|
+
zev94/radiant-llm:visual-parser-1.0 \
|
|
99
|
+
--input-dir /data --output-dir /data
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### 4) Run (separate output directory)
|
|
103
|
+
Windows PowerShell:
|
|
104
|
+
```powershell
|
|
105
|
+
docker run --rm --env-file .env `
|
|
106
|
+
-v "C:\path\to\pdfs:/data" `
|
|
107
|
+
-v "C:\path\to\out:/out" `
|
|
108
|
+
zev94/radiant-llm:visual-parser-1.0 `
|
|
109
|
+
--input-dir /data --output-dir /out
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Offline install (legacy `.tar`)
|
|
113
|
+
|
|
114
|
+
```powershell
|
|
115
|
+
docker load -i .\visual-parser_0.1.0.tar
|
|
116
|
+
docker images # use the tag printed by Docker
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Model overrides (optional)
|
|
120
|
+
|
|
121
|
+
Default vision model is **GPT-5.5** when using `--vision-provider gpt`. Override on the command line:
|
|
122
|
+
|
|
123
|
+
```powershell
|
|
124
|
+
docker run --rm --env-file .env -v "C:\path\to\pdfs:/data" `
|
|
125
|
+
zev94/radiant-llm:visual-parser-1.0 `
|
|
126
|
+
--input-dir /data --output-dir /data --vision-model gpt-5.4
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
<!-- ## Run from source (Python)
|
|
130
|
+
|
|
131
|
+
From `codebase/Visual-Parser/`:
|
|
132
|
+
```powershell
|
|
133
|
+
python visual-parser.py --input-dir "C:\path\to\pdfs"
|
|
134
|
+
``` -->
|
|
135
|
+
|
|
136
|
+
## Common configuration flags
|
|
137
|
+
|
|
138
|
+
After pulling the image, run:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
docker run --rm zev94/radiant-llm:visual-parser-1.0 --help
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
For copy-paste **Docker** examples (vision presets, text modes, workers, rebuild), see [`docker-usage-examples.md`](docker-usage-examples.md).
|
|
145
|
+
|
|
146
|
+
Paths:
|
|
147
|
+
- `--input-dir` / `-i` (required)
|
|
148
|
+
- `--output-dir` / `-o` (default: same as input)
|
|
149
|
+
|
|
150
|
+
Text extraction:
|
|
151
|
+
- `--text-mode nougat|lightweight` (default: `nougat`)
|
|
152
|
+
- `--nougat-model facebook/nougat-small`
|
|
153
|
+
- `--chunk-size 500`
|
|
154
|
+
- `--chunk-overlap 100`
|
|
155
|
+
|
|
156
|
+
Vision LLM:
|
|
157
|
+
- `--vision-provider gpt|gemini` (default: `gpt`)
|
|
158
|
+
- `--vision-model gpt-5.2` (or `gpt-4o`, `gemini-2.5-flash`, etc.)
|
|
159
|
+
- `--vision-detail low|high|auto`
|
|
160
|
+
- `--reasoning-effort none|low|medium|high|xhigh`
|
|
161
|
+
- `--metadata-pages 2`
|
|
162
|
+
|
|
163
|
+
Performance / misc:
|
|
164
|
+
- `--max-workers 4`
|
|
165
|
+
- `--rebuild` (reprocess everything; ignore `04_processed_pdfs.txt`)
|
|
166
|
+
- `--log-level DEBUG|INFO|WARNING|ERROR`
|
|
167
|
+
|
|
168
|
+
---
|
|
169
|
+
|
|
170
|
+
## Citation
|
|
171
|
+
|
|
172
|
+
If you use RADIANT-LLM or the accompanying evaluation materials, please cite the preprint:
|
|
173
|
+
|
|
174
|
+
```bibtex
|
|
175
|
+
@article{ndum2026radiant,
|
|
176
|
+
title={RADIANT-LLM: an Agentic Retrieval Augmented Generation Framework for Reliable Decision Support in Safety-Critical Nuclear Engineering},
|
|
177
|
+
author={Ndum, Zavier Ndum and Tao, Jian and Ford, John and Yim, Mansung and Liu, Yang},
|
|
178
|
+
journal={arXiv preprint arXiv:2604.22755},
|
|
179
|
+
year={2026}
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
Preprint: https://arxiv.org/abs/2604.22755
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## License
|
|
188
|
+
|
|
189
|
+
This repository is currently proprietary and not licensed for public use, redistribution, or modification. Licensing terms will be updated after institutional review.
|
|
190
|
+
|
|
191
|
+
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
# visual-parser (Standalone Visual-RAG PDF Ingestion)
|
|
2
|
+
|
|
3
|
+
<!--  -->
|
|
4
|
+

|
|
5
|
+
<!--  -->
|
|
6
|
+
|
|
7
|
+
`visual-parser` is a standalone document-ingestion tool that converts PDFs into a multi-modal JSONL knowledge base (text chunks + figure descriptions + metadata). The intended workflow is:
|
|
8
|
+
|
|
9
|
+
1) Run `visual-parser` on curated PDFs to generate JSONL KB files.
|
|
10
|
+
2) Run RADIANT-LLM Visual-RAG for QA over the generated KB.
|
|
11
|
+
|
|
12
|
+
## Outputs (JSONL KB)
|
|
13
|
+
|
|
14
|
+
By default, the pipeline writes:
|
|
15
|
+
- `01_chunks_kb.jsonl`: chunked text extracted from PDFs (Nougat by default).
|
|
16
|
+
- `02_figures_kb.jsonl`: figure/page visual descriptions (Vision LLM).
|
|
17
|
+
- `03_metadata_kb.jsonl`: document metadata rows (title/author/etc.).
|
|
18
|
+
- `04_processed_pdfs.txt`: a tracker so re-runs only process new PDFs (unless `--rebuild`).
|
|
19
|
+
|
|
20
|
+
## API keys (`.env`)
|
|
21
|
+
|
|
22
|
+
Provide at least one provider:
|
|
23
|
+
- `OPENAI_API_KEY` (OpenAI)
|
|
24
|
+
- `GEMINI_API_KEY` (Gemini)
|
|
25
|
+
|
|
26
|
+
Optional:
|
|
27
|
+
- `HF_TOKEN` (if you use gated Hugging Face models)
|
|
28
|
+
|
|
29
|
+
## Run with Docker (Docker Hub)
|
|
30
|
+
|
|
31
|
+
Prebuilt images are on **[zev94/radiant-llm](https://hub.docker.com/r/zev94/radiant-llm)** under the **visual-parser** tags:
|
|
32
|
+
|
|
33
|
+
| Tag | Description |
|
|
34
|
+
|-----|-------------|
|
|
35
|
+
| `visual-parser-1.0` | Pinned release |
|
|
36
|
+
| `visual-parser-latest` | Latest visual-parser build |
|
|
37
|
+
|
|
38
|
+
### 1) Install Docker
|
|
39
|
+
- Docker Desktop (Windows/macOS) or Docker Engine (Linux)
|
|
40
|
+
|
|
41
|
+
### 2) Pull the image
|
|
42
|
+
```bash
|
|
43
|
+
docker pull zev94/radiant-llm:visual-parser-1.0
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### 3) Run (input + output on the same mounted folder)
|
|
47
|
+
Windows PowerShell:
|
|
48
|
+
```powershell
|
|
49
|
+
docker run --rm --env-file .env `
|
|
50
|
+
-v "C:\path\to\pdfs:/data" `
|
|
51
|
+
zev94/radiant-llm:visual-parser-1.0 `
|
|
52
|
+
--input-dir /data --output-dir /data
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Linux / WSL:
|
|
56
|
+
```bash
|
|
57
|
+
docker run --rm --env-file .env \
|
|
58
|
+
-v "/path/to/pdfs:/data" \
|
|
59
|
+
zev94/radiant-llm:visual-parser-1.0 \
|
|
60
|
+
--input-dir /data --output-dir /data
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### 4) Run (separate output directory)
|
|
64
|
+
Windows PowerShell:
|
|
65
|
+
```powershell
|
|
66
|
+
docker run --rm --env-file .env `
|
|
67
|
+
-v "C:\path\to\pdfs:/data" `
|
|
68
|
+
-v "C:\path\to\out:/out" `
|
|
69
|
+
zev94/radiant-llm:visual-parser-1.0 `
|
|
70
|
+
--input-dir /data --output-dir /out
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Offline install (legacy `.tar`)
|
|
74
|
+
|
|
75
|
+
```powershell
|
|
76
|
+
docker load -i .\visual-parser_0.1.0.tar
|
|
77
|
+
docker images # use the tag printed by Docker
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Model overrides (optional)
|
|
81
|
+
|
|
82
|
+
Default vision model is **GPT-5.5** when using `--vision-provider gpt`. Override on the command line:
|
|
83
|
+
|
|
84
|
+
```powershell
|
|
85
|
+
docker run --rm --env-file .env -v "C:\path\to\pdfs:/data" `
|
|
86
|
+
zev94/radiant-llm:visual-parser-1.0 `
|
|
87
|
+
--input-dir /data --output-dir /data --vision-model gpt-5.4
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
<!-- ## Run from source (Python)
|
|
91
|
+
|
|
92
|
+
From `codebase/Visual-Parser/`:
|
|
93
|
+
```powershell
|
|
94
|
+
python visual-parser.py --input-dir "C:\path\to\pdfs"
|
|
95
|
+
``` -->
|
|
96
|
+
|
|
97
|
+
## Common configuration flags
|
|
98
|
+
|
|
99
|
+
After pulling the image, run:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
docker run --rm zev94/radiant-llm:visual-parser-1.0 --help
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
For copy-paste **Docker** examples (vision presets, text modes, workers, rebuild), see [`docker-usage-examples.md`](docker-usage-examples.md).
|
|
106
|
+
|
|
107
|
+
Paths:
|
|
108
|
+
- `--input-dir` / `-i` (required)
|
|
109
|
+
- `--output-dir` / `-o` (default: same as input)
|
|
110
|
+
|
|
111
|
+
Text extraction:
|
|
112
|
+
- `--text-mode nougat|lightweight` (default: `nougat`)
|
|
113
|
+
- `--nougat-model facebook/nougat-small`
|
|
114
|
+
- `--chunk-size 500`
|
|
115
|
+
- `--chunk-overlap 100`
|
|
116
|
+
|
|
117
|
+
Vision LLM:
|
|
118
|
+
- `--vision-provider gpt|gemini` (default: `gpt`)
|
|
119
|
+
- `--vision-model gpt-5.2` (or `gpt-4o`, `gemini-2.5-flash`, etc.)
|
|
120
|
+
- `--vision-detail low|high|auto`
|
|
121
|
+
- `--reasoning-effort none|low|medium|high|xhigh`
|
|
122
|
+
- `--metadata-pages 2`
|
|
123
|
+
|
|
124
|
+
Performance / misc:
|
|
125
|
+
- `--max-workers 4`
|
|
126
|
+
- `--rebuild` (reprocess everything; ignore `04_processed_pdfs.txt`)
|
|
127
|
+
- `--log-level DEBUG|INFO|WARNING|ERROR`
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Citation
|
|
132
|
+
|
|
133
|
+
If you use RADIANT-LLM or the accompanying evaluation materials, please cite the preprint:
|
|
134
|
+
|
|
135
|
+
```bibtex
|
|
136
|
+
@article{ndum2026radiant,
|
|
137
|
+
title={RADIANT-LLM: an Agentic Retrieval Augmented Generation Framework for Reliable Decision Support in Safety-Critical Nuclear Engineering},
|
|
138
|
+
author={Ndum, Zavier Ndum and Tao, Jian and Ford, John and Yim, Mansung and Liu, Yang},
|
|
139
|
+
journal={arXiv preprint arXiv:2604.22755},
|
|
140
|
+
year={2026}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Preprint: https://arxiv.org/abs/2604.22755
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## License
|
|
149
|
+
|
|
150
|
+
This repository is currently proprietary and not licensed for public use, redistribution, or modification. Licensing terms will be updated after institutional review.
|
|
151
|
+
|
|
152
|
+
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "visual-parser"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Standalone Visual-RAG PDF Parser — text extraction + Vision-LLM figure descriptions → JSONL"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
|
|
13
|
+
keywords = [
|
|
14
|
+
"pdf", "rag", "nougat", "vision-llm", "ocr",
|
|
15
|
+
"document-parsing", "jsonl", "knowledge-base",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"License :: OSI Approved :: MIT License",
|
|
24
|
+
"Operating System :: OS Independent",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
"Topic :: Text Processing :: Markup",
|
|
27
|
+
"Intended Audience :: Science/Research",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = [
|
|
31
|
+
"PyMuPDF==1.24.7",
|
|
32
|
+
"Pillow==11.1.0",
|
|
33
|
+
"torch==2.7.0",
|
|
34
|
+
"transformers==4.45.2",
|
|
35
|
+
"huggingface-hub==0.36.0",
|
|
36
|
+
"langchain-community==0.3.3",
|
|
37
|
+
"langchain==0.3.13",
|
|
38
|
+
"langchain-text-splitters==0.3.4",
|
|
39
|
+
"openai==1.78.1",
|
|
40
|
+
"google-generativeai==0.8.5",
|
|
41
|
+
"python-dotenv==1.1.0",
|
|
42
|
+
"tqdm==4.67.1",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
ocr = ["pytesseract==0.3.13"] # requires Tesseract binary installed separately
|
|
47
|
+
dev = ["pytest", "ruff", "mypy"]
|
|
48
|
+
|
|
49
|
+
[project.urls]
|
|
50
|
+
Homepage = "https://github.com/SmartLabNuclear/RADIANT_LLM"
|
|
51
|
+
Repository = "https://github.com/SmartLabNuclear/RADIANT_LLM"
|
|
52
|
+
"Docker Hub" = "https://hub.docker.com/r/zev94/radiant-llm"
|
|
53
|
+
|
|
54
|
+
[project.scripts]
|
|
55
|
+
visual-parser = "visual_parser.cli_main:main"
|
|
56
|
+
|
|
57
|
+
[tool.setuptools.packages.find]
|
|
58
|
+
where = ["."]
|
|
59
|
+
include = ["visual_parser*"]
|
|
60
|
+
|
|
61
|
+
[tool.ruff]
|
|
62
|
+
line-length = 100
|
|
63
|
+
target-version = "py310"
|
|
64
|
+
|
|
65
|
+
[tool.mypy]
|
|
66
|
+
python_version = "3.10"
|
|
67
|
+
ignore_missing_imports = true
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
visual_parser — Standalone Visual-RAG PDF Parser
|
|
3
|
+
=================================================
|
|
4
|
+
Detects new PDFs in a user-supplied directory, extracts text (via Nougat or
|
|
5
|
+
lightweight PyMuPDF/PyPDFLoader), describes every figure/chart/schematic using
|
|
6
|
+
a Vision LLM (OpenAI GPT-4o or Google Gemini), and writes three JSONL knowledge
|
|
7
|
+
bases ready for any downstream RAG system:
|
|
8
|
+
|
|
9
|
+
01_chunks_kb.jsonl – text chunks with stable IDs
|
|
10
|
+
02_visuals_kb.jsonl – per-figure visual descriptions
|
|
11
|
+
03_metadata_kb.jsonl – document-level metadata (title, authors, DOI …)
|
|
12
|
+
|
|
13
|
+
No chatbot, no vector store, no retrieval – just a robust parser.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from visual_parser.config import ParserConfig
|
|
17
|
+
from visual_parser.pipeline import run_pipeline
|
|
18
|
+
|
|
19
|
+
__all__ = ["ParserConfig", "run_pipeline"]
|
|
20
|
+
__version__ = "1.0.0"
|
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py — Argument parser and main() entry point for the Visual-RAG PDF Parser.
|
|
3
|
+
|
|
4
|
+
This module is the canonical home for CLI logic. It is imported by:
|
|
5
|
+
• visual-parser.py (top-level convenience script)
|
|
6
|
+
• visual_parser/__main__.py (enables: python -m visual_parser ...)
|
|
7
|
+
• pyproject.toml [project.scripts] (enables: visual-parser ...)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
USAGE_EXAMPLES = """
|
|
18
|
+
Examples
|
|
19
|
+
--------
|
|
20
|
+
# Nougat (default) + GPT-5.5 vision
|
|
21
|
+
python visual-parser.py --input-dir ./my_pdfs
|
|
22
|
+
|
|
23
|
+
# Fast lightweight extraction + Gemini
|
|
24
|
+
python visual-parser.py --input-dir ./my_pdfs \\
|
|
25
|
+
--text-mode lightweight \\
|
|
26
|
+
--vision-provider gemini \\
|
|
27
|
+
--vision-model gemini-1.5-pro
|
|
28
|
+
|
|
29
|
+
# Write outputs to a separate directory
|
|
30
|
+
python visual-parser.py --input-dir ./my_pdfs --output-dir ./output_kb
|
|
31
|
+
|
|
32
|
+
# Force re-parse all PDFs (ignore tracking registry)
|
|
33
|
+
python visual-parser.py --input-dir ./my_pdfs --rebuild
|
|
34
|
+
|
|
35
|
+
# High-detail images for dense schematics
|
|
36
|
+
python visual-parser.py --input-dir ./my_pdfs --vision-detail high
|
|
37
|
+
|
|
38
|
+
# Verbose console logging
|
|
39
|
+
python visual-parser.py --input-dir ./my_pdfs --log-level INFO
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _build_arg_parser() -> argparse.ArgumentParser:
|
|
44
|
+
p = argparse.ArgumentParser(
|
|
45
|
+
prog="visual-parser",
|
|
46
|
+
description=(
|
|
47
|
+
"Visual-RAG PDF Parser — detects new PDFs, extracts text and "
|
|
48
|
+
"figure descriptions, and writes three JSONL knowledge bases:\n"
|
|
49
|
+
" 01_chunks_kb.jsonl text chunks\n"
|
|
50
|
+
" 02_visuals_kb.jsonl visual descriptions\n"
|
|
51
|
+
" 03_metadata_kb.jsonl document metadata"
|
|
52
|
+
),
|
|
53
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
54
|
+
epilog=USAGE_EXAMPLES,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# ---- Paths --------------------------------------------------------------
|
|
58
|
+
io_group = p.add_argument_group("Paths")
|
|
59
|
+
io_group.add_argument(
|
|
60
|
+
"--input-dir", "-i",
|
|
61
|
+
required=True,
|
|
62
|
+
metavar="DIR",
|
|
63
|
+
help="Directory to scan for PDF files (searched recursively).",
|
|
64
|
+
)
|
|
65
|
+
io_group.add_argument(
|
|
66
|
+
"--output-dir", "-o",
|
|
67
|
+
default="",
|
|
68
|
+
metavar="DIR",
|
|
69
|
+
help="Directory where JSONL files are written. Defaults to --input-dir.",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# ---- Text extraction ----------------------------------------------------
|
|
73
|
+
text_group = p.add_argument_group("Text extraction")
|
|
74
|
+
text_group.add_argument(
|
|
75
|
+
"--text-mode",
|
|
76
|
+
choices=["nougat", "lightweight"],
|
|
77
|
+
default="nougat",
|
|
78
|
+
help=(
|
|
79
|
+
"nougat — Nougat OCR model (best for scanned/complex PDFs, GPU recommended).\n"
|
|
80
|
+
"lightweight — PyMuPDF text layer + PyPDFLoader fallback (fast, no GPU needed)."
|
|
81
|
+
),
|
|
82
|
+
)
|
|
83
|
+
text_group.add_argument(
|
|
84
|
+
"--nougat-model",
|
|
85
|
+
default="facebook/nougat-small",
|
|
86
|
+
metavar="MODEL_ID",
|
|
87
|
+
help="HuggingFace model ID for Nougat (default: facebook/nougat-small).",
|
|
88
|
+
)
|
|
89
|
+
text_group.add_argument(
|
|
90
|
+
"--chunk-size",
|
|
91
|
+
type=int,
|
|
92
|
+
default=500,
|
|
93
|
+
metavar="N",
|
|
94
|
+
help="Target characters per text chunk (default: 500).",
|
|
95
|
+
)
|
|
96
|
+
text_group.add_argument(
|
|
97
|
+
"--chunk-overlap",
|
|
98
|
+
type=int,
|
|
99
|
+
default=100,
|
|
100
|
+
metavar="N",
|
|
101
|
+
help="Overlap characters between adjacent chunks (default: 100).",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# ---- Vision LLM ---------------------------------------------------------
|
|
105
|
+
vision_group = p.add_argument_group("Vision LLM (figure descriptions & metadata)")
|
|
106
|
+
vision_group.add_argument(
|
|
107
|
+
"--vision-provider",
|
|
108
|
+
choices=["gpt", "gemini"],
|
|
109
|
+
default="gpt",
|
|
110
|
+
help=(
|
|
111
|
+
"gpt — OpenAI GPT-5.5 (set OPENAI_API_KEY in .env).\n"
|
|
112
|
+
"gemini — Google Gemini (set GEMINI_API_KEY in .env)."
|
|
113
|
+
),
|
|
114
|
+
)
|
|
115
|
+
vision_group.add_argument(
|
|
116
|
+
"--vision-model",
|
|
117
|
+
default=None,
|
|
118
|
+
metavar="MODEL_NAME",
|
|
119
|
+
help=(
|
|
120
|
+
"Vision model name. Omit to use the latest for each provider:\n"
|
|
121
|
+
" gpt → gpt-5.5 (also: gpt-5.4, gpt-5.3-chat-latest, gpt-5.2, gpt-5.1, gpt-5, gpt-4o, gpt-4.1)\n"
|
|
122
|
+
" gemini → gemini-3-pro-preview (also: gemini-2.5-flash, gemini-1.5-pro)"
|
|
123
|
+
),
|
|
124
|
+
)
|
|
125
|
+
vision_group.add_argument(
|
|
126
|
+
"--vision-detail",
|
|
127
|
+
choices=["low", "high", "auto"],
|
|
128
|
+
default="low",
|
|
129
|
+
help=(
|
|
130
|
+
"Image detail level (GPT only).\n"
|
|
131
|
+
"low — faster/cheaper (default, recommended for most use cases).\n"
|
|
132
|
+
"high — better for dense schematics with small text."
|
|
133
|
+
),
|
|
134
|
+
)
|
|
135
|
+
vision_group.add_argument(
|
|
136
|
+
"--reasoning-effort",
|
|
137
|
+
choices=["minimal", "none", "low", "medium", "high", "xhigh"],
|
|
138
|
+
default="medium",
|
|
139
|
+
help=(
|
|
140
|
+
"Reasoning effort for GPT-5.x models (ignored for Gemini and older GPT).\n"
|
|
141
|
+
" minimal/none — minimum reasoning, depending on model.\n"
|
|
142
|
+
" low — light reasoning.\n"
|
|
143
|
+
" medium — balanced (default).\n"
|
|
144
|
+
" high — deeper reasoning, slower.\n"
|
|
145
|
+
" xhigh — maximum depth (gpt-5.2, gpt-5.4, and gpt-5.5)."
|
|
146
|
+
),
|
|
147
|
+
)
|
|
148
|
+
vision_group.add_argument(
|
|
149
|
+
"--metadata-pages",
|
|
150
|
+
type=int,
|
|
151
|
+
default=2,
|
|
152
|
+
metavar="N",
|
|
153
|
+
help="Number of front pages sent to the vision LLM for metadata extraction (default: 2).",
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# ---- Performance --------------------------------------------------------
|
|
157
|
+
perf_group = p.add_argument_group("Performance")
|
|
158
|
+
perf_group.add_argument(
|
|
159
|
+
"--max-workers",
|
|
160
|
+
type=int,
|
|
161
|
+
default=4,
|
|
162
|
+
metavar="N",
|
|
163
|
+
help="Thread-pool size for parallel PDF processing (default: 4).",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# ---- Misc ---------------------------------------------------------------
|
|
167
|
+
misc_group = p.add_argument_group("Miscellaneous")
|
|
168
|
+
misc_group.add_argument(
|
|
169
|
+
"--rebuild",
|
|
170
|
+
action="store_true",
|
|
171
|
+
help=(
|
|
172
|
+
"Reprocess ALL PDFs, ignoring the 04_processed_pdfs.txt registry. "
|
|
173
|
+
"Use after changing prompts, chunking strategy, or switching models."
|
|
174
|
+
),
|
|
175
|
+
)
|
|
176
|
+
misc_group.add_argument(
|
|
177
|
+
"--log-level",
|
|
178
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR"],
|
|
179
|
+
default="ERROR",
|
|
180
|
+
help="Verbosity level written to 05_pipeline.log (default: ERROR).",
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
return p
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def main(argv=None) -> int:
|
|
187
|
+
"""
|
|
188
|
+
Parse CLI arguments, build a :class:`~visual_parser.config.ParserConfig`,
|
|
189
|
+
validate it, and hand off to :func:`~visual_parser.pipeline.run_pipeline`.
|
|
190
|
+
|
|
191
|
+
Returns 0 on success, 1 on configuration error.
|
|
192
|
+
"""
|
|
193
|
+
parser = _build_arg_parser()
|
|
194
|
+
args = parser.parse_args(argv)
|
|
195
|
+
|
|
196
|
+
# Default vision model per provider when not explicitly set
|
|
197
|
+
if args.vision_model is None:
|
|
198
|
+
args.vision_model = (
|
|
199
|
+
"gpt-5.5" if args.vision_provider == "gpt" else "gemini-3-pro-preview"
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
from visual_parser.config import ParserConfig
|
|
203
|
+
|
|
204
|
+
config = ParserConfig(
|
|
205
|
+
input_dir = os.path.abspath(args.input_dir),
|
|
206
|
+
output_dir = os.path.abspath(args.output_dir) if args.output_dir else "",
|
|
207
|
+
text_mode = args.text_mode,
|
|
208
|
+
nougat_model = args.nougat_model,
|
|
209
|
+
chunk_size = args.chunk_size,
|
|
210
|
+
chunk_overlap = args.chunk_overlap,
|
|
211
|
+
vision_provider = args.vision_provider,
|
|
212
|
+
gpt_vision_model = args.vision_model if args.vision_provider == "gpt" else "gpt-5.5",
|
|
213
|
+
gemini_vision_model = args.vision_model if args.vision_provider == "gemini" else "gemini-3-pro-preview",
|
|
214
|
+
gpt_reasoning_effort = args.reasoning_effort,
|
|
215
|
+
vision_detail = args.vision_detail,
|
|
216
|
+
metadata_pages = args.metadata_pages,
|
|
217
|
+
max_workers = args.max_workers,
|
|
218
|
+
rebuild = args.rebuild,
|
|
219
|
+
log_level = args.log_level,
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
try:
|
|
223
|
+
config.validate()
|
|
224
|
+
except ValueError as exc:
|
|
225
|
+
print(f"[ERROR] {exc}", file=sys.stderr)
|
|
226
|
+
return 1
|
|
227
|
+
|
|
228
|
+
from visual_parser.pipeline import run_pipeline
|
|
229
|
+
run_pipeline(config)
|
|
230
|
+
return 0
|