poster2json 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poster2json-0.1.0/LICENSE.md +22 -0
- poster2json-0.1.0/PKG-INFO +263 -0
- poster2json-0.1.0/README.md +218 -0
- poster2json-0.1.0/poster2json/__init__.py +47 -0
- poster2json-0.1.0/poster2json/__main__.py +8 -0
- poster2json-0.1.0/poster2json/cli.py +272 -0
- poster2json-0.1.0/poster2json/extract.py +933 -0
- poster2json-0.1.0/poster2json/gui.py +37 -0
- poster2json-0.1.0/poster2json/schemas/poster_schema.json +1181 -0
- poster2json-0.1.0/poster2json/standards.py +21 -0
- poster2json-0.1.0/poster2json/tests/__init__.py +1 -0
- poster2json-0.1.0/poster2json/tests/conftest.py +1 -0
- poster2json-0.1.0/poster2json/utils.py +172 -0
- poster2json-0.1.0/poster2json/validate.py +337 -0
- poster2json-0.1.0/pyproject.toml +143 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
|
|
2
|
+
MIT License
|
|
3
|
+
|
|
4
|
+
Copyright © 2022, FAIR Data Innovations Hub
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: poster2json
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE.md
|
|
7
|
+
Keywords: poster,json,metadata,extraction,llm,scientific,pdf,ocr,machine-learning,datacite,fair-data
|
|
8
|
+
Author: FAIR Data Innovations Hub
|
|
9
|
+
Author-email: contact@fairdataihub.org
|
|
10
|
+
Requires-Python: >=3.10,<4.0
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Natural Language :: English
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering
|
|
25
|
+
Classifier: Topic :: Text Processing
|
|
26
|
+
Provides-Extra: vision
|
|
27
|
+
Requires-Dist: Pillow (>=9.0.0)
|
|
28
|
+
Requires-Dist: accelerate (>=0.20.0)
|
|
29
|
+
Requires-Dist: art (>=6.0,<7.0)
|
|
30
|
+
Requires-Dist: bitsandbytes (>=0.44.0)
|
|
31
|
+
Requires-Dist: click (>=8.0,<9.0)
|
|
32
|
+
Requires-Dist: jsonschema (>=4.17.0)
|
|
33
|
+
Requires-Dist: numpy
|
|
34
|
+
Requires-Dist: pymupdf (>=1.22.0)
|
|
35
|
+
Requires-Dist: rouge-score
|
|
36
|
+
Requires-Dist: safetensors
|
|
37
|
+
Requires-Dist: sentencepiece
|
|
38
|
+
Requires-Dist: torch (>=2.0.0)
|
|
39
|
+
Requires-Dist: transformers (>=4.40.0)
|
|
40
|
+
Project-URL: Documentation, https://fairdataihub.github.io/poster2json
|
|
41
|
+
Project-URL: Homepage, https://github.com/fairdataihub/poster2json
|
|
42
|
+
Project-URL: Repository, https://github.com/fairdataihub/poster2json
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
<div align="center">
|
|
46
|
+
|
|
47
|
+
<img src="https://raw.githubusercontent.com/fairdataihub/poster2json/main/logo.svg" alt="logo" width="200" height="auto" />
|
|
48
|
+
|
|
49
|
+
<br />
|
|
50
|
+
|
|
51
|
+
<h1>poster2json</h1>
|
|
52
|
+
|
|
53
|
+
<p>
|
|
54
|
+
Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models.
|
|
55
|
+
</p>
|
|
56
|
+
|
|
57
|
+
<br />
|
|
58
|
+
|
|
59
|
+
<p>
|
|
60
|
+
<a href="https://github.com/fairdataihub/poster2json/graphs/contributors">
|
|
61
|
+
<img src="https://img.shields.io/github/contributors/fairdataihub/poster2json.svg?style=flat-square" alt="contributors" />
|
|
62
|
+
</a>
|
|
63
|
+
<a href="https://github.com/fairdataihub/poster2json/stargazers">
|
|
64
|
+
<img src="https://img.shields.io/github/stars/fairdataihub/poster2json.svg?style=flat-square" alt="stars" />
|
|
65
|
+
</a>
|
|
66
|
+
<a href="https://github.com/fairdataihub/poster2json/issues/">
|
|
67
|
+
<img src="https://img.shields.io/github/issues/fairdataihub/poster2json.svg?style=flat-square" alt="open issues" />
|
|
68
|
+
</a>
|
|
69
|
+
<a href="https://github.com/fairdataihub/poster2json/blob/main/LICENSE">
|
|
70
|
+
<img src="https://img.shields.io/github/license/fairdataihub/poster2json.svg?style=flat-square" alt="license" />
|
|
71
|
+
</a>
|
|
72
|
+
</p>
|
|
73
|
+
<p>
|
|
74
|
+
<a href="https://pypi.org/project/poster2json">
|
|
75
|
+
<img src="https://img.shields.io/pypi/v/poster2json.svg" alt="PyPI Version" />
|
|
76
|
+
</a>
|
|
77
|
+
<a href="https://pypistats.org/packages/poster2json">
|
|
78
|
+
<img src="https://img.shields.io/pypi/dm/poster2json.svg?color=orange" alt="PyPI Downloads" />
|
|
79
|
+
</a>
|
|
80
|
+
<a href="https://zenodo.org/badge/latestdoi/1105067405">
|
|
81
|
+
<img src="https://zenodo.org/badge/1105067405.svg" alt="DOI" />
|
|
82
|
+
</a>
|
|
83
|
+
</p>
|
|
84
|
+
|
|
85
|
+
<h4>
|
|
86
|
+
<a href="https://fairdataihub.github.io/poster2json/">Documentation</a>
|
|
87
|
+
<span> · </span>
|
|
88
|
+
<a href="https://fairdataihub.github.io/poster2json/about/changelog/">Changelog</a>
|
|
89
|
+
<span> · </span>
|
|
90
|
+
<a href="https://github.com/fairdataihub/poster2json/issues/">Report Bug</a>
|
|
91
|
+
<span> · </span>
|
|
92
|
+
<a href="https://github.com/fairdataihub/poster2json/issues/">Request Feature</a>
|
|
93
|
+
</h4>
|
|
94
|
+
</div>
|
|
95
|
+
|
|
96
|
+
<br />
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Description
|
|
101
|
+
|
|
102
|
+
**poster2json** extracts structured metadata from scientific conference posters (PDF or image format) into machine-actionable JSON conforming to the [poster-json-schema](https://github.com/fairdataihub/poster-json-schema).
|
|
103
|
+
|
|
104
|
+
The pipeline uses:
|
|
105
|
+
|
|
106
|
+
- **Llama 3.1 8B** (fine-tuned) for JSON structuring
|
|
107
|
+
- **Qwen2-VL-7B** for vision-based OCR of image posters
|
|
108
|
+
- **pdfalto** for layout-aware PDF text extraction
|
|
109
|
+
|
|
110
|
+
## Quick Start
|
|
111
|
+
|
|
112
|
+
### Installation
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
pip install poster2json
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### CLI Usage
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
# Extract metadata from a poster
|
|
122
|
+
poster2json extract poster.pdf -o result.json
|
|
123
|
+
|
|
124
|
+
# Validate extracted JSON
|
|
125
|
+
poster2json validate result.json
|
|
126
|
+
|
|
127
|
+
# Process multiple posters
|
|
128
|
+
poster2json batch ./posters/ -o ./output/
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Python API
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from poster2json import extract_poster, validate_poster
|
|
135
|
+
|
|
136
|
+
# Extract metadata
|
|
137
|
+
result = extract_poster("poster.pdf")
|
|
138
|
+
print(result["titles"][0]["title"])
|
|
139
|
+
|
|
140
|
+
# Validate the result
|
|
141
|
+
is_valid = validate_poster(result)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## Output Format
|
|
145
|
+
|
|
146
|
+
Output conforms to the [poster-json-schema](https://github.com/fairdataihub/poster-json-schema) (DataCite-based):
|
|
147
|
+
|
|
148
|
+
```json
|
|
149
|
+
{
|
|
150
|
+
"$schema": "https://posters.science/schema/v0.1/poster_schema.json",
|
|
151
|
+
"creators": [
|
|
152
|
+
{
|
|
153
|
+
"name": "Garcia, Sofia",
|
|
154
|
+
"givenName": "Sofia",
|
|
155
|
+
"familyName": "Garcia",
|
|
156
|
+
"affiliation": ["University"]
|
|
157
|
+
}
|
|
158
|
+
],
|
|
159
|
+
"titles": [
|
|
160
|
+
{ "title": "Machine Learning Approaches to Diabetic Retinopathy Detection" }
|
|
161
|
+
],
|
|
162
|
+
"posterContent": {
|
|
163
|
+
"sections": [
|
|
164
|
+
{ "sectionTitle": "Abstract", "sectionContent": "..." },
|
|
165
|
+
{ "sectionTitle": "Methods", "sectionContent": "..." },
|
|
166
|
+
{ "sectionTitle": "Results", "sectionContent": "..." }
|
|
167
|
+
]
|
|
168
|
+
},
|
|
169
|
+
"imageCaptions": [{ "captions": ["Figure 1.", "ROC curves showing..."] }],
|
|
170
|
+
"tableCaptions": [{ "captions": ["Table 1.", "Performance metrics"] }]
|
|
171
|
+
}
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## System Requirements
|
|
175
|
+
|
|
176
|
+
| Requirement | Specification |
|
|
177
|
+
| ----------- | -------------------------------- |
|
|
178
|
+
| GPU | NVIDIA CUDA-capable, ≥16GB VRAM |
|
|
179
|
+
| RAM | ≥32GB recommended |
|
|
180
|
+
| Python | 3.10+ |
|
|
181
|
+
| OS | Linux, macOS, Windows (via WSL2) |
|
|
182
|
+
|
|
183
|
+
## Performance
|
|
184
|
+
|
|
185
|
+
Validated on 10 manually annotated scientific posters:
|
|
186
|
+
|
|
187
|
+
| Metric | Score | Threshold |
|
|
188
|
+
| ---------------- | ----- | --------- |
|
|
189
|
+
| Word Capture | 0.96 | ≥0.75 |
|
|
190
|
+
| ROUGE-L | 0.89 | ≥0.75 |
|
|
191
|
+
| Number Capture | 0.93 | ≥0.75 |
|
|
192
|
+
| Field Proportion | 0.99 | 0.30–2.50 |
|
|
193
|
+
|
|
194
|
+
**Pass Rate**: 10/10 (100%)
|
|
195
|
+
|
|
196
|
+
## Documentation
|
|
197
|
+
|
|
198
|
+
| Document | Description |
|
|
199
|
+
| ------------------------------------ | ------------------------------- |
|
|
200
|
+
| [Architecture](docs/architecture.md) | Technical details & methodology |
|
|
201
|
+
| [Evaluation](docs/evaluation.md) | Validation metrics & results |
|
|
202
|
+
|
|
203
|
+
## Development Setup
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
# Clone the repository
|
|
207
|
+
git clone https://github.com/fairdataihub/poster2json.git
|
|
208
|
+
cd poster2json
|
|
209
|
+
|
|
210
|
+
# Create a virtual environment
|
|
211
|
+
python -m venv .venv
|
|
212
|
+
|
|
213
|
+
# Activate the virtual environment
|
|
214
|
+
source venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
215
|
+
|
|
216
|
+
# Install poetry
|
|
217
|
+
pip install poetry
|
|
218
|
+
|
|
219
|
+
# Install dependencies
|
|
220
|
+
poetry install
|
|
221
|
+
|
|
222
|
+
# Run tests
|
|
223
|
+
poe test
|
|
224
|
+
|
|
225
|
+
# Format code
|
|
226
|
+
poe format
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
If you are on windows and have multiple python versions, you can use the following commands:
|
|
230
|
+
|
|
231
|
+
```bash
|
|
232
|
+
py -0p # list all python versions
|
|
233
|
+
|
|
234
|
+
py -3.12 -m venv .venv
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
## License
|
|
238
|
+
|
|
239
|
+
MIT License - see [LICENSE](LICENSE.md) for details.
|
|
240
|
+
|
|
241
|
+
## Citation
|
|
242
|
+
|
|
243
|
+
```bibtex
|
|
244
|
+
@software{poster2json2026,
|
|
245
|
+
title = {poster2json: Scientific Poster to JSON Metadata Extraction},
|
|
246
|
+
author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
|
|
247
|
+
year = {2026},
|
|
248
|
+
url = {https://github.com/fairdataihub/poster2json},
|
|
249
|
+
doi = {10.5281/zenodo.18320010}
|
|
250
|
+
}
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Acknowledgements
|
|
254
|
+
|
|
255
|
+
- [FAIR Data Innovations Hub](https://fairdataihub.org/)
|
|
256
|
+
- Meta AI for Llama 3.1
|
|
257
|
+
- Alibaba Cloud for Qwen2-VL
|
|
258
|
+
- Part of the [posters.science](https://posters.science) platform
|
|
259
|
+
|
|
260
|
+
## Contributing
|
|
261
|
+
|
|
262
|
+
Contributions welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
263
|
+
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="https://raw.githubusercontent.com/fairdataihub/poster2json/main/logo.svg" alt="logo" width="200" height="auto" />
|
|
4
|
+
|
|
5
|
+
<br />
|
|
6
|
+
|
|
7
|
+
<h1>poster2json</h1>
|
|
8
|
+
|
|
9
|
+
<p>
|
|
10
|
+
Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models.
|
|
11
|
+
</p>
|
|
12
|
+
|
|
13
|
+
<br />
|
|
14
|
+
|
|
15
|
+
<p>
|
|
16
|
+
<a href="https://github.com/fairdataihub/poster2json/graphs/contributors">
|
|
17
|
+
<img src="https://img.shields.io/github/contributors/fairdataihub/poster2json.svg?style=flat-square" alt="contributors" />
|
|
18
|
+
</a>
|
|
19
|
+
<a href="https://github.com/fairdataihub/poster2json/stargazers">
|
|
20
|
+
<img src="https://img.shields.io/github/stars/fairdataihub/poster2json.svg?style=flat-square" alt="stars" />
|
|
21
|
+
</a>
|
|
22
|
+
<a href="https://github.com/fairdataihub/poster2json/issues/">
|
|
23
|
+
<img src="https://img.shields.io/github/issues/fairdataihub/poster2json.svg?style=flat-square" alt="open issues" />
|
|
24
|
+
</a>
|
|
25
|
+
<a href="https://github.com/fairdataihub/poster2json/blob/main/LICENSE">
|
|
26
|
+
<img src="https://img.shields.io/github/license/fairdataihub/poster2json.svg?style=flat-square" alt="license" />
|
|
27
|
+
</a>
|
|
28
|
+
</p>
|
|
29
|
+
<p>
|
|
30
|
+
<a href="https://pypi.org/project/poster2json">
|
|
31
|
+
<img src="https://img.shields.io/pypi/v/poster2json.svg" alt="PyPI Version" />
|
|
32
|
+
</a>
|
|
33
|
+
<a href="https://pypistats.org/packages/poster2json">
|
|
34
|
+
<img src="https://img.shields.io/pypi/dm/poster2json.svg?color=orange" alt="PyPI Downloads" />
|
|
35
|
+
</a>
|
|
36
|
+
<a href="https://zenodo.org/badge/latestdoi/1105067405">
|
|
37
|
+
<img src="https://zenodo.org/badge/1105067405.svg" alt="DOI" />
|
|
38
|
+
</a>
|
|
39
|
+
</p>
|
|
40
|
+
|
|
41
|
+
<h4>
|
|
42
|
+
<a href="https://fairdataihub.github.io/poster2json/">Documentation</a>
|
|
43
|
+
<span> · </span>
|
|
44
|
+
<a href="https://fairdataihub.github.io/poster2json/about/changelog/">Changelog</a>
|
|
45
|
+
<span> · </span>
|
|
46
|
+
<a href="https://github.com/fairdataihub/poster2json/issues/">Report Bug</a>
|
|
47
|
+
<span> · </span>
|
|
48
|
+
<a href="https://github.com/fairdataihub/poster2json/issues/">Request Feature</a>
|
|
49
|
+
</h4>
|
|
50
|
+
</div>
|
|
51
|
+
|
|
52
|
+
<br />
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Description
|
|
57
|
+
|
|
58
|
+
**poster2json** extracts structured metadata from scientific conference posters (PDF or image format) into machine-actionable JSON conforming to the [poster-json-schema](https://github.com/fairdataihub/poster-json-schema).
|
|
59
|
+
|
|
60
|
+
The pipeline uses:
|
|
61
|
+
|
|
62
|
+
- **Llama 3.1 8B** (fine-tuned) for JSON structuring
|
|
63
|
+
- **Qwen2-VL-7B** for vision-based OCR of image posters
|
|
64
|
+
- **pdfalto** for layout-aware PDF text extraction
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
### Installation
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
pip install poster2json
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### CLI Usage
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Extract metadata from a poster
|
|
78
|
+
poster2json extract poster.pdf -o result.json
|
|
79
|
+
|
|
80
|
+
# Validate extracted JSON
|
|
81
|
+
poster2json validate result.json
|
|
82
|
+
|
|
83
|
+
# Process multiple posters
|
|
84
|
+
poster2json batch ./posters/ -o ./output/
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Python API
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from poster2json import extract_poster, validate_poster
|
|
91
|
+
|
|
92
|
+
# Extract metadata
|
|
93
|
+
result = extract_poster("poster.pdf")
|
|
94
|
+
print(result["titles"][0]["title"])
|
|
95
|
+
|
|
96
|
+
# Validate the result
|
|
97
|
+
is_valid = validate_poster(result)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Output Format
|
|
101
|
+
|
|
102
|
+
Output conforms to the [poster-json-schema](https://github.com/fairdataihub/poster-json-schema) (DataCite-based):
|
|
103
|
+
|
|
104
|
+
```json
|
|
105
|
+
{
|
|
106
|
+
"$schema": "https://posters.science/schema/v0.1/poster_schema.json",
|
|
107
|
+
"creators": [
|
|
108
|
+
{
|
|
109
|
+
"name": "Garcia, Sofia",
|
|
110
|
+
"givenName": "Sofia",
|
|
111
|
+
"familyName": "Garcia",
|
|
112
|
+
"affiliation": ["University"]
|
|
113
|
+
}
|
|
114
|
+
],
|
|
115
|
+
"titles": [
|
|
116
|
+
{ "title": "Machine Learning Approaches to Diabetic Retinopathy Detection" }
|
|
117
|
+
],
|
|
118
|
+
"posterContent": {
|
|
119
|
+
"sections": [
|
|
120
|
+
{ "sectionTitle": "Abstract", "sectionContent": "..." },
|
|
121
|
+
{ "sectionTitle": "Methods", "sectionContent": "..." },
|
|
122
|
+
{ "sectionTitle": "Results", "sectionContent": "..." }
|
|
123
|
+
]
|
|
124
|
+
},
|
|
125
|
+
"imageCaptions": [{ "captions": ["Figure 1.", "ROC curves showing..."] }],
|
|
126
|
+
"tableCaptions": [{ "captions": ["Table 1.", "Performance metrics"] }]
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## System Requirements
|
|
131
|
+
|
|
132
|
+
| Requirement | Specification |
|
|
133
|
+
| ----------- | -------------------------------- |
|
|
134
|
+
| GPU | NVIDIA CUDA-capable, ≥16GB VRAM |
|
|
135
|
+
| RAM | ≥32GB recommended |
|
|
136
|
+
| Python | 3.10+ |
|
|
137
|
+
| OS | Linux, macOS, Windows (via WSL2) |
|
|
138
|
+
|
|
139
|
+
## Performance
|
|
140
|
+
|
|
141
|
+
Validated on 10 manually annotated scientific posters:
|
|
142
|
+
|
|
143
|
+
| Metric | Score | Threshold |
|
|
144
|
+
| ---------------- | ----- | --------- |
|
|
145
|
+
| Word Capture | 0.96 | ≥0.75 |
|
|
146
|
+
| ROUGE-L | 0.89 | ≥0.75 |
|
|
147
|
+
| Number Capture | 0.93 | ≥0.75 |
|
|
148
|
+
| Field Proportion | 0.99 | 0.30–2.50 |
|
|
149
|
+
|
|
150
|
+
**Pass Rate**: 10/10 (100%)
|
|
151
|
+
|
|
152
|
+
## Documentation
|
|
153
|
+
|
|
154
|
+
| Document | Description |
|
|
155
|
+
| ------------------------------------ | ------------------------------- |
|
|
156
|
+
| [Architecture](docs/architecture.md) | Technical details & methodology |
|
|
157
|
+
| [Evaluation](docs/evaluation.md) | Validation metrics & results |
|
|
158
|
+
|
|
159
|
+
## Development Setup
|
|
160
|
+
|
|
161
|
+
```bash
|
|
162
|
+
# Clone the repository
|
|
163
|
+
git clone https://github.com/fairdataihub/poster2json.git
|
|
164
|
+
cd poster2json
|
|
165
|
+
|
|
166
|
+
# Create a virtual environment
|
|
167
|
+
python -m venv .venv
|
|
168
|
+
|
|
169
|
+
# Activate the virtual environment
|
|
170
|
+
source venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
171
|
+
|
|
172
|
+
# Install poetry
|
|
173
|
+
pip install poetry
|
|
174
|
+
|
|
175
|
+
# Install dependencies
|
|
176
|
+
poetry install
|
|
177
|
+
|
|
178
|
+
# Run tests
|
|
179
|
+
poe test
|
|
180
|
+
|
|
181
|
+
# Format code
|
|
182
|
+
poe format
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
If you are on windows and have multiple python versions, you can use the following commands:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
py -0p # list all python versions
|
|
189
|
+
|
|
190
|
+
py -3.12 -m venv .venv
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## License
|
|
194
|
+
|
|
195
|
+
MIT License - see [LICENSE](LICENSE.md) for details.
|
|
196
|
+
|
|
197
|
+
## Citation
|
|
198
|
+
|
|
199
|
+
```bibtex
|
|
200
|
+
@software{poster2json2026,
|
|
201
|
+
title = {poster2json: Scientific Poster to JSON Metadata Extraction},
|
|
202
|
+
author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
|
|
203
|
+
year = {2026},
|
|
204
|
+
url = {https://github.com/fairdataihub/poster2json},
|
|
205
|
+
doi = {10.5281/zenodo.18320010}
|
|
206
|
+
}
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
## Acknowledgements
|
|
210
|
+
|
|
211
|
+
- [FAIR Data Innovations Hub](https://fairdataihub.org/)
|
|
212
|
+
- Meta AI for Llama 3.1
|
|
213
|
+
- Alibaba Cloud for Qwen2-VL
|
|
214
|
+
- Part of the [posters.science](https://posters.science) platform
|
|
215
|
+
|
|
216
|
+
## Contributing
|
|
217
|
+
|
|
218
|
+
Contributions welcome! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
poster2json - Convert scientific posters to structured JSON metadata.
|
|
3
|
+
|
|
4
|
+
Extract structured metadata from scientific poster PDFs and images
|
|
5
|
+
using Large Language Models. Output conforms to the poster-json-schema
|
|
6
|
+
(DataCite-based format).
|
|
7
|
+
|
|
8
|
+
Basic Usage:
|
|
9
|
+
>>> from poster2json import extract_poster, validate_poster
|
|
10
|
+
>>>
|
|
11
|
+
>>> # Extract metadata from a poster
|
|
12
|
+
>>> result = extract_poster("poster.pdf")
|
|
13
|
+
>>> print(result["titles"][0]["title"])
|
|
14
|
+
|
|
15
|
+
>>> # Validate extracted JSON
|
|
16
|
+
>>> is_valid = validate_poster(result)
|
|
17
|
+
|
|
18
|
+
CLI Usage:
|
|
19
|
+
$ poster2json extract poster.pdf -o result.json
|
|
20
|
+
$ poster2json validate result.json
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
24
|
+
|
|
25
|
+
# Main functions
|
|
26
|
+
from .extract import extract_poster
|
|
27
|
+
from .validate import (
|
|
28
|
+
get_validation_errors,
|
|
29
|
+
validate_comprehensive,
|
|
30
|
+
validate_poster,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
__version__ = version("poster2json")
|
|
35
|
+
except PackageNotFoundError:
|
|
36
|
+
__version__ = "(local)"
|
|
37
|
+
|
|
38
|
+
del PackageNotFoundError
|
|
39
|
+
del version
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"extract_poster",
|
|
43
|
+
"validate_poster",
|
|
44
|
+
"validate_comprehensive",
|
|
45
|
+
"get_validation_errors",
|
|
46
|
+
"__version__",
|
|
47
|
+
]
|