poster2json 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poster2json/__init__.py +47 -0
- poster2json/__main__.py +8 -0
- poster2json/cli.py +272 -0
- poster2json/extract.py +933 -0
- poster2json/gui.py +37 -0
- poster2json/schemas/poster_schema.json +1181 -0
- poster2json/standards.py +21 -0
- poster2json/tests/__init__.py +1 -0
- poster2json/tests/conftest.py +1 -0
- poster2json/utils.py +172 -0
- poster2json/validate.py +337 -0
- poster2json-0.1.0.dist-info/METADATA +263 -0
- poster2json-0.1.0.dist-info/RECORD +16 -0
- poster2json-0.1.0.dist-info/WHEEL +4 -0
- poster2json-0.1.0.dist-info/entry_points.txt +3 -0
- poster2json-0.1.0.dist-info/licenses/LICENSE.md +22 -0
poster2json/__init__.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""
|
|
2
|
+
poster2json - Convert scientific posters to structured JSON metadata.
|
|
3
|
+
|
|
4
|
+
Extract structured metadata from scientific poster PDFs and images
|
|
5
|
+
using Large Language Models. Output conforms to the poster-json-schema
|
|
6
|
+
(DataCite-based format).
|
|
7
|
+
|
|
8
|
+
Basic Usage:
|
|
9
|
+
>>> from poster2json import extract_poster, validate_poster
|
|
10
|
+
>>>
|
|
11
|
+
>>> # Extract metadata from a poster
|
|
12
|
+
>>> result = extract_poster("poster.pdf")
|
|
13
|
+
>>> print(result["titles"][0]["title"])
|
|
14
|
+
|
|
15
|
+
>>> # Validate extracted JSON
|
|
16
|
+
>>> is_valid = validate_poster(result)
|
|
17
|
+
|
|
18
|
+
CLI Usage:
|
|
19
|
+
$ poster2json extract poster.pdf -o result.json
|
|
20
|
+
$ poster2json validate result.json
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
24
|
+
|
|
25
|
+
# Main functions
|
|
26
|
+
from .extract import extract_poster
|
|
27
|
+
from .validate import (
|
|
28
|
+
get_validation_errors,
|
|
29
|
+
validate_comprehensive,
|
|
30
|
+
validate_poster,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
__version__ = version("poster2json")
|
|
35
|
+
except PackageNotFoundError:
|
|
36
|
+
__version__ = "(local)"
|
|
37
|
+
|
|
38
|
+
del PackageNotFoundError
|
|
39
|
+
del version
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"extract_poster",
|
|
43
|
+
"validate_poster",
|
|
44
|
+
"validate_comprehensive",
|
|
45
|
+
"get_validation_errors",
|
|
46
|
+
"__version__",
|
|
47
|
+
]
|
poster2json/__main__.py
ADDED
poster2json/cli.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
"""
|
|
2
|
+
poster2json CLI
|
|
3
|
+
|
|
4
|
+
Command-line interface for extracting structured JSON from scientific posters.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
from art import tprint
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@click.group(invoke_without_command=True)
|
|
16
|
+
@click.version_option(prog_name="poster2json")
|
|
17
|
+
@click.pass_context
|
|
18
|
+
def main(ctx):
|
|
19
|
+
"""
|
|
20
|
+
poster2json - Convert scientific posters to structured JSON metadata.
|
|
21
|
+
|
|
22
|
+
Extract structured metadata from scientific poster PDFs and images
|
|
23
|
+
using Large Language Models. Output conforms to the poster-json-schema
|
|
24
|
+
(DataCite-based format).
|
|
25
|
+
|
|
26
|
+
Examples:
|
|
27
|
+
|
|
28
|
+
# Extract metadata from a poster PDF
|
|
29
|
+
poster2json extract poster.pdf
|
|
30
|
+
|
|
31
|
+
# Extract to a specific output file
|
|
32
|
+
poster2json extract poster.pdf -o result.json
|
|
33
|
+
|
|
34
|
+
# Validate extracted JSON
|
|
35
|
+
poster2json validate result.json
|
|
36
|
+
|
|
37
|
+
# Process multiple posters in a directory
|
|
38
|
+
poster2json batch ./posters/ -o ./output/
|
|
39
|
+
"""
|
|
40
|
+
if ctx.invoked_subcommand is None:
|
|
41
|
+
click.echo(ctx.get_help())
|
|
42
|
+
return
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@main.command()
|
|
46
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
47
|
+
@click.option(
|
|
48
|
+
"-o", "--output",
|
|
49
|
+
type=click.Path(),
|
|
50
|
+
help="Output JSON file path. If not provided, prints to stdout."
|
|
51
|
+
)
|
|
52
|
+
@click.option(
|
|
53
|
+
"--pretty/--compact",
|
|
54
|
+
default=True,
|
|
55
|
+
help="Pretty-print JSON output (default: pretty)"
|
|
56
|
+
)
|
|
57
|
+
def extract(input_file: str, output: str, pretty: bool):
|
|
58
|
+
"""
|
|
59
|
+
Extract structured JSON from a scientific poster.
|
|
60
|
+
|
|
61
|
+
INPUT_FILE: Path to the poster file (PDF, JPG, or PNG)
|
|
62
|
+
|
|
63
|
+
Requires a CUDA-capable GPU with ≥16GB VRAM.
|
|
64
|
+
|
|
65
|
+
Examples:
|
|
66
|
+
|
|
67
|
+
poster2json extract poster.pdf
|
|
68
|
+
|
|
69
|
+
poster2json extract poster.jpg -o output.json
|
|
70
|
+
"""
|
|
71
|
+
from .extract import extract_poster
|
|
72
|
+
|
|
73
|
+
click.echo(f"Extracting metadata from: {input_file}", err=True)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
result = extract_poster(input_file)
|
|
77
|
+
|
|
78
|
+
if "error" in result:
|
|
79
|
+
click.echo(f"Error during extraction: {result['error']}", err=True)
|
|
80
|
+
sys.exit(1)
|
|
81
|
+
|
|
82
|
+
# Format output
|
|
83
|
+
indent = 2 if pretty else None
|
|
84
|
+
json_output = json.dumps(result, indent=indent, ensure_ascii=False)
|
|
85
|
+
|
|
86
|
+
if output:
|
|
87
|
+
Path(output).parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
89
|
+
f.write(json_output)
|
|
90
|
+
click.echo(f"Output saved to: {output}", err=True)
|
|
91
|
+
else:
|
|
92
|
+
click.echo(json_output)
|
|
93
|
+
|
|
94
|
+
except Exception as e:
|
|
95
|
+
click.echo(f"Error: {e}", err=True)
|
|
96
|
+
sys.exit(1)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@main.command()
|
|
100
|
+
@click.argument("input_file", type=click.Path(exists=True))
|
|
101
|
+
@click.option(
|
|
102
|
+
"-v", "--verbose",
|
|
103
|
+
is_flag=True,
|
|
104
|
+
help="Show detailed validation errors"
|
|
105
|
+
)
|
|
106
|
+
def validate(input_file: str, verbose: bool):
|
|
107
|
+
"""
|
|
108
|
+
Validate a poster JSON file against the schema.
|
|
109
|
+
|
|
110
|
+
INPUT_FILE: Path to the JSON file to validate
|
|
111
|
+
|
|
112
|
+
Examples:
|
|
113
|
+
|
|
114
|
+
poster2json validate result.json
|
|
115
|
+
|
|
116
|
+
poster2json validate result.json --verbose
|
|
117
|
+
"""
|
|
118
|
+
from .validate import validate_comprehensive, validate_poster
|
|
119
|
+
|
|
120
|
+
try:
|
|
121
|
+
with open(input_file, encoding="utf-8") as f:
|
|
122
|
+
data = json.load(f)
|
|
123
|
+
except json.JSONDecodeError as e:
|
|
124
|
+
click.echo(f"Invalid JSON file: {e}", err=True)
|
|
125
|
+
sys.exit(1)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
click.echo(f"Error reading file: {e}", err=True)
|
|
128
|
+
sys.exit(1)
|
|
129
|
+
|
|
130
|
+
if verbose:
|
|
131
|
+
result = validate_comprehensive(data)
|
|
132
|
+
|
|
133
|
+
if result["valid"]:
|
|
134
|
+
click.echo("✅ Poster JSON is valid")
|
|
135
|
+
else:
|
|
136
|
+
click.echo("❌ Poster JSON has validation errors")
|
|
137
|
+
|
|
138
|
+
if result["schema_errors"]:
|
|
139
|
+
click.echo("\nSchema Errors:")
|
|
140
|
+
for error in result["schema_errors"]:
|
|
141
|
+
click.echo(f" - {error['path']}: {error['message']}")
|
|
142
|
+
|
|
143
|
+
if result["field_issues"]:
|
|
144
|
+
click.echo("\nField Issues:")
|
|
145
|
+
for issue in result["field_issues"]:
|
|
146
|
+
click.echo(f" - {issue}")
|
|
147
|
+
|
|
148
|
+
if result["warnings"]:
|
|
149
|
+
click.echo("\nWarnings:")
|
|
150
|
+
for warning in result["warnings"]:
|
|
151
|
+
click.echo(f" - {warning}")
|
|
152
|
+
|
|
153
|
+
sys.exit(0 if result["valid"] else 1)
|
|
154
|
+
else:
|
|
155
|
+
is_valid = validate_poster(data, verbose=False)
|
|
156
|
+
if is_valid:
|
|
157
|
+
click.echo("✅ Valid")
|
|
158
|
+
sys.exit(0)
|
|
159
|
+
else:
|
|
160
|
+
click.echo("❌ Invalid (use --verbose for details)")
|
|
161
|
+
sys.exit(1)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
@main.command()
|
|
165
|
+
@click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
|
|
166
|
+
@click.option(
|
|
167
|
+
"-o", "--output-dir",
|
|
168
|
+
type=click.Path(),
|
|
169
|
+
default="./output",
|
|
170
|
+
help="Output directory for extracted JSON files"
|
|
171
|
+
)
|
|
172
|
+
@click.option(
|
|
173
|
+
"--pattern",
|
|
174
|
+
default="*.pdf,*.jpg,*.png",
|
|
175
|
+
help="File patterns to process (comma-separated)"
|
|
176
|
+
)
|
|
177
|
+
def batch(input_dir: str, output_dir: str, pattern: str):
|
|
178
|
+
"""
|
|
179
|
+
Extract metadata from multiple posters in a directory.
|
|
180
|
+
|
|
181
|
+
INPUT_DIR: Directory containing poster files
|
|
182
|
+
|
|
183
|
+
Examples:
|
|
184
|
+
|
|
185
|
+
poster2json batch ./posters/
|
|
186
|
+
|
|
187
|
+
poster2json batch ./posters/ -o ./results/
|
|
188
|
+
|
|
189
|
+
poster2json batch ./posters/ --pattern "*.pdf"
|
|
190
|
+
"""
|
|
191
|
+
from .extract import extract_poster, log
|
|
192
|
+
|
|
193
|
+
input_path = Path(input_dir)
|
|
194
|
+
output_path = Path(output_dir)
|
|
195
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
|
|
197
|
+
# Find files matching patterns
|
|
198
|
+
patterns = [p.strip() for p in pattern.split(",")]
|
|
199
|
+
files = []
|
|
200
|
+
for pat in patterns:
|
|
201
|
+
files.extend(input_path.glob(pat))
|
|
202
|
+
files = sorted(set(files))
|
|
203
|
+
|
|
204
|
+
if not files:
|
|
205
|
+
click.echo(f"No files found matching patterns: {pattern}", err=True)
|
|
206
|
+
sys.exit(1)
|
|
207
|
+
|
|
208
|
+
click.echo(f"Found {len(files)} files to process", err=True)
|
|
209
|
+
|
|
210
|
+
results = []
|
|
211
|
+
for i, file_path in enumerate(files, 1):
|
|
212
|
+
click.echo(f"\n[{i}/{len(files)}] Processing: {file_path.name}", err=True)
|
|
213
|
+
|
|
214
|
+
try:
|
|
215
|
+
result = extract_poster(str(file_path))
|
|
216
|
+
|
|
217
|
+
# Save output
|
|
218
|
+
output_file = output_path / f"{file_path.stem}_extracted.json"
|
|
219
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
|
220
|
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
221
|
+
|
|
222
|
+
success = "error" not in result
|
|
223
|
+
results.append({
|
|
224
|
+
"file": file_path.name,
|
|
225
|
+
"output": str(output_file),
|
|
226
|
+
"success": success,
|
|
227
|
+
"error": result.get("error") if not success else None,
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
status = "✅" if success else "❌"
|
|
231
|
+
click.echo(f" {status} Saved to: {output_file}", err=True)
|
|
232
|
+
|
|
233
|
+
except Exception as e:
|
|
234
|
+
results.append({
|
|
235
|
+
"file": file_path.name,
|
|
236
|
+
"success": False,
|
|
237
|
+
"error": str(e),
|
|
238
|
+
})
|
|
239
|
+
click.echo(f" ❌ Error: {e}", err=True)
|
|
240
|
+
|
|
241
|
+
# Summary
|
|
242
|
+
successful = sum(1 for r in results if r["success"])
|
|
243
|
+
click.echo(f"\n{'='*50}", err=True)
|
|
244
|
+
click.echo(f"Completed: {successful}/{len(results)} successful", err=True)
|
|
245
|
+
|
|
246
|
+
# Save results summary
|
|
247
|
+
summary_file = output_path / "batch_results.json"
|
|
248
|
+
with open(summary_file, "w", encoding="utf-8") as f:
|
|
249
|
+
json.dump(results, f, indent=2)
|
|
250
|
+
click.echo(f"Summary saved to: {summary_file}", err=True)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@main.command()
|
|
254
|
+
def info():
|
|
255
|
+
"""
|
|
256
|
+
Show information about poster2json.
|
|
257
|
+
"""
|
|
258
|
+
tprint("poster2json")
|
|
259
|
+
|
|
260
|
+
click.echo("\nConvert scientific posters to structured JSON metadata.")
|
|
261
|
+
click.echo("\nDocumentation: https://fairdataihub.github.io/poster2json/")
|
|
262
|
+
click.echo("Repository: https://github.com/fairdataihub/poster2json")
|
|
263
|
+
click.echo("\nModels used:")
|
|
264
|
+
click.echo(" - Llama 3.1 8B Poster Extraction (JSON structuring)")
|
|
265
|
+
click.echo(" - Qwen2-VL-7B-Instruct (Vision OCR for images)")
|
|
266
|
+
click.echo("\nRequirements:")
|
|
267
|
+
click.echo(" - CUDA-capable GPU with ≥16GB VRAM")
|
|
268
|
+
click.echo(" - pdfalto (for PDF processing)")
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
if __name__ == "__main__":
|
|
272
|
+
main()
|