poster2json 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ """
2
+ poster2json - Convert scientific posters to structured JSON metadata.
3
+
4
+ Extract structured metadata from scientific poster PDFs and images
5
+ using Large Language Models. Output conforms to the poster-json-schema
6
+ (DataCite-based format).
7
+
8
+ Basic Usage:
9
+ >>> from poster2json import extract_poster, validate_poster
10
+ >>>
11
+ >>> # Extract metadata from a poster
12
+ >>> result = extract_poster("poster.pdf")
13
+ >>> print(result["titles"][0]["title"])
14
+
15
+ >>> # Validate extracted JSON
16
+ >>> is_valid = validate_poster(result)
17
+
18
+ CLI Usage:
19
+ $ poster2json extract poster.pdf -o result.json
20
+ $ poster2json validate result.json
21
+ """
22
+
23
+ from importlib.metadata import PackageNotFoundError, version
24
+
25
+ # Main functions
26
+ from .extract import extract_poster
27
+ from .validate import (
28
+ get_validation_errors,
29
+ validate_comprehensive,
30
+ validate_poster,
31
+ )
32
+
33
+ try:
34
+ __version__ = version("poster2json")
35
+ except PackageNotFoundError:
36
+ __version__ = "(local)"
37
+
38
+ del PackageNotFoundError
39
+ del version
40
+
41
+ __all__ = [
42
+ "extract_poster",
43
+ "validate_poster",
44
+ "validate_comprehensive",
45
+ "get_validation_errors",
46
+ "__version__",
47
+ ]
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env python
2
+
3
+ """Package entry point."""
4
+
5
+ from poster2json.cli import main
6
+
7
+ if __name__ == "__main__": # pragma: no cover
8
+ main() # pylint: disable=no-value-for-parameter
poster2json/cli.py ADDED
@@ -0,0 +1,272 @@
1
+ """
2
+ poster2json CLI
3
+
4
+ Command-line interface for extracting structured JSON from scientific posters.
5
+ """
6
+
7
+ import json
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ import click
12
+ from art import tprint
13
+
14
+
15
+ @click.group(invoke_without_command=True)
16
+ @click.version_option(prog_name="poster2json")
17
+ @click.pass_context
18
+ def main(ctx):
19
+ """
20
+ poster2json - Convert scientific posters to structured JSON metadata.
21
+
22
+ Extract structured metadata from scientific poster PDFs and images
23
+ using Large Language Models. Output conforms to the poster-json-schema
24
+ (DataCite-based format).
25
+
26
+ Examples:
27
+
28
+ # Extract metadata from a poster PDF
29
+ poster2json extract poster.pdf
30
+
31
+ # Extract to a specific output file
32
+ poster2json extract poster.pdf -o result.json
33
+
34
+ # Validate extracted JSON
35
+ poster2json validate result.json
36
+
37
+ # Process multiple posters in a directory
38
+ poster2json batch ./posters/ -o ./output/
39
+ """
40
+ if ctx.invoked_subcommand is None:
41
+ click.echo(ctx.get_help())
42
+ return
43
+
44
+
45
+ @main.command()
46
+ @click.argument("input_file", type=click.Path(exists=True))
47
+ @click.option(
48
+ "-o", "--output",
49
+ type=click.Path(),
50
+ help="Output JSON file path. If not provided, prints to stdout."
51
+ )
52
+ @click.option(
53
+ "--pretty/--compact",
54
+ default=True,
55
+ help="Pretty-print JSON output (default: pretty)"
56
+ )
57
+ def extract(input_file: str, output: str, pretty: bool):
58
+ """
59
+ Extract structured JSON from a scientific poster.
60
+
61
+ INPUT_FILE: Path to the poster file (PDF, JPG, or PNG)
62
+
63
+ Requires a CUDA-capable GPU with ≥16GB VRAM.
64
+
65
+ Examples:
66
+
67
+ poster2json extract poster.pdf
68
+
69
+ poster2json extract poster.jpg -o output.json
70
+ """
71
+ from .extract import extract_poster
72
+
73
+ click.echo(f"Extracting metadata from: {input_file}", err=True)
74
+
75
+ try:
76
+ result = extract_poster(input_file)
77
+
78
+ if "error" in result:
79
+ click.echo(f"Error during extraction: {result['error']}", err=True)
80
+ sys.exit(1)
81
+
82
+ # Format output
83
+ indent = 2 if pretty else None
84
+ json_output = json.dumps(result, indent=indent, ensure_ascii=False)
85
+
86
+ if output:
87
+ Path(output).parent.mkdir(parents=True, exist_ok=True)
88
+ with open(output, "w", encoding="utf-8") as f:
89
+ f.write(json_output)
90
+ click.echo(f"Output saved to: {output}", err=True)
91
+ else:
92
+ click.echo(json_output)
93
+
94
+ except Exception as e:
95
+ click.echo(f"Error: {e}", err=True)
96
+ sys.exit(1)
97
+
98
+
99
+ @main.command()
100
+ @click.argument("input_file", type=click.Path(exists=True))
101
+ @click.option(
102
+ "-v", "--verbose",
103
+ is_flag=True,
104
+ help="Show detailed validation errors"
105
+ )
106
+ def validate(input_file: str, verbose: bool):
107
+ """
108
+ Validate a poster JSON file against the schema.
109
+
110
+ INPUT_FILE: Path to the JSON file to validate
111
+
112
+ Examples:
113
+
114
+ poster2json validate result.json
115
+
116
+ poster2json validate result.json --verbose
117
+ """
118
+ from .validate import validate_comprehensive, validate_poster
119
+
120
+ try:
121
+ with open(input_file, encoding="utf-8") as f:
122
+ data = json.load(f)
123
+ except json.JSONDecodeError as e:
124
+ click.echo(f"Invalid JSON file: {e}", err=True)
125
+ sys.exit(1)
126
+ except Exception as e:
127
+ click.echo(f"Error reading file: {e}", err=True)
128
+ sys.exit(1)
129
+
130
+ if verbose:
131
+ result = validate_comprehensive(data)
132
+
133
+ if result["valid"]:
134
+ click.echo("✅ Poster JSON is valid")
135
+ else:
136
+ click.echo("❌ Poster JSON has validation errors")
137
+
138
+ if result["schema_errors"]:
139
+ click.echo("\nSchema Errors:")
140
+ for error in result["schema_errors"]:
141
+ click.echo(f" - {error['path']}: {error['message']}")
142
+
143
+ if result["field_issues"]:
144
+ click.echo("\nField Issues:")
145
+ for issue in result["field_issues"]:
146
+ click.echo(f" - {issue}")
147
+
148
+ if result["warnings"]:
149
+ click.echo("\nWarnings:")
150
+ for warning in result["warnings"]:
151
+ click.echo(f" - {warning}")
152
+
153
+ sys.exit(0 if result["valid"] else 1)
154
+ else:
155
+ is_valid = validate_poster(data, verbose=False)
156
+ if is_valid:
157
+ click.echo("✅ Valid")
158
+ sys.exit(0)
159
+ else:
160
+ click.echo("❌ Invalid (use --verbose for details)")
161
+ sys.exit(1)
162
+
163
+
164
+ @main.command()
165
+ @click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
166
+ @click.option(
167
+ "-o", "--output-dir",
168
+ type=click.Path(),
169
+ default="./output",
170
+ help="Output directory for extracted JSON files"
171
+ )
172
+ @click.option(
173
+ "--pattern",
174
+ default="*.pdf,*.jpg,*.png",
175
+ help="File patterns to process (comma-separated)"
176
+ )
177
+ def batch(input_dir: str, output_dir: str, pattern: str):
178
+ """
179
+ Extract metadata from multiple posters in a directory.
180
+
181
+ INPUT_DIR: Directory containing poster files
182
+
183
+ Examples:
184
+
185
+ poster2json batch ./posters/
186
+
187
+ poster2json batch ./posters/ -o ./results/
188
+
189
+ poster2json batch ./posters/ --pattern "*.pdf"
190
+ """
191
+ from .extract import extract_poster, log
192
+
193
+ input_path = Path(input_dir)
194
+ output_path = Path(output_dir)
195
+ output_path.mkdir(parents=True, exist_ok=True)
196
+
197
+ # Find files matching patterns
198
+ patterns = [p.strip() for p in pattern.split(",")]
199
+ files = []
200
+ for pat in patterns:
201
+ files.extend(input_path.glob(pat))
202
+ files = sorted(set(files))
203
+
204
+ if not files:
205
+ click.echo(f"No files found matching patterns: {pattern}", err=True)
206
+ sys.exit(1)
207
+
208
+ click.echo(f"Found {len(files)} files to process", err=True)
209
+
210
+ results = []
211
+ for i, file_path in enumerate(files, 1):
212
+ click.echo(f"\n[{i}/{len(files)}] Processing: {file_path.name}", err=True)
213
+
214
+ try:
215
+ result = extract_poster(str(file_path))
216
+
217
+ # Save output
218
+ output_file = output_path / f"{file_path.stem}_extracted.json"
219
+ with open(output_file, "w", encoding="utf-8") as f:
220
+ json.dump(result, f, indent=2, ensure_ascii=False)
221
+
222
+ success = "error" not in result
223
+ results.append({
224
+ "file": file_path.name,
225
+ "output": str(output_file),
226
+ "success": success,
227
+ "error": result.get("error") if not success else None,
228
+ })
229
+
230
+ status = "✅" if success else "❌"
231
+ click.echo(f" {status} Saved to: {output_file}", err=True)
232
+
233
+ except Exception as e:
234
+ results.append({
235
+ "file": file_path.name,
236
+ "success": False,
237
+ "error": str(e),
238
+ })
239
+ click.echo(f" ❌ Error: {e}", err=True)
240
+
241
+ # Summary
242
+ successful = sum(1 for r in results if r["success"])
243
+ click.echo(f"\n{'='*50}", err=True)
244
+ click.echo(f"Completed: {successful}/{len(results)} successful", err=True)
245
+
246
+ # Save results summary
247
+ summary_file = output_path / "batch_results.json"
248
+ with open(summary_file, "w", encoding="utf-8") as f:
249
+ json.dump(results, f, indent=2)
250
+ click.echo(f"Summary saved to: {summary_file}", err=True)
251
+
252
+
253
+ @main.command()
254
+ def info():
255
+ """
256
+ Show information about poster2json.
257
+ """
258
+ tprint("poster2json")
259
+
260
+ click.echo("\nConvert scientific posters to structured JSON metadata.")
261
+ click.echo("\nDocumentation: https://fairdataihub.github.io/poster2json/")
262
+ click.echo("Repository: https://github.com/fairdataihub/poster2json")
263
+ click.echo("\nModels used:")
264
+ click.echo(" - Llama 3.1 8B Poster Extraction (JSON structuring)")
265
+ click.echo(" - Qwen2-VL-7B-Instruct (Vision OCR for images)")
266
+ click.echo("\nRequirements:")
267
+ click.echo(" - CUDA-capable GPU with ≥16GB VRAM")
268
+ click.echo(" - pdfalto (for PDF processing)")
269
+
270
+
271
+ if __name__ == "__main__":
272
+ main()