poster2json 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: poster2json
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
5
5
  License: MIT
6
6
  License-File: LICENSE.md
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  <div align="center">
46
46
 
47
- <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
47
+ <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
48
48
 
49
49
  <br />
50
50
 
@@ -118,9 +118,16 @@ pip install poster2json
118
118
  ### CLI Usage
119
119
 
120
120
  ```bash
121
- # Extract metadata from a poster
121
+ # Extract metadata from a poster (default: fine-tuned Llama @ 4bit)
122
122
  poster2json extract poster.pdf -o result.json
123
123
 
124
+ # Use a different instruct model (any HuggingFace repo id works)
125
+ poster2json extract poster.pdf --model google/gemma-2-9b-it --quantization 4bit
126
+
127
+ # Trade VRAM for quality
128
+ poster2json extract poster.pdf --quantization 8bit
129
+ poster2json extract poster.pdf --quantization fp16
130
+
124
131
  # Validate extracted JSON
125
132
  poster2json validate result.json
126
133
 
@@ -175,7 +182,7 @@ Output conforms to the [poster-json-schema](https://github.com/fairdataihub/post
175
182
 
176
183
  | Requirement | Specification |
177
184
  | ----------- | -------------------------------- |
178
- | GPU | NVIDIA CUDA-capable, ≥16GB VRAM |
185
+ | GPU | NVIDIA CUDA-capable, ≥8GB VRAM (default 4bit); ≥16GB for `--quantization fp16` or image/OCR posters |
179
186
  | RAM | ≥32GB recommended |
180
187
  | Python | 3.10+ |
181
188
  | OS | Linux, macOS, Windows (via WSL2) |
@@ -246,7 +253,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
246
253
  title = {poster2json: Scientific Poster to JSON Metadata Extraction},
247
254
  author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
248
255
  year = {2026},
249
- version = {0.2.2},
256
+ version = {0.2.3},
250
257
  url = {https://github.com/fairdataihub/poster2json},
251
258
  doi = {10.5281/zenodo.18320010}
252
259
  }
@@ -1,6 +1,6 @@
1
1
  <div align="center">
2
2
 
3
- <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
3
+ <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
4
4
 
5
5
  <br />
6
6
 
@@ -74,9 +74,16 @@ pip install poster2json
74
74
  ### CLI Usage
75
75
 
76
76
  ```bash
77
- # Extract metadata from a poster
77
+ # Extract metadata from a poster (default: fine-tuned Llama @ 4bit)
78
78
  poster2json extract poster.pdf -o result.json
79
79
 
80
+ # Use a different instruct model (any HuggingFace repo id works)
81
+ poster2json extract poster.pdf --model google/gemma-2-9b-it --quantization 4bit
82
+
83
+ # Trade VRAM for quality
84
+ poster2json extract poster.pdf --quantization 8bit
85
+ poster2json extract poster.pdf --quantization fp16
86
+
80
87
  # Validate extracted JSON
81
88
  poster2json validate result.json
82
89
 
@@ -131,7 +138,7 @@ Output conforms to the [poster-json-schema](https://github.com/fairdataihub/post
131
138
 
132
139
  | Requirement | Specification |
133
140
  | ----------- | -------------------------------- |
134
- | GPU | NVIDIA CUDA-capable, ≥16GB VRAM |
141
+ | GPU | NVIDIA CUDA-capable, ≥8GB VRAM (default 4bit); ≥16GB for `--quantization fp16` or image/OCR posters |
135
142
  | RAM | ≥32GB recommended |
136
143
  | Python | 3.10+ |
137
144
  | OS | Linux, macOS, Windows (via WSL2) |
@@ -202,7 +209,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
202
209
  title = {poster2json: Scientific Poster to JSON Metadata Extraction},
203
210
  author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
204
211
  year = {2026},
205
- version = {0.2.2},
212
+ version = {0.2.3},
206
213
  url = {https://github.com/fairdataihub/poster2json},
207
214
  doi = {10.5281/zenodo.18320010}
208
215
  }
@@ -54,26 +54,52 @@ def main(ctx):
54
54
  default=True,
55
55
  help="Pretty-print JSON output (default: pretty)"
56
56
  )
57
- def extract(input_file: str, output: str, pretty: bool):
57
+ @click.option(
58
+ "--model",
59
+ "model_id",
60
+ type=str,
61
+ default=None,
62
+ help=(
63
+ "HuggingFace model ID to use for JSON structuring. Overrides the "
64
+ "default fine-tuned Llama. Any instruct model works "
65
+ "(e.g. google/gemma-2-9b-it, Qwen/Qwen2.5-7B-Instruct)."
66
+ )
67
+ )
68
+ @click.option(
69
+ "--quantization",
70
+ type=click.Choice(["fp16", "8bit", "4bit"], case_sensitive=False),
71
+ default=None,
72
+ help="Precision mode for the JSON model. Defaults to 4bit (NF4)."
73
+ )
74
+ def extract(input_file: str, output: str, pretty: bool, model_id: str, quantization: str):
58
75
  """
59
76
  Extract structured JSON from a scientific poster.
60
-
77
+
61
78
  INPUT_FILE: Path to the poster file (PDF, JPG, or PNG)
62
-
63
- Requires a CUDA-capable GPU with ≥16GB VRAM.
64
-
79
+
80
+ Requires a CUDA-capable GPU. The default 4bit quantization fits on
81
+ ~6GB VRAM; use --quantization 8bit or fp16 if you have headroom and
82
+ want slightly better quality. (Image/OCR posters also load a Qwen2-VL
83
+ vision model at bf16 — expect higher peak VRAM on that path.)
84
+
65
85
  Examples:
66
-
86
+
67
87
  poster2json extract poster.pdf
68
-
88
+
69
89
  poster2json extract poster.jpg -o output.json
90
+
91
+ poster2json extract poster.pdf --model google/gemma-2-9b-it --quantization 8bit
70
92
  """
71
93
  from .extract import extract_poster
72
-
94
+
73
95
  click.echo(f"Extracting metadata from: {input_file}", err=True)
74
-
96
+ if model_id:
97
+ click.echo(f"Model: {model_id}", err=True)
98
+ if quantization:
99
+ click.echo(f"Quantization: {quantization}", err=True)
100
+
75
101
  try:
76
- result = extract_poster(input_file)
102
+ result = extract_poster(input_file, model_id=model_id, quantization=quantization)
77
103
 
78
104
  if "error" in result:
79
105
  click.echo(f"Error during extraction: {result['error']}", err=True)
@@ -33,6 +33,7 @@ from transformers import (
33
33
  AutoModelForCausalLM,
34
34
  AutoProcessor,
35
35
  AutoTokenizer,
36
+ BitsAndBytesConfig,
36
37
  Qwen2VLForConditionalGeneration,
37
38
  TextStreamer,
38
39
  )
@@ -589,9 +590,21 @@ _json_model = None
589
590
  _json_tokenizer = None
590
591
 
591
592
 
592
- def load_json_model(force_full_precision: bool = False):
593
- """Load Llama 3.1 8B for JSON structuring."""
593
+ def load_json_model(
594
+ model_id: Optional[str] = None,
595
+ quantization: Optional[str] = None,
596
+ ):
597
+ """Load the JSON-structuring LLM.
598
+
599
+ Args:
600
+ model_id: override the default JSON_MODEL_ID. Accepts any HuggingFace
601
+ repo id (e.g. the default fine-tuned Llama, or a generic instruct
602
+ model like google/gemma-2-9b-it, Qwen/Qwen2.5-7B-Instruct).
603
+ quantization: precision mode — one of "fp16", "8bit", "4bit".
604
+ Defaults to "4bit" (NF4), which fits on ~6GB VRAM.
605
+ """
594
606
  global _json_model, _json_tokenizer
607
+ resolved_model_id = model_id or JSON_MODEL_ID
595
608
  if _json_model is None:
596
609
  device = get_best_gpu()
597
610
 
@@ -604,12 +617,16 @@ def load_json_model(force_full_precision: bool = False):
604
617
  free_gb = 32
605
618
  device_map_value = "cpu"
606
619
 
607
- log(f"Loading {JSON_MODEL_ID} for JSON structuring on {device}...")
620
+ log(f"Loading {resolved_model_id} for JSON structuring on {device}...")
608
621
 
609
622
  try:
610
- _json_tokenizer = AutoTokenizer.from_pretrained(JSON_MODEL_ID)
623
+ _json_tokenizer = AutoTokenizer.from_pretrained(resolved_model_id)
611
624
 
612
- use_8bit = free_gb < 16 and device != "cpu" and not force_full_precision
625
+ mode = (quantization or "4bit").lower()
626
+ if mode not in {"fp16", "8bit", "4bit"}:
627
+ raise ValueError(
628
+ f"quantization must be one of fp16|8bit|4bit, got {quantization!r}"
629
+ )
613
630
 
614
631
  # Try Flash Attention 2
615
632
  try:
@@ -621,28 +638,30 @@ def load_json_model(force_full_precision: bool = False):
621
638
  attn_impl = None
622
639
  log(" Flash Attention not available, using default attention")
623
640
 
624
- if use_8bit:
625
- log(f" Using 8-bit quantization (only {free_gb:.1f}GB free)")
626
- model_kwargs = {
627
- "load_in_8bit": True,
628
- "device_map": device_map_value,
629
- "low_cpu_mem_usage": True,
630
- }
631
- if attn_impl:
632
- model_kwargs["attn_implementation"] = attn_impl
633
- _json_model = AutoModelForCausalLM.from_pretrained(JSON_MODEL_ID, **model_kwargs)
634
- else:
635
- if force_full_precision and free_gb < 16:
636
- log(f" Forcing full precision for quality ({free_gb:.1f}GB free)")
637
- model_kwargs = {
638
- "torch_dtype": torch.bfloat16,
639
- "device_map": device_map_value,
640
- "low_cpu_mem_usage": True,
641
- }
642
- if attn_impl:
643
- model_kwargs["attn_implementation"] = attn_impl
644
- _json_model = AutoModelForCausalLM.from_pretrained(JSON_MODEL_ID, **model_kwargs)
645
- log(f" ✓ JSON model loaded on {device}")
641
+ model_kwargs = {
642
+ "device_map": device_map_value,
643
+ "low_cpu_mem_usage": True,
644
+ }
645
+ if attn_impl:
646
+ model_kwargs["attn_implementation"] = attn_impl
647
+
648
+ if mode == "8bit":
649
+ log(f" Using 8-bit quantization (free={free_gb:.1f}GB)")
650
+ model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
651
+ elif mode == "4bit":
652
+ log(f" Using 4-bit NF4 quantization (free={free_gb:.1f}GB)")
653
+ model_kwargs["quantization_config"] = BitsAndBytesConfig(
654
+ load_in_4bit=True,
655
+ bnb_4bit_quant_type="nf4",
656
+ bnb_4bit_compute_dtype=torch.bfloat16,
657
+ bnb_4bit_use_double_quant=True,
658
+ )
659
+ else: # fp16 (bfloat16)
660
+ log(f" Using bfloat16 (free={free_gb:.1f}GB)")
661
+ model_kwargs["torch_dtype"] = torch.bfloat16
662
+
663
+ _json_model = AutoModelForCausalLM.from_pretrained(resolved_model_id, **model_kwargs)
664
+ log(f" ✓ JSON model loaded on {device} ({mode})")
646
665
  except Exception as e:
647
666
  log(f" ✗ Failed to load JSON model: {e}")
648
667
  if _json_model is not None:
@@ -722,13 +741,7 @@ JSON SCHEMA (all top-level fields are REQUIRED):
722
741
  "subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}, {{"subject": "keyword3"}}],
723
742
  "descriptions": [{{"description": "The abstract text from the poster...", "descriptionType": "Abstract"}}],
724
743
  "publisher": {{"name": "Conference Organizer or Institution Name"}},
725
- "conference": {{
726
- "conferenceName": "Name of Conference",
727
- "conferenceYear": 2025,
728
- "conferenceLocation": "City, Country",
729
- "conferenceStartDate": "YYYY-MM-DD",
730
- "conferenceEndDate": "YYYY-MM-DD"
731
- }},
744
+ "conference": null,
732
745
  "formats": ["PDF"],
733
746
  "content": {{
734
747
  "sections": [
@@ -747,8 +760,12 @@ EXTRACTION NOTES:
747
760
  - descriptions: Use the Abstract section content, descriptionType is REQUIRED
748
761
  - publisher: Use conference organizer, hosting institution, or repository name
749
762
  - titles: If the poster title is ALL CAPS, convert to proper Title Case preserving acronyms (e.g. "RESEARCH ON SARS-CoV-2" not "RESEARCH ON SARS-COV-2")
750
- - conference: conferenceName and conferenceYear are REQUIRED; extract from poster header/footer. If not found on the poster, omit the field entirely — do NOT guess or use placeholders
751
- - publisher: Extract from poster. If not found, omit — do NOT use placeholder text
763
+ - conference: Extract ONLY from text clearly visible on the poster (header, footer, logos).
764
+ * If conference details are NOT visible, set "conference": null — do NOT invent names, locations, dates, URLs, or acronyms.
765
+ * NEVER output generic values like "Name of Conference", "City, Country", "Conference Name", or made-up URLs.
766
+ * If only SOME fields are visible (e.g. name and year but not location), include only those: {{"conferenceName": "ACL 2024", "conferenceYear": 2024}}
767
+ * If no conference information is found at all, output "conference": null
768
+ - publisher: Extract from poster. If not found, set to null — do NOT use placeholder text
752
769
  - formats: Set to ["PDF"] for PDF files, ["PNG"] or ["JPEG"] for images
753
770
  - imageCaptions/tableCaptions: Use "id" field (e.g., "fig1") for cross-referencing if needed
754
771
  - rightsList: OPTIONAL - include if license/copyright info found on poster
@@ -763,7 +780,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
763
780
  2. SEPARATE section for EACH header found in the poster text. Use the poster's own headers. Lines starting with "## " are detected headers.
764
781
  3. Copy ALL text EXACTLY verbatim
765
782
  4. If title is ALL CAPS, convert to Title Case preserving acronyms (SARS-CoV-2, not SARS-COV-2)
766
- 5. Omit conference/publisher if not found on the poster never guess or use placeholders
783
+ 5. conference/publisher: extract ONLY if clearly visible on the poster. If not found, set to null. NEVER invent names, locations, dates, URLs, or use generic placeholders.
767
784
 
768
785
  {{
769
786
  "creators": [{{"name": "LastName, FirstName", "givenName": "FirstName", "familyName": "LastName", "affiliation": ["Institution"]}}],
@@ -772,7 +789,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
772
789
  "subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}],
773
790
  "descriptions": [{{"description": "Abstract text", "descriptionType": "Abstract"}}],
774
791
  "publisher": {{"name": "Conference or Institution"}},
775
- "conference": {{"conferenceName": "Conference Name", "conferenceYear": 2025, "conferenceLocation": "Location"}},
792
+ "conference": null,
776
793
  "formats": ["PDF"],
777
794
  "content": {{
778
795
  "sections": [{{"sectionTitle": "Header", "sectionContent": "verbatim text"}}]
@@ -1172,43 +1189,6 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
1172
1189
 
1173
1190
  result = enrich_json_with_identifiers(result, raw_text)
1174
1191
 
1175
- # Strip "Unknown" and prompt-placeholder values the LLM likes to hallucinate.
1176
- # These violate metadata quality expectations — better to omit than guess.
1177
- _UNKNOWN_RE = re.compile(r"^unknown\b", re.IGNORECASE)
1178
- # Prompt placeholders that the model echoes back verbatim when it can't
1179
- # find real conference metadata on the poster.
1180
- _PLACEHOLDER_VALS = {
1181
- "name of conference",
1182
- "conference name",
1183
- "city, country",
1184
- "location",
1185
- "conference organizer or institution name",
1186
- "conference or institution",
1187
- }
1188
- _PLACEHOLDER_DATE_RE = re.compile(r"^[Yy]{4}-[Mm]{2}-[Dd]{2}$")
1189
-
1190
- def _is_placeholder(val: str) -> bool:
1191
- s = val.strip()
1192
- return (
1193
- not s
1194
- or _UNKNOWN_RE.match(s)
1195
- or s.lower() in _PLACEHOLDER_VALS
1196
- or bool(_PLACEHOLDER_DATE_RE.match(s))
1197
- )
1198
-
1199
- if "conference" in result and isinstance(result["conference"], dict):
1200
- for key in list(result["conference"]):
1201
- val = result["conference"][key]
1202
- if isinstance(val, str) and _is_placeholder(val):
1203
- del result["conference"][key]
1204
- # Top-level optional string fields
1205
- for key in ("conferenceLocation", "publisher", "researchField"):
1206
- val = result.get(key)
1207
- if isinstance(val, str) and _is_placeholder(val):
1208
- del result[key]
1209
- elif isinstance(val, dict) and "name" in val and isinstance(val["name"], str) and _is_placeholder(val["name"]):
1210
- del result[key]
1211
-
1212
1192
  return result
1213
1193
 
1214
1194
 
@@ -1286,23 +1266,22 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
1286
1266
  return result
1287
1267
 
1288
1268
 
1289
- def extract_poster(poster_path: str) -> dict:
1269
+ def extract_poster(
1270
+ poster_path: str,
1271
+ model_id: Optional[str] = None,
1272
+ quantization: Optional[str] = None,
1273
+ ) -> dict:
1290
1274
  """
1291
1275
  Extract structured JSON metadata from a scientific poster.
1292
1276
 
1293
- This is the main entry point for poster extraction.
1294
-
1295
1277
  Args:
1296
- poster_path: Path to the poster file (PDF, JPG, or PNG)
1297
-
1298
- Returns:
1299
- Dictionary containing structured poster metadata conforming to
1300
- the poster-json-schema.
1301
-
1302
- Example:
1303
- >>> result = extract_poster("poster.pdf")
1304
- >>> print(result["titles"][0]["title"])
1305
- "Machine Learning Approaches to Diabetic Retinopathy Detection"
1278
+ poster_path: Path to the poster file (PDF, JPG, or PNG).
1279
+ model_id: Override the default JSON structuring model. Accepts any
1280
+ HuggingFace repo id (e.g. google/gemma-2-9b-it,
1281
+ Qwen/Qwen2.5-7B-Instruct) in addition to the default fine-tuned
1282
+ Llama.
1283
+ quantization: Precision mode: "fp16", "8bit", or "4bit".
1284
+ Defaults to "4bit" (NF4) when unset.
1306
1285
  """
1307
1286
  log(f"Processing poster: {poster_path}")
1308
1287
 
@@ -1317,12 +1296,13 @@ def extract_poster(poster_path: str) -> dict:
1317
1296
  log(f"Extracted {len(raw_text)} chars using {source} in {t_extract_elapsed:.2f}s")
1318
1297
 
1319
1298
  # Unload vision model before loading JSON model
1320
- ext = Path(poster_path).suffix.lower()
1321
- is_image_poster = ext in [".jpg", ".jpeg", ".png"]
1322
1299
  unload_vision_model()
1323
1300
 
1324
1301
  # Load JSON model
1325
- model, tokenizer = load_json_model(force_full_precision=is_image_poster)
1302
+ model, tokenizer = load_json_model(
1303
+ model_id=model_id,
1304
+ quantization=quantization,
1305
+ )
1326
1306
 
1327
1307
  try:
1328
1308
  t_json_start = time.time()
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
 
3
3
  name = "poster2json"
4
- version = "0.2.2"
4
+ version = "0.3.0"
5
5
  description = "Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models"
6
6
 
7
7
  packages = [{ include = "poster2json" }]
File without changes