PyPI - poster2json - Versions diffs - 0.2.2__tar.gz → 0.3.0__tar.gz - Mend

poster2json 0.2.2tar.gz → 0.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

{poster2json-0.2.2 → poster2json-0.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: poster2json
-Version: 0.2.2
+Version: 0.3.0
 Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
 License: MIT
 License-File: LICENSE.md
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
 <div align="center">
-<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
+<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
 <br />
@@ -118,9 +118,16 @@ pip install poster2json
 ### CLI Usage
 ```bash
-# Extract metadata from a poster
+# Extract metadata from a poster (default: fine-tuned Llama @ 4bit)
 poster2json extract poster.pdf -o result.json
+# Use a different instruct model (any HuggingFace repo id works)
+poster2json extract poster.pdf --model google/gemma-2-9b-it --quantization 4bit
+# Trade VRAM for quality
+poster2json extract poster.pdf --quantization 8bit
+poster2json extract poster.pdf --quantization fp16
 # Validate extracted JSON
 poster2json validate result.json
@@ -175,7 +182,7 @@ Output conforms to the [poster-json-schema](https://github.com/fairdataihub/post
 | Requirement | Specification                    |
 | ----------- | -------------------------------- |
-| GPU         | NVIDIA CUDA-capable, ≥16GB VRAM  |
+| GPU         | NVIDIA CUDA-capable, ≥8GB VRAM (default 4bit); ≥16GB for `--quantization fp16` or image/OCR posters |
 | RAM         | ≥32GB recommended                |
 | Python      | 3.10+                            |
 | OS          | Linux, macOS, Windows (via WSL2) |
@@ -246,7 +253,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
   title = {poster2json: Scientific Poster to JSON Metadata Extraction},
   author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
   year = {2026},
-  version = {0.2.2},
+  version = {0.2.3},
   url = {https://github.com/fairdataihub/poster2json},
   doi = {10.5281/zenodo.18320010}
 }

{poster2json-0.2.2 → poster2json-0.3.0}/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 <div align="center">
-<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
+<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
 <br />
@@ -74,9 +74,16 @@ pip install poster2json
 ### CLI Usage
 ```bash
-# Extract metadata from a poster
+# Extract metadata from a poster (default: fine-tuned Llama @ 4bit)
 poster2json extract poster.pdf -o result.json
+# Use a different instruct model (any HuggingFace repo id works)
+poster2json extract poster.pdf --model google/gemma-2-9b-it --quantization 4bit
+# Trade VRAM for quality
+poster2json extract poster.pdf --quantization 8bit
+poster2json extract poster.pdf --quantization fp16
 # Validate extracted JSON
 poster2json validate result.json
@@ -131,7 +138,7 @@ Output conforms to the [poster-json-schema](https://github.com/fairdataihub/post
 | Requirement | Specification                    |
 | ----------- | -------------------------------- |
-| GPU         | NVIDIA CUDA-capable, ≥16GB VRAM  |
+| GPU         | NVIDIA CUDA-capable, ≥8GB VRAM (default 4bit); ≥16GB for `--quantization fp16` or image/OCR posters |
 | RAM         | ≥32GB recommended                |
 | Python      | 3.10+                            |
 | OS          | Linux, macOS, Windows (via WSL2) |
@@ -202,7 +209,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
   title = {poster2json: Scientific Poster to JSON Metadata Extraction},
   author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
   year = {2026},
-  version = {0.2.2},
+  version = {0.2.3},
   url = {https://github.com/fairdataihub/poster2json},
   doi = {10.5281/zenodo.18320010}
 }

{poster2json-0.2.2 → poster2json-0.3.0}/poster2json/cli.py RENAMED Viewed

@@ -54,26 +54,52 @@ def main(ctx):
     default=True,
     help="Pretty-print JSON output (default: pretty)"
 )
-def extract(input_file: str, output: str, pretty: bool):
+@click.option(
+    "--model",
+    "model_id",
+    type=str,
+    default=None,
+    help=(
+        "HuggingFace model ID to use for JSON structuring. Overrides the "
+        "default fine-tuned Llama. Any instruct model works "
+        "(e.g. google/gemma-2-9b-it, Qwen/Qwen2.5-7B-Instruct)."
+    )
+)
+@click.option(
+    "--quantization",
+    type=click.Choice(["fp16", "8bit", "4bit"], case_sensitive=False),
+    default=None,
+    help="Precision mode for the JSON model. Defaults to 4bit (NF4)."
+)
+def extract(input_file: str, output: str, pretty: bool, model_id: str, quantization: str):
     """
     Extract structured JSON from a scientific poster.
     INPUT_FILE: Path to the poster file (PDF, JPG, or PNG)
-    Requires a CUDA-capable GPU with ≥16GB VRAM.
+    Requires a CUDA-capable GPU. The default 4bit quantization fits on
+    ~6GB VRAM; use --quantization 8bit or fp16 if you have headroom and
+    want slightly better quality. (Image/OCR posters also load a Qwen2-VL
+    vision model at bf16 — expect higher peak VRAM on that path.)
     Examples:
         poster2json extract poster.pdf
         poster2json extract poster.jpg -o output.json
+        poster2json extract poster.pdf --model google/gemma-2-9b-it --quantization 8bit
     """
     from .extract import extract_poster
     click.echo(f"Extracting metadata from: {input_file}", err=True)
+    if model_id:
+        click.echo(f"Model: {model_id}", err=True)
+    if quantization:
+        click.echo(f"Quantization: {quantization}", err=True)
     try:
-        result = extract_poster(input_file)
+        result = extract_poster(input_file, model_id=model_id, quantization=quantization)
         if "error" in result:
             click.echo(f"Error during extraction: {result['error']}", err=True)

{poster2json-0.2.2 → poster2json-0.3.0}/poster2json/extract.py RENAMED Viewed

@@ -33,6 +33,7 @@ from transformers import (
     AutoModelForCausalLM,
     AutoProcessor,
     AutoTokenizer,
+    BitsAndBytesConfig,
     Qwen2VLForConditionalGeneration,
     TextStreamer,
 )
@@ -589,9 +590,21 @@ _json_model = None
 _json_tokenizer = None
-def load_json_model(force_full_precision: bool = False):
-    """Load Llama 3.1 8B for JSON structuring."""
+def load_json_model(
+    model_id: Optional[str] = None,
+    quantization: Optional[str] = None,
+):
+    """Load the JSON-structuring LLM.
+    Args:
+        model_id: override the default JSON_MODEL_ID. Accepts any HuggingFace
+            repo id (e.g. the default fine-tuned Llama, or a generic instruct
+            model like google/gemma-2-9b-it, Qwen/Qwen2.5-7B-Instruct).
+        quantization: precision mode — one of "fp16", "8bit", "4bit".
+            Defaults to "4bit" (NF4), which fits on ~6GB VRAM.
+    """
     global _json_model, _json_tokenizer
+    resolved_model_id = model_id or JSON_MODEL_ID
     if _json_model is None:
         device = get_best_gpu()
@@ -604,12 +617,16 @@ def load_json_model(force_full_precision: bool = False):
             free_gb = 32
             device_map_value = "cpu"
-        log(f"Loading {JSON_MODEL_ID} for JSON structuring on {device}...")
+        log(f"Loading {resolved_model_id} for JSON structuring on {device}...")
         try:
-            _json_tokenizer = AutoTokenizer.from_pretrained(JSON_MODEL_ID)
+            _json_tokenizer = AutoTokenizer.from_pretrained(resolved_model_id)
-            use_8bit = free_gb < 16 and device != "cpu" and not force_full_precision
+            mode = (quantization or "4bit").lower()
+            if mode not in {"fp16", "8bit", "4bit"}:
+                raise ValueError(
+                    f"quantization must be one of fp16|8bit|4bit, got {quantization!r}"
+                )
             # Try Flash Attention 2
             try:
@@ -621,28 +638,30 @@ def load_json_model(force_full_precision: bool = False):
                 attn_impl = None
                 log("   Flash Attention not available, using default attention")
-            if use_8bit:
-                log(f"   Using 8-bit quantization (only {free_gb:.1f}GB free)")
-                model_kwargs = {
-                    "load_in_8bit": True,
-                    "device_map": device_map_value,
-                    "low_cpu_mem_usage": True,
-                }
-                if attn_impl:
-                    model_kwargs["attn_implementation"] = attn_impl
-                _json_model = AutoModelForCausalLM.from_pretrained(JSON_MODEL_ID, **model_kwargs)
-            else:
-                if force_full_precision and free_gb < 16:
-                    log(f"   Forcing full precision for quality ({free_gb:.1f}GB free)")
-                model_kwargs = {
-                    "torch_dtype": torch.bfloat16,
-                    "device_map": device_map_value,
-                    "low_cpu_mem_usage": True,
-                }
-                if attn_impl:
-                    model_kwargs["attn_implementation"] = attn_impl
-                _json_model = AutoModelForCausalLM.from_pretrained(JSON_MODEL_ID, **model_kwargs)
-            log(f"   ✓ JSON model loaded on {device}")
+            model_kwargs = {
+                "device_map": device_map_value,
+                "low_cpu_mem_usage": True,
+            }
+            if attn_impl:
+                model_kwargs["attn_implementation"] = attn_impl
+            if mode == "8bit":
+                log(f"   Using 8-bit quantization (free={free_gb:.1f}GB)")
+                model_kwargs["quantization_config"] = BitsAndBytesConfig(load_in_8bit=True)
+            elif mode == "4bit":
+                log(f"   Using 4-bit NF4 quantization (free={free_gb:.1f}GB)")
+                model_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_quant_type="nf4",
+                    bnb_4bit_compute_dtype=torch.bfloat16,
+                    bnb_4bit_use_double_quant=True,
+                )
+            else:  # fp16 (bfloat16)
+                log(f"   Using bfloat16 (free={free_gb:.1f}GB)")
+                model_kwargs["torch_dtype"] = torch.bfloat16
+            _json_model = AutoModelForCausalLM.from_pretrained(resolved_model_id, **model_kwargs)
+            log(f"   ✓ JSON model loaded on {device} ({mode})")
         except Exception as e:
             log(f"   ✗ Failed to load JSON model: {e}")
             if _json_model is not None:
@@ -722,13 +741,7 @@ JSON SCHEMA (all top-level fields are REQUIRED):
   "subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}, {{"subject": "keyword3"}}],
   "descriptions": [{{"description": "The abstract text from the poster...", "descriptionType": "Abstract"}}],
   "publisher": {{"name": "Conference Organizer or Institution Name"}},
-  "conference": {{
-    "conferenceName": "Name of Conference",
-    "conferenceYear": 2025,
-    "conferenceLocation": "City, Country",
-    "conferenceStartDate": "YYYY-MM-DD",
-    "conferenceEndDate": "YYYY-MM-DD"
-  }},
+  "conference": null,
   "formats": ["PDF"],
   "content": {{
     "sections": [
@@ -747,8 +760,12 @@ EXTRACTION NOTES:
 - descriptions: Use the Abstract section content, descriptionType is REQUIRED
 - publisher: Use conference organizer, hosting institution, or repository name
 - titles: If the poster title is ALL CAPS, convert to proper Title Case preserving acronyms (e.g. "RESEARCH ON SARS-CoV-2" not "RESEARCH ON SARS-COV-2")
-- conference: conferenceName and conferenceYear are REQUIRED; extract from poster header/footer. If not found on the poster, omit the field entirely — do NOT guess or use placeholders
-- publisher: Extract from poster. If not found, omit — do NOT use placeholder text
+- conference: Extract ONLY from text clearly visible on the poster (header, footer, logos).
+  * If conference details are NOT visible, set "conference": null — do NOT invent names, locations, dates, URLs, or acronyms.
+  * NEVER output generic values like "Name of Conference", "City, Country", "Conference Name", or made-up URLs.
+  * If only SOME fields are visible (e.g. name and year but not location), include only those: {{"conferenceName": "ACL 2024", "conferenceYear": 2024}}
+  * If no conference information is found at all, output "conference": null
+- publisher: Extract from poster. If not found, set to null — do NOT use placeholder text
 - formats: Set to ["PDF"] for PDF files, ["PNG"] or ["JPEG"] for images
 - imageCaptions/tableCaptions: Use "id" field (e.g., "fig1") for cross-referencing if needed
 - rightsList: OPTIONAL - include if license/copyright info found on poster
@@ -763,7 +780,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
 2. SEPARATE section for EACH header found in the poster text. Use the poster's own headers. Lines starting with "## " are detected headers.
 3. Copy ALL text EXACTLY verbatim
 4. If title is ALL CAPS, convert to Title Case preserving acronyms (SARS-CoV-2, not SARS-COV-2)
-5. Omit conference/publisher if not found on the poster — never guess or use placeholders
+5. conference/publisher: extract ONLY if clearly visible on the poster. If not found, set to null. NEVER invent names, locations, dates, URLs, or use generic placeholders.
 {{
   "creators": [{{"name": "LastName, FirstName", "givenName": "FirstName", "familyName": "LastName", "affiliation": ["Institution"]}}],
@@ -772,7 +789,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
   "subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}],
   "descriptions": [{{"description": "Abstract text", "descriptionType": "Abstract"}}],
   "publisher": {{"name": "Conference or Institution"}},
-  "conference": {{"conferenceName": "Conference Name", "conferenceYear": 2025, "conferenceLocation": "Location"}},
+  "conference": null,
   "formats": ["PDF"],
   "content": {{
     "sections": [{{"sectionTitle": "Header", "sectionContent": "verbatim text"}}]
@@ -1172,43 +1189,6 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
         result = enrich_json_with_identifiers(result, raw_text)
-    # Strip "Unknown" and prompt-placeholder values the LLM likes to hallucinate.
-    # These violate metadata quality expectations — better to omit than guess.
-    _UNKNOWN_RE = re.compile(r"^unknown\b", re.IGNORECASE)
-    # Prompt placeholders that the model echoes back verbatim when it can't
-    # find real conference metadata on the poster.
-    _PLACEHOLDER_VALS = {
-        "name of conference",
-        "conference name",
-        "city, country",
-        "location",
-        "conference organizer or institution name",
-        "conference or institution",
-    }
-    _PLACEHOLDER_DATE_RE = re.compile(r"^[Yy]{4}-[Mm]{2}-[Dd]{2}$")
-    def _is_placeholder(val: str) -> bool:
-        s = val.strip()
-        return (
-            not s
-            or _UNKNOWN_RE.match(s)
-            or s.lower() in _PLACEHOLDER_VALS
-            or bool(_PLACEHOLDER_DATE_RE.match(s))
-        )
-    if "conference" in result and isinstance(result["conference"], dict):
-        for key in list(result["conference"]):
-            val = result["conference"][key]
-            if isinstance(val, str) and _is_placeholder(val):
-                del result["conference"][key]
-    # Top-level optional string fields
-    for key in ("conferenceLocation", "publisher", "researchField"):
-        val = result.get(key)
-        if isinstance(val, str) and _is_placeholder(val):
-            del result[key]
-        elif isinstance(val, dict) and "name" in val and isinstance(val["name"], str) and _is_placeholder(val["name"]):
-            del result[key]
     return result
@@ -1286,23 +1266,22 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
     return result
-def extract_poster(poster_path: str) -> dict:
+def extract_poster(
+    poster_path: str,
+    model_id: Optional[str] = None,
+    quantization: Optional[str] = None,
+) -> dict:
     """
     Extract structured JSON metadata from a scientific poster.
-    This is the main entry point for poster extraction.
     Args:
-        poster_path: Path to the poster file (PDF, JPG, or PNG)
-    Returns:
-        Dictionary containing structured poster metadata conforming to
-        the poster-json-schema.
-    Example:
-        >>> result = extract_poster("poster.pdf")
-        >>> print(result["titles"][0]["title"])
-        "Machine Learning Approaches to Diabetic Retinopathy Detection"
+        poster_path: Path to the poster file (PDF, JPG, or PNG).
+        model_id: Override the default JSON structuring model. Accepts any
+            HuggingFace repo id (e.g. google/gemma-2-9b-it,
+            Qwen/Qwen2.5-7B-Instruct) in addition to the default fine-tuned
+            Llama.
+        quantization: Precision mode: "fp16", "8bit", or "4bit".
+            Defaults to "4bit" (NF4) when unset.
     """
     log(f"Processing poster: {poster_path}")
@@ -1317,12 +1296,13 @@ def extract_poster(poster_path: str) -> dict:
     log(f"Extracted {len(raw_text)} chars using {source} in {t_extract_elapsed:.2f}s")
     # Unload vision model before loading JSON model
-    ext = Path(poster_path).suffix.lower()
-    is_image_poster = ext in [".jpg", ".jpeg", ".png"]
     unload_vision_model()
     # Load JSON model
-    model, tokenizer = load_json_model(force_full_precision=is_image_poster)
+    model, tokenizer = load_json_model(
+        model_id=model_id,
+        quantization=quantization,
+    )
     try:
         t_json_start = time.time()

{poster2json-0.2.2 → poster2json-0.3.0}/pyproject.toml RENAMED Viewed

@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "poster2json"
-version = "0.2.2"
+version = "0.3.0"
 description = "Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models"
 packages = [{ include = "poster2json" }]