PyPI - poster2json - Versions diffs - 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

poster2json 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

poster2json/extract.py CHANGED Viewed

@@ -109,7 +109,7 @@ def free_gpu():
 def get_best_gpu(min_memory_gb: int = 16) -> str:
     """
     Get the GPU with most available memory.
     Returns device string like 'cuda:0' or 'cpu' if no GPU available.
     """
     if not torch.cuda.is_available():
@@ -332,12 +332,12 @@ def get_raw_text(
 ) -> Tuple[str, str]:
     """
     Get raw text from a poster file.
     Args:
         poster_path: Path to poster file (PDF, JPG, PNG)
         poster_id: Optional ID for caching
         output_dir: Optional directory for cached results
     Returns:
         Tuple of (text, source) where source indicates extraction method
     """
@@ -480,7 +480,9 @@ def _generate(model, tokenizer, prompt: str, max_tokens: int) -> str:
         )
     elapsed = time.time() - t0
     tokens_generated = outputs.shape[1] - inputs["input_ids"].shape[1]
-    log(f"   Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)")
+    log(
+        f"   Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)"
+    )
     return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
@@ -739,9 +741,22 @@ def _clean_unicode_artifacts(text: str) -> str:
         return text
     bidi_chars = [
-        "\u200e", "\u200f", "\u202a", "\u202b", "\u202c", "\u202d", "\u202e",
-        "\u2066", "\u2067", "\u2068", "\u2069", "\u200b", "\u200c", "\u200d",
-        "\ufeff", "\u00ad",
+        "\u200e",
+        "\u200f",
+        "\u202a",
+        "\u202b",
+        "\u202c",
+        "\u202d",
+        "\u202e",
+        "\u2066",
+        "\u2067",
+        "\u2068",
+        "\u2069",
+        "\u200b",
+        "\u200c",
+        "\u200d",
+        "\ufeff",
+        "\u00ad",
     ]
     for char in bidi_chars:
         text = text.replace(char, "")
@@ -811,7 +826,9 @@ def _postprocess_json(data: dict) -> dict:
                 content = section.get("sectionContent", "")
                 if isinstance(content, list):
                     content = " ".join(str(c) for c in content)
-                content = _clean_unicode_artifacts(content.strip() if isinstance(content, str) else "")
+                content = _clean_unicode_artifacts(
+                    content.strip() if isinstance(content, str) else ""
+                )
                 if content and len(content) > 10:
                     cleaned_sections.append({"sectionTitle": title, "sectionContent": content})
             result["posterContent"]["sections"] = cleaned_sections
@@ -839,7 +856,7 @@ def _postprocess_json(data: dict) -> dict:
 def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
     """
     Send raw poster text to the LLM and robustly parse the JSON response.
     This function:
       1. Calls the model with a full prompt
       2. Retries with more tokens if truncation is detected
@@ -876,16 +893,16 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
 def extract_poster(poster_path: str) -> dict:
     """
     Extract structured JSON metadata from a scientific poster.
     This is the main entry point for poster extraction.
     Args:
         poster_path: Path to the poster file (PDF, JPG, or PNG)
     Returns:
         Dictionary containing structured poster metadata conforming to
         the poster-json-schema.
     Example:
         >>> result = extract_poster("poster.pdf")
         >>> print(result["titles"][0]["title"])
@@ -930,4 +947,3 @@ def extract_poster(poster_path: str) -> dict:
         traceback.print_exc()
         unload_json_model()
         return {"error": str(e)}

{poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: poster2json
-Version: 0.1.1
+Version: 0.1.2
 Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
 License: MIT
 License-File: LICENSE.md
@@ -103,7 +103,7 @@ Convert scientific posters (PDF/images) to structured JSON metadata using Large
 The pipeline uses:
-- **Llama 3.1 8B** (fine-tuned) for JSON structuring
+- [**Llama-3.1-8B-Poster-Extraction**](https://huggingface.co/jimnoneill/Llama-3.1-8B-Poster-Extraction) for JSON structuring
 - **Qwen2-VL-7B** for vision-based OCR of image posters
 - **pdfalto** for layout-aware PDF text extraction
@@ -189,7 +189,7 @@ Validated on 10 manually annotated scientific posters:
 | Word Capture     | 0.96  | ≥0.75     |
 | ROUGE-L          | 0.89  | ≥0.75     |
 | Number Capture   | 0.93  | ≥0.75     |
-| Field Proportion | 0.99  | 0.30–2.50 |
+| Field Proportion | 0.99  | 0.50–2.00 |
 **Pass Rate**: 10/10 (100%)

{poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
 poster2json/__init__.py,sha256=ESghvlxkRYajjeabM9G3l-9ZI0hIBEztIhhSh1uFXPE,1143
 poster2json/__main__.py,sha256=6s_0TtF9yosSvBWX3MLsgToQ4mUKsd4oLBemG0K_y3I,185
 poster2json/cli.py,sha256=RglnUqbkeRxcM7wkW7AgI7YDw88m2VnNnGJVELMzD6M,8187
-poster2json/extract.py,sha256=WAr8T9jglp0mUYafcEuIOuS3BFJc2I6la-dt6dFGT-g,30798
+poster2json/extract.py,sha256=ApSPAFhjq4BZuSssdmiRefa-gi3-dBfauXwaEThQ7uo,30909
 poster2json/gui.py,sha256=dWqkFkdePC3NcMssGvj7x_ZZtBAmToyiJUMVrXIU3vs,1217
 poster2json/schemas/poster_schema.json,sha256=ApDF-8aGqShgvy_ituXo1Nv0dXDBuVRrMAokfC7r6dg,46196
 poster2json/standards.py,sha256=crQ2skZaPfzC3WgAZ_dnFf30rwj1bDaX5Dwqfp15qTY,606
@@ -9,8 +9,8 @@ poster2json/tests/__init__.py,sha256=pKvyDCcstqme7louOrIDlfx_Y_drU7OTM8M421N_oKo
 poster2json/tests/conftest.py,sha256=OYkRsHW3HZpFPcVzGpaIPbsiPOF8qX428H0vlLGlKUE,37
 poster2json/utils.py,sha256=e9g0fCGrmWjJM8fI3bosJ9FVae3O_NQJS3bBS1vcafQ,4252
 poster2json/validate.py,sha256=gl6Bce6wVS1t8ZqxDpDgR46GYleX1TicGeARaeWvzcs,9955
-poster2json-0.1.1.dist-info/METADATA,sha256=oZNTUNyMiWBPezjF82wL_jxi8eUXSMwIxxamRx7gAFg,7839
-poster2json-0.1.1.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
-poster2json-0.1.1.dist-info/entry_points.txt,sha256=-WEakwMIKNqxmZZHuYsq1ZbGw-75Q_uHP6zodmqNCWU,52
-poster2json-0.1.1.dist-info/licenses/LICENSE.md,sha256=KovpHb1fK-CUpNdkkZObadFiyjEb9DKNki54nMm3KEM,1087
-poster2json-0.1.1.dist-info/RECORD,,
+poster2json-0.1.2.dist-info/METADATA,sha256=iuizPIChN1vVQUXgynHLmZIa8ahf0tfD3WZcwpXeuTk,7912
+poster2json-0.1.2.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
+poster2json-0.1.2.dist-info/entry_points.txt,sha256=-WEakwMIKNqxmZZHuYsq1ZbGw-75Q_uHP6zodmqNCWU,52
+poster2json-0.1.2.dist-info/licenses/LICENSE.md,sha256=KovpHb1fK-CUpNdkkZObadFiyjEb9DKNki54nMm3KEM,1087
+poster2json-0.1.2.dist-info/RECORD,,

{poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/licenses/LICENSE.md RENAMED Viewed

File without changes

poster2json 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl

poster2json 0.1.1py3-none-any.whl → 0.1.2py3-none-any.whl