poster2json 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
poster2json/extract.py CHANGED
@@ -109,7 +109,7 @@ def free_gpu():
109
109
  def get_best_gpu(min_memory_gb: int = 16) -> str:
110
110
  """
111
111
  Get the GPU with most available memory.
112
-
112
+
113
113
  Returns device string like 'cuda:0' or 'cpu' if no GPU available.
114
114
  """
115
115
  if not torch.cuda.is_available():
@@ -332,12 +332,12 @@ def get_raw_text(
332
332
  ) -> Tuple[str, str]:
333
333
  """
334
334
  Get raw text from a poster file.
335
-
335
+
336
336
  Args:
337
337
  poster_path: Path to poster file (PDF, JPG, PNG)
338
338
  poster_id: Optional ID for caching
339
339
  output_dir: Optional directory for cached results
340
-
340
+
341
341
  Returns:
342
342
  Tuple of (text, source) where source indicates extraction method
343
343
  """
@@ -480,7 +480,9 @@ def _generate(model, tokenizer, prompt: str, max_tokens: int) -> str:
480
480
  )
481
481
  elapsed = time.time() - t0
482
482
  tokens_generated = outputs.shape[1] - inputs["input_ids"].shape[1]
483
- log(f" Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)")
483
+ log(
484
+ f" Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)"
485
+ )
484
486
 
485
487
  return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
486
488
 
@@ -739,9 +741,22 @@ def _clean_unicode_artifacts(text: str) -> str:
739
741
  return text
740
742
 
741
743
  bidi_chars = [
742
- "\u200e", "\u200f", "\u202a", "\u202b", "\u202c", "\u202d", "\u202e",
743
- "\u2066", "\u2067", "\u2068", "\u2069", "\u200b", "\u200c", "\u200d",
744
- "\ufeff", "\u00ad",
744
+ "\u200e",
745
+ "\u200f",
746
+ "\u202a",
747
+ "\u202b",
748
+ "\u202c",
749
+ "\u202d",
750
+ "\u202e",
751
+ "\u2066",
752
+ "\u2067",
753
+ "\u2068",
754
+ "\u2069",
755
+ "\u200b",
756
+ "\u200c",
757
+ "\u200d",
758
+ "\ufeff",
759
+ "\u00ad",
745
760
  ]
746
761
  for char in bidi_chars:
747
762
  text = text.replace(char, "")
@@ -811,7 +826,9 @@ def _postprocess_json(data: dict) -> dict:
811
826
  content = section.get("sectionContent", "")
812
827
  if isinstance(content, list):
813
828
  content = " ".join(str(c) for c in content)
814
- content = _clean_unicode_artifacts(content.strip() if isinstance(content, str) else "")
829
+ content = _clean_unicode_artifacts(
830
+ content.strip() if isinstance(content, str) else ""
831
+ )
815
832
  if content and len(content) > 10:
816
833
  cleaned_sections.append({"sectionTitle": title, "sectionContent": content})
817
834
  result["posterContent"]["sections"] = cleaned_sections
@@ -839,7 +856,7 @@ def _postprocess_json(data: dict) -> dict:
839
856
  def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
840
857
  """
841
858
  Send raw poster text to the LLM and robustly parse the JSON response.
842
-
859
+
843
860
  This function:
844
861
  1. Calls the model with a full prompt
845
862
  2. Retries with more tokens if truncation is detected
@@ -876,16 +893,16 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
876
893
  def extract_poster(poster_path: str) -> dict:
877
894
  """
878
895
  Extract structured JSON metadata from a scientific poster.
879
-
896
+
880
897
  This is the main entry point for poster extraction.
881
-
898
+
882
899
  Args:
883
900
  poster_path: Path to the poster file (PDF, JPG, or PNG)
884
-
901
+
885
902
  Returns:
886
903
  Dictionary containing structured poster metadata conforming to
887
904
  the poster-json-schema.
888
-
905
+
889
906
  Example:
890
907
  >>> result = extract_poster("poster.pdf")
891
908
  >>> print(result["titles"][0]["title"])
@@ -930,4 +947,3 @@ def extract_poster(poster_path: str) -> dict:
930
947
  traceback.print_exc()
931
948
  unload_json_model()
932
949
  return {"error": str(e)}
933
-
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: poster2json
3
- Version: 0.1.0
3
+ Version: 0.1.2
4
4
  Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
5
5
  License: MIT
6
6
  License-File: LICENSE.md
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  <div align="center">
46
46
 
47
- <img src="https://raw.githubusercontent.com/fairdataihub/poster2json/main/logo.svg" alt="logo" width="200" height="auto" />
47
+ <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
48
48
 
49
49
  <br />
50
50
 
@@ -103,7 +103,7 @@ Convert scientific posters (PDF/images) to structured JSON metadata using Large
103
103
 
104
104
  The pipeline uses:
105
105
 
106
- - **Llama 3.1 8B** (fine-tuned) for JSON structuring
106
+ - [**Llama-3.1-8B-Poster-Extraction**](https://huggingface.co/jimnoneill/Llama-3.1-8B-Poster-Extraction) for JSON structuring
107
107
  - **Qwen2-VL-7B** for vision-based OCR of image posters
108
108
  - **pdfalto** for layout-aware PDF text extraction
109
109
 
@@ -189,7 +189,7 @@ Validated on 10 manually annotated scientific posters:
189
189
  | Word Capture | 0.96 | ≥0.75 |
190
190
  | ROUGE-L | 0.89 | ≥0.75 |
191
191
  | Number Capture | 0.93 | ≥0.75 |
192
- | Field Proportion | 0.99 | 0.30–2.50 |
192
+ | Field Proportion | 0.99 | 0.50–2.00 |
193
193
 
194
194
  **Pass Rate**: 10/10 (100%)
195
195
 
@@ -211,7 +211,8 @@ cd poster2json
211
211
  python -m venv .venv
212
212
 
213
213
  # Activate the virtual environment
214
- source venv/bin/activate # On Windows: .venv\Scripts\activate
214
+ source venv/bin/activate
215
+ .venv\Scripts\activate # On Windows
215
216
 
216
217
  # Install poetry
217
218
  pip install poetry
@@ -1,7 +1,7 @@
1
1
  poster2json/__init__.py,sha256=ESghvlxkRYajjeabM9G3l-9ZI0hIBEztIhhSh1uFXPE,1143
2
2
  poster2json/__main__.py,sha256=6s_0TtF9yosSvBWX3MLsgToQ4mUKsd4oLBemG0K_y3I,185
3
3
  poster2json/cli.py,sha256=RglnUqbkeRxcM7wkW7AgI7YDw88m2VnNnGJVELMzD6M,8187
4
- poster2json/extract.py,sha256=WAr8T9jglp0mUYafcEuIOuS3BFJc2I6la-dt6dFGT-g,30798
4
+ poster2json/extract.py,sha256=ApSPAFhjq4BZuSssdmiRefa-gi3-dBfauXwaEThQ7uo,30909
5
5
  poster2json/gui.py,sha256=dWqkFkdePC3NcMssGvj7x_ZZtBAmToyiJUMVrXIU3vs,1217
6
6
  poster2json/schemas/poster_schema.json,sha256=ApDF-8aGqShgvy_ituXo1Nv0dXDBuVRrMAokfC7r6dg,46196
7
7
  poster2json/standards.py,sha256=crQ2skZaPfzC3WgAZ_dnFf30rwj1bDaX5Dwqfp15qTY,606
@@ -9,8 +9,8 @@ poster2json/tests/__init__.py,sha256=pKvyDCcstqme7louOrIDlfx_Y_drU7OTM8M421N_oKo
9
9
  poster2json/tests/conftest.py,sha256=OYkRsHW3HZpFPcVzGpaIPbsiPOF8qX428H0vlLGlKUE,37
10
10
  poster2json/utils.py,sha256=e9g0fCGrmWjJM8fI3bosJ9FVae3O_NQJS3bBS1vcafQ,4252
11
11
  poster2json/validate.py,sha256=gl6Bce6wVS1t8ZqxDpDgR46GYleX1TicGeARaeWvzcs,9955
12
- poster2json-0.1.0.dist-info/METADATA,sha256=VzGvlIslDFyjA8mZm9Ze-rZtcOq9NvGlSmUcRhAAmP0,7862
13
- poster2json-0.1.0.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
14
- poster2json-0.1.0.dist-info/entry_points.txt,sha256=-WEakwMIKNqxmZZHuYsq1ZbGw-75Q_uHP6zodmqNCWU,52
15
- poster2json-0.1.0.dist-info/licenses/LICENSE.md,sha256=KovpHb1fK-CUpNdkkZObadFiyjEb9DKNki54nMm3KEM,1087
16
- poster2json-0.1.0.dist-info/RECORD,,
12
+ poster2json-0.1.2.dist-info/METADATA,sha256=iuizPIChN1vVQUXgynHLmZIa8ahf0tfD3WZcwpXeuTk,7912
13
+ poster2json-0.1.2.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
14
+ poster2json-0.1.2.dist-info/entry_points.txt,sha256=-WEakwMIKNqxmZZHuYsq1ZbGw-75Q_uHP6zodmqNCWU,52
15
+ poster2json-0.1.2.dist-info/licenses/LICENSE.md,sha256=KovpHb1fK-CUpNdkkZObadFiyjEb9DKNki54nMm3KEM,1087
16
+ poster2json-0.1.2.dist-info/RECORD,,