poster2json 0.1.1__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- poster2json/extract.py +30 -14
- {poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/METADATA +3 -3
- {poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/RECORD +6 -6
- {poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/WHEEL +0 -0
- {poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/entry_points.txt +0 -0
- {poster2json-0.1.1.dist-info → poster2json-0.1.2.dist-info}/licenses/LICENSE.md +0 -0
poster2json/extract.py
CHANGED
|
@@ -109,7 +109,7 @@ def free_gpu():
|
|
|
109
109
|
def get_best_gpu(min_memory_gb: int = 16) -> str:
|
|
110
110
|
"""
|
|
111
111
|
Get the GPU with most available memory.
|
|
112
|
-
|
|
112
|
+
|
|
113
113
|
Returns device string like 'cuda:0' or 'cpu' if no GPU available.
|
|
114
114
|
"""
|
|
115
115
|
if not torch.cuda.is_available():
|
|
@@ -332,12 +332,12 @@ def get_raw_text(
|
|
|
332
332
|
) -> Tuple[str, str]:
|
|
333
333
|
"""
|
|
334
334
|
Get raw text from a poster file.
|
|
335
|
-
|
|
335
|
+
|
|
336
336
|
Args:
|
|
337
337
|
poster_path: Path to poster file (PDF, JPG, PNG)
|
|
338
338
|
poster_id: Optional ID for caching
|
|
339
339
|
output_dir: Optional directory for cached results
|
|
340
|
-
|
|
340
|
+
|
|
341
341
|
Returns:
|
|
342
342
|
Tuple of (text, source) where source indicates extraction method
|
|
343
343
|
"""
|
|
@@ -480,7 +480,9 @@ def _generate(model, tokenizer, prompt: str, max_tokens: int) -> str:
|
|
|
480
480
|
)
|
|
481
481
|
elapsed = time.time() - t0
|
|
482
482
|
tokens_generated = outputs.shape[1] - inputs["input_ids"].shape[1]
|
|
483
|
-
log(
|
|
483
|
+
log(
|
|
484
|
+
f" Generated {tokens_generated} tokens in {elapsed:.2f}s ({tokens_generated/elapsed:.1f} tok/s)"
|
|
485
|
+
)
|
|
484
486
|
|
|
485
487
|
return tokenizer.decode(outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True)
|
|
486
488
|
|
|
@@ -739,9 +741,22 @@ def _clean_unicode_artifacts(text: str) -> str:
|
|
|
739
741
|
return text
|
|
740
742
|
|
|
741
743
|
bidi_chars = [
|
|
742
|
-
"\u200e",
|
|
743
|
-
"\
|
|
744
|
-
"\
|
|
744
|
+
"\u200e",
|
|
745
|
+
"\u200f",
|
|
746
|
+
"\u202a",
|
|
747
|
+
"\u202b",
|
|
748
|
+
"\u202c",
|
|
749
|
+
"\u202d",
|
|
750
|
+
"\u202e",
|
|
751
|
+
"\u2066",
|
|
752
|
+
"\u2067",
|
|
753
|
+
"\u2068",
|
|
754
|
+
"\u2069",
|
|
755
|
+
"\u200b",
|
|
756
|
+
"\u200c",
|
|
757
|
+
"\u200d",
|
|
758
|
+
"\ufeff",
|
|
759
|
+
"\u00ad",
|
|
745
760
|
]
|
|
746
761
|
for char in bidi_chars:
|
|
747
762
|
text = text.replace(char, "")
|
|
@@ -811,7 +826,9 @@ def _postprocess_json(data: dict) -> dict:
|
|
|
811
826
|
content = section.get("sectionContent", "")
|
|
812
827
|
if isinstance(content, list):
|
|
813
828
|
content = " ".join(str(c) for c in content)
|
|
814
|
-
content = _clean_unicode_artifacts(
|
|
829
|
+
content = _clean_unicode_artifacts(
|
|
830
|
+
content.strip() if isinstance(content, str) else ""
|
|
831
|
+
)
|
|
815
832
|
if content and len(content) > 10:
|
|
816
833
|
cleaned_sections.append({"sectionTitle": title, "sectionContent": content})
|
|
817
834
|
result["posterContent"]["sections"] = cleaned_sections
|
|
@@ -839,7 +856,7 @@ def _postprocess_json(data: dict) -> dict:
|
|
|
839
856
|
def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
|
|
840
857
|
"""
|
|
841
858
|
Send raw poster text to the LLM and robustly parse the JSON response.
|
|
842
|
-
|
|
859
|
+
|
|
843
860
|
This function:
|
|
844
861
|
1. Calls the model with a full prompt
|
|
845
862
|
2. Retries with more tokens if truncation is detected
|
|
@@ -876,16 +893,16 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
|
|
|
876
893
|
def extract_poster(poster_path: str) -> dict:
|
|
877
894
|
"""
|
|
878
895
|
Extract structured JSON metadata from a scientific poster.
|
|
879
|
-
|
|
896
|
+
|
|
880
897
|
This is the main entry point for poster extraction.
|
|
881
|
-
|
|
898
|
+
|
|
882
899
|
Args:
|
|
883
900
|
poster_path: Path to the poster file (PDF, JPG, or PNG)
|
|
884
|
-
|
|
901
|
+
|
|
885
902
|
Returns:
|
|
886
903
|
Dictionary containing structured poster metadata conforming to
|
|
887
904
|
the poster-json-schema.
|
|
888
|
-
|
|
905
|
+
|
|
889
906
|
Example:
|
|
890
907
|
>>> result = extract_poster("poster.pdf")
|
|
891
908
|
>>> print(result["titles"][0]["title"])
|
|
@@ -930,4 +947,3 @@ def extract_poster(poster_path: str) -> dict:
|
|
|
930
947
|
traceback.print_exc()
|
|
931
948
|
unload_json_model()
|
|
932
949
|
return {"error": str(e)}
|
|
933
|
-
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: poster2json
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.2
|
|
4
4
|
Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE.md
|
|
@@ -103,7 +103,7 @@ Convert scientific posters (PDF/images) to structured JSON metadata using Large
|
|
|
103
103
|
|
|
104
104
|
The pipeline uses:
|
|
105
105
|
|
|
106
|
-
- **Llama
|
|
106
|
+
- [**Llama-3.1-8B-Poster-Extraction**](https://huggingface.co/jimnoneill/Llama-3.1-8B-Poster-Extraction) for JSON structuring
|
|
107
107
|
- **Qwen2-VL-7B** for vision-based OCR of image posters
|
|
108
108
|
- **pdfalto** for layout-aware PDF text extraction
|
|
109
109
|
|
|
@@ -189,7 +189,7 @@ Validated on 10 manually annotated scientific posters:
|
|
|
189
189
|
| Word Capture | 0.96 | ≥0.75 |
|
|
190
190
|
| ROUGE-L | 0.89 | ≥0.75 |
|
|
191
191
|
| Number Capture | 0.93 | ≥0.75 |
|
|
192
|
-
| Field Proportion | 0.99 | 0.
|
|
192
|
+
| Field Proportion | 0.99 | 0.50–2.00 |
|
|
193
193
|
|
|
194
194
|
**Pass Rate**: 10/10 (100%)
|
|
195
195
|
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
poster2json/__init__.py,sha256=ESghvlxkRYajjeabM9G3l-9ZI0hIBEztIhhSh1uFXPE,1143
|
|
2
2
|
poster2json/__main__.py,sha256=6s_0TtF9yosSvBWX3MLsgToQ4mUKsd4oLBemG0K_y3I,185
|
|
3
3
|
poster2json/cli.py,sha256=RglnUqbkeRxcM7wkW7AgI7YDw88m2VnNnGJVELMzD6M,8187
|
|
4
|
-
poster2json/extract.py,sha256=
|
|
4
|
+
poster2json/extract.py,sha256=ApSPAFhjq4BZuSssdmiRefa-gi3-dBfauXwaEThQ7uo,30909
|
|
5
5
|
poster2json/gui.py,sha256=dWqkFkdePC3NcMssGvj7x_ZZtBAmToyiJUMVrXIU3vs,1217
|
|
6
6
|
poster2json/schemas/poster_schema.json,sha256=ApDF-8aGqShgvy_ituXo1Nv0dXDBuVRrMAokfC7r6dg,46196
|
|
7
7
|
poster2json/standards.py,sha256=crQ2skZaPfzC3WgAZ_dnFf30rwj1bDaX5Dwqfp15qTY,606
|
|
@@ -9,8 +9,8 @@ poster2json/tests/__init__.py,sha256=pKvyDCcstqme7louOrIDlfx_Y_drU7OTM8M421N_oKo
|
|
|
9
9
|
poster2json/tests/conftest.py,sha256=OYkRsHW3HZpFPcVzGpaIPbsiPOF8qX428H0vlLGlKUE,37
|
|
10
10
|
poster2json/utils.py,sha256=e9g0fCGrmWjJM8fI3bosJ9FVae3O_NQJS3bBS1vcafQ,4252
|
|
11
11
|
poster2json/validate.py,sha256=gl6Bce6wVS1t8ZqxDpDgR46GYleX1TicGeARaeWvzcs,9955
|
|
12
|
-
poster2json-0.1.
|
|
13
|
-
poster2json-0.1.
|
|
14
|
-
poster2json-0.1.
|
|
15
|
-
poster2json-0.1.
|
|
16
|
-
poster2json-0.1.
|
|
12
|
+
poster2json-0.1.2.dist-info/METADATA,sha256=iuizPIChN1vVQUXgynHLmZIa8ahf0tfD3WZcwpXeuTk,7912
|
|
13
|
+
poster2json-0.1.2.dist-info/WHEEL,sha256=kJCRJT_g0adfAJzTx2GUMmS80rTJIVHRCfG0DQgLq3o,88
|
|
14
|
+
poster2json-0.1.2.dist-info/entry_points.txt,sha256=-WEakwMIKNqxmZZHuYsq1ZbGw-75Q_uHP6zodmqNCWU,52
|
|
15
|
+
poster2json-0.1.2.dist-info/licenses/LICENSE.md,sha256=KovpHb1fK-CUpNdkkZObadFiyjEb9DKNki54nMm3KEM,1087
|
|
16
|
+
poster2json-0.1.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|