poster2json 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {poster2json-0.2.2 → poster2json-0.2.3}/PKG-INFO +3 -3
- {poster2json-0.2.2 → poster2json-0.2.3}/README.md +2 -2
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/extract.py +9 -48
- {poster2json-0.2.2 → poster2json-0.2.3}/pyproject.toml +1 -1
- {poster2json-0.2.2 → poster2json-0.2.3}/LICENSE.md +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/__init__.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/__main__.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/cli.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/gui.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/identifiers.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/schemas/poster_schema.json +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/standards.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/tests/__init__.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/tests/conftest.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/utils.py +0 -0
- {poster2json-0.2.2 → poster2json-0.2.3}/poster2json/validate.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: poster2json
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE.md
|
|
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
|
|
|
44
44
|
|
|
45
45
|
<div align="center">
|
|
46
46
|
|
|
47
|
-
<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
|
|
47
|
+
<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
|
|
48
48
|
|
|
49
49
|
<br />
|
|
50
50
|
|
|
@@ -246,7 +246,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
|
|
|
246
246
|
title = {poster2json: Scientific Poster to JSON Metadata Extraction},
|
|
247
247
|
author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
|
|
248
248
|
year = {2026},
|
|
249
|
-
version = {0.2.
|
|
249
|
+
version = {0.2.3},
|
|
250
250
|
url = {https://github.com/fairdataihub/poster2json},
|
|
251
251
|
doi = {10.5281/zenodo.18320010}
|
|
252
252
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
<div align="center">
|
|
2
2
|
|
|
3
|
-
<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
|
|
3
|
+
<img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
|
|
4
4
|
|
|
5
5
|
<br />
|
|
6
6
|
|
|
@@ -202,7 +202,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
|
|
|
202
202
|
title = {poster2json: Scientific Poster to JSON Metadata Extraction},
|
|
203
203
|
author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
|
|
204
204
|
year = {2026},
|
|
205
|
-
version = {0.2.
|
|
205
|
+
version = {0.2.3},
|
|
206
206
|
url = {https://github.com/fairdataihub/poster2json},
|
|
207
207
|
doi = {10.5281/zenodo.18320010}
|
|
208
208
|
}
|
|
@@ -722,13 +722,7 @@ JSON SCHEMA (all top-level fields are REQUIRED):
|
|
|
722
722
|
"subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}, {{"subject": "keyword3"}}],
|
|
723
723
|
"descriptions": [{{"description": "The abstract text from the poster...", "descriptionType": "Abstract"}}],
|
|
724
724
|
"publisher": {{"name": "Conference Organizer or Institution Name"}},
|
|
725
|
-
"conference":
|
|
726
|
-
"conferenceName": "Name of Conference",
|
|
727
|
-
"conferenceYear": 2025,
|
|
728
|
-
"conferenceLocation": "City, Country",
|
|
729
|
-
"conferenceStartDate": "YYYY-MM-DD",
|
|
730
|
-
"conferenceEndDate": "YYYY-MM-DD"
|
|
731
|
-
}},
|
|
725
|
+
"conference": null,
|
|
732
726
|
"formats": ["PDF"],
|
|
733
727
|
"content": {{
|
|
734
728
|
"sections": [
|
|
@@ -747,8 +741,12 @@ EXTRACTION NOTES:
|
|
|
747
741
|
- descriptions: Use the Abstract section content, descriptionType is REQUIRED
|
|
748
742
|
- publisher: Use conference organizer, hosting institution, or repository name
|
|
749
743
|
- titles: If the poster title is ALL CAPS, convert to proper Title Case preserving acronyms (e.g. "RESEARCH ON SARS-CoV-2" not "RESEARCH ON SARS-COV-2")
|
|
750
|
-
- conference:
|
|
751
|
-
|
|
744
|
+
- conference: Extract ONLY from text clearly visible on the poster (header, footer, logos).
|
|
745
|
+
* If conference details are NOT visible, set "conference": null — do NOT invent names, locations, dates, URLs, or acronyms.
|
|
746
|
+
* NEVER output generic values like "Name of Conference", "City, Country", "Conference Name", or made-up URLs.
|
|
747
|
+
* If only SOME fields are visible (e.g. name and year but not location), include only those: {{"conferenceName": "ACL 2024", "conferenceYear": 2024}}
|
|
748
|
+
* If no conference information is found at all, output "conference": null
|
|
749
|
+
- publisher: Extract from poster. If not found, set to null — do NOT use placeholder text
|
|
752
750
|
- formats: Set to ["PDF"] for PDF files, ["PNG"] or ["JPEG"] for images
|
|
753
751
|
- imageCaptions/tableCaptions: Use "id" field (e.g., "fig1") for cross-referencing if needed
|
|
754
752
|
- rightsList: OPTIONAL - include if license/copyright info found on poster
|
|
@@ -763,7 +761,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
|
|
|
763
761
|
2. SEPARATE section for EACH header found in the poster text. Use the poster's own headers. Lines starting with "## " are detected headers.
|
|
764
762
|
3. Copy ALL text EXACTLY verbatim
|
|
765
763
|
4. If title is ALL CAPS, convert to Title Case preserving acronyms (SARS-CoV-2, not SARS-COV-2)
|
|
766
|
-
5.
|
|
764
|
+
5. conference/publisher: extract ONLY if clearly visible on the poster. If not found, set to null. NEVER invent names, locations, dates, URLs, or use generic placeholders.
|
|
767
765
|
|
|
768
766
|
{{
|
|
769
767
|
"creators": [{{"name": "LastName, FirstName", "givenName": "FirstName", "familyName": "LastName", "affiliation": ["Institution"]}}],
|
|
@@ -772,7 +770,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
|
|
|
772
770
|
"subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}],
|
|
773
771
|
"descriptions": [{{"description": "Abstract text", "descriptionType": "Abstract"}}],
|
|
774
772
|
"publisher": {{"name": "Conference or Institution"}},
|
|
775
|
-
"conference":
|
|
773
|
+
"conference": null,
|
|
776
774
|
"formats": ["PDF"],
|
|
777
775
|
"content": {{
|
|
778
776
|
"sections": [{{"sectionTitle": "Header", "sectionContent": "verbatim text"}}]
|
|
@@ -1172,43 +1170,6 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
|
|
|
1172
1170
|
|
|
1173
1171
|
result = enrich_json_with_identifiers(result, raw_text)
|
|
1174
1172
|
|
|
1175
|
-
# Strip "Unknown" and prompt-placeholder values the LLM likes to hallucinate.
|
|
1176
|
-
# These violate metadata quality expectations — better to omit than guess.
|
|
1177
|
-
_UNKNOWN_RE = re.compile(r"^unknown\b", re.IGNORECASE)
|
|
1178
|
-
# Prompt placeholders that the model echoes back verbatim when it can't
|
|
1179
|
-
# find real conference metadata on the poster.
|
|
1180
|
-
_PLACEHOLDER_VALS = {
|
|
1181
|
-
"name of conference",
|
|
1182
|
-
"conference name",
|
|
1183
|
-
"city, country",
|
|
1184
|
-
"location",
|
|
1185
|
-
"conference organizer or institution name",
|
|
1186
|
-
"conference or institution",
|
|
1187
|
-
}
|
|
1188
|
-
_PLACEHOLDER_DATE_RE = re.compile(r"^[Yy]{4}-[Mm]{2}-[Dd]{2}$")
|
|
1189
|
-
|
|
1190
|
-
def _is_placeholder(val: str) -> bool:
|
|
1191
|
-
s = val.strip()
|
|
1192
|
-
return (
|
|
1193
|
-
not s
|
|
1194
|
-
or _UNKNOWN_RE.match(s)
|
|
1195
|
-
or s.lower() in _PLACEHOLDER_VALS
|
|
1196
|
-
or bool(_PLACEHOLDER_DATE_RE.match(s))
|
|
1197
|
-
)
|
|
1198
|
-
|
|
1199
|
-
if "conference" in result and isinstance(result["conference"], dict):
|
|
1200
|
-
for key in list(result["conference"]):
|
|
1201
|
-
val = result["conference"][key]
|
|
1202
|
-
if isinstance(val, str) and _is_placeholder(val):
|
|
1203
|
-
del result["conference"][key]
|
|
1204
|
-
# Top-level optional string fields
|
|
1205
|
-
for key in ("conferenceLocation", "publisher", "researchField"):
|
|
1206
|
-
val = result.get(key)
|
|
1207
|
-
if isinstance(val, str) and _is_placeholder(val):
|
|
1208
|
-
del result[key]
|
|
1209
|
-
elif isinstance(val, dict) and "name" in val and isinstance(val["name"], str) and _is_placeholder(val["name"]):
|
|
1210
|
-
del result[key]
|
|
1211
|
-
|
|
1212
1173
|
return result
|
|
1213
1174
|
|
|
1214
1175
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|