poster2json 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: poster2json
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models
5
5
  License: MIT
6
6
  License-File: LICENSE.md
@@ -44,7 +44,7 @@ Description-Content-Type: text/markdown
44
44
 
45
45
  <div align="center">
46
46
 
47
- <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
47
+ <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
48
48
 
49
49
  <br />
50
50
 
@@ -246,7 +246,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
246
246
  title = {poster2json: Scientific Poster to JSON Metadata Extraction},
247
247
  author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
248
248
  year = {2026},
249
- version = {0.2.2},
249
+ version = {0.2.3},
250
250
  url = {https://github.com/fairdataihub/poster2json},
251
251
  doi = {10.5281/zenodo.18320010}
252
252
  }
@@ -1,6 +1,6 @@
1
1
  <div align="center">
2
2
 
3
- <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" />
3
+ <img src="https://cdn.posters.science/logos/poster-fairy.png" alt="logo" width="200" height="auto" title="This image was generated by AI" />
4
4
 
5
5
  <br />
6
6
 
@@ -202,7 +202,7 @@ MIT License - see [LICENSE](LICENSE.md) for details.
202
202
  title = {poster2json: Scientific Poster to JSON Metadata Extraction},
203
203
  author = {O'Neill, James and Soundarajan, Sanjay and Portillo, Dorian and Patel, Bhavesh},
204
204
  year = {2026},
205
- version = {0.2.2},
205
+ version = {0.2.3},
206
206
  url = {https://github.com/fairdataihub/poster2json},
207
207
  doi = {10.5281/zenodo.18320010}
208
208
  }
@@ -722,13 +722,7 @@ JSON SCHEMA (all top-level fields are REQUIRED):
722
722
  "subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}, {{"subject": "keyword3"}}],
723
723
  "descriptions": [{{"description": "The abstract text from the poster...", "descriptionType": "Abstract"}}],
724
724
  "publisher": {{"name": "Conference Organizer or Institution Name"}},
725
- "conference": {{
726
- "conferenceName": "Name of Conference",
727
- "conferenceYear": 2025,
728
- "conferenceLocation": "City, Country",
729
- "conferenceStartDate": "YYYY-MM-DD",
730
- "conferenceEndDate": "YYYY-MM-DD"
731
- }},
725
+ "conference": null,
732
726
  "formats": ["PDF"],
733
727
  "content": {{
734
728
  "sections": [
@@ -747,8 +741,12 @@ EXTRACTION NOTES:
747
741
  - descriptions: Use the Abstract section content, descriptionType is REQUIRED
748
742
  - publisher: Use conference organizer, hosting institution, or repository name
749
743
  - titles: If the poster title is ALL CAPS, convert to proper Title Case preserving acronyms (e.g. "RESEARCH ON SARS-CoV-2" not "RESEARCH ON SARS-COV-2")
750
- - conference: conferenceName and conferenceYear are REQUIRED; extract from poster header/footer. If not found on the poster, omit the field entirely — do NOT guess or use placeholders
751
- - publisher: Extract from poster. If not found, omit — do NOT use placeholder text
744
+ - conference: Extract ONLY from text clearly visible on the poster (header, footer, logos).
745
+ * If conference details are NOT visible, set "conference": null — do NOT invent names, locations, dates, URLs, or acronyms.
746
+ * NEVER output generic values like "Name of Conference", "City, Country", "Conference Name", or made-up URLs.
747
+ * If only SOME fields are visible (e.g. name and year but not location), include only those: {{"conferenceName": "ACL 2024", "conferenceYear": 2024}}
748
+ * If no conference information is found at all, output "conference": null
749
+ - publisher: Extract from poster. If not found, set to null — do NOT use placeholder text
752
750
  - formats: Set to ["PDF"] for PDF files, ["PNG"] or ["JPEG"] for images
753
751
  - imageCaptions/tableCaptions: Use "id" field (e.g., "fig1") for cross-referencing if needed
754
752
  - rightsList: OPTIONAL - include if license/copyright info found on poster
@@ -763,7 +761,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
763
761
  2. SEPARATE section for EACH header found in the poster text. Use the poster's own headers. Lines starting with "## " are detected headers.
764
762
  3. Copy ALL text EXACTLY verbatim
765
763
  4. If title is ALL CAPS, convert to Title Case preserving acronyms (SARS-CoV-2, not SARS-COV-2)
766
- 5. Omit conference/publisher if not found on the poster never guess or use placeholders
764
+ 5. conference/publisher: extract ONLY if clearly visible on the poster. If not found, set to null. NEVER invent names, locations, dates, URLs, or use generic placeholders.
767
765
 
768
766
  {{
769
767
  "creators": [{{"name": "LastName, FirstName", "givenName": "FirstName", "familyName": "LastName", "affiliation": ["Institution"]}}],
@@ -772,7 +770,7 @@ FALLBACK_PROMPT = """Convert poster text to JSON. REQUIRED FIELDS:
772
770
  "subjects": [{{"subject": "keyword1"}}, {{"subject": "keyword2"}}],
773
771
  "descriptions": [{{"description": "Abstract text", "descriptionType": "Abstract"}}],
774
772
  "publisher": {{"name": "Conference or Institution"}},
775
- "conference": {{"conferenceName": "Conference Name", "conferenceYear": 2025, "conferenceLocation": "Location"}},
773
+ "conference": null,
776
774
  "formats": ["PDF"],
777
775
  "content": {{
778
776
  "sections": [{{"sectionTitle": "Header", "sectionContent": "verbatim text"}}]
@@ -1172,43 +1170,6 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
1172
1170
 
1173
1171
  result = enrich_json_with_identifiers(result, raw_text)
1174
1172
 
1175
- # Strip "Unknown" and prompt-placeholder values the LLM likes to hallucinate.
1176
- # These violate metadata quality expectations — better to omit than guess.
1177
- _UNKNOWN_RE = re.compile(r"^unknown\b", re.IGNORECASE)
1178
- # Prompt placeholders that the model echoes back verbatim when it can't
1179
- # find real conference metadata on the poster.
1180
- _PLACEHOLDER_VALS = {
1181
- "name of conference",
1182
- "conference name",
1183
- "city, country",
1184
- "location",
1185
- "conference organizer or institution name",
1186
- "conference or institution",
1187
- }
1188
- _PLACEHOLDER_DATE_RE = re.compile(r"^[Yy]{4}-[Mm]{2}-[Dd]{2}$")
1189
-
1190
- def _is_placeholder(val: str) -> bool:
1191
- s = val.strip()
1192
- return (
1193
- not s
1194
- or _UNKNOWN_RE.match(s)
1195
- or s.lower() in _PLACEHOLDER_VALS
1196
- or bool(_PLACEHOLDER_DATE_RE.match(s))
1197
- )
1198
-
1199
- if "conference" in result and isinstance(result["conference"], dict):
1200
- for key in list(result["conference"]):
1201
- val = result["conference"][key]
1202
- if isinstance(val, str) and _is_placeholder(val):
1203
- del result["conference"][key]
1204
- # Top-level optional string fields
1205
- for key in ("conferenceLocation", "publisher", "researchField"):
1206
- val = result.get(key)
1207
- if isinstance(val, str) and _is_placeholder(val):
1208
- del result[key]
1209
- elif isinstance(val, dict) and "name" in val and isinstance(val["name"], str) and _is_placeholder(val["name"]):
1210
- del result[key]
1211
-
1212
1173
  return result
1213
1174
 
1214
1175
 
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
 
3
3
  name = "poster2json"
4
- version = "0.2.2"
4
+ version = "0.2.3"
5
5
  description = "Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models"
6
6
 
7
7
  packages = [{ include = "poster2json" }]
File without changes