PyPI - philoch-bib-sdk - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

philoch-bib-sdk 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py ADDED Viewed

@@ -0,0 +1,59 @@
+from functools import partial
+from typing import Callable, NamedTuple
+from philoch_bib_sdk.logic.functions.journal_article_matcher import (
+    TBibkey,
+    TJournalBibkeyIndex,
+    TJournalName,
+    TNumber,
+    TReadIndex,
+    TVolume,
+)
+class ColumnNames(NamedTuple):
+    bibkey: TBibkey
+    journal: TJournalName
+    volume: TVolume
+    number: TNumber
+def _read_from_ods(
+    column_names: ColumnNames,
+    file_path: str,
+) -> TJournalBibkeyIndex:
+    """
+    Reads the specified columns from an ODS file and returns a TJournalBibkeyIndex dictionary.
+    Args:
+        column_names (ColumnNames): The names of the columns to read (journal, volume, number, bibkey).
+        file_path (str): The path to the ODS file.
+    Returns:
+        TJournalBibkeyIndex: A dictionary mapping (journal, volume, number) tuples to bibkey values.
+    """
+    import polars as pl
+    df = pl.read_ods(
+        source=file_path,
+        has_header=True,
+        columns=[column_names.journal, column_names.volume, column_names.number, column_names.bibkey],
+        schema_overrides={
+            column_names.journal: pl.Utf8,
+            column_names.volume: pl.Utf8,
+            column_names.number: pl.Utf8,
+            column_names.bibkey: pl.Utf8,
+        },
+    )
+    if df.is_empty():
+        raise ValueError(
+            f"Tabular data at '{file_path}' is empty or does not contain the expected columns: {column_names}"
+        )
+    return {
+        (row[column_names.journal], row[column_names.volume], row[column_names.number]): row[column_names.bibkey]
+        for row in df.to_dicts()
+    }
+type THOFReadFromOds = Callable[[ColumnNames], TReadIndex]
+hof_read_from_ods: THOFReadFromOds = lambda column_names: partial(_read_from_ods, column_names)

philoch_bib_sdk/converters/latex.py ADDED Viewed

@@ -0,0 +1,6 @@
+def unicode_to_latex(unicode_str: str) -> str:
+    raise NotImplementedError("This function is not implemented yet.")
+def latex_to_unicode(latex_str: str) -> str:
+    raise NotImplementedError("This function is not implemented yet.")

philoch_bib_sdk/converters/plaintext/author/formatter.py ADDED Viewed

@@ -0,0 +1,31 @@
+from typing import Tuple
+from aletk.utils import get_logger
+from philoch_bib_sdk.logic.models import Author, TBibString
+lgr = get_logger(__name__)
+def _full_name_generic(given_name: str, family_name: str, mononym: str) -> str:
+    if mononym:
+        return mononym
+    if not given_name:
+        return ""
+    if not family_name:
+        return given_name
+    return f"{family_name}, {given_name}"
+def _format_single(author: Author, bibstring_type: TBibString) -> str:
+    given_name = f"{getattr(author.given_name, bibstring_type)}"
+    family_name = f"{getattr(author.family_name, bibstring_type)}"
+    mononym = f"{getattr(author.mononym, bibstring_type)}"
+    return _full_name_generic(given_name, family_name, mononym)
+def format_author(authors: Tuple[Author, ...], bibstring_type: TBibString) -> str:
+    names = (_format_single(author, bibstring_type=bibstring_type) for author in authors)
+    return " and ".join(name for name in names if name)

philoch_bib_sdk/converters/plaintext/author/parser.py ADDED Viewed

@@ -0,0 +1,72 @@
+import traceback
+from typing import Tuple
+from aletk.ResultMonad import Ok, Err
+from aletk.utils import get_logger, remove_extra_whitespace
+from philoch_bib_sdk.logic.models import Author, BibStringAttr, TBibString
+lgr = get_logger(__name__)
+def _parse_normalize(text: str) -> Tuple[str, str, str]:
+    """
+    Return a tuple of two strings, the first of which is the given name, and the second of which is the family name. If only one name is found, the second string will be empty.
+    Fails if more than two names are found.
+    """
+    parts = tuple(remove_extra_whitespace(part) for part in text.split(","))
+    if len(parts) > 2:
+        raise ValueError(f"Unexpected number of author parts found in '{text}': '{parts}'. Expected 2 or less.")
+    elif len(parts) == 0:
+        return ("", "", "")
+    elif len(parts) == 1:
+        # Mononym
+        return ("", "", parts[0])
+    else:
+        # Full name
+        return (parts[1], parts[0], "")
+def _parse_single(normalized_name_parts: Tuple[str, str, str], bib_string_type: TBibString) -> Author:
+    """
+    Parse a single author from a string.
+    """
+    _given_name, _family_name, _mononym = normalized_name_parts
+    return Author(
+        given_name=BibStringAttr(**{str(bib_string_type): _given_name}),
+        family_name=BibStringAttr(**{str(bib_string_type): _family_name}),
+        mononym=BibStringAttr(**{str(bib_string_type): _mononym}),
+        shorthand=BibStringAttr(),
+        famous_name=BibStringAttr(),
+        publications=(),
+    )
+def parse_author(text: str, bibstring_type: TBibString) -> Ok[Tuple[Author, ...]] | Err:
+    """
+    Return either a tuple of Author objects or an error.
+    The input string is expected to be an ' and '-separated list of authors, with each author in the format "family_name, given_name" or "mononym".
+    """
+    try:
+        if text == "":
+            lgr.debug("Empty author string, returning empty tuple.")
+            return Ok(())
+        parts = tuple(remove_extra_whitespace(part) for part in text.split("and"))
+        parts_normalized = (_parse_normalize(part) for part in parts)
+        authors = tuple(_parse_single(part, bibstring_type) for part in parts_normalized)
+        return Ok(authors)
+    except Exception as e:
+        return Err(
+            message=f"Could not parse 'author' field with value [[ {text} ]]. {e.__class__.__name__}: {e}",
+            code=-1,
+            error_type="ParsingError",
+            error_trace=f"{traceback.format_exc()}",
+        )

philoch_bib_sdk/converters/plaintext/bib_string_formatter.py ADDED Viewed

@@ -0,0 +1,8 @@
+from philoch_bib_sdk.logic.models import BibStringAttr, MaybeStr, TBibString
+def format_bib_string_attr(bib_string: MaybeStr[BibStringAttr], bibstring_type: TBibString) -> str:
+    """
+    Format a BibStringAttr into a string representation.
+    """
+    return "" if not bib_string else getattr(bib_string, bibstring_type, "")

philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py ADDED Viewed

@@ -0,0 +1,21 @@
+from philoch_bib_sdk.logic.models import BibKeyAttr, MaybeStr
+def format_bibkey(bibkey: MaybeStr[BibKeyAttr]) -> str:
+    if bibkey == "":
+        return ""
+    if bibkey.other_authors:
+        authors_l = [bibkey.first_author, bibkey.other_authors]
+    else:
+        authors_l = [bibkey.first_author]
+    authors = "-".join(authors_l)
+    if isinstance(bibkey.date, int):
+        year = f"{bibkey.date}{bibkey.date_suffix}"
+    else:
+        year = f"{bibkey.date}-{bibkey.date_suffix}" if bibkey.date_suffix else bibkey.date
+    return f"{authors}:{year}"

philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py ADDED Viewed

@@ -0,0 +1,144 @@
+import traceback
+from typing import Tuple
+from aletk.ResultMonad import Ok, Err
+from aletk.utils import get_logger
+from philoch_bib_sdk.logic.literals import TBasicPubState
+from philoch_bib_sdk.logic.models import BibKeyAttr
+lgr = get_logger(__name__)
+def _parse_bibkey_author(text: str) -> Tuple[str, str]:
+    author_parts = text.split("-")
+    if len(author_parts) == 1:
+        first_author = author_parts[0]
+        other_authors = ""
+    elif len(author_parts) == 2:
+        first_author = author_parts[0]
+        other_authors = author_parts[1]
+    else:
+        raise ValueError(
+            f"Unexpected bibkey author parts in [[ {text} ]]. Found [[ {author_parts} ]]. Expected 1 author, or 2 authors separated by '-'."
+        )
+    return first_author, other_authors
+def _parse_bibkey_date_int_part(text: str) -> Tuple[int | None, int | None]:
+    char_index_type_d = {i: (char, char.isdigit()) for i, char in enumerate(text)}
+    year_l: list[str] = []
+    int_breakpoint = None
+    for i, (char, is_digit) in char_index_type_d.items():
+        if is_digit:
+            year_l.append(char)
+            int_breakpoint = i
+        else:
+            break
+    if year_l != []:
+        year_int = int(f"{''.join(year_l)}")
+    else:
+        year_int = None
+    if year_int and len(f"{year_int}") > 4:
+        raise ValueError(f"Unexpected year value in '{text}': is not a valid year or publication state")
+    return year_int, int_breakpoint
+def _parse_bibkey_date_suffix_part(
+    date_parts: str, year_int: int | None, int_breakpoint: int | None
+) -> Tuple[int | TBasicPubState, str]:
+    # Case 1. The first part of the year is a digit
+    if int_breakpoint is not None:
+        if year_int is None:
+            raise ValueError(
+                f"Unexpected case! year_int is None but int_breakpoint is not None. This should not happen."
+            )
+        date_suffix_raw = date_parts[int_breakpoint + 1 :]
+        return (
+            year_int,
+            date_suffix_raw,
+        )
+    if year_int is not None:
+        raise ValueError(f"Unexpected case! year_int is None but int_breakpoint is not None. This should not happen.")
+    # Case 2. first characters are non-digits
+    # has to start with either "unpub" or "forthcoming" then
+    date_suffix_raw = "".join(date_parts)
+    if not (date_suffix_raw.startswith("forthcoming") or date_suffix_raw.startswith("unpub")):
+        raise ValueError(f"Unexpected year value in '{date_parts}': it is not a valid publication state.")
+    date_suffix_parts = date_suffix_raw.split("-")
+    if len(date_suffix_parts) == 2:
+        suffix = date_suffix_parts[1]
+        if not suffix:
+            raise ValueError(
+                f"Unexpected year value in '{date_parts}': it is not a valid publication state. Expected a suffix after '-'."
+            )
+    elif len(date_suffix_parts) == 1:
+        suffix = ""
+    else:
+        raise ValueError(f"Unexpected year value in '{date_parts}': it is not a valid publication state.")
+    pubstate: TBasicPubState = ""
+    if date_suffix_parts[0] == "unpub":
+        pubstate = "unpub"
+    elif date_suffix_parts[0] == "forthcoming":
+        pubstate = "forthcoming"
+    else:
+        raise ValueError(f"Unexpected year value in '{date_parts}': it is not a valid publication state.")
+    return pubstate, suffix
+def parse_bibkey(text: str) -> Ok[BibKeyAttr] | Err:
+    """
+    Return either a Bibkey object, or a BibkeyError object to indicate a parsing error.
+    """
+    try:
+        bibkey_parts = text.split(":")
+        if len(bibkey_parts) != 2:
+            raise ValueError(
+                f"Unexpected number of bibkey parts in [[ {text} ]]. Expected only two parts separated by ':'."
+            )
+        # Parse the author part
+        first_author, other_authors = _parse_bibkey_author(bibkey_parts[0])
+        # Parse the date part
+        date_parts = bibkey_parts[1]
+        year_int, int_breakpoint = _parse_bibkey_date_int_part(date_parts)
+        # Parse the date suffix part
+        date, date_suffix = _parse_bibkey_date_suffix_part(date_parts, year_int, int_breakpoint)
+        return Ok(
+            BibKeyAttr(
+                first_author=first_author,
+                other_authors=other_authors,
+                date=date,
+                date_suffix=date_suffix,
+            )
+        )
+    except Exception as e:
+        error_message = f"Could not parse bibkey for '{text}'"
+        return Err(
+            message=error_message,
+            code=-1,
+            error_type="BibkeyError",
+            error_trace=f"{traceback.format_exc()}",
+        )

philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py ADDED Viewed

@@ -0,0 +1,37 @@
+from typing import Literal
+from philoch_bib_sdk.logic.models import VALID_DATE_FORMATS, BibItemDateAttr
+def format_date(date: BibItemDateAttr | Literal["no date"]) -> str:
+    if date == "no date":
+        return "no date"
+    match date:
+        case BibItemDateAttr(year=year, year_part_2_hyphen=None, year_part_2_slash=None, month=None, day=None):
+            return str(year)
+        case BibItemDateAttr(year=year, year_part_2_hyphen=None, year_part_2_slash=None, month=month, day=day) if (
+            month is not None and day is not None
+        ):
+            return f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)}"
+        case BibItemDateAttr(year=year, year_part_2_hyphen=None, year_part_2_slash=None, month=month, day=None) if (
+            month is not None
+        ):
+            return f"{year}-{str(month).zfill(2)}"
+        case BibItemDateAttr(
+            year=year, year_part_2_hyphen=year_part_2_hyphen, year_part_2_slash=None, month=None, day=None
+        ) if (year_part_2_hyphen is not None):
+            return f"{year}-{year_part_2_hyphen}"
+        case BibItemDateAttr(
+            year=year, year_part_2_hyphen=None, year_part_2_slash=year_part_2_slash, month=None, day=None
+        ) if (year_part_2_slash is not None):
+            return f"{year}/{year_part_2_slash}"
+        case _:
+            raise ValueError(
+                f"Invalid date format. Expected one of {', '.join(VALID_DATE_FORMATS)}, but found '{date}'."
+            )

philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py ADDED Viewed

@@ -0,0 +1,62 @@
+from aletk.utils import remove_extra_whitespace, get_logger
+from aletk.ResultMonad import Ok, Err
+from typing import Literal
+from philoch_bib_sdk.logic.models import VALID_DATE_FORMATS, BibItemDateAttr
+lgr = get_logger(__name__)
+def _parse_date(text: str) -> BibItemDateAttr | Literal["no date"]:
+    """
+    Parse a single date attribute from a string.
+    """
+    text = remove_extra_whitespace(text)
+    if remove_extra_whitespace(text).lower() == "no date":
+        return "no date"
+    # Split by potential delimiters (hyphens or slashes)
+    parts = text.replace("-", "/").split("/")
+    # Handle the number of parts (could be year, year-year2, year/year_2, year-month-day)
+    if len(parts) == 1:
+        return BibItemDateAttr(
+            year=int(parts[0]), year_part_2_hyphen=None, year_part_2_slash=None, month=None, day=None
+        )
+    elif len(parts) == 2 and "-" in text:
+        return BibItemDateAttr(
+            year=int(parts[0]), year_part_2_hyphen=int(parts[1]), year_part_2_slash=None, month=None, day=None
+        )
+    elif len(parts) == 2 and "/" in text:
+        return BibItemDateAttr(
+            year=int(parts[0]), year_part_2_hyphen=None, year_part_2_slash=int(parts[1]), month=None, day=None
+        )
+    elif len(parts) == 3 and "-" in text and len(parts[1]) <= 2 and len(parts[2]) <= 2:
+        return BibItemDateAttr(
+            year=int(parts[0]), year_part_2_hyphen=None, year_part_2_slash=None, month=int(parts[1]), day=int(parts[2])
+        )
+    else:
+        raise ValueError(f"Invalid date format found in '{text}'. Expected one of {', '.join(VALID_DATE_FORMATS)}.")
+def parse_date(text: str) -> Ok[BibItemDateAttr | Literal["no date"]] | Err:
+    """
+    Parse a single date string into a BibItemDateAttr object.
+    The input is expected to be a single date, either in the format '<year>' or '<year>-<month>' or '<year>-<month>-<day>' (or slashes instead of hyphens).
+    """
+    try:
+        return Ok(_parse_date(text))
+    except Exception as e:
+        error_message = f"Error parsing date from '{text}': {e}"
+        return Err(
+            message=error_message,
+            code=-1,
+            error_type=f"{e.__class__.__name__}",
+            error_trace="",
+        )

philoch_bib_sdk/converters/plaintext/bibitem/formatter.py ADDED Viewed

@@ -0,0 +1,182 @@
+from typing import TypedDict
+from aletk.utils import get_logger
+from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
+from philoch_bib_sdk.converters.plaintext.bib_string_formatter import format_bib_string_attr
+from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
+from philoch_bib_sdk.converters.plaintext.bibitem.date_formatter import format_date
+from philoch_bib_sdk.converters.plaintext.bibitem.pages_formatter import format_pages
+from philoch_bib_sdk.converters.plaintext.journal.formatter import format_journal
+from philoch_bib_sdk.logic.literals import TBibTeXEntryType
+from philoch_bib_sdk.logic.models import BibItem
+lgr = get_logger(__name__)
+def format_entry_type(entry_type: TBibTeXEntryType) -> str:
+    """
+    Format the entry type for the BibItem.
+    """
+    match entry_type:
+        case "UNKNOWN":
+            return "UNKNOWN"
+        case _ if entry_type:
+            return f"@{entry_type}"
+        case _:
+            return ""
+class FormattedBibItem(TypedDict, total=True):
+    _to_do_general: str
+    _change_request: str
+    entry_type: str
+    bibkey: str
+    author: str
+    _author_ids: str
+    editor: str
+    _editor_ids: str
+    author_ids: str
+    options: str
+    shorthand: str
+    date: str
+    pubstate: str
+    title: str
+    _title_unicode: str
+    booktitle: str
+    crossref: str
+    journal: str
+    journal_id: str
+    volume: str
+    number: str
+    pages: str
+    eid: str
+    series: str
+    address: str
+    institution: str
+    school: str
+    publisher: str
+    publisher_id: str
+    type: str
+    edition: str
+    note: str
+    _issuetitle: str
+    _guesteditor: str
+    _extra_note: str
+    urn: str
+    eprint: str
+    doi: str
+    url: str
+    _kw_level1: str
+    _kw_level2: str
+    _kw_level3: str
+    _epoch: str
+    _person: str
+    _comm_for_profile_bib: str
+    _langid: str
+    _lang_der: str
+    _further_refs: str
+    _depends_on: str
+    _dltc_num: str
+    _spec_interest: str
+    _note_perso: str
+    _note_stock: str
+    _note_status: str
+    _num_inwork_coll: str
+    _num_inwork: str
+    _num_coll: str
+    _dltc_copyediting_note: str
+    _note_missing: str
+    _num_sort: str
+def format_bibitem(bibitem: BibItem) -> FormattedBibItem:
+    bibkey = format_bibkey(bibitem.bibkey)
+    author = format_author(bibitem.author, "latex")
+    editor = format_author(bibitem.editor, "latex")
+    person = format_author((bibitem._person,), "latex") if bibitem._person else ""
+    shorthand = ", ".join([author.mononym.latex for author in bibitem.author if author.mononym.latex])
+    date = format_date(bibitem.date)
+    pages = format_pages(bibitem.pages)
+    journal = format_journal(bibitem.journal, "latex")
+    crossref = format_bibkey(bibitem.crossref.bibkey) if bibitem.crossref else ""
+    _kw_level1, kw_level2, kw_level3 = (
+        bibitem._kws.level_1.name if bibitem._kws else "",
+        bibitem._kws.level_2.name if bibitem._kws else "",
+        bibitem._kws.level_3.name if bibitem._kws else "",
+    )
+    further_refs = ", ".join([format_bibkey(ref) for ref in bibitem._further_refs])
+    depends_on = ", ".join([format_bibkey(dep) for dep in bibitem._depends_on])
+    formatted: FormattedBibItem = {
+        "_to_do_general": bibitem._to_do_general,
+        "_change_request": bibitem._change_request,
+        "entry_type": format_entry_type(bibitem.entry_type),
+        "bibkey": bibkey,
+        "author": author,
+        "_author_ids": "",
+        "editor": editor,
+        "_editor_ids": "",
+        "author_ids": "",
+        "options": ", ".join(bibitem.options),
+        "shorthand": shorthand,
+        "date": date,
+        "pubstate": bibitem.pubstate,
+        "title": format_bib_string_attr(bibitem.title, "latex"),
+        "_title_unicode": format_bib_string_attr(bibitem.title, "unicode"),
+        "booktitle": format_bib_string_attr(bibitem.booktitle, "latex"),
+        "crossref": crossref,
+        "journal": journal,
+        "journal_id": "",
+        "volume": bibitem.volume,
+        "number": bibitem.number,
+        "pages": pages,
+        "eid": bibitem.eid,
+        "series": format_bib_string_attr(bibitem.series.name, "latex") if bibitem.series else "",
+        "address": format_bib_string_attr(bibitem.address, "latex"),
+        "institution": format_bib_string_attr(bibitem.institution, "latex"),
+        "school": format_bib_string_attr(bibitem.school, "latex"),
+        "publisher": format_bib_string_attr(bibitem.publisher, "latex"),
+        "publisher_id": "",
+        "type": format_bib_string_attr(bibitem.type, "latex"),
+        "edition": str(bibitem.edition) if bibitem.edition is not None else "",
+        "note": format_bib_string_attr(bibitem.note, "latex"),
+        "_issuetitle": format_bib_string_attr(bibitem.issuetitle, "latex") if bibitem.issuetitle else "",
+        "_guesteditor": ", ".join(format_author(tuple(author for author in bibitem._guesteditor), "latex")),
+        "_extra_note": format_bib_string_attr(bibitem._extra_note, "latex") if bibitem._extra_note else "",
+        "urn": bibitem.urn,
+        "eprint": bibitem.eprint,
+        "doi": bibitem.doi,
+        "url": bibitem.url,
+        "_kw_level1": _kw_level1,
+        "_kw_level2": kw_level2,
+        "_kw_level3": kw_level3,
+        "_epoch": bibitem._epoch,
+        "_person": person,
+        "_comm_for_profile_bib": bibitem._comm_for_profile_bib,
+        "_langid": bibitem._langid,
+        "_lang_der": bibitem._lang_der,
+        "_further_refs": further_refs,
+        "_depends_on": depends_on,
+        "_dltc_num": str(bibitem._dltc_num) if bibitem._dltc_num is not None else "",
+        "_spec_interest": bibitem._spec_interest,
+        "_note_perso": bibitem._note_perso,
+        "_note_stock": bibitem._note_stock,
+        "_note_status": bibitem._note_status,
+        "_num_inwork_coll": str(bibitem._num_inwork_coll) if bibitem._num_inwork_coll is not None else "",
+        "_num_inwork": bibitem._num_inwork,
+        "_num_coll": str(bibitem._num_coll) if bibitem._num_coll is not None else "",
+        "_dltc_copyediting_note": bibitem._dltc_copyediting_note,
+        "_note_missing": bibitem._note_missing,
+        "_num_sort": str(bibitem._num_sort) if bibitem._num_sort is not None else "",
+    }
+    return formatted

philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py ADDED Viewed

@@ -0,0 +1,13 @@
+from typing import Tuple
+from philoch_bib_sdk.logic.models import PageAttr
+def _pages_single_str(page_pair: PageAttr) -> str:
+    return "--".join((page_pair.start, page_pair.end)) if page_pair.end else page_pair.start
+def format_pages(pages: Tuple[PageAttr, ...]) -> str:
+    if pages is tuple():
+        return ""
+    return ", ".join((_pages_single_str(page_pair) for page_pair in pages))

philoch-bib-sdk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

philoch-bib-sdk 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl