PyPI - philoch-bib-sdk - Versions diffs - 0.3.9__cp313-cp313-win_amd64.whl - Mend

philoch-bib-sdk 0.3.9__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

philoch_bib_sdk/__init__.py +0 -0
philoch_bib_sdk/_rust.cp313-win_amd64.pyd +0 -0
philoch_bib_sdk/adapters/io/__init__.py +115 -0
philoch_bib_sdk/adapters/io/csv/__init__.py +308 -0
philoch_bib_sdk/adapters/io/ods/__init__.py +145 -0
philoch_bib_sdk/adapters/plaintext/bibitem_reader.py +0 -0
philoch_bib_sdk/adapters/tabular_data/read_journal_volume_number_index.py +58 -0
philoch_bib_sdk/converters/latex.py +6 -0
philoch_bib_sdk/converters/plaintext/author/formatter.py +34 -0
philoch_bib_sdk/converters/plaintext/author/parser.py +83 -0
philoch_bib_sdk/converters/plaintext/bib_string_formatter.py +8 -0
philoch_bib_sdk/converters/plaintext/bibitem/bibkey_formatter.py +21 -0
philoch_bib_sdk/converters/plaintext/bibitem/bibkey_parser.py +158 -0
philoch_bib_sdk/converters/plaintext/bibitem/date_formatter.py +37 -0
philoch_bib_sdk/converters/plaintext/bibitem/date_parser.py +62 -0
philoch_bib_sdk/converters/plaintext/bibitem/formatter.py +182 -0
philoch_bib_sdk/converters/plaintext/bibitem/pages_formatter.py +13 -0
philoch_bib_sdk/converters/plaintext/bibitem/pages_parser.py +63 -0
philoch_bib_sdk/converters/plaintext/bibitem/parser.py +415 -0
philoch_bib_sdk/converters/plaintext/journal/formatter.py +25 -0
philoch_bib_sdk/converters/plaintext/journal/parser.py +36 -0
philoch_bib_sdk/converters/plaintext/shared/renderable_formatter.py +25 -0
philoch_bib_sdk/interfaces/cli/__init__.py +3 -0
philoch_bib_sdk/interfaces/cli/fuzzy_matching.py +135 -0
philoch_bib_sdk/logic/__init__.py +39 -0
philoch_bib_sdk/logic/default_models.py +315 -0
philoch_bib_sdk/logic/functions/__init__.py +31 -0
philoch_bib_sdk/logic/functions/comparator.py +414 -0
philoch_bib_sdk/logic/functions/fuzzy_matcher.py +796 -0
philoch_bib_sdk/logic/functions/journal_article_matcher.py +44 -0
philoch_bib_sdk/logic/literals.py +98 -0
philoch_bib_sdk/logic/models.py +366 -0
philoch_bib_sdk/logic/models_staging.py +173 -0
philoch_bib_sdk/procedures/fuzzy_matching.py +112 -0
philoch_bib_sdk/py.typed +0 -0
philoch_bib_sdk/rust_scorer/Cargo.lock +232 -0
philoch_bib_sdk/rust_scorer/Cargo.toml +26 -0
philoch_bib_sdk/rust_scorer/pyproject.toml +15 -0
philoch_bib_sdk/rust_scorer/rust_scorer.pyi +65 -0
philoch_bib_sdk/rust_scorer/src/lib.rs +362 -0
philoch_bib_sdk-0.3.9.dist-info/METADATA +15 -0
philoch_bib_sdk-0.3.9.dist-info/RECORD +44 -0
philoch_bib_sdk-0.3.9.dist-info/WHEEL +4 -0
philoch_bib_sdk-0.3.9.dist-info/licenses/LICENSE +21 -0

philoch_bib_sdk/logic/functions/journal_article_matcher.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import Callable, Dict, Tuple
+from philoch_bib_sdk.converters.plaintext.journal.formatter import format_journal
+from philoch_bib_sdk.logic.models import BibItem, BibKeyAttr
+type TJournalName = str
+type TVolume = str
+type TNumber = str
+type TBibkey = str
+type TJournalBibkeyIndex = Dict[
+    Tuple[TJournalName, TVolume, TNumber], BibKeyAttr
+]  # (journal, volume, number)  # bibkey
+def get_bibkey_by_journal_volume_number(index: TJournalBibkeyIndex, subject: BibItem) -> BibKeyAttr:
+    """
+    Simple lookup of a Bibitem on an index for its bibkey, via the combination (journal_name, volume, number). Fails if any of the three fields are missing.
+    """
+    # TODO: need to ensure the index is unique, possibly via some fuzzy matching with the title or the author
+    journal = format_journal(subject.journal, bibstring_type="latex")
+    volume = subject.volume
+    number = subject.number
+    if any((journal == "", volume == "", number == "")):
+        raise ValueError(
+            f"Expected subject bibitem journal with non-empty journal, volume, and number. Found [[ journal: {journal}; volume: {volume}; number: {number} ]] instead."
+        )
+    return index[(journal, volume, number)]
+type TReadIndex = Callable[
+    [
+        str,  # path to the index file
+    ],
+    TJournalBibkeyIndex,
+]

philoch_bib_sdk/logic/literals.py ADDED Viewed

@@ -0,0 +1,98 @@
+from typing import Literal, Tuple, get_args
+TBibTeXEntryType = Literal[
+    "article",
+    "book",
+    "incollection",
+    "inproceedings",
+    "mastersthesis",
+    "misc",
+    "phdthesis",
+    "proceedings",
+    "techreport",
+    "unpublished",
+    "UNKNOWN",
+]
+TBasicPubState = Literal[
+    "",
+    "unpub",
+    "forthcoming",
+]
+TPubState = Literal[
+    "",
+    "unpub",
+    "forthcoming",
+    "inwork",
+    "submitted",
+    "published",
+]
+TLanguageID = Literal[
+    "",
+    "catalan",
+    "czech",
+    "danish",
+    "dutch",
+    "english",
+    "french",
+    "greek",
+    "italian",
+    "latin",
+    "lithuanian",
+    "ngerman",
+    "polish",
+    "portuguese",
+    "romanian",
+    "russian",
+    "slovak",
+    "spanish",
+    "swedish",
+    "unknown",
+]
+TEpoch = Literal[
+    "",
+    "ancient-philosophy",
+    "ancient-scientists",
+    "austrian-philosophy",
+    "british-idealism",
+    "classics",
+    "contemporaries",
+    "contemporary-scientists",
+    "continental-philosophy",
+    "critical-theory",
+    "cynics",
+    "enlightenment",
+    "existentialism",
+    "exotic-philosophy",
+    "german-idealism",
+    "german-rationalism",
+    "gestalt-psychology",
+    "hermeneutics",
+    "islamic-philosophy",
+    "mathematicians",
+    "medieval-philosophy",
+    "modern-philosophy",
+    "modern-scientists",
+    "neokantianism",
+    "neo-kantianism",
+    "neoplatonism",
+    "new-realism",
+    "ordinary-language-philosophy",
+    "phenomenology",
+    "polish-logic",
+    "pragmatism",
+    "presocratics",
+    "renaissance",
+    "stoics",
+    "theologians",
+    "vienna-circle",
+]
+# Literal value constants for runtime validation
+BIBTEX_ENTRY_TYPE_VALUES: Tuple[TBibTeXEntryType, ...] = get_args(TBibTeXEntryType)
+PUB_STATE_VALUES: Tuple[TPubState, ...] = get_args(TPubState)
+EPOCH_VALUES: Tuple[TEpoch, ...] = get_args(TEpoch)
+LANGUAGE_ID_VALUES: Tuple[TLanguageID, ...] = get_args(TLanguageID)

philoch_bib_sdk/logic/models.py ADDED Viewed

@@ -0,0 +1,366 @@
+from __future__ import annotations
+from typing import Literal, Tuple, get_args
+import attrs
+from philoch_bib_sdk.logic.literals import TBasicPubState, TBibTeXEntryType, TEpoch, TLanguageID, TPubState
+type Maybe[T] = T | None
+type MaybeStr[T] = T | Literal[""]
+@attrs.define(frozen=True, slots=True)
+class BibStringAttr:
+    """
+    A representation of the different forms of a string we may need for different purposes.
+    Args:
+        latex: formatted string for LaTeX, can be used in bib files
+        unicode: formatted string for Unicode, can be used in text. Produced from the LaTeX string
+        simplified: simplified string, can be used to match strings. Produced from the Unicode string
+    """
+    latex: str = ""
+    unicode: str = ""
+    simplified: str = ""
+BibStringLiteral = Literal["latex", "unicode", "simplified"]
+type TBibString = BibStringLiteral
+BIB_STRING_VALUES: Tuple[str, ...] = get_args(BibStringLiteral)
+############
+# Base Renderables
+############
+@attrs.define(frozen=True, slots=True)
+class BaseRenderable:
+    """
+    Base class for renderable objects that contain a single 'text' attribute.
+    Args:
+        text: BibString
+        id: Maybe[int] = None
+    """
+    text: BibStringAttr
+    id: Maybe[int] = None
+@attrs.define(frozen=True, slots=True)
+class BaseNamedRenderable:
+    """
+    Base class for renderable objects that contain a single 'name' attribute.
+    Args:
+        name: BibString
+        id: Maybe[int] = None
+    """
+    name: BibStringAttr
+    id: Maybe[int] = None
+RenderablesLiteral = Literal["text", "name"]
+type TRenderable = RenderablesLiteral
+RENDERABLES_VALUES: Tuple[str, ...] = get_args(RenderablesLiteral)
+############
+# Author
+############
+@attrs.define(frozen=True, slots=True)
+class Author:
+    """
+    An author of a publication.
+    Args:
+        given_name: BibStringAttr
+        family_name: BibStringAttr
+        given_name_latex: BibStringAttr
+        family_name_latex: BibStringAttr
+        publications: Tuple[BibItem] = []
+        id: Maybe[int] = None
+    """
+    given_name: BibStringAttr
+    family_name: BibStringAttr
+    mononym: BibStringAttr
+    shorthand: BibStringAttr
+    famous_name: BibStringAttr
+    publications: Tuple[BibItem, ...]
+    id: Maybe[int] = None
+############
+# Journal
+############
+@attrs.define(frozen=True, slots=True)
+class Journal:
+    """
+    A journal that publishes publications.
+    Args:
+        name: BibStringAttr
+        name_latex: str
+        issn_print: str
+        issn_electronic: str
+        id: Maybe[int] = None
+    """
+    name: BibStringAttr
+    issn_print: str
+    issn_electronic: str
+    id: Maybe[int] = None
+############
+# Keyword
+############
+@attrs.define(frozen=True, slots=True)
+class Keyword:
+    """
+    Keyword of a publication.
+    Args:
+        name: str
+        id: Maybe[int] = None
+    """
+    name: str
+    id: Maybe[int] = None
+############
+# BibItem
+############
+class BibKeyValidationError(Exception):
+    pass
+@attrs.define(frozen=True, slots=True)
+class BibKeyAttr:
+    """
+    A unique identifier for a publication.
+    Args:
+        first_author: str
+        other_authors: str
+        date: int | TBasicPubStatus
+        date_suffix: str
+    """
+    first_author: str
+    other_authors: str
+    date: int | TBasicPubState
+    date_suffix: str
+    def __attrs_post_init__(self) -> None:
+        if not self.first_author or not self.date:
+            raise BibKeyValidationError("Both 'first_author' and 'date' must not be empty.")
+class BibItemDateValidationError(Exception):
+    pass
+@attrs.define(frozen=True, slots=True)
+class BibItemDateAttr:
+    """
+    Year of a publication.
+    Example:
+        BibItemDate(year=2021, year_revised=2022) represents `2021/2022`.
+        BibItemDate(year=2021, month=1, day=1) represents `2021-01-01`.
+    Args:
+        year: int
+        year_part_2_hyphen: Maybe[int] = None
+        year_part_2_slash: Maybe[int] = None
+        month: Maybe[int] = None
+        day: Maybe[int] = None
+    """
+    year: int
+    year_part_2_hyphen: Maybe[int] = None
+    year_part_2_slash: Maybe[int] = None
+    month: Maybe[int] = None
+    day: Maybe[int] = None
+    def __attrs_post_init__(self) -> None:
+        if any([self.year_part_2_hyphen, self.year_part_2_slash]) and not self.year:
+            raise BibItemDateValidationError(
+                "If 'year_part_2_hyphens' or 'year_part_2_slash' is set, 'year' must not be empty."
+            )
+        if not ((self.month and self.day) or (not self.month and not self.day)):
+            raise BibItemDateValidationError("If 'day' is set, 'month' must be set too, and vice versa.")
+        if self.month and not self.year:
+            raise BibItemDateValidationError("If 'month' is set, 'year' must not be empty.")
+        if self.year_part_2_hyphen and self.year_part_2_slash:
+            raise BibItemDateValidationError("If 'year_part_2_hyphen' is set, 'year_part_2_slash' must not be set.")
+VALID_DATE_FORMATS = [
+    "{year}",
+    "{year_1}-{year_2}",
+    "{year}/{year_2}",
+    "{year}-{month}-{day}",
+    "{year}-{month}",
+]
+@attrs.define(frozen=True, slots=True)
+class KeywordsAttr:
+    """
+    Keywords of a publication.
+    Args:
+        level_1: Keyword
+        level_2: Keyword
+        level_3: Keyword
+    """
+    level_1: Keyword
+    level_2: Keyword
+    level_3: Keyword
+class PageValidationError(Exception):
+    pass
+@attrs.define(frozen=True, slots=True)
+class PageAttr:
+    """
+    Page numbers of a publication. Can be a range, roman numerals, or a single page.
+    Args:
+        start: str
+        end: str
+    """
+    start: str
+    end: str
+    def __attrs_post_init__(self) -> None:
+        if self.end and not self.start:
+            raise PageValidationError("If 'end' is set, 'start' must not be empty.")
+class BibItemValidationError(Exception):
+    pass
+@attrs.define(frozen=True, slots=True)
+class BibItem:
+    """
+    Bibliographic item type. All attributes are optional.
+    Args:
+    """
+    # Normal string fields
+    _to_do_general: str
+    _change_request: str
+    # Official fields, may be stored in different formats
+    entry_type: TBibTeXEntryType
+    bibkey: MaybeStr[BibKeyAttr]
+    author: Tuple[Author, ...]
+    editor: Tuple[Author, ...]
+    options: Tuple[str, ...]
+    # shorthand: BibStringAttr  # Mononym of the author
+    date: BibItemDateAttr | Literal["no date"]
+    pubstate: TPubState
+    title: MaybeStr[BibStringAttr]
+    booktitle: MaybeStr[BibStringAttr]
+    crossref: MaybeStr[CrossrefBibItemAttr]
+    journal: Maybe[Journal]
+    volume: str
+    number: str
+    pages: Tuple[PageAttr, ...]
+    eid: str
+    series: MaybeStr[BaseNamedRenderable]
+    address: MaybeStr[BibStringAttr]
+    institution: MaybeStr[BibStringAttr]
+    school: MaybeStr[BibStringAttr]
+    publisher: MaybeStr[BibStringAttr]
+    type: MaybeStr[BibStringAttr]
+    edition: Maybe[int]
+    note: MaybeStr[BibStringAttr]
+    issuetitle: MaybeStr[BibStringAttr]
+    _guesteditor: Tuple[Author, ...]  # Custom field
+    _extra_note: MaybeStr[BibStringAttr]  # Custom field
+    urn: str
+    eprint: str
+    doi: str
+    url: str
+    # String fields
+    _kws: MaybeStr[KeywordsAttr]
+    _epoch: TEpoch
+    _person: MaybeStr[Author]
+    _comm_for_profile_bib: str
+    _langid: TLanguageID
+    _lang_der: str
+    _further_refs: Tuple[BibKeyAttr, ...]
+    _depends_on: Tuple[BibKeyAttr, ...]
+    _dltc_num: Maybe[int]
+    _spec_interest: str
+    _note_perso: str
+    _note_stock: str
+    _note_status: str
+    _num_inwork_coll: Maybe[int]
+    _num_inwork: str
+    _num_coll: Maybe[int]
+    _dltc_copyediting_note: str
+    _note_missing: str
+    _num_sort: Maybe[int]
+    # Additional fields
+    id: Maybe[int] = None
+    _bib_info_source: str = ""
+    def __attrs_post_init__(self) -> None:
+        if self.crossref and self.bibkey == self.crossref.bibkey:
+            raise BibItemValidationError("Crossref bibkey must be different from the main bibkey.")
+@attrs.define(frozen=True, slots=True)
+class CrossrefBibItemAttr(BibItem):
+    """
+    A cross-reference to another bibliographic item.
+    Args:
+        bibkey: str
+    """
+    def __attrs_post_init__(self) -> None:
+        if self.entry_type != "book":
+            raise ValueError("Crossref must have a 'type' of 'book'.")
+        if not self.booktitle:
+            raise ValueError("Crossref must have a 'booktitle'.")
+        if not self.bibkey:
+            raise ValueError("Crossref must have a 'bibkey'.")
+        if self.crossref and self.bibkey == self.crossref.bibkey:
+            raise BibItemValidationError("Crossref bibkey must be different from the main bibkey.")

philoch_bib_sdk/logic/models_staging.py ADDED Viewed

@@ -0,0 +1,173 @@
+"""Data models for staged bibliography matching.
+This module provides models for tracking fuzzy matching results when comparing
+new BibItems against an existing bibliography.
+"""
+import json
+from enum import StrEnum
+from typing import Tuple, TypedDict
+import attrs
+class SearchMetadata(TypedDict, total=False):
+    """Metadata about a fuzzy matching search operation.
+    Attributes:
+        search_time_ms: Time taken for the search in milliseconds
+        candidates_searched: Number of candidates evaluated
+        scorer: Which scorer was used ("rust" or "python")
+    """
+    search_time_ms: int
+    candidates_searched: int
+    scorer: str
+from philoch_bib_sdk.converters.plaintext.author.formatter import format_author
+from philoch_bib_sdk.logic.models import BibItem
+class ScoreComponent(StrEnum):
+    """Components used in calculating similarity scores between BibItems."""
+    TITLE = "title"
+    AUTHOR = "author"
+    DATE = "date"
+    DOI = "doi"
+    JOURNAL_VOLUME_NUMBER = "journal_volume_number"
+    PAGES = "pages"
+    PUBLISHER = "publisher"
+@attrs.define(frozen=True, slots=True)
+class PartialScore:
+    """Individual score component with weight and explanation.
+    Attributes:
+        component: The type of comparison (title, author, etc.)
+        score: Raw score value (before weighting)
+        weight: Weight factor applied to this component (0.0-1.0)
+        weighted_score: Final score after applying weight (score * weight)
+        details: Human-readable explanation of the score
+    """
+    component: ScoreComponent
+    score: int
+    weight: float
+    weighted_score: float
+    details: str
+@attrs.define(frozen=True, slots=True)
+class Match:
+    """A candidate match with full scoring breakdown.
+    Attributes:
+        bibkey: The bibliography key of the matched item
+        matched_bibitem: The full BibItem that was matched
+        total_score: Sum of all weighted partial scores
+        partial_scores: Detailed breakdown of each score component
+        rank: Position in the results (1-based, 1 = best match)
+    """
+    bibkey: str
+    matched_bibitem: BibItem
+    total_score: float
+    partial_scores: Tuple[PartialScore, ...]
+    rank: int
+    def to_json_summary(self) -> dict[str, object]:
+        """Convert match to a JSON-serializable summary.
+        Returns:
+            Dictionary with bibkey, rank, scores, and breakdown details
+        """
+        # Truncate long strings for readability in CSV
+        from philoch_bib_sdk.logic.models import BibStringAttr
+        title_attr = self.matched_bibitem.title
+        title = title_attr.simplified if isinstance(title_attr, BibStringAttr) else ""
+        title_truncated = title[:100] + "..." if len(title) > 100 else title
+        author_formatted = format_author(self.matched_bibitem.author, "simplified")
+        author_truncated = author_formatted[:100] + "..." if len(author_formatted) > 100 else author_formatted
+        return {
+            "bibkey": self.bibkey,
+            "rank": self.rank,
+            "total_score": round(self.total_score, 2),
+            "title": title_truncated,
+            "author": author_truncated,
+            "score_breakdown": {
+                ps.component.value: {
+                    "score": ps.score,
+                    "weight": ps.weight,
+                    "weighted": round(ps.weighted_score, 2),
+                    "details": ps.details,
+                }
+                for ps in self.partial_scores
+            },
+        }
+@attrs.define(frozen=True, slots=True)
+class BibItemStaged:
+    """A BibItem being matched against a bibliography.
+    Used for processing new/incoming bibliographic entries and comparing them
+    against an existing bibliography to find potential matches or duplicates.
+    Attributes:
+        bibitem: The new/incoming item to match
+        top_matches: Top N best matches found in the bibliography
+        search_metadata: Performance and search statistics
+    """
+    bibitem: BibItem
+    top_matches: Tuple[Match, ...]
+    search_metadata: SearchMetadata
+    def to_csv_row(self) -> dict[str, str | int | float]:
+        """Export as a flat CSV row with nested JSON for match details.
+        Returns:
+            Dictionary suitable for CSV writing with json-encoded top_matches
+        """
+        from philoch_bib_sdk.converters.plaintext.bibitem.bibkey_formatter import format_bibkey
+        from philoch_bib_sdk.logic.models import BibItemDateAttr, BibStringAttr
+        # Handle date formatting
+        date_str = ""
+        if self.bibitem.date != "no date":
+            date_obj = self.bibitem.date
+            if isinstance(date_obj, BibItemDateAttr):
+                date_str = str(date_obj.year)
+        # Get best match info if available
+        best_match_score = 0.0
+        best_match_bibkey = ""
+        if self.top_matches:
+            best_match_score = self.top_matches[0].total_score
+            best_match_bibkey = self.top_matches[0].bibkey
+        # Handle bibkey using formatter
+        bibkey_str = format_bibkey(self.bibitem.bibkey)
+        # Handle title
+        title_attr = self.bibitem.title
+        title_str = title_attr.simplified if isinstance(title_attr, BibStringAttr) else ""
+        return {
+            "staged_bibkey": bibkey_str,
+            "staged_title": title_str,
+            "staged_author": format_author(self.bibitem.author, "simplified"),
+            "staged_year": date_str,
+            "num_matches": len(self.top_matches),
+            "best_match_score": round(best_match_score, 2),
+            "best_match_bibkey": best_match_bibkey,
+            "top_matches_json": json.dumps(tuple(m.to_json_summary() for m in self.top_matches)),
+            "search_time_ms": self.search_metadata.get("search_time_ms", 0),
+            "candidates_searched": self.search_metadata.get("candidates_searched", 0),
+        }