PyPI - biblealignlib - Versions diffs - 0.3.1__tar.gz → 0.3.2__tar.gz - Mend

biblealignlib 0.3.1tar.gz → 0.3.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblealignlib
-Version: 0.3.1
+Version: 0.3.2
 Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
 License: MIT
 License-File: LICENSE
@@ -36,9 +36,13 @@ Description-Content-Type: text/markdown
 # biblealignlib
-Biblica's code for working with Bible alignment data from
+Biblica's Python code for working with Bible alignment data from
 https://github.com/Clear-Bible/Alignments .
+This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
+licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
 ## Installing extra dependencies
 ### eflomal

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/README.md RENAMED Viewed

@@ -1,8 +1,12 @@
 # biblealignlib
-Biblica's code for working with Bible alignment data from
+Biblica's Python code for working with Bible alignment data from
 https://github.com/Clear-Bible/Alignments .
+This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
+licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
 ## Installing extra dependencies
 ### eflomal

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/mapper.py RENAMED Viewed

@@ -18,9 +18,9 @@ commonly used by automated alignment algorithms.
 >>>  pm.bcv["mappings"]["41004003"]
 <CorpusMapping: 41004003>
 >>> pm.bcv["mappings"]["41004003"].source_pairs
-[(<Source: n41004003001>, 0), (<Source: n41004003002>, 1), (<Source: n41004003003>, 2), ...
+[(<Source: n41004003001|Ἀκούετε>, 0), (<Source: n41004003002|ἰδοὺ>, 1), (<Source: n41004003003|ἐξῆλθεν>, 2), ...
 >>> pm.bcv["mappings"]["41004003"].target_pairs
-[(<Target: 410040030011>, 0), (<Target: 410040030021>, 1), (<Target: 410040030031>, 2), ...
+[(<Target: 410040030011|Listen>, 0), (<Target: 410040030021|A>, 1), (<Target: 410040030031|sower>, 2), ...
 """

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentGroup.py RENAMED Viewed

@@ -283,6 +283,14 @@ class AlignmentRecord:
         """True if any selectors in references are incomplete."""
         return any(ref.incomplete for ref in self.references.values())
+    def update_target_selectors(self, selectors: list[str]) -> None:
+        """Replace the target selectors for this record.
+        Selectors are sorted, matching the behaviour of
+        AlignmentReference.__post_init__.
+        """
+        self.references["target"].selectors = sorted(selectors)
     def asdict(
         self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
     ) -> dict[str, Any]:

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/BaseToken.py RENAMED Viewed

@@ -25,7 +25,7 @@ class BaseToken:
     def __repr__(self) -> str:
         """Return a printed representation."""
-        return f"<{self.__class__.__name__}: {self.id}>"
+        return f"<{self.__class__.__name__}: {self.tokenstr}>"
     #
     def __hash__(self) -> int:

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/VerseData.py RENAMED Viewed

@@ -156,6 +156,16 @@ class VerseData:
                 for trg in targets:
                     print(f"Target: {trg._display}")
+    def display_record(self, alrec: AlignmentRecord) -> None:
+        """Display an alignment record from this instance."""
+        source_tokenstring: str = ", ".join(
+            [self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
+        )
+        target_tokenstring: str = ", ".join(
+            [self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
+        )
+        print(f"{alrec.meta.id}: {source_tokenstring} --- {target_tokenstring}")
     def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
         """Display tokens from typeattr that are _not_ aligned."""
         assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
@@ -172,13 +182,13 @@ class VerseData:
         if aligned:
             for sources, targets in self.alignments:
                 print(
-                    f"{str([src.idtext for src in sources]):{srcwidth}}\t\t{[trg.idtext for trg in targets]}"
+                    f"{str([src.tokenstr for src in sources]):{srcwidth}}\t\t{[trg.tokenstr for trg in targets]}"
                 )
         else:
             # show all sources with their (possibly empty) target alignments
             for source in self.sources:
                 print(
-                    f"{str(source.idtext):{srcwidth}}\t\t{[trg.idtext for trg in self.get_source_alignments(source)]}"
+                    f"{str(source.tokenstr):{srcwidth}}\t\t{[trg.tokenstr for trg in self.get_source_alignments(source)]}"
                 )
     def get_texts(

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/alignments.py RENAMED Viewed

@@ -301,6 +301,7 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
     """Write JSON data for an arbitrary group in Scripture Burrito format.
     Writes some of the JSON by hand to get records on the same line.
+    Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
     """
     def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
@@ -311,19 +312,32 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
         out.write(" ],\n")
     def _write_meta(out: TextIO, meta: Metadata) -> None:
-        """Write metdatadata to out."""
+        """Write metadata to out."""
         metarow = '"meta": ' + json.dumps(meta.asdict())
         f.write(f" {metarow},\n")
+    def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
+        """Return the serialized dict for arec with a sequential BCV-based id.
+        This converts the ClearAligner opaque IDs to something
+        meaningful, attempting to make files more diff-able.
+        """
+        bcv = arec.source_bcv
+        bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
+        recdict = arec.asdict()
+        recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
+        return recdict
     f.write("{\n")
     _write_documents(f, group.documents)
     _write_meta(f, group.meta)
     f.write(f' "roles": {json.dumps(group.roles)},\n')
     f.write(f' "type": "{group._type}",\n "records": [\n ')
     # should sort the records: NIV11 doesn't appear to be sorted
+    bcv_counters: dict[str, int] = {}
     for arec in group.records[:-1]:
-        json.dump(arec.asdict(), f)
+        json.dump(_record_dict(arec, bcv_counters), f)
         f.write(",\n ")
     # now the last one without a comma, because JSON
-    json.dump(group.records[-1].asdict(), f)
+    json.dump(_record_dict(group.records[-1], bcv_counters), f)
     f.write("\n ]}")

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/manager.py RENAMED Viewed

@@ -114,6 +114,10 @@ class Manager(UserDict):
             keepbadrecords=self.keepbadrecords,
         )
         self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
+        # TODO: upgrade the selectors to use tokenstr. This requires
+        # knowing the source and targetitems, but alignmentsreader
+        # doesn't have that data
+        # self.add_tokenstr_to_records(self)
         # group records by BCV
         self.bcv["records"] = groupby_bcv(
             list(self.alignmentsreader.alignmentgroup.records), lambda r: r.source_bcv

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/source.py RENAMED Viewed

@@ -17,12 +17,11 @@ called from burrito.manager.Manager().
 5468
 # dict: token ID -> Source() instance
 >>> src["n41004003001"]
-src["n41004003001"]
-<Source: n41004003001>
+<Source: n41004003001|Ἀκούετε>
 >>> src["n41004003001"].display()
 n41004003001: Ἀκούετε		 (Listen, ἀκούω, verb)
->>> src["n41004003001"].idtext
-('n41004003001', 'Ἀκούετε')
+>>> src["n41004003001"].tokenstr
+'n41004003001|Ἀκούετε'
 >>> src["n41004003001"].asdict()
 {'identifier': 'n41004003001',
  'altId': 'Ἀκούετε-1',

{biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/target.py RENAMED Viewed

@@ -1,4 +1,14 @@
-"""Manage the target/translation data for Grape City (gc) alignment data.
+"""Manage the target/translation data for alignment data.
+This typically reads the output of kathairo.
+Limitations:
+- Each token is assigned to the relevant source verse, which may be
+  different than the verse assignments in the target text. This is
+  version-specific, not necessarily a versification issue. So verse
+  identifiers may need mapping.
+    - Example: SBLGNT for 3JN has v. 15, but all these tokens are in v. 14 in the NIV11.
 >>> from biblealignlib.burrito import target
 # Reading is normally done by Manager
@@ -15,6 +25,7 @@
 # write the tokens out
 >>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
 >>> tr.write_tsv(tokenlist=tr.data.values(), outpath=(LANGDATAPATH / "targets/BSB/new-nt_BSB.tsv"))
 """
 from collections import UserDict, defaultdict

biblealignlib-0.3.2/biblealignlib/util/DiffAlignments.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""Compare two alignment groups record by record.
+Both groups must share the same sourceid, targetid, and
+targetlanguage. This is most useful for checking minor changes to
+ensure you haven't introduced errors.
+Comparison ignores meta.id (which is assigned on write) but reports
+differences in targets and all other meta fields (status, origin, creator, note).
+>>> from biblealignlib.burrito import CLEARROOT, AlignmentSet
+>>> from biblealignlib.util.DiffAlignments import DiffAlignments
+>>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
+>>> alset1 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
+...                       targetlanguage="eng", langdatapath=LANGDATAPATH,
+...                       alternateid="manual")
+>>> alset2 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
+...                       targetlanguage="eng", langdatapath=LANGDATAPATH,
+...                       alternateid="updated")
+>>> da = DiffAlignments(alset1, alset2)
+>>> da.show()
+"""
+from dataclasses import dataclass, field
+from ..burrito.AlignmentGroup import AlignmentGroup, AlignmentRecord
+from ..burrito.AlignmentSet import AlignmentSet
+from ..burrito.alignments import AlignmentsReader
+# Meta fields compared between records (id is intentionally excluded)
+_COMPARED_META_FIELDS = ("creator", "note", "origin", "status")
+def _record_key(rec: AlignmentRecord) -> tuple[str, ...]:
+    """Return a stable key for matching records across groups.
+    Keyed by sorted source selectors, since records are matched on the
+    source side and targets may differ.
+    """
+    return tuple(sorted(rec.source_selectors))
+def _meta_diffs(rec1: AlignmentRecord, rec2: AlignmentRecord) -> dict[str, tuple[str, str]]:
+    """Return a dict of differing meta fields (excluding id).
+    Keys are field names; values are (val_in_rec1, val_in_rec2).
+    """
+    diffs: dict[str, tuple[str, str]] = {}
+    for field_name in _COMPARED_META_FIELDS:
+        v1 = getattr(rec1.meta, field_name, "")
+        v2 = getattr(rec2.meta, field_name, "")
+        if v1 != v2:
+            diffs[field_name] = (str(v1), str(v2))
+    return diffs
+@dataclass
+class RecordDiff:
+    """Captures differences between two matched alignment records."""
+    source_selectors: tuple[str, ...]
+    # non-empty when targets differ
+    targets1: list[str] = field(default_factory=list)
+    targets2: list[str] = field(default_factory=list)
+    # non-empty when meta fields (excluding id) differ
+    meta_diffs: dict[str, tuple[str, str]] = field(default_factory=dict)
+    @property
+    def targets_differ(self) -> bool:
+        """True if target selectors differ between the two records."""
+        return self.targets1 != self.targets2
+    def __repr__(self) -> str:
+        src = ", ".join(self.source_selectors)
+        parts = [f"<RecordDiff src=[{src}]"]
+        if self.targets_differ:
+            parts.append(f" targets: {self.targets1} -> {self.targets2}")
+        for fname, (v1, v2) in self.meta_diffs.items():
+            parts.append(f" {fname}: {v1!r} -> {v2!r}")
+        parts.append(">")
+        return "".join(parts)
+class DiffAlignments:
+    """Compare two alignment groups from the same source/target pair.
+    Records are matched by their source selectors. Differences in
+    target selectors and metadata (excluding id) are reported.
+    """
+    def __init__(self, alset1: AlignmentSet, alset2: AlignmentSet) -> None:
+        """Initialize and compute differences."""
+        for attr in ("sourceid", "targetid", "targetlanguage"):
+            v1 = getattr(alset1, attr)
+            v2 = getattr(alset2, attr)
+            if v1 != v2:
+                raise ValueError(f"AlignmentSets differ on {attr!r}: {v1!r} vs {v2!r}")
+        self.alset1 = alset1
+        self.alset2 = alset2
+        self.group1: AlignmentGroup = AlignmentsReader(alset1).alignmentgroup
+        self.group2: AlignmentGroup = AlignmentsReader(alset2).alignmentgroup
+        # index each group's records by source-selector key
+        self._recs1: dict[tuple[str, ...], AlignmentRecord] = {
+            _record_key(r): r for r in self.group1.records
+        }
+        self._recs2: dict[tuple[str, ...], AlignmentRecord] = {
+            _record_key(r): r for r in self.group2.records
+        }
+        keys1 = set(self._recs1)
+        keys2 = set(self._recs2)
+        # records present only in one group
+        self.only_in_1: list[AlignmentRecord] = [self._recs1[k] for k in sorted(keys1 - keys2)]
+        self.only_in_2: list[AlignmentRecord] = [self._recs2[k] for k in sorted(keys2 - keys1)]
+        # records present in both; compare targets and meta
+        self.record_diffs: list[RecordDiff] = []
+        for key in sorted(keys1 & keys2):
+            r1, r2 = self._recs1[key], self._recs2[key]
+            t1, t2 = sorted(r1.target_selectors), sorted(r2.target_selectors)
+            mdiffs = _meta_diffs(r1, r2)
+            if t1 != t2 or mdiffs:
+                self.record_diffs.append(
+                    RecordDiff(source_selectors=key, targets1=t1, targets2=t2, meta_diffs=mdiffs)
+                )
+    @property
+    def has_diffs(self) -> bool:
+        """True if any differences were found."""
+        return bool(self.only_in_1 or self.only_in_2 or self.record_diffs)
+    def show(self) -> None:
+        """Print a human-readable summary of all differences."""
+        label1 = self.alset1.identifier
+        label2 = self.alset2.identifier
+        print(f"Comparing {label1!r} vs {label2!r}")
+        print(
+            f"  {len(self.group1.records)} records in {label1}, "
+            f"{len(self.group2.records)} records in {label2}"
+        )
+        if not self.has_diffs:
+            print("  No differences found.")
+            return
+        if self.only_in_1:
+            print(f"\n  Records only in {label1} ({len(self.only_in_1)}):")
+            for rec in self.only_in_1:
+                src = ", ".join(rec.source_selectors)
+                print(f"    - src=[{src}]  tgt={rec.target_selectors}")
+        if self.only_in_2:
+            print(f"\n  Records only in {label2} ({len(self.only_in_2)}):")
+            for rec in self.only_in_2:
+                src = ", ".join(rec.source_selectors)
+                print(f"    + src=[{src}]  tgt={rec.target_selectors}")
+        if self.record_diffs:
+            print(f"\n  Records with differences ({len(self.record_diffs)}):")
+            for diff in self.record_diffs:
+                src = ", ".join(diff.source_selectors)
+                print(f"    src=[{src}]")
+                if diff.targets_differ:
+                    print(f"      targets: {diff.targets1}")
+                    print(f"            -> {diff.targets2}")
+                for fname, (v1, v2) in diff.meta_diffs.items():
+                    print(f"      {fname}: {v1!r} -> {v2!r}")

biblealignlib 0.3.1__tar.gz → 0.3.2__tar.gz

biblealignlib 0.3.1tar.gz → 0.3.2tar.gz