PyPI - biblealignlib - Versions diffs - 0.3.2__tar.gz → 0.4.0__tar.gz - Mend

biblealignlib 0.3.2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: biblealignlib
-Version: 0.3.2
+Version: 0.4.0
 Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
 License: MIT
 License-File: LICENSE

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentGroup.py RENAMED Viewed

@@ -27,7 +27,7 @@ from biblelib.word import bcvwpid
 import biblealignlib as bal
 from .AlignmentType import TranslationType
-from .source import macula_prefixer
+from .source import macula_prefixer, macula_unprefixer
 # hoisting means this can be defined at several different levels, so
@@ -291,8 +291,15 @@ class AlignmentRecord:
         """
         self.references["target"].selectors = sorted(selectors)
+    # note that source/target_tokens are only available from a Manager
+    # instance, so the default repr doesn't include tokenstrs.
     def asdict(
-        self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
+        self,
+        positional: bool = False,
+        withmeta: bool = True,
+        withmaculaprefix: bool = False,
+        source_tokens: Optional[dict[str, Any]] = None,
+        target_tokens: Optional[dict[str, Any]] = None,
     ) -> dict[str, Any]:
         """Return a dict of values suitable for serialization.
@@ -307,6 +314,14 @@ class AlignmentRecord:
         With withmaculaprefix=True (the default is False), prefix
         source references with 'o' or 'n' depending on canon.
+        With source_tokens provided as a dict mapping bare token IDs to token
+        objects, source selectors are replaced with tokenstr representations
+        ("{id}|{text}"). With withmaculaprefix=True, the prefixed ID is used.
+        With target_tokens provided as a dict mapping token IDs to token
+        objects, target selectors are replaced with tokenstr representations
+        ("{id}|{text}").
         """
         recdict: dict[str, Any] = {}
         if positional:
@@ -319,12 +334,28 @@ class AlignmentRecord:
         else:
             # typical case
             sourcerefs: list[str] = self.references["source"].selectors
-            if withmaculaprefix:
+            if source_tokens is not None:
+                # Build tokenstr: use bare ID by default, prefixed ID if withmaculaprefix
+                bare_ids = [macula_unprefixer(sel) for sel in sourcerefs]
+                display_ids = (
+                    [macula_prefixer(b) for b in bare_ids] if withmaculaprefix else bare_ids
+                )
+                sourcerefs = [
+                    f"{did}|{tok.text}" if (tok := source_tokens.get(bare)) else did
+                    for bare, did in zip(bare_ids, display_ids)
+                ]
+            elif withmaculaprefix:
                 # default: add back the Macula prefix
                 sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs]
             # else leave as is (atypical)
             recdict["source"] = sourcerefs
-            recdict["target"] = self.references["target"].selectors
+            targetrefs: list[str] = self.references["target"].selectors
+            if target_tokens is not None:
+                targetrefs = [
+                    f"{sel}|{tok.text}" if (tok := target_tokens.get(sel)) else sel
+                    for sel in targetrefs
+                ]
+            recdict["target"] = targetrefs
         if withmeta:
             recdict.update(
                 {
@@ -380,12 +411,25 @@ class AlignmentGroup:
         docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents])
         return f"<AlignmentGroup{docids}: {len(self.records)} records>"
-    def asdict(self, hoist: bool = True) -> dict[str, Any]:
+    def asdict(
+        self,
+        hoist: bool = True,
+        source_tokens: Optional[dict[str, Any]] = None,
+        target_tokens: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
         """Return a dict of values suitable for serialization.
         This is opinionated about the preferred serialization: hoists
         as much as possible to upper levels.
+        With source_tokens provided as a dict mapping bare token IDs to token
+        objects, source selectors in each record are replaced with tokenstr
+        representations ("{id}|{text}").
+        With target_tokens provided as a dict mapping token IDs to token
+        objects, target selectors in each record are replaced with tokenstr
+        representations ("{id}|{text}").
         """
         # for now
         positional: bool = False
@@ -395,7 +439,13 @@ class AlignmentGroup:
             "meta": self.meta.asdict(),
             "type": self._type,
             "records": [
-                rec.asdict(positional=positional, withmeta=withmeta) for rec in self.records
+                rec.asdict(
+                    positional=positional,
+                    withmeta=withmeta,
+                    source_tokens=source_tokens,
+                    target_tokens=target_tokens,
+                )
+                for rec in self.records
             ],
         }
@@ -446,10 +496,27 @@ class TopLevelGroups:
         """Return a printed representation."""
         return f"<TopLevelGroups({self.targetdocid}): {self.sourcedocids}>"
-    def asdict(self, hoist: bool = True) -> dict[str, Any]:
-        """Return an opionated dict of values suitable for serialization."""
+    def asdict(
+        self,
+        hoist: bool = True,
+        source_tokens: Optional[dict[str, Any]] = None,
+        target_tokens: Optional[dict[str, Any]] = None,
+    ) -> dict[str, Any]:
+        """Return an opinionated dict of values suitable for serialization.
+        With source_tokens and target_tokens, passes them to each group's
+        asdict() so that selectors are replaced with tokenstr representations.
+        """
         return {
             "format": self.format,
             "version": self.version,
-            "groups": [self.groups[0].asdict(hoist=hoist), self.groups[1].asdict(hoist=hoist)],
+            "groups": [
+                self.groups[0].asdict(
+                    hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
+                ),
+                self.groups[1].asdict(
+                    hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
+                ),
+            ],
         }

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/BaseToken.py RENAMED Viewed

@@ -92,3 +92,12 @@ def bare_id(identifier: str) -> str:
         identifier
     ), f"'{identifier}' does not look like a valid BCVWPID identifier."
     return identifier[1:] if identifier[0].isalpha() else identifier
+def strip_tokenstr(selector: str) -> str:
+    """Return only the ID portion of a selector, dropping any tokenstr text suffix.
+    A tokenstr selector has the form "{id}|{text}" (e.g. "n41004003001|Ἀκούετε").
+    Plain IDs without a '|' are returned unchanged.
+    """
+    return selector.split("|", 1)[0] if "|" in selector else selector

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/VerseData.py RENAMED Viewed

@@ -156,15 +156,32 @@ class VerseData:
                 for trg in targets:
                     print(f"Target: {trg._display}")
-    def display_record(self, alrec: AlignmentRecord) -> None:
-        """Display an alignment record from this instance."""
+    def display_record(self, alrec: AlignmentRecord, srcwidth: Optional[int] = None) -> None:
+        """Display an alignment record from this instance.
+        srcwidth sets the minimum column width for the source token string;
+        defaults to the length of the source token string (no padding).
+        The source column is left-justified within that width.
+        """
         source_tokenstring: str = ", ".join(
             [self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
         )
         target_tokenstring: str = ", ".join(
             [self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
         )
-        print(f"{alrec.meta.id}: {source_tokenstring} --- {target_tokenstring}")
+        width = srcwidth if srcwidth is not None else len(source_tokenstring)
+        print(f"{alrec.meta.id}: {source_tokenstring:<{width}} --- {target_tokenstring}")
+    def display_records(self) -> None:
+        """Display all alignment records with a consistent source column width."""
+        if not self.records:
+            return
+        srcwidth: int = max(
+            len(", ".join(self.sourceitems[sel].tokenstr for sel in alrec.source_selectors))
+            for alrec in self.records
+        )
+        for alrec in self.records:
+            self.display_record(alrec, srcwidth=srcwidth)
     def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
         """Display tokens from typeattr that are _not_ aligned."""

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@ from .AlignmentSet import AlignmentSet
 from .AlignmentType import TranslationType
 from .alignments import AlignmentsReader, write_alignment_group
 from .manager import Manager, VerseData
-from .BaseToken import BaseToken, asbool, bare_id
+from .BaseToken import BaseToken, asbool, bare_id, strip_tokenstr
 from .DiffRecord import DiffReason, DiffRecord
 from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
 from .target import Target, TargetReader
@@ -42,6 +42,7 @@ __all__ = [
     "BaseToken",
     "asbool",
     "bare_id",
+    "strip_tokenstr",
     # DiffRecord
     "DiffReason",
     "DiffRecord",

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/alignments.py RENAMED Viewed

@@ -28,6 +28,7 @@ from .AlignmentGroup import Document, Metadata, AlignmentGroup, AlignmentReferen
 from .AlignmentSet import AlignmentSet
 from .AlignmentType import TranslationType
 from .BadRecord import BadRecord, Reason
+from .BaseToken import strip_tokenstr
 from .source import SourceReader, macula_unprefixer
 from .target import TargetReader
@@ -110,10 +111,12 @@ class AlignmentsReader:
         #
     def _targetid(self, targetid: str) -> str:
-        """Return a normalized target ID.
+        """Return a normalized target ID, optionally dropping the word-part digit.
-        With self.keeptargetwordpart = False, drop the last digit.
+        Accepts both plain IDs and tokenstr selectors ("{id}|{text}").
+        With self.keeptargetwordpart = False, a 12-character ID is truncated to 11.
         """
+        targetid = strip_tokenstr(targetid)
         if not self.keeptargetwordpart and len(targetid) == 12:
             return targetid[:11]
         else:
@@ -297,23 +300,35 @@ class AlignmentsReader:
 # copied from gc2sb.manager.write_alignment_group with minor changes
-def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True) -> None:
+def write_alignment_group(
+    group: AlignmentGroup,
+    f: TextIO,
+    source_tokens: Optional[dict[str, Any]] = None,
+    target_tokens: Optional[dict[str, Any]] = None,
+) -> None:
     """Write JSON data for an arbitrary group in Scripture Burrito format.
     Writes some of the JSON by hand to get records on the same line.
     Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
+    With source_tokens provided as a dict mapping bare token IDs to token
+    objects, source selectors are written as tokenstr representations
+    ("{id}|{text}") instead of plain IDs.
+    With target_tokens provided as a dict mapping token IDs to token objects,
+    target selectors are written as tokenstr representations ("{id}|{text}").
     """
     def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
         """Write documents tuple to out."""
         out.write(' "documents": [\n')
-        out.write("    " + json.dumps(documents[0].asdict()) + ",\n")
-        out.write("    " + json.dumps(documents[1].asdict()) + "\n")
+        out.write("    " + json.dumps(documents[0].asdict(), ensure_ascii=False) + ",\n")
+        out.write("    " + json.dumps(documents[1].asdict(), ensure_ascii=False) + "\n")
         out.write(" ],\n")
     def _write_meta(out: TextIO, meta: Metadata) -> None:
         """Write metadata to out."""
-        metarow = '"meta": ' + json.dumps(meta.asdict())
+        metarow = '"meta": ' + json.dumps(meta.asdict(), ensure_ascii=False)
         f.write(f" {metarow},\n")
     def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
@@ -324,20 +339,21 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
         """
         bcv = arec.source_bcv
         bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
-        recdict = arec.asdict()
+        recdict = arec.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
         recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
         return recdict
     f.write("{\n")
     _write_documents(f, group.documents)
     _write_meta(f, group.meta)
-    f.write(f' "roles": {json.dumps(group.roles)},\n')
+    f.write(f' "roles": {json.dumps(group.roles, ensure_ascii=False)},\n')
     f.write(f' "type": "{group._type}",\n "records": [\n ')
     # should sort the records: NIV11 doesn't appear to be sorted
     bcv_counters: dict[str, int] = {}
-    for arec in group.records[:-1]:
-        json.dump(_record_dict(arec, bcv_counters), f)
+    records = sorted(group.records)
+    for arec in records[:-1]:
+        json.dump(_record_dict(arec, bcv_counters), f, ensure_ascii=False)
         f.write(",\n ")
     # now the last one without a comma, because JSON
-    json.dump(_record_dict(group.records[-1], bcv_counters), f)
+    json.dump(_record_dict(group.records[-1], bcv_counters), f, ensure_ascii=False)
     f.write("\n ]}")

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/manager.py RENAMED Viewed

@@ -35,7 +35,7 @@ from collections import UserDict
 from typing import TypedDict
 from warnings import warn
-from .AlignmentGroup import AlignmentRecord
+from .AlignmentGroup import AlignmentGroup, AlignmentRecord
 from .AlignmentSet import AlignmentSet
 from .VerseData import VerseData
 from .alignments import AlignmentsReader
@@ -114,6 +114,7 @@ class Manager(UserDict):
             keepbadrecords=self.keepbadrecords,
         )
         self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
+        self.alignmentgroup: AlignmentGroup = self.alignmentsreader.alignmentgroup
         # TODO: upgrade the selectors to use tokenstr. This requires
         # knowing the source and targetitems, but alignmentsreader
         # doesn't have that data

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/source.py RENAMED Viewed

@@ -53,7 +53,7 @@ from biblealignlib import normalize_strongs, get_canonid
 # should eventually come from Clearlib
 from .util import groupby_key
-from .BaseToken import BaseToken
+from .BaseToken import BaseToken, strip_tokenstr
 PREFIXRE = re.compile(r"^[no]")
@@ -76,7 +76,13 @@ def macula_prefixer(bcvwp: str) -> str:
 def macula_unprefixer(bcvwp: str) -> str:
-    """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged."""
+    """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.
+    Also strips any tokenstr text suffix ("{id}|{text}" → "{id}") before
+    checking for the prefix, so both plain IDs and tokenstr selectors are
+    handled correctly.
+    """
+    bcvwp = strip_tokenstr(bcvwp)
     if PREFIXRE.match(bcvwp):
         return bcvwp[1:]
     else:

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/DiffTargets.py RENAMED Viewed

@@ -17,12 +17,7 @@ are stored as DiffRecord instances in a dict keyed by BCV.
 >>> mgr84 = Manager(alset1)
 >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets)
 >>> len(dt84)
-3784
-# now run it again to account for single-token replacements
->>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets, dt84.get_single_token_replacements())
->>> len(dt84)
-2860
+2565
 >>> alset2 = AlignmentSet(targetlanguage=targetlang,
         targetid="NIV11",
@@ -38,6 +33,7 @@ are stored as DiffRecord instances in a dict keyed by BCV.
 from __future__ import annotations
 from collections import UserDict
+import copy
 from dataclasses import dataclass
 import difflib
 from itertools import zip_longest
@@ -45,13 +41,18 @@ from pathlib import Path
 from typing import Optional, TextIO, TYPE_CHECKING
 from biblealignlib.burrito import (
+    AlignmentGroup,
     AlignmentRecord,
+    AlignmentReference,
     AlignmentSet,
     BaseToken,
     DiffReason,
     DiffRecord,
+    Document,
     Manager,
+    Metadata,
     Target,
+    TargetReader,
 )
 from ..burrito.alignments import write_alignment_group
 from ..burrito.util import groupby_bcv
@@ -226,15 +227,11 @@ def diff_verse_targets(
     )
-# this is a two-pass operation:
-# Run with default (empty) bcvequivalents
-# Run again supplying get_single_token_replacements() as bcvequivalents
-# >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets.BCVEQUIVALENT84)
-# >>> len(dt84)
-# 3784
-# >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets, dt84.get_single_token_replacements())
-# >>> len(dt84)
-# 2879
+# could try here to find alignment records that are a subset of an
+# equal region, and then map the token IDs?
+# then write out revised records and patch onto the alignment data??
 # this still doesn't handle multi-term direct replacements: for those we need to ensure semantic compatability
 class DiffTargets84(UserDict):
     missing84: set[str] = {"42023018", "47013014", "64001015"}
@@ -306,21 +303,21 @@ class DiffTargets84(UserDict):
         "64001014033": "64001015018",
     }
     # hacky way to avoid outputing the same alignment record more than once
-    output_alrecs: dict[str, AlignmentRecord] = {}
+    output_alrecs: dict[str, bool] = {}
     def __init__(
         self,
         mgr84: Manager,
-        targets11: dict[str, Target],
+        targets11: TargetReader,
         bcvequivalents: dict[str, dict[str, str]] = {},
     ) -> None:
         super().__init__()
         self.mgr84 = mgr84
         self.niv84bcvtargets: dict[str, list[Target]] = mgr84.bcv["targets"]
-        self.targets11: dict[str, Target] = targets11
+        self.targets11: TargetReader = targets11
         self.bcvequivalents = bcvequivalents
         # not correct for versification differences??
-        self.niv11bcvtargets: dict[str, list[Target]] = groupby_bcv(self.targets11.values())
+        self.niv11bcvtargets: dict[str, list[Target]] = groupby_bcv(list(self.targets11.values()))
         for bcv in self.niv11bcvtargets:
             if bcv not in self.missing84:
                 trg84: list[Target] = self.niv84bcvtargets.get(bcv, [])
@@ -329,40 +326,56 @@ class DiffTargets84(UserDict):
                     self.bcvequivalents.get(bcv, {}) if self.bcvequivalents else {}
                 )
                 record = diff_verse_targets(bcv, trg84, trg11, equivalents)
-                # record.data is like (('equal', 0, 5, 0, 5), ('replace', 5, 6, 5, 6), ('equal', 6, 34, 6, 34), ('replace', 34, 35, 34, 35), ('equal', 35, 38, 35, 38))
-                if record is not None:
+                # record.data is like
+                # (('equal', 0, 5, 0, 5), ('replace', 5, 6, 5, 6), ('equal', 6, 34, 6, 34),
+                # ('replace', 34, 35, 34, 35), ('equal', 35, 38, 35, 38))
+                if record and not self._replaceonly_same_length(record):
+                    # then record as a difference
                     self.data[bcv] = record
         # items that are only replacements
-        self.replaceonly: dict[str, DiffRecord] = {
+        # self.replaceonly: dict[str, DiffRecord] = {
+        #     bcv: drec
+        #     for bcv, drec in self.items()
+        #     if all([(op.opcode in ("equal", "replace")) for op in drec.data])
+        # }
+        # self.single_replaceonly: dict[str, list[Operation]] = {
+        #     bcv: oplist
+        #     for bcv, drec in self.replaceonly.items()
+        #     if (oplist := [op for op in drec.data if op.single_replace])
+        #     if oplist
+        # }
+        # self.dual_replaceonly: dict[str, list[Operation]] = {
+        #     bcv: oplist
+        #     for bcv, drec in self.replaceonly.items()
+        #     if (oplist := [op for op in drec.data if op.dual_replace])
+        #     if oplist
+        # }
+        self.replaceonly_same_length: dict[str, DiffRecord] = {
             bcv: drec
             for bcv, drec in self.items()
-            if all([(op.opcode in ("equal", "replace")) for op in drec.data])
-        }
-        self.single_replaceonly: dict[str, list[Operation]] = {
-            bcv: oplist
-            for bcv, drec in self.replaceonly.items()
-            if (oplist := [op for op in drec.data if op.single_replace])
-            if oplist
-        }
-        self.dual_replaceonly: dict[str, list[Operation]] = {
-            bcv: oplist
-            for bcv, drec in self.replaceonly.items()
-            if (oplist := [op for op in drec.data if op.dual_replace])
-            if oplist
+            if all((op.opcode in ("equal", "replace")) for op in drec.data)
+            if all(op.same_length for op in drec.data)
         }
-    def _get_bcv_texts(self, bcv) -> tuple[list[str], list[str]]:
+    def _replaceonly_same_length(self, diffrec: DiffRecord) -> bool:
+        """True if all operations are 'equal' or 'replace' of same length.
+        That means token IDs don't need to change in NIV11.
+        """
+        return all((op.opcode in ("equal", "replace")) and op.same_length for op in diffrec.data)
+    def _get_bcv_texts(self, bcv: str) -> tuple[list[str], list[str]]:
         record = self.data.get(bcv)
         if record is None:
             print(f"{bcv}: No differences")
-            return
+            return [], []
         text84 = [trg.text for trg in self.niv84bcvtargets[bcv]]
         text11 = [trg.text for trg in self.niv11bcvtargets[bcv]]
         return text84, text11
-    def get_single_token_replacements(self) -> dict[str, dict[str, str]]:
-        # bcv-specific single token replacements
-        return {bcv: self.replace_single_text(bcv) for bcv, ops in self.single_replaceonly.items()}
+    # def get_single_token_replacements(self) -> dict[str, dict[str, str]]:
+    #     # bcv-specific single token replacements
+    #     return {bcv: self.replace_single_text(bcv) for bcv, ops in self.single_replaceonly.items()}
     def display_pair_text(self, bcv: str) -> None:
         text84, text11 = self._get_bcv_texts(bcv)
@@ -371,27 +384,14 @@ class DiffTargets84(UserDict):
     # only for single-token replace operations
     def replace_single_text(self, bcv: str) -> dict[str, str]:
-        record = self.data.get(bcv)
         text84, text11 = self._get_bcv_texts(bcv)
         replacements: dict[str, str] = {}
-        for op in record.data:
+        for op in self.data.get(bcv, []):
             if op.single_replace:
                 k = text11[op.start2 : op.end2][0]
                 replacements[k] = text84[op.start1 : op.end1][0]
         return replacements
-    # replacements where one or both sides have two tokens
-    # could consolidate this with replace_single_text
-    def replace_dual_text(self, bcv: str) -> dict[str, str]:
-        record = self.data.get(bcv)
-        text84, text11 = self._get_bcv_texts(bcv)
-        replacements: dict[str, str] = {}
-        for op in record.data:
-            if op.dual_replace:
-                k = tuple(text11[op.start2 : op.end2])
-                replacements[k] = tuple(text84[op.start1 : op.end1])
-        return replacements
     def mismatched_verses(self) -> dict[str, AlignmentRecord]:
         """Some alignment records have a source in one verse and a target in another."""
         return {
@@ -486,8 +486,7 @@ class DiffTargets84(UserDict):
                 self.output_alrecs[alrec.meta.id] = True
             except KeyError as e:
                 # Selectors: ['42001025012']
-                # niv11map keys: dict_keys(['42001024001', '42001024002', '42001024003', '42001024004', '42001024005', '42001024006', '42001024007', '42001024008', '42001024009', '42001024010', '42001024011', '42001024012', '42001024013', '42001024014', '42001024015'])
+                # niv11map keys: dict_keys(['42001024001', '42001024002', '42001024003', ...])
                 print(f"--- {versedata.bcvid}, KeyError on {e}")
                 print(f"Record: {alrec}")
@@ -584,26 +583,51 @@ class DiffTargets84(UserDict):
                         #     niv11replace
-class Interlinear:
-    """Line up NIV84 and NIV11 tokens opposite source (SBLGNT) tokens.
+class Serialize:
+    """Serialize confident alignments for ClearAligner.
-    For each source verse, outputs one row per source token (in verse order)
-    showing the aligned NIV84 token(s) and corresponding NIV11 token(s).
+    With include_partials True, this also includes partial alignments.
-    Alignment records with one source token expand to one row per target, with
-    the source column blank on continuation rows.  Records with multiple source
-    tokens (many-to-many) collapse to one row per source token with all targets
-    joined on that row.
+    Also outputs difference records and difference information on
+    tokens as a checklist of things to review.
-    After all source-token rows, any NIV11 tokens not yet emitted are appended
-    on blank source rows, sorted by token ID.
     """
-    def __init__(self, dt84: DiffTargets84) -> None:
+    # collects alignment records that didn't produce partials: BCVID -> list of records
+    unmapped_records: dict[str, list[AlignmentRecord]] = {}
+    def __init__(self, dt84: DiffTargets84, include_partials: bool = False) -> None:
         self.dt84 = dt84
         self.mgr84 = dt84.mgr84
         self.niv84bcvtargets = dt84.niv84bcvtargets
         self.niv11bcvtargets = dt84.niv11bcvtargets
+        # new Document for AlignmentRecord instances
+        self.niv11_document: Document = Document(docid="NIV11", scheme="BCVW")
+        # construct a new manager, with mappings to NIV11
+        self.niv11alset: AlignmentSet = AlignmentSet(
+            targetlanguage=self.mgr84.alignmentset.targetlanguage,
+            targetid="NIV11",
+            sourceid=self.mgr84.alignmentset.sourceid,
+            langdatapath=self.mgr84.alignmentset.langdatapath,
+        )
+        # read the existing alignments but then replace the alignment records
+        self.mgr11: Manager = Manager(self.niv11alset)
+        self.mgr11.targetitems = self.dt84.targets11
+        self.niv11_algroup: AlignmentGroup = self.niv11_alignment_group(
+            include_partials=include_partials
+        )
+        self.mgr11.bcv["records"] = groupby_bcv(
+            list(self.niv11_algroup.records), lambda r: r.source_bcv
+        )
+        # and make VerseData instances for alignments
+        versedata: dict[str, VerseData] = {}
+        for bcvid in self.mgr11.bcv["records"]:
+            try:
+                vd: VerseData = self.mgr11.make_versedata(bcvid)
+                versedata[bcvid] = vd
+            except KeyError:
+                print(f"Warning: no records for {bcvid} in NIV11; skipping verse")
+        self.mgr11.bcv["versedata"] = versedata
     def _niv84_to_niv11(self, bcv: str) -> dict[str, list[Target]]:
         """Map NIV84 token IDs to NIV11 Target tokens for a verse.
@@ -637,142 +661,193 @@ class Interlinear:
                 # delete → key omitted; insert → no NIV84 token, omitted from map
         return result
-    def _verse_rows(self, bcv: str, versedata: "VerseData") -> list[tuple[str, str, str, str]]:
-        """Return (source_str, niv84_str, niv11_str) tuples for one verse."""
-        # def _niv11_token_string(tokens: list[BaseToken]) -> str:
-        #     niv11_tokens = [t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])]
-        #     return " ".join(t.tokenstr for t in niv11_tokens)
-        interlabels: dict[str, str] = {
-            "many-to-many": "+-+",
-            "one-to-many": "1-1",
-            "unaligned": "0",
-            "unmatched": "-",
-        }
-        niv84_by_id: dict[str, Target] = {t.id: t for t in self.niv84bcvtargets.get(bcv, [])}
-        niv84_to_niv11: dict[str, list[Target]] = self._niv84_to_niv11(bcv)
-        src_to_alrecs: dict[str, list[AlignmentRecord]] = {}
-        for alrec in versedata.records:
-            for src_id in alrec.source_selectors:
-                src_to_alrecs.setdefault(src_id, []).append(alrec)
-        niv11_emitted: set[str] = set()
-        rows: list[tuple[str, str, str, str]] = []
-        for src_token in versedata.sources:
-            alrecs = src_to_alrecs.get(src_token.id, [])
-            if not alrecs:
-                rows.append((interlabels["unaligned"], src_token.tokenstr, "", ""))
-                continue
-            for alrec in alrecs:
-                niv84_ids = alrec.target_selectors
-                niv84_tokens = [niv84_by_id[tid] for tid in niv84_ids if tid in niv84_by_id]
-                niv84_str = " ".join(t.tokenstr for t in niv84_tokens)
-                if len(alrec.source_selectors) > 1:
-                    # many-to-many: join all targets on one row per source token
-                    # duplicates here for 40001023004|ἐν
-                    niv11_tokens = sorted(
-                        {t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])}
-                    )
-                    niv11_str = " ".join(t.tokenstr for t in niv11_tokens)
-                    niv11_emitted.update(t.id for t in niv11_tokens)
-                    rows.append(
-                        (
-                            interlabels["many-to-many"],
-                            src_token.tokenstr,
-                            niv84_str,
-                            niv11_str,
-                        )
-                    )
-                else:
-                    # one-to-many: one row per group of NIV84 targets, blank source on continuation
-                    niv84_tokens = [niv84_by_id[tid] for tid in niv84_ids if tid in niv84_by_id]
-                    niv84_str = " ".join(t.tokenstr for t in niv84_tokens)
-                    # duplicates here for 40001019006|δίκαιος
-                    niv11_tokens = sorted(
-                        {t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])}
-                    )
-                    # niv11_tokens = [t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])]
-                    niv11_str = " ".join(t.tokenstr for t in niv11_tokens)
-                    niv11_emitted.update(t.id for t in niv11_tokens)
-                    rows.append(
-                        (
-                            interlabels["one-to-many"],
-                            src_token.tokenstr,
-                            niv84_str,
-                            niv11_str,
-                        )
-                    )
-        # Append any NIV11 tokens not yet emitted, sorted by ID
-        unmatched = sorted(
-            [t for t in self.niv11bcvtargets.get(bcv, []) if t.id not in niv11_emitted],
-            key=lambda t: t.id,
-        )
-        allunmatched = " ".join(t.tokenstr for t in unmatched)
-        rows.append((interlabels["unmatched"], "", "", allunmatched))
-        return rows
-    def write_tsv(self, outpath: Optional[Path] = None) -> None:
-        """Write one TSV row per source token across all verses."""
-        if not outpath:
-            outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
-            outdir.mkdir(parents=True, exist_ok=True)
-            outpath = outdir / "NIV84-NIV11-interlinear.tsv"
-        with outpath.open("w", encoding="utf-8") as f:
-            f.write("Label\tSource\tNIV84\tNIV11\n")
-            for bcv, versedata in self.mgr84.bcv["versedata"].items():
-                for row in self._verse_rows(bcv, versedata):
-                    f.write("\t".join(row) + "\n")
-class Serialize:
-    """Serialize confident alignments for ClearAligner.
-    Also outputs difference records and difference information on
-    tokens as a checklist of things to review.
-    """
-    def __init__(self, dt84: DiffTargets84) -> None:
-        self.dt84 = dt84
-        self.mgr84 = dt84.mgr84
-        self.niv84bcvtargets = dt84.niv84bcvtargets
-        self.niv11bcvtargets = dt84.niv11bcvtargets
-        # construct a new manager, with mappings to NIV11
-        niv11alset: AlignmentSet = AlignmentSet(
-            targetlanguage=self.mgr84.alignmentset.targetlanguage,
-            targetid="NIV11",
-            sourceid=self.mgr84.alignmentset.sourceid,
-            langdatapath=self.mgr84.alignmentset.langdatapath,
-        )
-        # read the existing alignments but then replace the alignment records
-        self.mgr11: Manager = Manager(niv11alset)
+    # if the target IDs are the same, even if the NIV11 tokens are
+    # different, we can still use the NIV84 alignment records
+    # but xverse mapping is needed
     def _alrecs_to_niv11(self, bcv: str) -> list[AlignmentRecord]:
-        """Return the alignment records from NIV84, mappNIV11 targets.
+        """Return the alignment records from NIV84, mapped to NIV11 targets.
         Only when there aren't significant differences.
         """
         niv84_tokens = self.niv84bcvtargets.get(bcv, [])
         # niv11_tokens = self.niv11bcvtargets.get(bcv, [])
         alrecs_niv84: list[AlignmentRecord] = self.mgr84.bcv["records"][bcv]
-        # map token IDs to token instances
-        # niv84_id_tokens: dict[str, Target] = {t.id: t for t in niv84_tokens}
-        # niv11_id_tokens: dict[str, Target] = {t.id: t for t in niv11_tokens}
+        # niv84_to_niv11 = self._niv84_to_niv11(bcv)
         # this only works because each token is equal or equivalent
         #
         # this handles any cases of cross-verse boundary changes
-        niv84_niv11_map: dict[str, str] = {
+        niv84_niv11_xverse_map: dict[str, str] = {
             t84.id: self.dt84.niv84_niv11_map.get(t84.id, t84.id) for t84 in niv84_tokens
         }
+        # in theory, the xverse  map shouldn't interact with the diff-based map ...
+        new_alrecs: list[AlignmentRecord] = []
         for alrec in alrecs_niv84:
             niv11_selectors: list[str] = [
-                niv84_niv11_map.get(sel, sel) for sel in alrec.target_selectors
+                xverse_sel
+                for sel in alrec.target_selectors
+                if (xverse_sel := niv84_niv11_xverse_map.get(sel, sel))
+                # if (to_niv11 := niv84_to_niv11.get(xverse_sel, xverse_sel))
+                # for niv11_tok in to_niv11
+            ]
+            new_reference: AlignmentReference = AlignmentReference(
+                document=self.niv11_document, selectors=niv11_selectors
+            )
+            newmeta = copy.deepcopy(alrec.meta)
+            newmeta.origin = "NIV84_transfer"
+            new_alrecs.append(
+                AlignmentRecord(
+                    meta=alrec.meta,
+                    references={
+                        "source": alrec.references["source"],
+                        "target": new_reference,
+                    },
+                    type=alrec.type,
+                )
+            )
+        return new_alrecs
+    # from Claude
+    def collect_partial_records(self, bcv: str) -> list[AlignmentRecord]:
+        """Generate NIV11 AlignmentRecords for confidently-mapped records in a diff verse.
+        For each operation in the verse's DiffRecord whose opcode is 'equal' or
+        'replace' with equal length (same token count on both sides), the positional
+        zip gives a one-to-one NIV84 → NIV11 token correspondence.
+        An AlignmentRecord is included only when every one of its target selectors
+        falls within the span of such a confident operation, so the full NIV11
+        mapping is unambiguous.  Records that straddle operation boundaries, or
+        whose selectors sit in a delete/insert/unequal-replace span, are skipped.
+        Returns an empty list for verses without a DiffRecord (those are handled
+        by the existing _alrecs_to_niv11 / niv11_alignment_group path).
+        """
+        diffrec = self.dt84.data.get(bcv)
+        if diffrec is None:
+            return []
+        versedata = self.mgr84.bcv["versedata"].get(bcv)
+        if versedata is None or not versedata.records:
+            return []
+        niv84_tokens = self.niv84bcvtargets.get(bcv, [])
+        niv11_tokens = self.niv11bcvtargets.get(bcv, [])
+        # Build a confident NIV84 token ID → NIV11 token ID map.
+        # equal ops: texts match; same-length replace ops: unique positional partner.
+        confident_map: dict[str, str] = {}
+        for op in diffrec.data:
+            if op.opcode == "equal" or (op.opcode == "replace" and op.same_length):
+                for t84, t11 in zip(
+                    niv84_tokens[op.start1 : op.end1],
+                    niv11_tokens[op.start2 : op.end2],
+                ):
+                    confident_map[t84.id] = t11.id
+        if not confident_map:
+            return []
+        new_alrecs: list[AlignmentRecord] = []
+        for alrec in versedata.records:
+            niv11_selectors: list[str] = []
+            for sel in alrec.target_selectors:
+                niv11_id = confident_map.get(sel)
+                if niv11_id is None:
+                    if bcv not in self.unmapped_records:
+                        self.unmapped_records[bcv] = [alrec]
+                    else:
+                        if alrec not in self.unmapped_records[bcv]:
+                            self.unmapped_records[bcv].append(alrec)
+                    break  # selector not in any confident span → skip record
+                niv11_selectors.append(niv11_id)
+            else:
+                # all selectors mapped confidently
+                newmeta = copy.deepcopy(alrec.meta)
+                newmeta.origin = "NIV84_partial_transfer"
+                new_alrecs.append(
+                    AlignmentRecord(
+                        meta=newmeta,
+                        references={
+                            "source": alrec.references["source"],
+                            "target": AlignmentReference(
+                                document=self.niv11_document, selectors=niv11_selectors
+                            ),
+                        },
+                        type=alrec.type,
+                    )
+                )
+        return new_alrecs
+    def niv11_alignment_group(self, include_partials: bool = False) -> AlignmentGroup:
+        """Return an AlignmentGroup for NIV11, with aligned records from NIV84 where possible."""
+        niv84_algroup: AlignmentGroup = self.mgr84.alignmentsreader.alignmentgroup
+        sblgnt_document: Document = niv84_algroup.documents[0]
+        niv11_metadata: Metadata = Metadata(conformsTo="0.3", creator="NIV84-NIV11 transfer")
+        niv11_alrecs: list[AlignmentRecord] = [
+            alrec
+            for bcv in self.mgr84.bcv["records"].keys()
+            # only those that map cleanly
+            if bcv not in self.dt84
+            for alrec in self._alrecs_to_niv11(bcv)
+        ]
+        if include_partials:
+            niv11_partials: list[AlignmentRecord] = [
+                alrec
+                for bcv in self.dt84.data.keys()
+                for alrec in self.collect_partial_records(bcv)
             ]
-            alrec.update_target_selectors(niv11_selectors)
-        return alrecs_niv84
+            niv11_alrecs = sorted(niv11_alrecs + niv11_partials)
+        niv11_algroup: AlignmentGroup = AlignmentGroup(
+            documents=(sblgnt_document, self.niv11_document),
+            meta=niv11_metadata,
+            records=niv11_alrecs,
+            roles=niv84_algroup.roles,
+            sourcedocid=niv84_algroup.sourcedocid,
+            canon=niv84_algroup.canon,
+            _type=niv84_algroup._type,
+        )
+        return niv11_algroup
+    def write_unmapped_records(self, outpath: Path = None) -> None:
+        """Write partials that were not included in partials (confidently-mapped spans)."""
+        unmapped_output: set[AlignmentRecord] = set()
+        if not outpath:
+            outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
+            outdir.mkdir(parents=True, exist_ok=True)
+            outpath = outdir / "NIV84-NIV11-unmappedrecords.tsv"
+        with outpath.open("w", encoding="utf-8") as f:
+            f.write("Verse\tNIV84 Tokens\n")
+            for bcv, alreclist in self.unmapped_records.items():
+                niv84_bcv_tokens: list[Target] = self.niv84bcvtargets.get(bcv, [])
+                niv84_bcv_tokenstrs: dict[str, str] = {t.id: t.tokenstr for t in niv84_bcv_tokens}
+                for alrec in alreclist:
+                    if alrec in unmapped_output:
+                        continue
+                    else:
+                        unmapped_output.add(alrec)
+                        niv84_str = {
+                            sel: niv84_bcv_tokenstrs.get(sel, "<unknown>")
+                            for sel in alrec.target_selectors
+                        }
+                        f.write(f"{bcv}\t{" ".join(niv84_str.values())}\n")
+    def write_diffs(self, outpath: Path = None) -> None:
+        """Write diffs as a checklist for manual alignment."""
+        if not outpath:
+            outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
+            outdir.mkdir(parents=True, exist_ok=True)
+            outpath = outdir / "NIV84-NIV11-diffs.tsv"
+        with outpath.open("w", encoding="utf-8") as f:
+            f.write("Verse\tOpCode\tNIV84 Tokens\tNIV11 Tokens\n")
+            for bcv, diffrec in self.dt84.data.items():
+                niv84_tokens = self.niv84bcvtargets.get(bcv, [])
+                niv11_tokens = self.niv11bcvtargets.get(bcv, [])
+                for op in diffrec.data:
+                    seq84 = niv84_tokens[op.start1 : op.end1]
+                    seq11 = niv11_tokens[op.start2 : op.end2]
+                    f.write(
+                        f"{bcv}\t{op.opcode}\t"
+                        f"{' '.join(t.tokenstr for t in seq84)}\t"
+                        f"{' '.join(t.tokenstr for t in seq11)}\n"
+                    )

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/merger.py RENAMED Viewed

@@ -77,8 +77,7 @@ class Merger:
             data2: Optional[VerseData] = cast(
                 Optional[VerseData], self.mgr2.bcv["versedata"].get(bcv)
             )
-            if data1 and data2:
-                diffs: list[DiffRecord] = data1.diff(data2)
+            diffs: list[DiffRecord] = data1.diff(data2) if data1 and data2 else []
             bcv_pairs[bcv] = BCVPair(
                 bcv=bcv,
                 mgr1_data=data1,

{biblealignlib-0.3.2 → biblealignlib-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "biblealignlib"
-version = "0.3.2"
+version = "0.4.0"
 description = "Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments."
 authors = ["Sean Boisen <sean.boisen@biblica.com>"]
 license = "MIT"

biblealignlib-0.3.2/biblealignlib/util/Transfer.py DELETED Viewed

@@ -1,93 +0,0 @@
-"""Transfer alignment data from one AlignmentSet to another on a closely related targe version.
-Example: if NIV84 alignments are more complete than NIV2011
-alignments, this code can transfer the data from the former to the
-latter, where the surface text is the same, and where NIV84 has
-alignment record that is missing from NIV11.
-Input is two Manager instances, which must be based on the same source
-and target language. If the target versions are the _same_, use
-merger.py instead.
-- If the source or target languages aren't the same, this code is
-  not relevant to your problem.
->>> from biblealignlib.burrito import CLEARROOT, Manager, AlignmentSet
->>> from biblealignlib.util import Transfer
->>> targetlang, sourceid = ("eng", "SBLGNT")
-# get manager instances for two sets of alignments
->>> niv84as = AlignmentSet(targetlanguage=targetlang,
-        targetid="NIV84",
-        sourceid=sourceid,
-        langdatapath=(CLEARROOT / f"alignments-{targetlang}/data"))
->>> niv84mgr = Manager(niv84as)
->>> niv11as = AlignmentSet(targetlanguage=targetlang,
-        targetid="NIV11",
-        sourceid=sourceid,
-        langdatapath=(CLEARROOT / f"alignments-{targetlang}/data"))
->>> niv11mgr = Manager(niv11as)
-# instantiate a Transfer instance
->>> transferinst = Transfer.Transfer(niv84mgr, niv11mgr)
-"""
-from collections import Counter, UserDict
-from typing import cast, Optional
-from ..burrito import Manager, VerseData
-from ..burrito import DiffRecord
-from ..burrito.util import groupby_bcid
-from . import BCVPair
-class Transfer(UserDict):
-    def __init__(self, mgr1: Manager, mgr2: Manager) -> None:
-        """Initialize an instance."""
-        super().__init__()
-        self.mgr1 = mgr1
-        self.mgr2 = mgr2
-        for attr in ("sourceid", "targetlanguage"):
-            mgr1attr = getattr(self.mgr1.alignmentset, attr)
-            mgr2attr = getattr(self.mgr2.alignmentset, attr)
-            if mgr1attr != mgr2attr:
-                raise ValueError(
-                    f"Managers must have the same {attr!r} attribute, but {mgr1attr} != {mgr2attr}"
-                )
-        # should be the same for both
-        self.allsrcbcv = mgr1.bcv["sources"]
-        self.data: dict[str, BCVPair] = self.get_bcv_pairs()
-        self.pairingcounts = Counter(bcvp.pairing for bcvp in self.values())
-        # overlaps
-        self.overlaps = [bcvp for bcvp in self.values() if bcvp.pairing == "both"]
-        # overlaps with differences
-        self.diffpairs = [bcvp for bcvp in self.overlaps if bcvp.diffs]
-    def get_bcv_pairs(self) -> dict[str, BCVPair]:
-        """Return a dictionary of BCVPair instances."""
-        bcv_pairs: dict[str, BCVPair] = {}
-        for bcv in self.allsrcbcv:
-            data1: Optional[VerseData] = cast(
-                Optional[VerseData], self.mgr1.bcv["versedata"].get(bcv)
-            )
-            data2: Optional[VerseData] = cast(
-                Optional[VerseData], self.mgr2.bcv["versedata"].get(bcv)
-            )
-            if data1 and data2:
-                diffs: list[DiffRecord] =
-            bcv_pairs[bcv] = BCVPair(
-                bcv=bcv,
-                mgr1_data=data1,
-                mgr2_data=data2,
-            )
-        return bcv_pairs
-    def show_diffs(self) -> None:
-        """Display information about overlaps that differ."""
-        overlap_bcs = groupby_bcid([bcvp.bcv for bcvp in self.diffpairs])
-        print(f"{len(overlap_bcs)} overlapping and different chapters: {overlap_bcs.keys()}")
-        for bcvpair in self.diffpairs:
-            vd1 = bcvpair.mgr1_data.alignments if bcvpair.mgr1_data else ()
-            vd2 = bcvpair.mgr2_data.alignments if bcvpair.mgr2_data else ()
-            print(bcvpair.bcv, ": ", str(len(vd1)), "---", str(len(vd2)))