biblealignlib 0.3.2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/PKG-INFO +1 -1
  2. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentGroup.py +76 -9
  3. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/BaseToken.py +9 -0
  4. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/VerseData.py +20 -3
  5. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/__init__.py +2 -1
  6. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/alignments.py +27 -11
  7. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/manager.py +2 -1
  8. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/source.py +8 -2
  9. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/DiffTargets.py +266 -191
  10. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/merger.py +1 -2
  11. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/pyproject.toml +1 -1
  12. biblealignlib-0.3.2/biblealignlib/util/Transfer.py +0 -93
  13. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/LICENSE +0 -0
  14. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/LICENSE.md +0 -0
  15. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/README.md +0 -0
  16. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/__init__.py +0 -0
  17. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/Score.py +0 -0
  18. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/__init__.py +0 -0
  19. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/corpusmapping.py +0 -0
  20. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/eflomal.py +0 -0
  21. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/mapper.py +0 -0
  22. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/reader.py +0 -0
  23. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/runeflomal.py +0 -0
  24. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/scorer.py +0 -0
  25. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/writer.py +0 -0
  26. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentSet.py +0 -0
  27. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentType.py +0 -0
  28. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/BadRecord.py +0 -0
  29. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/DiffRecord.py +0 -0
  30. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/target.py +0 -0
  31. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/util.py +0 -0
  32. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/Coverage.py +0 -0
  33. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/__init__.py +0 -0
  34. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/analyzer.py +0 -0
  35. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/exporter.py +0 -0
  36. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/filters.py +0 -0
  37. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/interlinear/__init__.py +0 -0
  38. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/interlinear/reverse.py +0 -0
  39. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/interlinear/token.py +0 -0
  40. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/strongs.py +0 -0
  41. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/DiffAlignments.py +0 -0
  42. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/__init__.py +0 -0
  43. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/canonsplit.py +0 -0
  44. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/tokens_to_chars.py +0 -0
  45. {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/vocab.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblealignlib
3
- Version: 0.3.2
3
+ Version: 0.4.0
4
4
  Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -27,7 +27,7 @@ from biblelib.word import bcvwpid
27
27
  import biblealignlib as bal
28
28
 
29
29
  from .AlignmentType import TranslationType
30
- from .source import macula_prefixer
30
+ from .source import macula_prefixer, macula_unprefixer
31
31
 
32
32
 
33
33
  # hoisting means this can be defined at several different levels, so
@@ -291,8 +291,15 @@ class AlignmentRecord:
291
291
  """
292
292
  self.references["target"].selectors = sorted(selectors)
293
293
 
294
+ # note that source/target_tokens are only available from a Manager
295
+ # instance, so the default repr doesn't include tokenstrs.
294
296
  def asdict(
295
- self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
297
+ self,
298
+ positional: bool = False,
299
+ withmeta: bool = True,
300
+ withmaculaprefix: bool = False,
301
+ source_tokens: Optional[dict[str, Any]] = None,
302
+ target_tokens: Optional[dict[str, Any]] = None,
296
303
  ) -> dict[str, Any]:
297
304
  """Return a dict of values suitable for serialization.
298
305
 
@@ -307,6 +314,14 @@ class AlignmentRecord:
307
314
  With withmaculaprefix=True (the default is False), prefix
308
315
  source references with 'o' or 'n' depending on canon.
309
316
 
317
+ With source_tokens provided as a dict mapping bare token IDs to token
318
+ objects, source selectors are replaced with tokenstr representations
319
+ ("{id}|{text}"). With withmaculaprefix=True, the prefixed ID is used.
320
+
321
+ With target_tokens provided as a dict mapping token IDs to token
322
+ objects, target selectors are replaced with tokenstr representations
323
+ ("{id}|{text}").
324
+
310
325
  """
311
326
  recdict: dict[str, Any] = {}
312
327
  if positional:
@@ -319,12 +334,28 @@ class AlignmentRecord:
319
334
  else:
320
335
  # typical case
321
336
  sourcerefs: list[str] = self.references["source"].selectors
322
- if withmaculaprefix:
337
+ if source_tokens is not None:
338
+ # Build tokenstr: use bare ID by default, prefixed ID if withmaculaprefix
339
+ bare_ids = [macula_unprefixer(sel) for sel in sourcerefs]
340
+ display_ids = (
341
+ [macula_prefixer(b) for b in bare_ids] if withmaculaprefix else bare_ids
342
+ )
343
+ sourcerefs = [
344
+ f"{did}|{tok.text}" if (tok := source_tokens.get(bare)) else did
345
+ for bare, did in zip(bare_ids, display_ids)
346
+ ]
347
+ elif withmaculaprefix:
323
348
  # default: add back the Macula prefix
324
349
  sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs]
325
350
  # else leave as is (atypical)
326
351
  recdict["source"] = sourcerefs
327
- recdict["target"] = self.references["target"].selectors
352
+ targetrefs: list[str] = self.references["target"].selectors
353
+ if target_tokens is not None:
354
+ targetrefs = [
355
+ f"{sel}|{tok.text}" if (tok := target_tokens.get(sel)) else sel
356
+ for sel in targetrefs
357
+ ]
358
+ recdict["target"] = targetrefs
328
359
  if withmeta:
329
360
  recdict.update(
330
361
  {
@@ -380,12 +411,25 @@ class AlignmentGroup:
380
411
  docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents])
381
412
  return f"<AlignmentGroup{docids}: {len(self.records)} records>"
382
413
 
383
- def asdict(self, hoist: bool = True) -> dict[str, Any]:
414
+ def asdict(
415
+ self,
416
+ hoist: bool = True,
417
+ source_tokens: Optional[dict[str, Any]] = None,
418
+ target_tokens: Optional[dict[str, Any]] = None,
419
+ ) -> dict[str, Any]:
384
420
  """Return a dict of values suitable for serialization.
385
421
 
386
422
  This is opinionated about the preferred serialization: hoists
387
423
  as much as possible to upper levels.
388
424
 
425
+ With source_tokens provided as a dict mapping bare token IDs to token
426
+ objects, source selectors in each record are replaced with tokenstr
427
+ representations ("{id}|{text}").
428
+
429
+ With target_tokens provided as a dict mapping token IDs to token
430
+ objects, target selectors in each record are replaced with tokenstr
431
+ representations ("{id}|{text}").
432
+
389
433
  """
390
434
  # for now
391
435
  positional: bool = False
@@ -395,7 +439,13 @@ class AlignmentGroup:
395
439
  "meta": self.meta.asdict(),
396
440
  "type": self._type,
397
441
  "records": [
398
- rec.asdict(positional=positional, withmeta=withmeta) for rec in self.records
442
+ rec.asdict(
443
+ positional=positional,
444
+ withmeta=withmeta,
445
+ source_tokens=source_tokens,
446
+ target_tokens=target_tokens,
447
+ )
448
+ for rec in self.records
399
449
  ],
400
450
  }
401
451
 
@@ -446,10 +496,27 @@ class TopLevelGroups:
446
496
  """Return a printed representation."""
447
497
  return f"<TopLevelGroups({self.targetdocid}): {self.sourcedocids}>"
448
498
 
449
- def asdict(self, hoist: bool = True) -> dict[str, Any]:
450
- """Return an opionated dict of values suitable for serialization."""
499
+ def asdict(
500
+ self,
501
+ hoist: bool = True,
502
+ source_tokens: Optional[dict[str, Any]] = None,
503
+ target_tokens: Optional[dict[str, Any]] = None,
504
+ ) -> dict[str, Any]:
505
+ """Return an opinionated dict of values suitable for serialization.
506
+
507
+ With source_tokens and target_tokens, passes them to each group's
508
+ asdict() so that selectors are replaced with tokenstr representations.
509
+
510
+ """
451
511
  return {
452
512
  "format": self.format,
453
513
  "version": self.version,
454
- "groups": [self.groups[0].asdict(hoist=hoist), self.groups[1].asdict(hoist=hoist)],
514
+ "groups": [
515
+ self.groups[0].asdict(
516
+ hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
517
+ ),
518
+ self.groups[1].asdict(
519
+ hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
520
+ ),
521
+ ],
455
522
  }
@@ -92,3 +92,12 @@ def bare_id(identifier: str) -> str:
92
92
  identifier
93
93
  ), f"'{identifier}' does not look like a valid BCVWPID identifier."
94
94
  return identifier[1:] if identifier[0].isalpha() else identifier
95
+
96
+
97
+ def strip_tokenstr(selector: str) -> str:
98
+ """Return only the ID portion of a selector, dropping any tokenstr text suffix.
99
+
100
+ A tokenstr selector has the form "{id}|{text}" (e.g. "n41004003001|Ἀκούετε").
101
+ Plain IDs without a '|' are returned unchanged.
102
+ """
103
+ return selector.split("|", 1)[0] if "|" in selector else selector
@@ -156,15 +156,32 @@ class VerseData:
156
156
  for trg in targets:
157
157
  print(f"Target: {trg._display}")
158
158
 
159
- def display_record(self, alrec: AlignmentRecord) -> None:
160
- """Display an alignment record from this instance."""
159
+ def display_record(self, alrec: AlignmentRecord, srcwidth: Optional[int] = None) -> None:
160
+ """Display an alignment record from this instance.
161
+
162
+ srcwidth sets the minimum column width for the source token string;
163
+ defaults to the length of the source token string (no padding).
164
+ The source column is left-justified within that width.
165
+ """
161
166
  source_tokenstring: str = ", ".join(
162
167
  [self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
163
168
  )
164
169
  target_tokenstring: str = ", ".join(
165
170
  [self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
166
171
  )
167
- print(f"{alrec.meta.id}: {source_tokenstring} --- {target_tokenstring}")
172
+ width = srcwidth if srcwidth is not None else len(source_tokenstring)
173
+ print(f"{alrec.meta.id}: {source_tokenstring:<{width}} --- {target_tokenstring}")
174
+
175
+ def display_records(self) -> None:
176
+ """Display all alignment records with a consistent source column width."""
177
+ if not self.records:
178
+ return
179
+ srcwidth: int = max(
180
+ len(", ".join(self.sourceitems[sel].tokenstr for sel in alrec.source_selectors))
181
+ for alrec in self.records
182
+ )
183
+ for alrec in self.records:
184
+ self.display_record(alrec, srcwidth=srcwidth)
168
185
 
169
186
  def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
170
187
  """Display tokens from typeattr that are _not_ aligned."""
@@ -18,7 +18,7 @@ from .AlignmentSet import AlignmentSet
18
18
  from .AlignmentType import TranslationType
19
19
  from .alignments import AlignmentsReader, write_alignment_group
20
20
  from .manager import Manager, VerseData
21
- from .BaseToken import BaseToken, asbool, bare_id
21
+ from .BaseToken import BaseToken, asbool, bare_id, strip_tokenstr
22
22
  from .DiffRecord import DiffReason, DiffRecord
23
23
  from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
24
24
  from .target import Target, TargetReader
@@ -42,6 +42,7 @@ __all__ = [
42
42
  "BaseToken",
43
43
  "asbool",
44
44
  "bare_id",
45
+ "strip_tokenstr",
45
46
  # DiffRecord
46
47
  "DiffReason",
47
48
  "DiffRecord",
@@ -28,6 +28,7 @@ from .AlignmentGroup import Document, Metadata, AlignmentGroup, AlignmentReferen
28
28
  from .AlignmentSet import AlignmentSet
29
29
  from .AlignmentType import TranslationType
30
30
  from .BadRecord import BadRecord, Reason
31
+ from .BaseToken import strip_tokenstr
31
32
  from .source import SourceReader, macula_unprefixer
32
33
  from .target import TargetReader
33
34
 
@@ -110,10 +111,12 @@ class AlignmentsReader:
110
111
  #
111
112
 
112
113
  def _targetid(self, targetid: str) -> str:
113
- """Return a normalized target ID.
114
+ """Return a normalized target ID, optionally dropping the word-part digit.
114
115
 
115
- With self.keeptargetwordpart = False, drop the last digit.
116
+ Accepts both plain IDs and tokenstr selectors ("{id}|{text}").
117
+ With self.keeptargetwordpart = False, a 12-character ID is truncated to 11.
116
118
  """
119
+ targetid = strip_tokenstr(targetid)
117
120
  if not self.keeptargetwordpart and len(targetid) == 12:
118
121
  return targetid[:11]
119
122
  else:
@@ -297,23 +300,35 @@ class AlignmentsReader:
297
300
 
298
301
 
299
302
  # copied from gc2sb.manager.write_alignment_group with minor changes
300
- def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True) -> None:
303
+ def write_alignment_group(
304
+ group: AlignmentGroup,
305
+ f: TextIO,
306
+ source_tokens: Optional[dict[str, Any]] = None,
307
+ target_tokens: Optional[dict[str, Any]] = None,
308
+ ) -> None:
301
309
  """Write JSON data for an arbitrary group in Scripture Burrito format.
302
310
 
303
311
  Writes some of the JSON by hand to get records on the same line.
304
312
  Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
313
+
314
+ With source_tokens provided as a dict mapping bare token IDs to token
315
+ objects, source selectors are written as tokenstr representations
316
+ ("{id}|{text}") instead of plain IDs.
317
+
318
+ With target_tokens provided as a dict mapping token IDs to token objects,
319
+ target selectors are written as tokenstr representations ("{id}|{text}").
305
320
  """
306
321
 
307
322
  def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
308
323
  """Write documents tuple to out."""
309
324
  out.write(' "documents": [\n')
310
- out.write(" " + json.dumps(documents[0].asdict()) + ",\n")
311
- out.write(" " + json.dumps(documents[1].asdict()) + "\n")
325
+ out.write(" " + json.dumps(documents[0].asdict(), ensure_ascii=False) + ",\n")
326
+ out.write(" " + json.dumps(documents[1].asdict(), ensure_ascii=False) + "\n")
312
327
  out.write(" ],\n")
313
328
 
314
329
  def _write_meta(out: TextIO, meta: Metadata) -> None:
315
330
  """Write metadata to out."""
316
- metarow = '"meta": ' + json.dumps(meta.asdict())
331
+ metarow = '"meta": ' + json.dumps(meta.asdict(), ensure_ascii=False)
317
332
  f.write(f" {metarow},\n")
318
333
 
319
334
  def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
@@ -324,20 +339,21 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
324
339
  """
325
340
  bcv = arec.source_bcv
326
341
  bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
327
- recdict = arec.asdict()
342
+ recdict = arec.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
328
343
  recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
329
344
  return recdict
330
345
 
331
346
  f.write("{\n")
332
347
  _write_documents(f, group.documents)
333
348
  _write_meta(f, group.meta)
334
- f.write(f' "roles": {json.dumps(group.roles)},\n')
349
+ f.write(f' "roles": {json.dumps(group.roles, ensure_ascii=False)},\n')
335
350
  f.write(f' "type": "{group._type}",\n "records": [\n ')
336
351
  # should sort the records: NIV11 doesn't appear to be sorted
337
352
  bcv_counters: dict[str, int] = {}
338
- for arec in group.records[:-1]:
339
- json.dump(_record_dict(arec, bcv_counters), f)
353
+ records = sorted(group.records)
354
+ for arec in records[:-1]:
355
+ json.dump(_record_dict(arec, bcv_counters), f, ensure_ascii=False)
340
356
  f.write(",\n ")
341
357
  # now the last one without a comma, because JSON
342
- json.dump(_record_dict(group.records[-1], bcv_counters), f)
358
+ json.dump(_record_dict(group.records[-1], bcv_counters), f, ensure_ascii=False)
343
359
  f.write("\n ]}")
@@ -35,7 +35,7 @@ from collections import UserDict
35
35
  from typing import TypedDict
36
36
  from warnings import warn
37
37
 
38
- from .AlignmentGroup import AlignmentRecord
38
+ from .AlignmentGroup import AlignmentGroup, AlignmentRecord
39
39
  from .AlignmentSet import AlignmentSet
40
40
  from .VerseData import VerseData
41
41
  from .alignments import AlignmentsReader
@@ -114,6 +114,7 @@ class Manager(UserDict):
114
114
  keepbadrecords=self.keepbadrecords,
115
115
  )
116
116
  self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
117
+ self.alignmentgroup: AlignmentGroup = self.alignmentsreader.alignmentgroup
117
118
  # TODO: upgrade the selectors to use tokenstr. This requires
118
119
  # knowing the source and targetitems, but alignmentsreader
119
120
  # doesn't have that data
@@ -53,7 +53,7 @@ from biblealignlib import normalize_strongs, get_canonid
53
53
 
54
54
  # should eventually come from Clearlib
55
55
  from .util import groupby_key
56
- from .BaseToken import BaseToken
56
+ from .BaseToken import BaseToken, strip_tokenstr
57
57
 
58
58
  PREFIXRE = re.compile(r"^[no]")
59
59
 
@@ -76,7 +76,13 @@ def macula_prefixer(bcvwp: str) -> str:
76
76
 
77
77
 
78
78
  def macula_unprefixer(bcvwp: str) -> str:
79
- """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged."""
79
+ """Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.
80
+
81
+ Also strips any tokenstr text suffix ("{id}|{text}" → "{id}") before
82
+ checking for the prefix, so both plain IDs and tokenstr selectors are
83
+ handled correctly.
84
+ """
85
+ bcvwp = strip_tokenstr(bcvwp)
80
86
  if PREFIXRE.match(bcvwp):
81
87
  return bcvwp[1:]
82
88
  else:
@@ -17,12 +17,7 @@ are stored as DiffRecord instances in a dict keyed by BCV.
17
17
  >>> mgr84 = Manager(alset1)
18
18
  >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets)
19
19
  >>> len(dt84)
20
- 3784
21
- # now run it again to account for single-token replacements
22
- >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets, dt84.get_single_token_replacements())
23
- >>> len(dt84)
24
- 2860
25
-
20
+ 2565
26
21
 
27
22
  >>> alset2 = AlignmentSet(targetlanguage=targetlang,
28
23
  targetid="NIV11",
@@ -38,6 +33,7 @@ are stored as DiffRecord instances in a dict keyed by BCV.
38
33
  from __future__ import annotations
39
34
 
40
35
  from collections import UserDict
36
+ import copy
41
37
  from dataclasses import dataclass
42
38
  import difflib
43
39
  from itertools import zip_longest
@@ -45,13 +41,18 @@ from pathlib import Path
45
41
  from typing import Optional, TextIO, TYPE_CHECKING
46
42
 
47
43
  from biblealignlib.burrito import (
44
+ AlignmentGroup,
48
45
  AlignmentRecord,
46
+ AlignmentReference,
49
47
  AlignmentSet,
50
48
  BaseToken,
51
49
  DiffReason,
52
50
  DiffRecord,
51
+ Document,
53
52
  Manager,
53
+ Metadata,
54
54
  Target,
55
+ TargetReader,
55
56
  )
56
57
  from ..burrito.alignments import write_alignment_group
57
58
  from ..burrito.util import groupby_bcv
@@ -226,15 +227,11 @@ def diff_verse_targets(
226
227
  )
227
228
 
228
229
 
229
- # this is a two-pass operation:
230
- # Run with default (empty) bcvequivalents
231
- # Run again supplying get_single_token_replacements() as bcvequivalents
232
- # >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets.BCVEQUIVALENT84)
233
- # >>> len(dt84)
234
- # 3784
235
- # >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets, dt84.get_single_token_replacements())
236
- # >>> len(dt84)
237
- # 2879
230
+ # could try here to find alignment records that are a subset of an
231
+ # equal region, and then map the token IDs?
232
+ # then write out revised records and patch onto the alignment data??
233
+
234
+
238
235
  # this still doesn't handle multi-term direct replacements: for those we need to ensure semantic compatability
239
236
  class DiffTargets84(UserDict):
240
237
  missing84: set[str] = {"42023018", "47013014", "64001015"}
@@ -306,21 +303,21 @@ class DiffTargets84(UserDict):
306
303
  "64001014033": "64001015018",
307
304
  }
308
305
  # hacky way to avoid outputing the same alignment record more than once
309
- output_alrecs: dict[str, AlignmentRecord] = {}
306
+ output_alrecs: dict[str, bool] = {}
310
307
 
311
308
  def __init__(
312
309
  self,
313
310
  mgr84: Manager,
314
- targets11: dict[str, Target],
311
+ targets11: TargetReader,
315
312
  bcvequivalents: dict[str, dict[str, str]] = {},
316
313
  ) -> None:
317
314
  super().__init__()
318
315
  self.mgr84 = mgr84
319
316
  self.niv84bcvtargets: dict[str, list[Target]] = mgr84.bcv["targets"]
320
- self.targets11: dict[str, Target] = targets11
317
+ self.targets11: TargetReader = targets11
321
318
  self.bcvequivalents = bcvequivalents
322
319
  # not correct for versification differences??
323
- self.niv11bcvtargets: dict[str, list[Target]] = groupby_bcv(self.targets11.values())
320
+ self.niv11bcvtargets: dict[str, list[Target]] = groupby_bcv(list(self.targets11.values()))
324
321
  for bcv in self.niv11bcvtargets:
325
322
  if bcv not in self.missing84:
326
323
  trg84: list[Target] = self.niv84bcvtargets.get(bcv, [])
@@ -329,40 +326,56 @@ class DiffTargets84(UserDict):
329
326
  self.bcvequivalents.get(bcv, {}) if self.bcvequivalents else {}
330
327
  )
331
328
  record = diff_verse_targets(bcv, trg84, trg11, equivalents)
332
- # record.data is like (('equal', 0, 5, 0, 5), ('replace', 5, 6, 5, 6), ('equal', 6, 34, 6, 34), ('replace', 34, 35, 34, 35), ('equal', 35, 38, 35, 38))
333
- if record is not None:
329
+ # record.data is like
330
+ # (('equal', 0, 5, 0, 5), ('replace', 5, 6, 5, 6), ('equal', 6, 34, 6, 34),
331
+ # ('replace', 34, 35, 34, 35), ('equal', 35, 38, 35, 38))
332
+ if record and not self._replaceonly_same_length(record):
333
+ # then record as a difference
334
334
  self.data[bcv] = record
335
335
  # items that are only replacements
336
- self.replaceonly: dict[str, DiffRecord] = {
336
+ # self.replaceonly: dict[str, DiffRecord] = {
337
+ # bcv: drec
338
+ # for bcv, drec in self.items()
339
+ # if all([(op.opcode in ("equal", "replace")) for op in drec.data])
340
+ # }
341
+ # self.single_replaceonly: dict[str, list[Operation]] = {
342
+ # bcv: oplist
343
+ # for bcv, drec in self.replaceonly.items()
344
+ # if (oplist := [op for op in drec.data if op.single_replace])
345
+ # if oplist
346
+ # }
347
+ # self.dual_replaceonly: dict[str, list[Operation]] = {
348
+ # bcv: oplist
349
+ # for bcv, drec in self.replaceonly.items()
350
+ # if (oplist := [op for op in drec.data if op.dual_replace])
351
+ # if oplist
352
+ # }
353
+ self.replaceonly_same_length: dict[str, DiffRecord] = {
337
354
  bcv: drec
338
355
  for bcv, drec in self.items()
339
- if all([(op.opcode in ("equal", "replace")) for op in drec.data])
340
- }
341
- self.single_replaceonly: dict[str, list[Operation]] = {
342
- bcv: oplist
343
- for bcv, drec in self.replaceonly.items()
344
- if (oplist := [op for op in drec.data if op.single_replace])
345
- if oplist
346
- }
347
- self.dual_replaceonly: dict[str, list[Operation]] = {
348
- bcv: oplist
349
- for bcv, drec in self.replaceonly.items()
350
- if (oplist := [op for op in drec.data if op.dual_replace])
351
- if oplist
356
+ if all((op.opcode in ("equal", "replace")) for op in drec.data)
357
+ if all(op.same_length for op in drec.data)
352
358
  }
353
359
 
354
- def _get_bcv_texts(self, bcv) -> tuple[list[str], list[str]]:
360
+ def _replaceonly_same_length(self, diffrec: DiffRecord) -> bool:
361
+ """True if all operations are 'equal' or 'replace' of same length.
362
+
363
+ That means token IDs don't need to change in NIV11.
364
+ """
365
+ return all((op.opcode in ("equal", "replace")) and op.same_length for op in diffrec.data)
366
+
367
+ def _get_bcv_texts(self, bcv: str) -> tuple[list[str], list[str]]:
355
368
  record = self.data.get(bcv)
356
369
  if record is None:
357
370
  print(f"{bcv}: No differences")
358
- return
371
+ return [], []
359
372
  text84 = [trg.text for trg in self.niv84bcvtargets[bcv]]
360
373
  text11 = [trg.text for trg in self.niv11bcvtargets[bcv]]
361
374
  return text84, text11
362
375
 
363
- def get_single_token_replacements(self) -> dict[str, dict[str, str]]:
364
- # bcv-specific single token replacements
365
- return {bcv: self.replace_single_text(bcv) for bcv, ops in self.single_replaceonly.items()}
376
+ # def get_single_token_replacements(self) -> dict[str, dict[str, str]]:
377
+ # # bcv-specific single token replacements
378
+ # return {bcv: self.replace_single_text(bcv) for bcv, ops in self.single_replaceonly.items()}
366
379
 
367
380
  def display_pair_text(self, bcv: str) -> None:
368
381
  text84, text11 = self._get_bcv_texts(bcv)
@@ -371,27 +384,14 @@ class DiffTargets84(UserDict):
371
384
 
372
385
  # only for single-token replace operations
373
386
  def replace_single_text(self, bcv: str) -> dict[str, str]:
374
- record = self.data.get(bcv)
375
387
  text84, text11 = self._get_bcv_texts(bcv)
376
388
  replacements: dict[str, str] = {}
377
- for op in record.data:
389
+ for op in self.data.get(bcv, []):
378
390
  if op.single_replace:
379
391
  k = text11[op.start2 : op.end2][0]
380
392
  replacements[k] = text84[op.start1 : op.end1][0]
381
393
  return replacements
382
394
 
383
- # replacements where one or both sides have two tokens
384
- # could consolidate this with replace_single_text
385
- def replace_dual_text(self, bcv: str) -> dict[str, str]:
386
- record = self.data.get(bcv)
387
- text84, text11 = self._get_bcv_texts(bcv)
388
- replacements: dict[str, str] = {}
389
- for op in record.data:
390
- if op.dual_replace:
391
- k = tuple(text11[op.start2 : op.end2])
392
- replacements[k] = tuple(text84[op.start1 : op.end1])
393
- return replacements
394
-
395
395
  def mismatched_verses(self) -> dict[str, AlignmentRecord]:
396
396
  """Some alignment records have a source in one verse and a target in another."""
397
397
  return {
@@ -486,8 +486,7 @@ class DiffTargets84(UserDict):
486
486
  self.output_alrecs[alrec.meta.id] = True
487
487
  except KeyError as e:
488
488
  # Selectors: ['42001025012']
489
- # niv11map keys: dict_keys(['42001024001', '42001024002', '42001024003', '42001024004', '42001024005', '42001024006', '42001024007', '42001024008', '42001024009', '42001024010', '42001024011', '42001024012', '42001024013', '42001024014', '42001024015'])
490
-
489
+ # niv11map keys: dict_keys(['42001024001', '42001024002', '42001024003', ...])
491
490
  print(f"--- {versedata.bcvid}, KeyError on {e}")
492
491
  print(f"Record: {alrec}")
493
492
 
@@ -584,26 +583,51 @@ class DiffTargets84(UserDict):
584
583
  # niv11replace
585
584
 
586
585
 
587
- class Interlinear:
588
- """Line up NIV84 and NIV11 tokens opposite source (SBLGNT) tokens.
586
+ class Serialize:
587
+ """Serialize confident alignments for ClearAligner.
589
588
 
590
- For each source verse, outputs one row per source token (in verse order)
591
- showing the aligned NIV84 token(s) and corresponding NIV11 token(s).
589
+ With include_partials True, this also includes partial alignments.
592
590
 
593
- Alignment records with one source token expand to one row per target, with
594
- the source column blank on continuation rows. Records with multiple source
595
- tokens (many-to-many) collapse to one row per source token with all targets
596
- joined on that row.
591
+ Also outputs difference records and difference information on
592
+ tokens as a checklist of things to review.
597
593
 
598
- After all source-token rows, any NIV11 tokens not yet emitted are appended
599
- on blank source rows, sorted by token ID.
600
594
  """
601
595
 
602
- def __init__(self, dt84: DiffTargets84) -> None:
596
+ # collects alignment records that didn't produce partials: BCVID -> list of records
597
+ unmapped_records: dict[str, list[AlignmentRecord]] = {}
598
+
599
+ def __init__(self, dt84: DiffTargets84, include_partials: bool = False) -> None:
603
600
  self.dt84 = dt84
604
601
  self.mgr84 = dt84.mgr84
605
602
  self.niv84bcvtargets = dt84.niv84bcvtargets
606
603
  self.niv11bcvtargets = dt84.niv11bcvtargets
604
+ # new Document for AlignmentRecord instances
605
+ self.niv11_document: Document = Document(docid="NIV11", scheme="BCVW")
606
+ # construct a new manager, with mappings to NIV11
607
+ self.niv11alset: AlignmentSet = AlignmentSet(
608
+ targetlanguage=self.mgr84.alignmentset.targetlanguage,
609
+ targetid="NIV11",
610
+ sourceid=self.mgr84.alignmentset.sourceid,
611
+ langdatapath=self.mgr84.alignmentset.langdatapath,
612
+ )
613
+ # read the existing alignments but then replace the alignment records
614
+ self.mgr11: Manager = Manager(self.niv11alset)
615
+ self.mgr11.targetitems = self.dt84.targets11
616
+ self.niv11_algroup: AlignmentGroup = self.niv11_alignment_group(
617
+ include_partials=include_partials
618
+ )
619
+ self.mgr11.bcv["records"] = groupby_bcv(
620
+ list(self.niv11_algroup.records), lambda r: r.source_bcv
621
+ )
622
+ # and make VerseData instances for alignments
623
+ versedata: dict[str, VerseData] = {}
624
+ for bcvid in self.mgr11.bcv["records"]:
625
+ try:
626
+ vd: VerseData = self.mgr11.make_versedata(bcvid)
627
+ versedata[bcvid] = vd
628
+ except KeyError:
629
+ print(f"Warning: no records for {bcvid} in NIV11; skipping verse")
630
+ self.mgr11.bcv["versedata"] = versedata
607
631
 
608
632
  def _niv84_to_niv11(self, bcv: str) -> dict[str, list[Target]]:
609
633
  """Map NIV84 token IDs to NIV11 Target tokens for a verse.
@@ -637,142 +661,193 @@ class Interlinear:
637
661
  # delete → key omitted; insert → no NIV84 token, omitted from map
638
662
  return result
639
663
 
640
- def _verse_rows(self, bcv: str, versedata: "VerseData") -> list[tuple[str, str, str, str]]:
641
- """Return (source_str, niv84_str, niv11_str) tuples for one verse."""
642
-
643
- # def _niv11_token_string(tokens: list[BaseToken]) -> str:
644
- # niv11_tokens = [t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])]
645
- # return " ".join(t.tokenstr for t in niv11_tokens)
646
-
647
- interlabels: dict[str, str] = {
648
- "many-to-many": "+-+",
649
- "one-to-many": "1-1",
650
- "unaligned": "0",
651
- "unmatched": "-",
652
- }
653
- niv84_by_id: dict[str, Target] = {t.id: t for t in self.niv84bcvtargets.get(bcv, [])}
654
- niv84_to_niv11: dict[str, list[Target]] = self._niv84_to_niv11(bcv)
655
-
656
- src_to_alrecs: dict[str, list[AlignmentRecord]] = {}
657
- for alrec in versedata.records:
658
- for src_id in alrec.source_selectors:
659
- src_to_alrecs.setdefault(src_id, []).append(alrec)
660
-
661
- niv11_emitted: set[str] = set()
662
- rows: list[tuple[str, str, str, str]] = []
663
-
664
- for src_token in versedata.sources:
665
- alrecs = src_to_alrecs.get(src_token.id, [])
666
- if not alrecs:
667
- rows.append((interlabels["unaligned"], src_token.tokenstr, "", ""))
668
- continue
669
- for alrec in alrecs:
670
- niv84_ids = alrec.target_selectors
671
- niv84_tokens = [niv84_by_id[tid] for tid in niv84_ids if tid in niv84_by_id]
672
- niv84_str = " ".join(t.tokenstr for t in niv84_tokens)
673
- if len(alrec.source_selectors) > 1:
674
- # many-to-many: join all targets on one row per source token
675
- # duplicates here for 40001023004|ἐν
676
- niv11_tokens = sorted(
677
- {t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])}
678
- )
679
- niv11_str = " ".join(t.tokenstr for t in niv11_tokens)
680
- niv11_emitted.update(t.id for t in niv11_tokens)
681
- rows.append(
682
- (
683
- interlabels["many-to-many"],
684
- src_token.tokenstr,
685
- niv84_str,
686
- niv11_str,
687
- )
688
- )
689
- else:
690
- # one-to-many: one row per group of NIV84 targets, blank source on continuation
691
- niv84_tokens = [niv84_by_id[tid] for tid in niv84_ids if tid in niv84_by_id]
692
- niv84_str = " ".join(t.tokenstr for t in niv84_tokens)
693
- # duplicates here for 40001019006|δίκαιος
694
- niv11_tokens = sorted(
695
- {t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])}
696
- )
697
- # niv11_tokens = [t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])]
698
- niv11_str = " ".join(t.tokenstr for t in niv11_tokens)
699
- niv11_emitted.update(t.id for t in niv11_tokens)
700
- rows.append(
701
- (
702
- interlabels["one-to-many"],
703
- src_token.tokenstr,
704
- niv84_str,
705
- niv11_str,
706
- )
707
- )
708
-
709
- # Append any NIV11 tokens not yet emitted, sorted by ID
710
- unmatched = sorted(
711
- [t for t in self.niv11bcvtargets.get(bcv, []) if t.id not in niv11_emitted],
712
- key=lambda t: t.id,
713
- )
714
- allunmatched = " ".join(t.tokenstr for t in unmatched)
715
- rows.append((interlabels["unmatched"], "", "", allunmatched))
716
-
717
- return rows
718
-
719
- def write_tsv(self, outpath: Optional[Path] = None) -> None:
720
- """Write one TSV row per source token across all verses."""
721
- if not outpath:
722
- outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
723
- outdir.mkdir(parents=True, exist_ok=True)
724
- outpath = outdir / "NIV84-NIV11-interlinear.tsv"
725
- with outpath.open("w", encoding="utf-8") as f:
726
- f.write("Label\tSource\tNIV84\tNIV11\n")
727
- for bcv, versedata in self.mgr84.bcv["versedata"].items():
728
- for row in self._verse_rows(bcv, versedata):
729
- f.write("\t".join(row) + "\n")
730
-
731
-
732
- class Serialize:
733
- """Serialize confident alignments for ClearAligner.
734
-
735
- Also outputs difference records and difference information on
736
- tokens as a checklist of things to review.
737
-
738
- """
739
-
740
- def __init__(self, dt84: DiffTargets84) -> None:
741
- self.dt84 = dt84
742
- self.mgr84 = dt84.mgr84
743
- self.niv84bcvtargets = dt84.niv84bcvtargets
744
- self.niv11bcvtargets = dt84.niv11bcvtargets
745
- # construct a new manager, with mappings to NIV11
746
- niv11alset: AlignmentSet = AlignmentSet(
747
- targetlanguage=self.mgr84.alignmentset.targetlanguage,
748
- targetid="NIV11",
749
- sourceid=self.mgr84.alignmentset.sourceid,
750
- langdatapath=self.mgr84.alignmentset.langdatapath,
751
- )
752
- # read the existing alignments but then replace the alignment records
753
- self.mgr11: Manager = Manager(niv11alset)
754
-
664
+ # if the target IDs are the same, even if the NIV11 tokens are
665
+ # different, we can still use the NIV84 alignment records
666
+ # but xverse mapping is needed
755
667
  def _alrecs_to_niv11(self, bcv: str) -> list[AlignmentRecord]:
756
- """Return the alignment records from NIV84, mappNIV11 targets.
668
+ """Return the alignment records from NIV84, mapped to NIV11 targets.
757
669
 
758
670
  Only when there aren't significant differences.
759
671
  """
760
672
  niv84_tokens = self.niv84bcvtargets.get(bcv, [])
761
673
  # niv11_tokens = self.niv11bcvtargets.get(bcv, [])
762
674
  alrecs_niv84: list[AlignmentRecord] = self.mgr84.bcv["records"][bcv]
763
- # map token IDs to token instances
764
- # niv84_id_tokens: dict[str, Target] = {t.id: t for t in niv84_tokens}
765
- # niv11_id_tokens: dict[str, Target] = {t.id: t for t in niv11_tokens}
675
+ # niv84_to_niv11 = self._niv84_to_niv11(bcv)
766
676
  # this only works because each token is equal or equivalent
767
677
  #
768
678
  # this handles any cases of cross-verse boundary changes
769
- niv84_niv11_map: dict[str, str] = {
679
+ niv84_niv11_xverse_map: dict[str, str] = {
770
680
  t84.id: self.dt84.niv84_niv11_map.get(t84.id, t84.id) for t84 in niv84_tokens
771
681
  }
772
-
682
+ # in theory, the xverse map shouldn't interact with the diff-based map ...
683
+ new_alrecs: list[AlignmentRecord] = []
773
684
  for alrec in alrecs_niv84:
774
685
  niv11_selectors: list[str] = [
775
- niv84_niv11_map.get(sel, sel) for sel in alrec.target_selectors
686
+ xverse_sel
687
+ for sel in alrec.target_selectors
688
+ if (xverse_sel := niv84_niv11_xverse_map.get(sel, sel))
689
+ # if (to_niv11 := niv84_to_niv11.get(xverse_sel, xverse_sel))
690
+ # for niv11_tok in to_niv11
691
+ ]
692
+ new_reference: AlignmentReference = AlignmentReference(
693
+ document=self.niv11_document, selectors=niv11_selectors
694
+ )
695
+ newmeta = copy.deepcopy(alrec.meta)
696
+ newmeta.origin = "NIV84_transfer"
697
+ new_alrecs.append(
698
+ AlignmentRecord(
699
+ meta=alrec.meta,
700
+ references={
701
+ "source": alrec.references["source"],
702
+ "target": new_reference,
703
+ },
704
+ type=alrec.type,
705
+ )
706
+ )
707
+
708
+ return new_alrecs
709
+
710
+ # from Claude
711
+ def collect_partial_records(self, bcv: str) -> list[AlignmentRecord]:
712
+ """Generate NIV11 AlignmentRecords for confidently-mapped records in a diff verse.
713
+
714
+ For each operation in the verse's DiffRecord whose opcode is 'equal' or
715
+ 'replace' with equal length (same token count on both sides), the positional
716
+ zip gives a one-to-one NIV84 → NIV11 token correspondence.
717
+
718
+ An AlignmentRecord is included only when every one of its target selectors
719
+ falls within the span of such a confident operation, so the full NIV11
720
+ mapping is unambiguous. Records that straddle operation boundaries, or
721
+ whose selectors sit in a delete/insert/unequal-replace span, are skipped.
722
+
723
+ Returns an empty list for verses without a DiffRecord (those are handled
724
+ by the existing _alrecs_to_niv11 / niv11_alignment_group path).
725
+ """
726
+ diffrec = self.dt84.data.get(bcv)
727
+ if diffrec is None:
728
+ return []
729
+ versedata = self.mgr84.bcv["versedata"].get(bcv)
730
+ if versedata is None or not versedata.records:
731
+ return []
732
+
733
+ niv84_tokens = self.niv84bcvtargets.get(bcv, [])
734
+ niv11_tokens = self.niv11bcvtargets.get(bcv, [])
735
+
736
+ # Build a confident NIV84 token ID → NIV11 token ID map.
737
+ # equal ops: texts match; same-length replace ops: unique positional partner.
738
+ confident_map: dict[str, str] = {}
739
+ for op in diffrec.data:
740
+ if op.opcode == "equal" or (op.opcode == "replace" and op.same_length):
741
+ for t84, t11 in zip(
742
+ niv84_tokens[op.start1 : op.end1],
743
+ niv11_tokens[op.start2 : op.end2],
744
+ ):
745
+ confident_map[t84.id] = t11.id
746
+
747
+ if not confident_map:
748
+ return []
749
+
750
+ new_alrecs: list[AlignmentRecord] = []
751
+ for alrec in versedata.records:
752
+ niv11_selectors: list[str] = []
753
+ for sel in alrec.target_selectors:
754
+ niv11_id = confident_map.get(sel)
755
+ if niv11_id is None:
756
+ if bcv not in self.unmapped_records:
757
+ self.unmapped_records[bcv] = [alrec]
758
+ else:
759
+ if alrec not in self.unmapped_records[bcv]:
760
+ self.unmapped_records[bcv].append(alrec)
761
+ break # selector not in any confident span → skip record
762
+ niv11_selectors.append(niv11_id)
763
+ else:
764
+ # all selectors mapped confidently
765
+ newmeta = copy.deepcopy(alrec.meta)
766
+ newmeta.origin = "NIV84_partial_transfer"
767
+ new_alrecs.append(
768
+ AlignmentRecord(
769
+ meta=newmeta,
770
+ references={
771
+ "source": alrec.references["source"],
772
+ "target": AlignmentReference(
773
+ document=self.niv11_document, selectors=niv11_selectors
774
+ ),
775
+ },
776
+ type=alrec.type,
777
+ )
778
+ )
779
+
780
+ return new_alrecs
781
+
782
+ def niv11_alignment_group(self, include_partials: bool = False) -> AlignmentGroup:
783
+ """Return an AlignmentGroup for NIV11, with aligned records from NIV84 where possible."""
784
+ niv84_algroup: AlignmentGroup = self.mgr84.alignmentsreader.alignmentgroup
785
+ sblgnt_document: Document = niv84_algroup.documents[0]
786
+ niv11_metadata: Metadata = Metadata(conformsTo="0.3", creator="NIV84-NIV11 transfer")
787
+ niv11_alrecs: list[AlignmentRecord] = [
788
+ alrec
789
+ for bcv in self.mgr84.bcv["records"].keys()
790
+ # only those that map cleanly
791
+ if bcv not in self.dt84
792
+ for alrec in self._alrecs_to_niv11(bcv)
793
+ ]
794
+ if include_partials:
795
+ niv11_partials: list[AlignmentRecord] = [
796
+ alrec
797
+ for bcv in self.dt84.data.keys()
798
+ for alrec in self.collect_partial_records(bcv)
776
799
  ]
777
- alrec.update_target_selectors(niv11_selectors)
778
- return alrecs_niv84
800
+ niv11_alrecs = sorted(niv11_alrecs + niv11_partials)
801
+ niv11_algroup: AlignmentGroup = AlignmentGroup(
802
+ documents=(sblgnt_document, self.niv11_document),
803
+ meta=niv11_metadata,
804
+ records=niv11_alrecs,
805
+ roles=niv84_algroup.roles,
806
+ sourcedocid=niv84_algroup.sourcedocid,
807
+ canon=niv84_algroup.canon,
808
+ _type=niv84_algroup._type,
809
+ )
810
+ return niv11_algroup
811
+
812
+ def write_unmapped_records(self, outpath: Path = None) -> None:
813
+ """Write partials that were not included in partials (confidently-mapped spans)."""
814
+ unmapped_output: set[AlignmentRecord] = set()
815
+ if not outpath:
816
+ outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
817
+ outdir.mkdir(parents=True, exist_ok=True)
818
+ outpath = outdir / "NIV84-NIV11-unmappedrecords.tsv"
819
+ with outpath.open("w", encoding="utf-8") as f:
820
+ f.write("Verse\tNIV84 Tokens\n")
821
+ for bcv, alreclist in self.unmapped_records.items():
822
+ niv84_bcv_tokens: list[Target] = self.niv84bcvtargets.get(bcv, [])
823
+ niv84_bcv_tokenstrs: dict[str, str] = {t.id: t.tokenstr for t in niv84_bcv_tokens}
824
+ for alrec in alreclist:
825
+ if alrec in unmapped_output:
826
+ continue
827
+ else:
828
+ unmapped_output.add(alrec)
829
+ niv84_str = {
830
+ sel: niv84_bcv_tokenstrs.get(sel, "<unknown>")
831
+ for sel in alrec.target_selectors
832
+ }
833
+ f.write(f"{bcv}\t{" ".join(niv84_str.values())}\n")
834
+
835
+ def write_diffs(self, outpath: Path = None) -> None:
836
+ """Write diffs as a checklist for manual alignment."""
837
+ if not outpath:
838
+ outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
839
+ outdir.mkdir(parents=True, exist_ok=True)
840
+ outpath = outdir / "NIV84-NIV11-diffs.tsv"
841
+ with outpath.open("w", encoding="utf-8") as f:
842
+ f.write("Verse\tOpCode\tNIV84 Tokens\tNIV11 Tokens\n")
843
+ for bcv, diffrec in self.dt84.data.items():
844
+ niv84_tokens = self.niv84bcvtargets.get(bcv, [])
845
+ niv11_tokens = self.niv11bcvtargets.get(bcv, [])
846
+ for op in diffrec.data:
847
+ seq84 = niv84_tokens[op.start1 : op.end1]
848
+ seq11 = niv11_tokens[op.start2 : op.end2]
849
+ f.write(
850
+ f"{bcv}\t{op.opcode}\t"
851
+ f"{' '.join(t.tokenstr for t in seq84)}\t"
852
+ f"{' '.join(t.tokenstr for t in seq11)}\n"
853
+ )
@@ -77,8 +77,7 @@ class Merger:
77
77
  data2: Optional[VerseData] = cast(
78
78
  Optional[VerseData], self.mgr2.bcv["versedata"].get(bcv)
79
79
  )
80
- if data1 and data2:
81
- diffs: list[DiffRecord] = data1.diff(data2)
80
+ diffs: list[DiffRecord] = data1.diff(data2) if data1 and data2 else []
82
81
  bcv_pairs[bcv] = BCVPair(
83
82
  bcv=bcv,
84
83
  mgr1_data=data1,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "biblealignlib"
3
- version = "0.3.2"
3
+ version = "0.4.0"
4
4
  description = "Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments."
5
5
  authors = ["Sean Boisen <sean.boisen@biblica.com>"]
6
6
  license = "MIT"
@@ -1,93 +0,0 @@
1
- """Transfer alignment data from one AlignmentSet to another on a closely related targe version.
2
-
3
- Example: if NIV84 alignments are more complete than NIV2011
4
- alignments, this code can transfer the data from the former to the
5
- latter, where the surface text is the same, and where NIV84 has
6
- alignment record that is missing from NIV11.
7
-
8
- Input is two Manager instances, which must be based on the same source
9
- and target language. If the target versions are the _same_, use
10
- merger.py instead.
11
-
12
- - If the source or target languages aren't the same, this code is
13
- not relevant to your problem.
14
-
15
-
16
- >>> from biblealignlib.burrito import CLEARROOT, Manager, AlignmentSet
17
- >>> from biblealignlib.util import Transfer
18
- >>> targetlang, sourceid = ("eng", "SBLGNT")
19
- # get manager instances for two sets of alignments
20
- >>> niv84as = AlignmentSet(targetlanguage=targetlang,
21
- targetid="NIV84",
22
- sourceid=sourceid,
23
- langdatapath=(CLEARROOT / f"alignments-{targetlang}/data"))
24
- >>> niv84mgr = Manager(niv84as)
25
- >>> niv11as = AlignmentSet(targetlanguage=targetlang,
26
- targetid="NIV11",
27
- sourceid=sourceid,
28
- langdatapath=(CLEARROOT / f"alignments-{targetlang}/data"))
29
- >>> niv11mgr = Manager(niv11as)
30
- # instantiate a Transfer instance
31
- >>> transferinst = Transfer.Transfer(niv84mgr, niv11mgr)
32
-
33
- """
34
-
35
- from collections import Counter, UserDict
36
- from typing import cast, Optional
37
-
38
- from ..burrito import Manager, VerseData
39
- from ..burrito import DiffRecord
40
- from ..burrito.util import groupby_bcid
41
- from . import BCVPair
42
-
43
-
44
- class Transfer(UserDict):
45
-
46
- def __init__(self, mgr1: Manager, mgr2: Manager) -> None:
47
- """Initialize an instance."""
48
- super().__init__()
49
- self.mgr1 = mgr1
50
- self.mgr2 = mgr2
51
- for attr in ("sourceid", "targetlanguage"):
52
- mgr1attr = getattr(self.mgr1.alignmentset, attr)
53
- mgr2attr = getattr(self.mgr2.alignmentset, attr)
54
- if mgr1attr != mgr2attr:
55
- raise ValueError(
56
- f"Managers must have the same {attr!r} attribute, but {mgr1attr} != {mgr2attr}"
57
- )
58
- # should be the same for both
59
- self.allsrcbcv = mgr1.bcv["sources"]
60
- self.data: dict[str, BCVPair] = self.get_bcv_pairs()
61
- self.pairingcounts = Counter(bcvp.pairing for bcvp in self.values())
62
- # overlaps
63
- self.overlaps = [bcvp for bcvp in self.values() if bcvp.pairing == "both"]
64
- # overlaps with differences
65
- self.diffpairs = [bcvp for bcvp in self.overlaps if bcvp.diffs]
66
-
67
- def get_bcv_pairs(self) -> dict[str, BCVPair]:
68
- """Return a dictionary of BCVPair instances."""
69
- bcv_pairs: dict[str, BCVPair] = {}
70
- for bcv in self.allsrcbcv:
71
- data1: Optional[VerseData] = cast(
72
- Optional[VerseData], self.mgr1.bcv["versedata"].get(bcv)
73
- )
74
- data2: Optional[VerseData] = cast(
75
- Optional[VerseData], self.mgr2.bcv["versedata"].get(bcv)
76
- )
77
- if data1 and data2:
78
- diffs: list[DiffRecord] =
79
- bcv_pairs[bcv] = BCVPair(
80
- bcv=bcv,
81
- mgr1_data=data1,
82
- mgr2_data=data2,
83
- )
84
- return bcv_pairs
85
-
86
- def show_diffs(self) -> None:
87
- """Display information about overlaps that differ."""
88
- overlap_bcs = groupby_bcid([bcvp.bcv for bcvp in self.diffpairs])
89
- print(f"{len(overlap_bcs)} overlapping and different chapters: {overlap_bcs.keys()}")
90
- for bcvpair in self.diffpairs:
91
- vd1 = bcvpair.mgr1_data.alignments if bcvpair.mgr1_data else ()
92
- vd2 = bcvpair.mgr2_data.alignments if bcvpair.mgr2_data else ()
93
- print(bcvpair.bcv, ": ", str(len(vd1)), "---", str(len(vd2)))
File without changes
File without changes
File without changes