biblealignlib 0.3.2__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/PKG-INFO +1 -1
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentGroup.py +76 -9
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/BaseToken.py +9 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/VerseData.py +20 -3
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/__init__.py +2 -1
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/alignments.py +27 -11
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/manager.py +2 -1
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/source.py +8 -2
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/DiffTargets.py +266 -191
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/merger.py +1 -2
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/pyproject.toml +1 -1
- biblealignlib-0.3.2/biblealignlib/util/Transfer.py +0 -93
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/LICENSE +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/LICENSE.md +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/README.md +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/__init__.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/Score.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/__init__.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/corpusmapping.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/eflomal.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/mapper.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/reader.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/runeflomal.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/scorer.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/autoalign/writer.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentSet.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/AlignmentType.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/BadRecord.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/DiffRecord.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/target.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/burrito/util.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/Coverage.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/__init__.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/analyzer.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/exporter.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/coverage/filters.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/interlinear/__init__.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/interlinear/reverse.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/interlinear/token.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/strongs.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/DiffAlignments.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/__init__.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/canonsplit.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/tokens_to_chars.py +0 -0
- {biblealignlib-0.3.2 → biblealignlib-0.4.0}/biblealignlib/util/vocab.py +0 -0
|
@@ -27,7 +27,7 @@ from biblelib.word import bcvwpid
|
|
|
27
27
|
import biblealignlib as bal
|
|
28
28
|
|
|
29
29
|
from .AlignmentType import TranslationType
|
|
30
|
-
from .source import macula_prefixer
|
|
30
|
+
from .source import macula_prefixer, macula_unprefixer
|
|
31
31
|
|
|
32
32
|
|
|
33
33
|
# hoisting means this can be defined at several different levels, so
|
|
@@ -291,8 +291,15 @@ class AlignmentRecord:
|
|
|
291
291
|
"""
|
|
292
292
|
self.references["target"].selectors = sorted(selectors)
|
|
293
293
|
|
|
294
|
+
# note that source/target_tokens are only available from a Manager
|
|
295
|
+
# instance, so the default repr doesn't include tokenstrs.
|
|
294
296
|
def asdict(
|
|
295
|
-
self,
|
|
297
|
+
self,
|
|
298
|
+
positional: bool = False,
|
|
299
|
+
withmeta: bool = True,
|
|
300
|
+
withmaculaprefix: bool = False,
|
|
301
|
+
source_tokens: Optional[dict[str, Any]] = None,
|
|
302
|
+
target_tokens: Optional[dict[str, Any]] = None,
|
|
296
303
|
) -> dict[str, Any]:
|
|
297
304
|
"""Return a dict of values suitable for serialization.
|
|
298
305
|
|
|
@@ -307,6 +314,14 @@ class AlignmentRecord:
|
|
|
307
314
|
With withmaculaprefix=True (the default is False), prefix
|
|
308
315
|
source references with 'o' or 'n' depending on canon.
|
|
309
316
|
|
|
317
|
+
With source_tokens provided as a dict mapping bare token IDs to token
|
|
318
|
+
objects, source selectors are replaced with tokenstr representations
|
|
319
|
+
("{id}|{text}"). With withmaculaprefix=True, the prefixed ID is used.
|
|
320
|
+
|
|
321
|
+
With target_tokens provided as a dict mapping token IDs to token
|
|
322
|
+
objects, target selectors are replaced with tokenstr representations
|
|
323
|
+
("{id}|{text}").
|
|
324
|
+
|
|
310
325
|
"""
|
|
311
326
|
recdict: dict[str, Any] = {}
|
|
312
327
|
if positional:
|
|
@@ -319,12 +334,28 @@ class AlignmentRecord:
|
|
|
319
334
|
else:
|
|
320
335
|
# typical case
|
|
321
336
|
sourcerefs: list[str] = self.references["source"].selectors
|
|
322
|
-
if
|
|
337
|
+
if source_tokens is not None:
|
|
338
|
+
# Build tokenstr: use bare ID by default, prefixed ID if withmaculaprefix
|
|
339
|
+
bare_ids = [macula_unprefixer(sel) for sel in sourcerefs]
|
|
340
|
+
display_ids = (
|
|
341
|
+
[macula_prefixer(b) for b in bare_ids] if withmaculaprefix else bare_ids
|
|
342
|
+
)
|
|
343
|
+
sourcerefs = [
|
|
344
|
+
f"{did}|{tok.text}" if (tok := source_tokens.get(bare)) else did
|
|
345
|
+
for bare, did in zip(bare_ids, display_ids)
|
|
346
|
+
]
|
|
347
|
+
elif withmaculaprefix:
|
|
323
348
|
# default: add back the Macula prefix
|
|
324
349
|
sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs]
|
|
325
350
|
# else leave as is (atypical)
|
|
326
351
|
recdict["source"] = sourcerefs
|
|
327
|
-
|
|
352
|
+
targetrefs: list[str] = self.references["target"].selectors
|
|
353
|
+
if target_tokens is not None:
|
|
354
|
+
targetrefs = [
|
|
355
|
+
f"{sel}|{tok.text}" if (tok := target_tokens.get(sel)) else sel
|
|
356
|
+
for sel in targetrefs
|
|
357
|
+
]
|
|
358
|
+
recdict["target"] = targetrefs
|
|
328
359
|
if withmeta:
|
|
329
360
|
recdict.update(
|
|
330
361
|
{
|
|
@@ -380,12 +411,25 @@ class AlignmentGroup:
|
|
|
380
411
|
docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents])
|
|
381
412
|
return f"<AlignmentGroup{docids}: {len(self.records)} records>"
|
|
382
413
|
|
|
383
|
-
def asdict(
|
|
414
|
+
def asdict(
|
|
415
|
+
self,
|
|
416
|
+
hoist: bool = True,
|
|
417
|
+
source_tokens: Optional[dict[str, Any]] = None,
|
|
418
|
+
target_tokens: Optional[dict[str, Any]] = None,
|
|
419
|
+
) -> dict[str, Any]:
|
|
384
420
|
"""Return a dict of values suitable for serialization.
|
|
385
421
|
|
|
386
422
|
This is opinionated about the preferred serialization: hoists
|
|
387
423
|
as much as possible to upper levels.
|
|
388
424
|
|
|
425
|
+
With source_tokens provided as a dict mapping bare token IDs to token
|
|
426
|
+
objects, source selectors in each record are replaced with tokenstr
|
|
427
|
+
representations ("{id}|{text}").
|
|
428
|
+
|
|
429
|
+
With target_tokens provided as a dict mapping token IDs to token
|
|
430
|
+
objects, target selectors in each record are replaced with tokenstr
|
|
431
|
+
representations ("{id}|{text}").
|
|
432
|
+
|
|
389
433
|
"""
|
|
390
434
|
# for now
|
|
391
435
|
positional: bool = False
|
|
@@ -395,7 +439,13 @@ class AlignmentGroup:
|
|
|
395
439
|
"meta": self.meta.asdict(),
|
|
396
440
|
"type": self._type,
|
|
397
441
|
"records": [
|
|
398
|
-
rec.asdict(
|
|
442
|
+
rec.asdict(
|
|
443
|
+
positional=positional,
|
|
444
|
+
withmeta=withmeta,
|
|
445
|
+
source_tokens=source_tokens,
|
|
446
|
+
target_tokens=target_tokens,
|
|
447
|
+
)
|
|
448
|
+
for rec in self.records
|
|
399
449
|
],
|
|
400
450
|
}
|
|
401
451
|
|
|
@@ -446,10 +496,27 @@ class TopLevelGroups:
|
|
|
446
496
|
"""Return a printed representation."""
|
|
447
497
|
return f"<TopLevelGroups({self.targetdocid}): {self.sourcedocids}>"
|
|
448
498
|
|
|
449
|
-
def asdict(
|
|
450
|
-
|
|
499
|
+
def asdict(
|
|
500
|
+
self,
|
|
501
|
+
hoist: bool = True,
|
|
502
|
+
source_tokens: Optional[dict[str, Any]] = None,
|
|
503
|
+
target_tokens: Optional[dict[str, Any]] = None,
|
|
504
|
+
) -> dict[str, Any]:
|
|
505
|
+
"""Return an opinionated dict of values suitable for serialization.
|
|
506
|
+
|
|
507
|
+
With source_tokens and target_tokens, passes them to each group's
|
|
508
|
+
asdict() so that selectors are replaced with tokenstr representations.
|
|
509
|
+
|
|
510
|
+
"""
|
|
451
511
|
return {
|
|
452
512
|
"format": self.format,
|
|
453
513
|
"version": self.version,
|
|
454
|
-
"groups": [
|
|
514
|
+
"groups": [
|
|
515
|
+
self.groups[0].asdict(
|
|
516
|
+
hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
|
|
517
|
+
),
|
|
518
|
+
self.groups[1].asdict(
|
|
519
|
+
hoist=hoist, source_tokens=source_tokens, target_tokens=target_tokens
|
|
520
|
+
),
|
|
521
|
+
],
|
|
455
522
|
}
|
|
@@ -92,3 +92,12 @@ def bare_id(identifier: str) -> str:
|
|
|
92
92
|
identifier
|
|
93
93
|
), f"'{identifier}' does not look like a valid BCVWPID identifier."
|
|
94
94
|
return identifier[1:] if identifier[0].isalpha() else identifier
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def strip_tokenstr(selector: str) -> str:
|
|
98
|
+
"""Return only the ID portion of a selector, dropping any tokenstr text suffix.
|
|
99
|
+
|
|
100
|
+
A tokenstr selector has the form "{id}|{text}" (e.g. "n41004003001|Ἀκούετε").
|
|
101
|
+
Plain IDs without a '|' are returned unchanged.
|
|
102
|
+
"""
|
|
103
|
+
return selector.split("|", 1)[0] if "|" in selector else selector
|
|
@@ -156,15 +156,32 @@ class VerseData:
|
|
|
156
156
|
for trg in targets:
|
|
157
157
|
print(f"Target: {trg._display}")
|
|
158
158
|
|
|
159
|
-
def display_record(self, alrec: AlignmentRecord) -> None:
|
|
160
|
-
"""Display an alignment record from this instance.
|
|
159
|
+
def display_record(self, alrec: AlignmentRecord, srcwidth: Optional[int] = None) -> None:
|
|
160
|
+
"""Display an alignment record from this instance.
|
|
161
|
+
|
|
162
|
+
srcwidth sets the minimum column width for the source token string;
|
|
163
|
+
defaults to the length of the source token string (no padding).
|
|
164
|
+
The source column is left-justified within that width.
|
|
165
|
+
"""
|
|
161
166
|
source_tokenstring: str = ", ".join(
|
|
162
167
|
[self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
|
|
163
168
|
)
|
|
164
169
|
target_tokenstring: str = ", ".join(
|
|
165
170
|
[self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
|
|
166
171
|
)
|
|
167
|
-
|
|
172
|
+
width = srcwidth if srcwidth is not None else len(source_tokenstring)
|
|
173
|
+
print(f"{alrec.meta.id}: {source_tokenstring:<{width}} --- {target_tokenstring}")
|
|
174
|
+
|
|
175
|
+
def display_records(self) -> None:
|
|
176
|
+
"""Display all alignment records with a consistent source column width."""
|
|
177
|
+
if not self.records:
|
|
178
|
+
return
|
|
179
|
+
srcwidth: int = max(
|
|
180
|
+
len(", ".join(self.sourceitems[sel].tokenstr for sel in alrec.source_selectors))
|
|
181
|
+
for alrec in self.records
|
|
182
|
+
)
|
|
183
|
+
for alrec in self.records:
|
|
184
|
+
self.display_record(alrec, srcwidth=srcwidth)
|
|
168
185
|
|
|
169
186
|
def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
|
|
170
187
|
"""Display tokens from typeattr that are _not_ aligned."""
|
|
@@ -18,7 +18,7 @@ from .AlignmentSet import AlignmentSet
|
|
|
18
18
|
from .AlignmentType import TranslationType
|
|
19
19
|
from .alignments import AlignmentsReader, write_alignment_group
|
|
20
20
|
from .manager import Manager, VerseData
|
|
21
|
-
from .BaseToken import BaseToken, asbool, bare_id
|
|
21
|
+
from .BaseToken import BaseToken, asbool, bare_id, strip_tokenstr
|
|
22
22
|
from .DiffRecord import DiffReason, DiffRecord
|
|
23
23
|
from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
|
|
24
24
|
from .target import Target, TargetReader
|
|
@@ -42,6 +42,7 @@ __all__ = [
|
|
|
42
42
|
"BaseToken",
|
|
43
43
|
"asbool",
|
|
44
44
|
"bare_id",
|
|
45
|
+
"strip_tokenstr",
|
|
45
46
|
# DiffRecord
|
|
46
47
|
"DiffReason",
|
|
47
48
|
"DiffRecord",
|
|
@@ -28,6 +28,7 @@ from .AlignmentGroup import Document, Metadata, AlignmentGroup, AlignmentReferen
|
|
|
28
28
|
from .AlignmentSet import AlignmentSet
|
|
29
29
|
from .AlignmentType import TranslationType
|
|
30
30
|
from .BadRecord import BadRecord, Reason
|
|
31
|
+
from .BaseToken import strip_tokenstr
|
|
31
32
|
from .source import SourceReader, macula_unprefixer
|
|
32
33
|
from .target import TargetReader
|
|
33
34
|
|
|
@@ -110,10 +111,12 @@ class AlignmentsReader:
|
|
|
110
111
|
#
|
|
111
112
|
|
|
112
113
|
def _targetid(self, targetid: str) -> str:
|
|
113
|
-
"""Return a normalized target ID.
|
|
114
|
+
"""Return a normalized target ID, optionally dropping the word-part digit.
|
|
114
115
|
|
|
115
|
-
|
|
116
|
+
Accepts both plain IDs and tokenstr selectors ("{id}|{text}").
|
|
117
|
+
With self.keeptargetwordpart = False, a 12-character ID is truncated to 11.
|
|
116
118
|
"""
|
|
119
|
+
targetid = strip_tokenstr(targetid)
|
|
117
120
|
if not self.keeptargetwordpart and len(targetid) == 12:
|
|
118
121
|
return targetid[:11]
|
|
119
122
|
else:
|
|
@@ -297,23 +300,35 @@ class AlignmentsReader:
|
|
|
297
300
|
|
|
298
301
|
|
|
299
302
|
# copied from gc2sb.manager.write_alignment_group with minor changes
|
|
300
|
-
def write_alignment_group(
|
|
303
|
+
def write_alignment_group(
|
|
304
|
+
group: AlignmentGroup,
|
|
305
|
+
f: TextIO,
|
|
306
|
+
source_tokens: Optional[dict[str, Any]] = None,
|
|
307
|
+
target_tokens: Optional[dict[str, Any]] = None,
|
|
308
|
+
) -> None:
|
|
301
309
|
"""Write JSON data for an arbitrary group in Scripture Burrito format.
|
|
302
310
|
|
|
303
311
|
Writes some of the JSON by hand to get records on the same line.
|
|
304
312
|
Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
|
|
313
|
+
|
|
314
|
+
With source_tokens provided as a dict mapping bare token IDs to token
|
|
315
|
+
objects, source selectors are written as tokenstr representations
|
|
316
|
+
("{id}|{text}") instead of plain IDs.
|
|
317
|
+
|
|
318
|
+
With target_tokens provided as a dict mapping token IDs to token objects,
|
|
319
|
+
target selectors are written as tokenstr representations ("{id}|{text}").
|
|
305
320
|
"""
|
|
306
321
|
|
|
307
322
|
def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
|
|
308
323
|
"""Write documents tuple to out."""
|
|
309
324
|
out.write(' "documents": [\n')
|
|
310
|
-
out.write(" " + json.dumps(documents[0].asdict()) + ",\n")
|
|
311
|
-
out.write(" " + json.dumps(documents[1].asdict()) + "\n")
|
|
325
|
+
out.write(" " + json.dumps(documents[0].asdict(), ensure_ascii=False) + ",\n")
|
|
326
|
+
out.write(" " + json.dumps(documents[1].asdict(), ensure_ascii=False) + "\n")
|
|
312
327
|
out.write(" ],\n")
|
|
313
328
|
|
|
314
329
|
def _write_meta(out: TextIO, meta: Metadata) -> None:
|
|
315
330
|
"""Write metadata to out."""
|
|
316
|
-
metarow = '"meta": ' + json.dumps(meta.asdict())
|
|
331
|
+
metarow = '"meta": ' + json.dumps(meta.asdict(), ensure_ascii=False)
|
|
317
332
|
f.write(f" {metarow},\n")
|
|
318
333
|
|
|
319
334
|
def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
|
|
@@ -324,20 +339,21 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
|
|
|
324
339
|
"""
|
|
325
340
|
bcv = arec.source_bcv
|
|
326
341
|
bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
|
|
327
|
-
recdict = arec.asdict()
|
|
342
|
+
recdict = arec.asdict(source_tokens=source_tokens, target_tokens=target_tokens)
|
|
328
343
|
recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
|
|
329
344
|
return recdict
|
|
330
345
|
|
|
331
346
|
f.write("{\n")
|
|
332
347
|
_write_documents(f, group.documents)
|
|
333
348
|
_write_meta(f, group.meta)
|
|
334
|
-
f.write(f' "roles": {json.dumps(group.roles)},\n')
|
|
349
|
+
f.write(f' "roles": {json.dumps(group.roles, ensure_ascii=False)},\n')
|
|
335
350
|
f.write(f' "type": "{group._type}",\n "records": [\n ')
|
|
336
351
|
# should sort the records: NIV11 doesn't appear to be sorted
|
|
337
352
|
bcv_counters: dict[str, int] = {}
|
|
338
|
-
|
|
339
|
-
|
|
353
|
+
records = sorted(group.records)
|
|
354
|
+
for arec in records[:-1]:
|
|
355
|
+
json.dump(_record_dict(arec, bcv_counters), f, ensure_ascii=False)
|
|
340
356
|
f.write(",\n ")
|
|
341
357
|
# now the last one without a comma, because JSON
|
|
342
|
-
json.dump(_record_dict(group.records[-1], bcv_counters), f)
|
|
358
|
+
json.dump(_record_dict(group.records[-1], bcv_counters), f, ensure_ascii=False)
|
|
343
359
|
f.write("\n ]}")
|
|
@@ -35,7 +35,7 @@ from collections import UserDict
|
|
|
35
35
|
from typing import TypedDict
|
|
36
36
|
from warnings import warn
|
|
37
37
|
|
|
38
|
-
from .AlignmentGroup import AlignmentRecord
|
|
38
|
+
from .AlignmentGroup import AlignmentGroup, AlignmentRecord
|
|
39
39
|
from .AlignmentSet import AlignmentSet
|
|
40
40
|
from .VerseData import VerseData
|
|
41
41
|
from .alignments import AlignmentsReader
|
|
@@ -114,6 +114,7 @@ class Manager(UserDict):
|
|
|
114
114
|
keepbadrecords=self.keepbadrecords,
|
|
115
115
|
)
|
|
116
116
|
self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
|
|
117
|
+
self.alignmentgroup: AlignmentGroup = self.alignmentsreader.alignmentgroup
|
|
117
118
|
# TODO: upgrade the selectors to use tokenstr. This requires
|
|
118
119
|
# knowing the source and targetitems, but alignmentsreader
|
|
119
120
|
# doesn't have that data
|
|
@@ -53,7 +53,7 @@ from biblealignlib import normalize_strongs, get_canonid
|
|
|
53
53
|
|
|
54
54
|
# should eventually come from Clearlib
|
|
55
55
|
from .util import groupby_key
|
|
56
|
-
from .BaseToken import BaseToken
|
|
56
|
+
from .BaseToken import BaseToken, strip_tokenstr
|
|
57
57
|
|
|
58
58
|
PREFIXRE = re.compile(r"^[no]")
|
|
59
59
|
|
|
@@ -76,7 +76,13 @@ def macula_prefixer(bcvwp: str) -> str:
|
|
|
76
76
|
|
|
77
77
|
|
|
78
78
|
def macula_unprefixer(bcvwp: str) -> str:
|
|
79
|
-
"""Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.
|
|
79
|
+
"""Drop a corpus prefix ('n' or 'o') from BCVWP, else return unchanged.
|
|
80
|
+
|
|
81
|
+
Also strips any tokenstr text suffix ("{id}|{text}" → "{id}") before
|
|
82
|
+
checking for the prefix, so both plain IDs and tokenstr selectors are
|
|
83
|
+
handled correctly.
|
|
84
|
+
"""
|
|
85
|
+
bcvwp = strip_tokenstr(bcvwp)
|
|
80
86
|
if PREFIXRE.match(bcvwp):
|
|
81
87
|
return bcvwp[1:]
|
|
82
88
|
else:
|
|
@@ -17,12 +17,7 @@ are stored as DiffRecord instances in a dict keyed by BCV.
|
|
|
17
17
|
>>> mgr84 = Manager(alset1)
|
|
18
18
|
>>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets)
|
|
19
19
|
>>> len(dt84)
|
|
20
|
-
|
|
21
|
-
# now run it again to account for single-token replacements
|
|
22
|
-
>>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets, dt84.get_single_token_replacements())
|
|
23
|
-
>>> len(dt84)
|
|
24
|
-
2860
|
|
25
|
-
|
|
20
|
+
2565
|
|
26
21
|
|
|
27
22
|
>>> alset2 = AlignmentSet(targetlanguage=targetlang,
|
|
28
23
|
targetid="NIV11",
|
|
@@ -38,6 +33,7 @@ are stored as DiffRecord instances in a dict keyed by BCV.
|
|
|
38
33
|
from __future__ import annotations
|
|
39
34
|
|
|
40
35
|
from collections import UserDict
|
|
36
|
+
import copy
|
|
41
37
|
from dataclasses import dataclass
|
|
42
38
|
import difflib
|
|
43
39
|
from itertools import zip_longest
|
|
@@ -45,13 +41,18 @@ from pathlib import Path
|
|
|
45
41
|
from typing import Optional, TextIO, TYPE_CHECKING
|
|
46
42
|
|
|
47
43
|
from biblealignlib.burrito import (
|
|
44
|
+
AlignmentGroup,
|
|
48
45
|
AlignmentRecord,
|
|
46
|
+
AlignmentReference,
|
|
49
47
|
AlignmentSet,
|
|
50
48
|
BaseToken,
|
|
51
49
|
DiffReason,
|
|
52
50
|
DiffRecord,
|
|
51
|
+
Document,
|
|
53
52
|
Manager,
|
|
53
|
+
Metadata,
|
|
54
54
|
Target,
|
|
55
|
+
TargetReader,
|
|
55
56
|
)
|
|
56
57
|
from ..burrito.alignments import write_alignment_group
|
|
57
58
|
from ..burrito.util import groupby_bcv
|
|
@@ -226,15 +227,11 @@ def diff_verse_targets(
|
|
|
226
227
|
)
|
|
227
228
|
|
|
228
229
|
|
|
229
|
-
#
|
|
230
|
-
#
|
|
231
|
-
#
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
# 3784
|
|
235
|
-
# >>> dt84 = DiffTargets.DiffTargets84(mgr84, niv11targets, dt84.get_single_token_replacements())
|
|
236
|
-
# >>> len(dt84)
|
|
237
|
-
# 2879
|
|
230
|
+
# could try here to find alignment records that are a subset of an
|
|
231
|
+
# equal region, and then map the token IDs?
|
|
232
|
+
# then write out revised records and patch onto the alignment data??
|
|
233
|
+
|
|
234
|
+
|
|
238
235
|
# this still doesn't handle multi-term direct replacements: for those we need to ensure semantic compatability
|
|
239
236
|
class DiffTargets84(UserDict):
|
|
240
237
|
missing84: set[str] = {"42023018", "47013014", "64001015"}
|
|
@@ -306,21 +303,21 @@ class DiffTargets84(UserDict):
|
|
|
306
303
|
"64001014033": "64001015018",
|
|
307
304
|
}
|
|
308
305
|
# hacky way to avoid outputing the same alignment record more than once
|
|
309
|
-
output_alrecs: dict[str,
|
|
306
|
+
output_alrecs: dict[str, bool] = {}
|
|
310
307
|
|
|
311
308
|
def __init__(
|
|
312
309
|
self,
|
|
313
310
|
mgr84: Manager,
|
|
314
|
-
targets11:
|
|
311
|
+
targets11: TargetReader,
|
|
315
312
|
bcvequivalents: dict[str, dict[str, str]] = {},
|
|
316
313
|
) -> None:
|
|
317
314
|
super().__init__()
|
|
318
315
|
self.mgr84 = mgr84
|
|
319
316
|
self.niv84bcvtargets: dict[str, list[Target]] = mgr84.bcv["targets"]
|
|
320
|
-
self.targets11:
|
|
317
|
+
self.targets11: TargetReader = targets11
|
|
321
318
|
self.bcvequivalents = bcvequivalents
|
|
322
319
|
# not correct for versification differences??
|
|
323
|
-
self.niv11bcvtargets: dict[str, list[Target]] = groupby_bcv(self.targets11.values())
|
|
320
|
+
self.niv11bcvtargets: dict[str, list[Target]] = groupby_bcv(list(self.targets11.values()))
|
|
324
321
|
for bcv in self.niv11bcvtargets:
|
|
325
322
|
if bcv not in self.missing84:
|
|
326
323
|
trg84: list[Target] = self.niv84bcvtargets.get(bcv, [])
|
|
@@ -329,40 +326,56 @@ class DiffTargets84(UserDict):
|
|
|
329
326
|
self.bcvequivalents.get(bcv, {}) if self.bcvequivalents else {}
|
|
330
327
|
)
|
|
331
328
|
record = diff_verse_targets(bcv, trg84, trg11, equivalents)
|
|
332
|
-
# record.data is like
|
|
333
|
-
|
|
329
|
+
# record.data is like
|
|
330
|
+
# (('equal', 0, 5, 0, 5), ('replace', 5, 6, 5, 6), ('equal', 6, 34, 6, 34),
|
|
331
|
+
# ('replace', 34, 35, 34, 35), ('equal', 35, 38, 35, 38))
|
|
332
|
+
if record and not self._replaceonly_same_length(record):
|
|
333
|
+
# then record as a difference
|
|
334
334
|
self.data[bcv] = record
|
|
335
335
|
# items that are only replacements
|
|
336
|
-
self.replaceonly: dict[str, DiffRecord] = {
|
|
336
|
+
# self.replaceonly: dict[str, DiffRecord] = {
|
|
337
|
+
# bcv: drec
|
|
338
|
+
# for bcv, drec in self.items()
|
|
339
|
+
# if all([(op.opcode in ("equal", "replace")) for op in drec.data])
|
|
340
|
+
# }
|
|
341
|
+
# self.single_replaceonly: dict[str, list[Operation]] = {
|
|
342
|
+
# bcv: oplist
|
|
343
|
+
# for bcv, drec in self.replaceonly.items()
|
|
344
|
+
# if (oplist := [op for op in drec.data if op.single_replace])
|
|
345
|
+
# if oplist
|
|
346
|
+
# }
|
|
347
|
+
# self.dual_replaceonly: dict[str, list[Operation]] = {
|
|
348
|
+
# bcv: oplist
|
|
349
|
+
# for bcv, drec in self.replaceonly.items()
|
|
350
|
+
# if (oplist := [op for op in drec.data if op.dual_replace])
|
|
351
|
+
# if oplist
|
|
352
|
+
# }
|
|
353
|
+
self.replaceonly_same_length: dict[str, DiffRecord] = {
|
|
337
354
|
bcv: drec
|
|
338
355
|
for bcv, drec in self.items()
|
|
339
|
-
if all(
|
|
340
|
-
|
|
341
|
-
self.single_replaceonly: dict[str, list[Operation]] = {
|
|
342
|
-
bcv: oplist
|
|
343
|
-
for bcv, drec in self.replaceonly.items()
|
|
344
|
-
if (oplist := [op for op in drec.data if op.single_replace])
|
|
345
|
-
if oplist
|
|
346
|
-
}
|
|
347
|
-
self.dual_replaceonly: dict[str, list[Operation]] = {
|
|
348
|
-
bcv: oplist
|
|
349
|
-
for bcv, drec in self.replaceonly.items()
|
|
350
|
-
if (oplist := [op for op in drec.data if op.dual_replace])
|
|
351
|
-
if oplist
|
|
356
|
+
if all((op.opcode in ("equal", "replace")) for op in drec.data)
|
|
357
|
+
if all(op.same_length for op in drec.data)
|
|
352
358
|
}
|
|
353
359
|
|
|
354
|
-
def
|
|
360
|
+
def _replaceonly_same_length(self, diffrec: DiffRecord) -> bool:
|
|
361
|
+
"""True if all operations are 'equal' or 'replace' of same length.
|
|
362
|
+
|
|
363
|
+
That means token IDs don't need to change in NIV11.
|
|
364
|
+
"""
|
|
365
|
+
return all((op.opcode in ("equal", "replace")) and op.same_length for op in diffrec.data)
|
|
366
|
+
|
|
367
|
+
def _get_bcv_texts(self, bcv: str) -> tuple[list[str], list[str]]:
|
|
355
368
|
record = self.data.get(bcv)
|
|
356
369
|
if record is None:
|
|
357
370
|
print(f"{bcv}: No differences")
|
|
358
|
-
return
|
|
371
|
+
return [], []
|
|
359
372
|
text84 = [trg.text for trg in self.niv84bcvtargets[bcv]]
|
|
360
373
|
text11 = [trg.text for trg in self.niv11bcvtargets[bcv]]
|
|
361
374
|
return text84, text11
|
|
362
375
|
|
|
363
|
-
def get_single_token_replacements(self) -> dict[str, dict[str, str]]:
|
|
364
|
-
|
|
365
|
-
|
|
376
|
+
# def get_single_token_replacements(self) -> dict[str, dict[str, str]]:
|
|
377
|
+
# # bcv-specific single token replacements
|
|
378
|
+
# return {bcv: self.replace_single_text(bcv) for bcv, ops in self.single_replaceonly.items()}
|
|
366
379
|
|
|
367
380
|
def display_pair_text(self, bcv: str) -> None:
|
|
368
381
|
text84, text11 = self._get_bcv_texts(bcv)
|
|
@@ -371,27 +384,14 @@ class DiffTargets84(UserDict):
|
|
|
371
384
|
|
|
372
385
|
# only for single-token replace operations
|
|
373
386
|
def replace_single_text(self, bcv: str) -> dict[str, str]:
|
|
374
|
-
record = self.data.get(bcv)
|
|
375
387
|
text84, text11 = self._get_bcv_texts(bcv)
|
|
376
388
|
replacements: dict[str, str] = {}
|
|
377
|
-
for op in
|
|
389
|
+
for op in self.data.get(bcv, []):
|
|
378
390
|
if op.single_replace:
|
|
379
391
|
k = text11[op.start2 : op.end2][0]
|
|
380
392
|
replacements[k] = text84[op.start1 : op.end1][0]
|
|
381
393
|
return replacements
|
|
382
394
|
|
|
383
|
-
# replacements where one or both sides have two tokens
|
|
384
|
-
# could consolidate this with replace_single_text
|
|
385
|
-
def replace_dual_text(self, bcv: str) -> dict[str, str]:
|
|
386
|
-
record = self.data.get(bcv)
|
|
387
|
-
text84, text11 = self._get_bcv_texts(bcv)
|
|
388
|
-
replacements: dict[str, str] = {}
|
|
389
|
-
for op in record.data:
|
|
390
|
-
if op.dual_replace:
|
|
391
|
-
k = tuple(text11[op.start2 : op.end2])
|
|
392
|
-
replacements[k] = tuple(text84[op.start1 : op.end1])
|
|
393
|
-
return replacements
|
|
394
|
-
|
|
395
395
|
def mismatched_verses(self) -> dict[str, AlignmentRecord]:
|
|
396
396
|
"""Some alignment records have a source in one verse and a target in another."""
|
|
397
397
|
return {
|
|
@@ -486,8 +486,7 @@ class DiffTargets84(UserDict):
|
|
|
486
486
|
self.output_alrecs[alrec.meta.id] = True
|
|
487
487
|
except KeyError as e:
|
|
488
488
|
# Selectors: ['42001025012']
|
|
489
|
-
# niv11map keys: dict_keys(['42001024001', '42001024002', '42001024003',
|
|
490
|
-
|
|
489
|
+
# niv11map keys: dict_keys(['42001024001', '42001024002', '42001024003', ...])
|
|
491
490
|
print(f"--- {versedata.bcvid}, KeyError on {e}")
|
|
492
491
|
print(f"Record: {alrec}")
|
|
493
492
|
|
|
@@ -584,26 +583,51 @@ class DiffTargets84(UserDict):
|
|
|
584
583
|
# niv11replace
|
|
585
584
|
|
|
586
585
|
|
|
587
|
-
class
|
|
588
|
-
"""
|
|
586
|
+
class Serialize:
|
|
587
|
+
"""Serialize confident alignments for ClearAligner.
|
|
589
588
|
|
|
590
|
-
|
|
591
|
-
showing the aligned NIV84 token(s) and corresponding NIV11 token(s).
|
|
589
|
+
With include_partials True, this also includes partial alignments.
|
|
592
590
|
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
tokens (many-to-many) collapse to one row per source token with all targets
|
|
596
|
-
joined on that row.
|
|
591
|
+
Also outputs difference records and difference information on
|
|
592
|
+
tokens as a checklist of things to review.
|
|
597
593
|
|
|
598
|
-
After all source-token rows, any NIV11 tokens not yet emitted are appended
|
|
599
|
-
on blank source rows, sorted by token ID.
|
|
600
594
|
"""
|
|
601
595
|
|
|
602
|
-
|
|
596
|
+
# collects alignment records that didn't produce partials: BCVID -> list of records
|
|
597
|
+
unmapped_records: dict[str, list[AlignmentRecord]] = {}
|
|
598
|
+
|
|
599
|
+
def __init__(self, dt84: DiffTargets84, include_partials: bool = False) -> None:
|
|
603
600
|
self.dt84 = dt84
|
|
604
601
|
self.mgr84 = dt84.mgr84
|
|
605
602
|
self.niv84bcvtargets = dt84.niv84bcvtargets
|
|
606
603
|
self.niv11bcvtargets = dt84.niv11bcvtargets
|
|
604
|
+
# new Document for AlignmentRecord instances
|
|
605
|
+
self.niv11_document: Document = Document(docid="NIV11", scheme="BCVW")
|
|
606
|
+
# construct a new manager, with mappings to NIV11
|
|
607
|
+
self.niv11alset: AlignmentSet = AlignmentSet(
|
|
608
|
+
targetlanguage=self.mgr84.alignmentset.targetlanguage,
|
|
609
|
+
targetid="NIV11",
|
|
610
|
+
sourceid=self.mgr84.alignmentset.sourceid,
|
|
611
|
+
langdatapath=self.mgr84.alignmentset.langdatapath,
|
|
612
|
+
)
|
|
613
|
+
# read the existing alignments but then replace the alignment records
|
|
614
|
+
self.mgr11: Manager = Manager(self.niv11alset)
|
|
615
|
+
self.mgr11.targetitems = self.dt84.targets11
|
|
616
|
+
self.niv11_algroup: AlignmentGroup = self.niv11_alignment_group(
|
|
617
|
+
include_partials=include_partials
|
|
618
|
+
)
|
|
619
|
+
self.mgr11.bcv["records"] = groupby_bcv(
|
|
620
|
+
list(self.niv11_algroup.records), lambda r: r.source_bcv
|
|
621
|
+
)
|
|
622
|
+
# and make VerseData instances for alignments
|
|
623
|
+
versedata: dict[str, VerseData] = {}
|
|
624
|
+
for bcvid in self.mgr11.bcv["records"]:
|
|
625
|
+
try:
|
|
626
|
+
vd: VerseData = self.mgr11.make_versedata(bcvid)
|
|
627
|
+
versedata[bcvid] = vd
|
|
628
|
+
except KeyError:
|
|
629
|
+
print(f"Warning: no records for {bcvid} in NIV11; skipping verse")
|
|
630
|
+
self.mgr11.bcv["versedata"] = versedata
|
|
607
631
|
|
|
608
632
|
def _niv84_to_niv11(self, bcv: str) -> dict[str, list[Target]]:
|
|
609
633
|
"""Map NIV84 token IDs to NIV11 Target tokens for a verse.
|
|
@@ -637,142 +661,193 @@ class Interlinear:
|
|
|
637
661
|
# delete → key omitted; insert → no NIV84 token, omitted from map
|
|
638
662
|
return result
|
|
639
663
|
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
# def _niv11_token_string(tokens: list[BaseToken]) -> str:
|
|
644
|
-
# niv11_tokens = [t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])]
|
|
645
|
-
# return " ".join(t.tokenstr for t in niv11_tokens)
|
|
646
|
-
|
|
647
|
-
interlabels: dict[str, str] = {
|
|
648
|
-
"many-to-many": "+-+",
|
|
649
|
-
"one-to-many": "1-1",
|
|
650
|
-
"unaligned": "0",
|
|
651
|
-
"unmatched": "-",
|
|
652
|
-
}
|
|
653
|
-
niv84_by_id: dict[str, Target] = {t.id: t for t in self.niv84bcvtargets.get(bcv, [])}
|
|
654
|
-
niv84_to_niv11: dict[str, list[Target]] = self._niv84_to_niv11(bcv)
|
|
655
|
-
|
|
656
|
-
src_to_alrecs: dict[str, list[AlignmentRecord]] = {}
|
|
657
|
-
for alrec in versedata.records:
|
|
658
|
-
for src_id in alrec.source_selectors:
|
|
659
|
-
src_to_alrecs.setdefault(src_id, []).append(alrec)
|
|
660
|
-
|
|
661
|
-
niv11_emitted: set[str] = set()
|
|
662
|
-
rows: list[tuple[str, str, str, str]] = []
|
|
663
|
-
|
|
664
|
-
for src_token in versedata.sources:
|
|
665
|
-
alrecs = src_to_alrecs.get(src_token.id, [])
|
|
666
|
-
if not alrecs:
|
|
667
|
-
rows.append((interlabels["unaligned"], src_token.tokenstr, "", ""))
|
|
668
|
-
continue
|
|
669
|
-
for alrec in alrecs:
|
|
670
|
-
niv84_ids = alrec.target_selectors
|
|
671
|
-
niv84_tokens = [niv84_by_id[tid] for tid in niv84_ids if tid in niv84_by_id]
|
|
672
|
-
niv84_str = " ".join(t.tokenstr for t in niv84_tokens)
|
|
673
|
-
if len(alrec.source_selectors) > 1:
|
|
674
|
-
# many-to-many: join all targets on one row per source token
|
|
675
|
-
# duplicates here for 40001023004|ἐν
|
|
676
|
-
niv11_tokens = sorted(
|
|
677
|
-
{t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])}
|
|
678
|
-
)
|
|
679
|
-
niv11_str = " ".join(t.tokenstr for t in niv11_tokens)
|
|
680
|
-
niv11_emitted.update(t.id for t in niv11_tokens)
|
|
681
|
-
rows.append(
|
|
682
|
-
(
|
|
683
|
-
interlabels["many-to-many"],
|
|
684
|
-
src_token.tokenstr,
|
|
685
|
-
niv84_str,
|
|
686
|
-
niv11_str,
|
|
687
|
-
)
|
|
688
|
-
)
|
|
689
|
-
else:
|
|
690
|
-
# one-to-many: one row per group of NIV84 targets, blank source on continuation
|
|
691
|
-
niv84_tokens = [niv84_by_id[tid] for tid in niv84_ids if tid in niv84_by_id]
|
|
692
|
-
niv84_str = " ".join(t.tokenstr for t in niv84_tokens)
|
|
693
|
-
# duplicates here for 40001019006|δίκαιος
|
|
694
|
-
niv11_tokens = sorted(
|
|
695
|
-
{t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])}
|
|
696
|
-
)
|
|
697
|
-
# niv11_tokens = [t for tid in niv84_ids for t in niv84_to_niv11.get(tid, [])]
|
|
698
|
-
niv11_str = " ".join(t.tokenstr for t in niv11_tokens)
|
|
699
|
-
niv11_emitted.update(t.id for t in niv11_tokens)
|
|
700
|
-
rows.append(
|
|
701
|
-
(
|
|
702
|
-
interlabels["one-to-many"],
|
|
703
|
-
src_token.tokenstr,
|
|
704
|
-
niv84_str,
|
|
705
|
-
niv11_str,
|
|
706
|
-
)
|
|
707
|
-
)
|
|
708
|
-
|
|
709
|
-
# Append any NIV11 tokens not yet emitted, sorted by ID
|
|
710
|
-
unmatched = sorted(
|
|
711
|
-
[t for t in self.niv11bcvtargets.get(bcv, []) if t.id not in niv11_emitted],
|
|
712
|
-
key=lambda t: t.id,
|
|
713
|
-
)
|
|
714
|
-
allunmatched = " ".join(t.tokenstr for t in unmatched)
|
|
715
|
-
rows.append((interlabels["unmatched"], "", "", allunmatched))
|
|
716
|
-
|
|
717
|
-
return rows
|
|
718
|
-
|
|
719
|
-
def write_tsv(self, outpath: Optional[Path] = None) -> None:
|
|
720
|
-
"""Write one TSV row per source token across all verses."""
|
|
721
|
-
if not outpath:
|
|
722
|
-
outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
|
|
723
|
-
outdir.mkdir(parents=True, exist_ok=True)
|
|
724
|
-
outpath = outdir / "NIV84-NIV11-interlinear.tsv"
|
|
725
|
-
with outpath.open("w", encoding="utf-8") as f:
|
|
726
|
-
f.write("Label\tSource\tNIV84\tNIV11\n")
|
|
727
|
-
for bcv, versedata in self.mgr84.bcv["versedata"].items():
|
|
728
|
-
for row in self._verse_rows(bcv, versedata):
|
|
729
|
-
f.write("\t".join(row) + "\n")
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
class Serialize:
|
|
733
|
-
"""Serialize confident alignments for ClearAligner.
|
|
734
|
-
|
|
735
|
-
Also outputs difference records and difference information on
|
|
736
|
-
tokens as a checklist of things to review.
|
|
737
|
-
|
|
738
|
-
"""
|
|
739
|
-
|
|
740
|
-
def __init__(self, dt84: DiffTargets84) -> None:
|
|
741
|
-
self.dt84 = dt84
|
|
742
|
-
self.mgr84 = dt84.mgr84
|
|
743
|
-
self.niv84bcvtargets = dt84.niv84bcvtargets
|
|
744
|
-
self.niv11bcvtargets = dt84.niv11bcvtargets
|
|
745
|
-
# construct a new manager, with mappings to NIV11
|
|
746
|
-
niv11alset: AlignmentSet = AlignmentSet(
|
|
747
|
-
targetlanguage=self.mgr84.alignmentset.targetlanguage,
|
|
748
|
-
targetid="NIV11",
|
|
749
|
-
sourceid=self.mgr84.alignmentset.sourceid,
|
|
750
|
-
langdatapath=self.mgr84.alignmentset.langdatapath,
|
|
751
|
-
)
|
|
752
|
-
# read the existing alignments but then replace the alignment records
|
|
753
|
-
self.mgr11: Manager = Manager(niv11alset)
|
|
754
|
-
|
|
664
|
+
# if the target IDs are the same, even if the NIV11 tokens are
|
|
665
|
+
# different, we can still use the NIV84 alignment records
|
|
666
|
+
# but xverse mapping is needed
|
|
755
667
|
def _alrecs_to_niv11(self, bcv: str) -> list[AlignmentRecord]:
|
|
756
|
-
"""Return the alignment records from NIV84,
|
|
668
|
+
"""Return the alignment records from NIV84, mapped to NIV11 targets.
|
|
757
669
|
|
|
758
670
|
Only when there aren't significant differences.
|
|
759
671
|
"""
|
|
760
672
|
niv84_tokens = self.niv84bcvtargets.get(bcv, [])
|
|
761
673
|
# niv11_tokens = self.niv11bcvtargets.get(bcv, [])
|
|
762
674
|
alrecs_niv84: list[AlignmentRecord] = self.mgr84.bcv["records"][bcv]
|
|
763
|
-
#
|
|
764
|
-
# niv84_id_tokens: dict[str, Target] = {t.id: t for t in niv84_tokens}
|
|
765
|
-
# niv11_id_tokens: dict[str, Target] = {t.id: t for t in niv11_tokens}
|
|
675
|
+
# niv84_to_niv11 = self._niv84_to_niv11(bcv)
|
|
766
676
|
# this only works because each token is equal or equivalent
|
|
767
677
|
#
|
|
768
678
|
# this handles any cases of cross-verse boundary changes
|
|
769
|
-
|
|
679
|
+
niv84_niv11_xverse_map: dict[str, str] = {
|
|
770
680
|
t84.id: self.dt84.niv84_niv11_map.get(t84.id, t84.id) for t84 in niv84_tokens
|
|
771
681
|
}
|
|
772
|
-
|
|
682
|
+
# in theory, the xverse map shouldn't interact with the diff-based map ...
|
|
683
|
+
new_alrecs: list[AlignmentRecord] = []
|
|
773
684
|
for alrec in alrecs_niv84:
|
|
774
685
|
niv11_selectors: list[str] = [
|
|
775
|
-
|
|
686
|
+
xverse_sel
|
|
687
|
+
for sel in alrec.target_selectors
|
|
688
|
+
if (xverse_sel := niv84_niv11_xverse_map.get(sel, sel))
|
|
689
|
+
# if (to_niv11 := niv84_to_niv11.get(xverse_sel, xverse_sel))
|
|
690
|
+
# for niv11_tok in to_niv11
|
|
691
|
+
]
|
|
692
|
+
new_reference: AlignmentReference = AlignmentReference(
|
|
693
|
+
document=self.niv11_document, selectors=niv11_selectors
|
|
694
|
+
)
|
|
695
|
+
newmeta = copy.deepcopy(alrec.meta)
|
|
696
|
+
newmeta.origin = "NIV84_transfer"
|
|
697
|
+
new_alrecs.append(
|
|
698
|
+
AlignmentRecord(
|
|
699
|
+
meta=alrec.meta,
|
|
700
|
+
references={
|
|
701
|
+
"source": alrec.references["source"],
|
|
702
|
+
"target": new_reference,
|
|
703
|
+
},
|
|
704
|
+
type=alrec.type,
|
|
705
|
+
)
|
|
706
|
+
)
|
|
707
|
+
|
|
708
|
+
return new_alrecs
|
|
709
|
+
|
|
710
|
+
# from Claude
|
|
711
|
+
def collect_partial_records(self, bcv: str) -> list[AlignmentRecord]:
|
|
712
|
+
"""Generate NIV11 AlignmentRecords for confidently-mapped records in a diff verse.
|
|
713
|
+
|
|
714
|
+
For each operation in the verse's DiffRecord whose opcode is 'equal' or
|
|
715
|
+
'replace' with equal length (same token count on both sides), the positional
|
|
716
|
+
zip gives a one-to-one NIV84 → NIV11 token correspondence.
|
|
717
|
+
|
|
718
|
+
An AlignmentRecord is included only when every one of its target selectors
|
|
719
|
+
falls within the span of such a confident operation, so the full NIV11
|
|
720
|
+
mapping is unambiguous. Records that straddle operation boundaries, or
|
|
721
|
+
whose selectors sit in a delete/insert/unequal-replace span, are skipped.
|
|
722
|
+
|
|
723
|
+
Returns an empty list for verses without a DiffRecord (those are handled
|
|
724
|
+
by the existing _alrecs_to_niv11 / niv11_alignment_group path).
|
|
725
|
+
"""
|
|
726
|
+
diffrec = self.dt84.data.get(bcv)
|
|
727
|
+
if diffrec is None:
|
|
728
|
+
return []
|
|
729
|
+
versedata = self.mgr84.bcv["versedata"].get(bcv)
|
|
730
|
+
if versedata is None or not versedata.records:
|
|
731
|
+
return []
|
|
732
|
+
|
|
733
|
+
niv84_tokens = self.niv84bcvtargets.get(bcv, [])
|
|
734
|
+
niv11_tokens = self.niv11bcvtargets.get(bcv, [])
|
|
735
|
+
|
|
736
|
+
# Build a confident NIV84 token ID → NIV11 token ID map.
|
|
737
|
+
# equal ops: texts match; same-length replace ops: unique positional partner.
|
|
738
|
+
confident_map: dict[str, str] = {}
|
|
739
|
+
for op in diffrec.data:
|
|
740
|
+
if op.opcode == "equal" or (op.opcode == "replace" and op.same_length):
|
|
741
|
+
for t84, t11 in zip(
|
|
742
|
+
niv84_tokens[op.start1 : op.end1],
|
|
743
|
+
niv11_tokens[op.start2 : op.end2],
|
|
744
|
+
):
|
|
745
|
+
confident_map[t84.id] = t11.id
|
|
746
|
+
|
|
747
|
+
if not confident_map:
|
|
748
|
+
return []
|
|
749
|
+
|
|
750
|
+
new_alrecs: list[AlignmentRecord] = []
|
|
751
|
+
for alrec in versedata.records:
|
|
752
|
+
niv11_selectors: list[str] = []
|
|
753
|
+
for sel in alrec.target_selectors:
|
|
754
|
+
niv11_id = confident_map.get(sel)
|
|
755
|
+
if niv11_id is None:
|
|
756
|
+
if bcv not in self.unmapped_records:
|
|
757
|
+
self.unmapped_records[bcv] = [alrec]
|
|
758
|
+
else:
|
|
759
|
+
if alrec not in self.unmapped_records[bcv]:
|
|
760
|
+
self.unmapped_records[bcv].append(alrec)
|
|
761
|
+
break # selector not in any confident span → skip record
|
|
762
|
+
niv11_selectors.append(niv11_id)
|
|
763
|
+
else:
|
|
764
|
+
# all selectors mapped confidently
|
|
765
|
+
newmeta = copy.deepcopy(alrec.meta)
|
|
766
|
+
newmeta.origin = "NIV84_partial_transfer"
|
|
767
|
+
new_alrecs.append(
|
|
768
|
+
AlignmentRecord(
|
|
769
|
+
meta=newmeta,
|
|
770
|
+
references={
|
|
771
|
+
"source": alrec.references["source"],
|
|
772
|
+
"target": AlignmentReference(
|
|
773
|
+
document=self.niv11_document, selectors=niv11_selectors
|
|
774
|
+
),
|
|
775
|
+
},
|
|
776
|
+
type=alrec.type,
|
|
777
|
+
)
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
return new_alrecs
|
|
781
|
+
|
|
782
|
+
def niv11_alignment_group(self, include_partials: bool = False) -> AlignmentGroup:
|
|
783
|
+
"""Return an AlignmentGroup for NIV11, with aligned records from NIV84 where possible."""
|
|
784
|
+
niv84_algroup: AlignmentGroup = self.mgr84.alignmentsreader.alignmentgroup
|
|
785
|
+
sblgnt_document: Document = niv84_algroup.documents[0]
|
|
786
|
+
niv11_metadata: Metadata = Metadata(conformsTo="0.3", creator="NIV84-NIV11 transfer")
|
|
787
|
+
niv11_alrecs: list[AlignmentRecord] = [
|
|
788
|
+
alrec
|
|
789
|
+
for bcv in self.mgr84.bcv["records"].keys()
|
|
790
|
+
# only those that map cleanly
|
|
791
|
+
if bcv not in self.dt84
|
|
792
|
+
for alrec in self._alrecs_to_niv11(bcv)
|
|
793
|
+
]
|
|
794
|
+
if include_partials:
|
|
795
|
+
niv11_partials: list[AlignmentRecord] = [
|
|
796
|
+
alrec
|
|
797
|
+
for bcv in self.dt84.data.keys()
|
|
798
|
+
for alrec in self.collect_partial_records(bcv)
|
|
776
799
|
]
|
|
777
|
-
|
|
778
|
-
|
|
800
|
+
niv11_alrecs = sorted(niv11_alrecs + niv11_partials)
|
|
801
|
+
niv11_algroup: AlignmentGroup = AlignmentGroup(
|
|
802
|
+
documents=(sblgnt_document, self.niv11_document),
|
|
803
|
+
meta=niv11_metadata,
|
|
804
|
+
records=niv11_alrecs,
|
|
805
|
+
roles=niv84_algroup.roles,
|
|
806
|
+
sourcedocid=niv84_algroup.sourcedocid,
|
|
807
|
+
canon=niv84_algroup.canon,
|
|
808
|
+
_type=niv84_algroup._type,
|
|
809
|
+
)
|
|
810
|
+
return niv11_algroup
|
|
811
|
+
|
|
812
|
+
def write_unmapped_records(self, outpath: Path = None) -> None:
|
|
813
|
+
"""Write partials that were not included in partials (confidently-mapped spans)."""
|
|
814
|
+
unmapped_output: set[AlignmentRecord] = set()
|
|
815
|
+
if not outpath:
|
|
816
|
+
outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
|
|
817
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
818
|
+
outpath = outdir / "NIV84-NIV11-unmappedrecords.tsv"
|
|
819
|
+
with outpath.open("w", encoding="utf-8") as f:
|
|
820
|
+
f.write("Verse\tNIV84 Tokens\n")
|
|
821
|
+
for bcv, alreclist in self.unmapped_records.items():
|
|
822
|
+
niv84_bcv_tokens: list[Target] = self.niv84bcvtargets.get(bcv, [])
|
|
823
|
+
niv84_bcv_tokenstrs: dict[str, str] = {t.id: t.tokenstr for t in niv84_bcv_tokens}
|
|
824
|
+
for alrec in alreclist:
|
|
825
|
+
if alrec in unmapped_output:
|
|
826
|
+
continue
|
|
827
|
+
else:
|
|
828
|
+
unmapped_output.add(alrec)
|
|
829
|
+
niv84_str = {
|
|
830
|
+
sel: niv84_bcv_tokenstrs.get(sel, "<unknown>")
|
|
831
|
+
for sel in alrec.target_selectors
|
|
832
|
+
}
|
|
833
|
+
f.write(f"{bcv}\t{" ".join(niv84_str.values())}\n")
|
|
834
|
+
|
|
835
|
+
def write_diffs(self, outpath: Path = None) -> None:
|
|
836
|
+
"""Write diffs as a checklist for manual alignment."""
|
|
837
|
+
if not outpath:
|
|
838
|
+
outdir = self.mgr84.alignmentset.langdatapath / "NIV84-NIV11"
|
|
839
|
+
outdir.mkdir(parents=True, exist_ok=True)
|
|
840
|
+
outpath = outdir / "NIV84-NIV11-diffs.tsv"
|
|
841
|
+
with outpath.open("w", encoding="utf-8") as f:
|
|
842
|
+
f.write("Verse\tOpCode\tNIV84 Tokens\tNIV11 Tokens\n")
|
|
843
|
+
for bcv, diffrec in self.dt84.data.items():
|
|
844
|
+
niv84_tokens = self.niv84bcvtargets.get(bcv, [])
|
|
845
|
+
niv11_tokens = self.niv11bcvtargets.get(bcv, [])
|
|
846
|
+
for op in diffrec.data:
|
|
847
|
+
seq84 = niv84_tokens[op.start1 : op.end1]
|
|
848
|
+
seq11 = niv11_tokens[op.start2 : op.end2]
|
|
849
|
+
f.write(
|
|
850
|
+
f"{bcv}\t{op.opcode}\t"
|
|
851
|
+
f"{' '.join(t.tokenstr for t in seq84)}\t"
|
|
852
|
+
f"{' '.join(t.tokenstr for t in seq11)}\n"
|
|
853
|
+
)
|
|
@@ -77,8 +77,7 @@ class Merger:
|
|
|
77
77
|
data2: Optional[VerseData] = cast(
|
|
78
78
|
Optional[VerseData], self.mgr2.bcv["versedata"].get(bcv)
|
|
79
79
|
)
|
|
80
|
-
if data1 and data2
|
|
81
|
-
diffs: list[DiffRecord] = data1.diff(data2)
|
|
80
|
+
diffs: list[DiffRecord] = data1.diff(data2) if data1 and data2 else []
|
|
82
81
|
bcv_pairs[bcv] = BCVPair(
|
|
83
82
|
bcv=bcv,
|
|
84
83
|
mgr1_data=data1,
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "biblealignlib"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.0"
|
|
4
4
|
description = "Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments."
|
|
5
5
|
authors = ["Sean Boisen <sean.boisen@biblica.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -1,93 +0,0 @@
|
|
|
1
|
-
"""Transfer alignment data from one AlignmentSet to another on a closely related targe version.
|
|
2
|
-
|
|
3
|
-
Example: if NIV84 alignments are more complete than NIV2011
|
|
4
|
-
alignments, this code can transfer the data from the former to the
|
|
5
|
-
latter, where the surface text is the same, and where NIV84 has
|
|
6
|
-
alignment record that is missing from NIV11.
|
|
7
|
-
|
|
8
|
-
Input is two Manager instances, which must be based on the same source
|
|
9
|
-
and target language. If the target versions are the _same_, use
|
|
10
|
-
merger.py instead.
|
|
11
|
-
|
|
12
|
-
- If the source or target languages aren't the same, this code is
|
|
13
|
-
not relevant to your problem.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
>>> from biblealignlib.burrito import CLEARROOT, Manager, AlignmentSet
|
|
17
|
-
>>> from biblealignlib.util import Transfer
|
|
18
|
-
>>> targetlang, sourceid = ("eng", "SBLGNT")
|
|
19
|
-
# get manager instances for two sets of alignments
|
|
20
|
-
>>> niv84as = AlignmentSet(targetlanguage=targetlang,
|
|
21
|
-
targetid="NIV84",
|
|
22
|
-
sourceid=sourceid,
|
|
23
|
-
langdatapath=(CLEARROOT / f"alignments-{targetlang}/data"))
|
|
24
|
-
>>> niv84mgr = Manager(niv84as)
|
|
25
|
-
>>> niv11as = AlignmentSet(targetlanguage=targetlang,
|
|
26
|
-
targetid="NIV11",
|
|
27
|
-
sourceid=sourceid,
|
|
28
|
-
langdatapath=(CLEARROOT / f"alignments-{targetlang}/data"))
|
|
29
|
-
>>> niv11mgr = Manager(niv11as)
|
|
30
|
-
# instantiate a Transfer instance
|
|
31
|
-
>>> transferinst = Transfer.Transfer(niv84mgr, niv11mgr)
|
|
32
|
-
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
from collections import Counter, UserDict
|
|
36
|
-
from typing import cast, Optional
|
|
37
|
-
|
|
38
|
-
from ..burrito import Manager, VerseData
|
|
39
|
-
from ..burrito import DiffRecord
|
|
40
|
-
from ..burrito.util import groupby_bcid
|
|
41
|
-
from . import BCVPair
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class Transfer(UserDict):
|
|
45
|
-
|
|
46
|
-
def __init__(self, mgr1: Manager, mgr2: Manager) -> None:
|
|
47
|
-
"""Initialize an instance."""
|
|
48
|
-
super().__init__()
|
|
49
|
-
self.mgr1 = mgr1
|
|
50
|
-
self.mgr2 = mgr2
|
|
51
|
-
for attr in ("sourceid", "targetlanguage"):
|
|
52
|
-
mgr1attr = getattr(self.mgr1.alignmentset, attr)
|
|
53
|
-
mgr2attr = getattr(self.mgr2.alignmentset, attr)
|
|
54
|
-
if mgr1attr != mgr2attr:
|
|
55
|
-
raise ValueError(
|
|
56
|
-
f"Managers must have the same {attr!r} attribute, but {mgr1attr} != {mgr2attr}"
|
|
57
|
-
)
|
|
58
|
-
# should be the same for both
|
|
59
|
-
self.allsrcbcv = mgr1.bcv["sources"]
|
|
60
|
-
self.data: dict[str, BCVPair] = self.get_bcv_pairs()
|
|
61
|
-
self.pairingcounts = Counter(bcvp.pairing for bcvp in self.values())
|
|
62
|
-
# overlaps
|
|
63
|
-
self.overlaps = [bcvp for bcvp in self.values() if bcvp.pairing == "both"]
|
|
64
|
-
# overlaps with differences
|
|
65
|
-
self.diffpairs = [bcvp for bcvp in self.overlaps if bcvp.diffs]
|
|
66
|
-
|
|
67
|
-
def get_bcv_pairs(self) -> dict[str, BCVPair]:
|
|
68
|
-
"""Return a dictionary of BCVPair instances."""
|
|
69
|
-
bcv_pairs: dict[str, BCVPair] = {}
|
|
70
|
-
for bcv in self.allsrcbcv:
|
|
71
|
-
data1: Optional[VerseData] = cast(
|
|
72
|
-
Optional[VerseData], self.mgr1.bcv["versedata"].get(bcv)
|
|
73
|
-
)
|
|
74
|
-
data2: Optional[VerseData] = cast(
|
|
75
|
-
Optional[VerseData], self.mgr2.bcv["versedata"].get(bcv)
|
|
76
|
-
)
|
|
77
|
-
if data1 and data2:
|
|
78
|
-
diffs: list[DiffRecord] =
|
|
79
|
-
bcv_pairs[bcv] = BCVPair(
|
|
80
|
-
bcv=bcv,
|
|
81
|
-
mgr1_data=data1,
|
|
82
|
-
mgr2_data=data2,
|
|
83
|
-
)
|
|
84
|
-
return bcv_pairs
|
|
85
|
-
|
|
86
|
-
def show_diffs(self) -> None:
|
|
87
|
-
"""Display information about overlaps that differ."""
|
|
88
|
-
overlap_bcs = groupby_bcid([bcvp.bcv for bcvp in self.diffpairs])
|
|
89
|
-
print(f"{len(overlap_bcs)} overlapping and different chapters: {overlap_bcs.keys()}")
|
|
90
|
-
for bcvpair in self.diffpairs:
|
|
91
|
-
vd1 = bcvpair.mgr1_data.alignments if bcvpair.mgr1_data else ()
|
|
92
|
-
vd2 = bcvpair.mgr2_data.alignments if bcvpair.mgr2_data else ()
|
|
93
|
-
print(bcvpair.bcv, ": ", str(len(vd1)), "---", str(len(vd2)))
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|