biblealignlib 0.3.0__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/PKG-INFO +6 -2
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/README.md +5 -1
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/mapper.py +2 -2
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentGroup.py +8 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/BaseToken.py +5 -5
- biblealignlib-0.3.2/biblealignlib/burrito/DiffRecord.py +73 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/VerseData.py +48 -45
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/__init__.py +4 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/alignments.py +17 -3
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/manager.py +4 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/source.py +3 -4
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/target.py +12 -1
- biblealignlib-0.3.2/biblealignlib/util/DiffAlignments.py +168 -0
- biblealignlib-0.3.2/biblealignlib/util/DiffTargets.py +778 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/Transfer.py +18 -5
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/__init__.py +19 -4
- biblealignlib-0.3.2/biblealignlib/util/canonsplit.py +77 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/merger.py +4 -1
- biblealignlib-0.3.2/biblealignlib/util/tokens_to_chars.py +38 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/pyproject.toml +2 -1
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/LICENSE +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/LICENSE.md +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/__init__.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/Score.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/__init__.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/corpusmapping.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/eflomal.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/reader.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/runeflomal.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/scorer.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/writer.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentSet.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentType.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/BadRecord.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/util.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/Coverage.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/__init__.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/analyzer.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/exporter.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/filters.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/interlinear/__init__.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/interlinear/reverse.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/interlinear/token.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/strongs.py +0 -0
- {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/vocab.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: biblealignlib
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
|
|
5
5
|
License: MIT
|
|
6
6
|
License-File: LICENSE
|
|
@@ -36,9 +36,13 @@ Description-Content-Type: text/markdown
|
|
|
36
36
|
|
|
37
37
|
# biblealignlib
|
|
38
38
|
|
|
39
|
-
Biblica's code for working with Bible alignment data from
|
|
39
|
+
Biblica's Python code for working with Bible alignment data from
|
|
40
40
|
https://github.com/Clear-Bible/Alignments .
|
|
41
41
|
|
|
42
|
+
This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
|
|
43
|
+
licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
|
|
44
|
+
|
|
45
|
+
|
|
42
46
|
## Installing extra dependencies
|
|
43
47
|
|
|
44
48
|
### eflomal
|
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
# biblealignlib
|
|
2
2
|
|
|
3
|
-
Biblica's code for working with Bible alignment data from
|
|
3
|
+
Biblica's Python code for working with Bible alignment data from
|
|
4
4
|
https://github.com/Clear-Bible/Alignments .
|
|
5
5
|
|
|
6
|
+
This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
|
|
7
|
+
licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
|
|
8
|
+
|
|
9
|
+
|
|
6
10
|
## Installing extra dependencies
|
|
7
11
|
|
|
8
12
|
### eflomal
|
|
@@ -18,9 +18,9 @@ commonly used by automated alignment algorithms.
|
|
|
18
18
|
>>> pm.bcv["mappings"]["41004003"]
|
|
19
19
|
<CorpusMapping: 41004003>
|
|
20
20
|
>>> pm.bcv["mappings"]["41004003"].source_pairs
|
|
21
|
-
[(<Source: n41004003001
|
|
21
|
+
[(<Source: n41004003001|Ἀκούετε>, 0), (<Source: n41004003002|ἰδοὺ>, 1), (<Source: n41004003003|ἐξῆλθεν>, 2), ...
|
|
22
22
|
>>> pm.bcv["mappings"]["41004003"].target_pairs
|
|
23
|
-
[(<Target: 410040030011>, 0), (<Target: 410040030021>, 1), (<Target: 410040030031>, 2), ...
|
|
23
|
+
[(<Target: 410040030011|Listen>, 0), (<Target: 410040030021|A>, 1), (<Target: 410040030031|sower>, 2), ...
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
"""
|
|
@@ -283,6 +283,14 @@ class AlignmentRecord:
|
|
|
283
283
|
"""True if any selectors in references are incomplete."""
|
|
284
284
|
return any(ref.incomplete for ref in self.references.values())
|
|
285
285
|
|
|
286
|
+
def update_target_selectors(self, selectors: list[str]) -> None:
|
|
287
|
+
"""Replace the target selectors for this record.
|
|
288
|
+
|
|
289
|
+
Selectors are sorted, matching the behaviour of
|
|
290
|
+
AlignmentReference.__post_init__.
|
|
291
|
+
"""
|
|
292
|
+
self.references["target"].selectors = sorted(selectors)
|
|
293
|
+
|
|
286
294
|
def asdict(
|
|
287
295
|
self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
|
|
288
296
|
) -> dict[str, Any]:
|
|
@@ -25,7 +25,7 @@ class BaseToken:
|
|
|
25
25
|
|
|
26
26
|
def __repr__(self) -> str:
|
|
27
27
|
"""Return a printed representation."""
|
|
28
|
-
return f"<{self.__class__.__name__}: {self.
|
|
28
|
+
return f"<{self.__class__.__name__}: {self.tokenstr}>"
|
|
29
29
|
|
|
30
30
|
#
|
|
31
31
|
def __hash__(self) -> int:
|
|
@@ -48,6 +48,10 @@ class BaseToken:
|
|
|
48
48
|
"""Return the BCV-format verse reference for a token instance."""
|
|
49
49
|
return str(self.bcv)
|
|
50
50
|
|
|
51
|
+
@property
|
|
52
|
+
def tokenstr(self) -> str:
|
|
53
|
+
return f"{self.id}|{self.text}"
|
|
54
|
+
|
|
51
55
|
@property
|
|
52
56
|
def idtext(self) -> tuple[str, str]:
|
|
53
57
|
"""Return a tuple of id and text.
|
|
@@ -59,10 +63,6 @@ class BaseToken:
|
|
|
59
63
|
self.text,
|
|
60
64
|
)
|
|
61
65
|
|
|
62
|
-
@property
|
|
63
|
-
def tokenstr(self) -> str:
|
|
64
|
-
return f"{self.id}, {self.text}"
|
|
65
|
-
|
|
66
66
|
@property
|
|
67
67
|
def bare_id(self) -> str:
|
|
68
68
|
"""Return the ID minus any canon prefixes."""
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from .source import Source
|
|
6
|
+
from .target import Target
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DiffReason(Enum):
|
|
10
|
+
"""Enumerate constants for alignment differences."""
|
|
11
|
+
|
|
12
|
+
DIFFLEN = "Different number of alignments"
|
|
13
|
+
DIFFSOURCES = "Source selectors differ"
|
|
14
|
+
DIFFTARGETS = "Target selectors differ"
|
|
15
|
+
DIFFNOTES = "Different notes"
|
|
16
|
+
DIFFSTATUS = "Different status"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class DiffRecord:
|
|
21
|
+
"""Container for data on alignment differences for a verse.
|
|
22
|
+
|
|
23
|
+
The same verse could have multiple alignment differences.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# the alignment BCV
|
|
27
|
+
bcvid: str
|
|
28
|
+
# the data in the first alignment
|
|
29
|
+
sources1: tuple[Source, ...] = ()
|
|
30
|
+
targets1: tuple[Target, ...] = ()
|
|
31
|
+
# the data in the second alignment
|
|
32
|
+
sources2: tuple[Source, ...] = ()
|
|
33
|
+
targets2: tuple[Target, ...] = ()
|
|
34
|
+
# why it's different
|
|
35
|
+
diffreason: Optional[DiffReason] = None
|
|
36
|
+
# any auxiliary data
|
|
37
|
+
data: tuple = ()
|
|
38
|
+
# optional
|
|
39
|
+
n_differences: Optional[int] = None
|
|
40
|
+
|
|
41
|
+
def __hash__(self) -> int:
|
|
42
|
+
"""Return a hash based on bcvid, diffreason, and data."""
|
|
43
|
+
return hash((self.bcvid, self.diffreason, self.data))
|
|
44
|
+
|
|
45
|
+
def __repr__(self) -> str:
|
|
46
|
+
"""Return a string representation."""
|
|
47
|
+
basestr = (
|
|
48
|
+
f"<DiffRecord ({self.bcvid}): '{self.diffreason.value if self.diffreason else None}'"
|
|
49
|
+
)
|
|
50
|
+
if self.data:
|
|
51
|
+
basestr += ", " + repr(self.data)
|
|
52
|
+
basestr += ">"
|
|
53
|
+
return basestr
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def n_sources1(self) -> int:
|
|
57
|
+
"""Return the number of sources in the first alignment."""
|
|
58
|
+
return len(self.sources1)
|
|
59
|
+
|
|
60
|
+
@property
|
|
61
|
+
def n_sources2(self) -> int:
|
|
62
|
+
"""Return the number of sources in the second alignment."""
|
|
63
|
+
return len(self.sources2)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def n_targets1(self) -> int:
|
|
67
|
+
"""Return the number of targets in the first alignment."""
|
|
68
|
+
return len(self.targets1)
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def n_targets2(self) -> int:
|
|
72
|
+
"""Return the number of targets in the second alignment."""
|
|
73
|
+
return len(self.targets2)
|
|
@@ -29,7 +29,6 @@ Target: 44020020021: করিনি ('', False, False)
|
|
|
29
29
|
|
|
30
30
|
from collections import Counter
|
|
31
31
|
from dataclasses import dataclass
|
|
32
|
-
from enum import Enum
|
|
33
32
|
from typing import Optional
|
|
34
33
|
|
|
35
34
|
import pandas as pd
|
|
@@ -39,47 +38,7 @@ from .BaseToken import BaseToken
|
|
|
39
38
|
from .source import Source
|
|
40
39
|
from .target import Target
|
|
41
40
|
from .AlignmentGroup import AlignmentRecord
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
class DiffReason(Enum):
|
|
45
|
-
"""Enumerate constants for alignment differences."""
|
|
46
|
-
|
|
47
|
-
DIFFLEN = "Different number of alignments"
|
|
48
|
-
DIFFSOURCES = "Source selectors differ"
|
|
49
|
-
DIFFTARGETS = "Target selectors differ"
|
|
50
|
-
DIFFNOTES = "Different notes"
|
|
51
|
-
DIFFSTATUS = "Different status"
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
@dataclass
|
|
55
|
-
class DiffRecord:
|
|
56
|
-
"""Container for data on alignment differences.
|
|
57
|
-
|
|
58
|
-
The same verse could have multiple alignment differences.
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
# the alignment BCV
|
|
62
|
-
bcvid: str
|
|
63
|
-
# the data in the first alignment
|
|
64
|
-
sources1: tuple[Source, ...] = ()
|
|
65
|
-
targets1: tuple[Target, ...] = ()
|
|
66
|
-
# the data in the second alignment
|
|
67
|
-
sources2: tuple[Source, ...] = ()
|
|
68
|
-
targets2: tuple[Target, ...] = ()
|
|
69
|
-
# why it's different
|
|
70
|
-
diffreason: Optional[DiffReason] = None
|
|
71
|
-
# any auxiliary data
|
|
72
|
-
data: tuple = ()
|
|
73
|
-
|
|
74
|
-
def __repr__(self) -> str:
|
|
75
|
-
"""Return a string representation."""
|
|
76
|
-
basestr = (
|
|
77
|
-
f"<DiffRecord ({self.bcvid}): '{self.diffreason.value if self.diffreason else None}'"
|
|
78
|
-
)
|
|
79
|
-
if self.data:
|
|
80
|
-
basestr += ", " + repr(self.data)
|
|
81
|
-
basestr += ">"
|
|
82
|
-
return basestr
|
|
41
|
+
from .DiffRecord import DiffRecord, DiffReason
|
|
83
42
|
|
|
84
43
|
|
|
85
44
|
@dataclass
|
|
@@ -110,6 +69,16 @@ class VerseData:
|
|
|
110
69
|
"""Return a string representation."""
|
|
111
70
|
return f"<VerseData: {self.bcvid}>"
|
|
112
71
|
|
|
72
|
+
@property
|
|
73
|
+
def sourceitems(self) -> dict[str, Source]:
|
|
74
|
+
"""Return mapping from BCVW to source tokens."""
|
|
75
|
+
return {src.bare_id: src for src in self.sources}
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def targetitems(self) -> dict[str, Target]:
|
|
79
|
+
"""Return mapping from BCVW to target tokens."""
|
|
80
|
+
return {src.bare_id: src for src in self.targets}
|
|
81
|
+
|
|
113
82
|
@property
|
|
114
83
|
def aligned_sources(self) -> list[Source]:
|
|
115
84
|
"""Return list of aligned source tokens.
|
|
@@ -187,6 +156,16 @@ class VerseData:
|
|
|
187
156
|
for trg in targets:
|
|
188
157
|
print(f"Target: {trg._display}")
|
|
189
158
|
|
|
159
|
+
def display_record(self, alrec: AlignmentRecord) -> None:
|
|
160
|
+
"""Display an alignment record from this instance."""
|
|
161
|
+
source_tokenstring: str = ", ".join(
|
|
162
|
+
[self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
|
|
163
|
+
)
|
|
164
|
+
target_tokenstring: str = ", ".join(
|
|
165
|
+
[self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
|
|
166
|
+
)
|
|
167
|
+
print(f"{alrec.meta.id}: {source_tokenstring} --- {target_tokenstring}")
|
|
168
|
+
|
|
190
169
|
def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
|
|
191
170
|
"""Display tokens from typeattr that are _not_ aligned."""
|
|
192
171
|
assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
|
|
@@ -203,13 +182,13 @@ class VerseData:
|
|
|
203
182
|
if aligned:
|
|
204
183
|
for sources, targets in self.alignments:
|
|
205
184
|
print(
|
|
206
|
-
f"{str([src.
|
|
185
|
+
f"{str([src.tokenstr for src in sources]):{srcwidth}}\t\t{[trg.tokenstr for trg in targets]}"
|
|
207
186
|
)
|
|
208
187
|
else:
|
|
209
188
|
# show all sources with their (possibly empty) target alignments
|
|
210
189
|
for source in self.sources:
|
|
211
190
|
print(
|
|
212
|
-
f"{str(source.
|
|
191
|
+
f"{str(source.tokenstr):{srcwidth}}\t\t{[trg.tokenstr for trg in self.get_source_alignments(source)]}"
|
|
213
192
|
)
|
|
214
193
|
|
|
215
194
|
def get_texts(
|
|
@@ -244,6 +223,30 @@ class VerseData:
|
|
|
244
223
|
texts = [item.text for item in tokens]
|
|
245
224
|
return texts
|
|
246
225
|
|
|
226
|
+
def tokenstrings(self, record: AlignmentRecord, typeattr: str) -> list[str]:
|
|
227
|
+
"""Return a list of id|text strings for the tokens in this record."""
|
|
228
|
+
assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
|
|
229
|
+
items = self.sourceitems if typeattr == "sources" else self.targetitems
|
|
230
|
+
selectors: list[str] = (
|
|
231
|
+
record.source_selectors if typeattr == "sources" else record.target_selectors
|
|
232
|
+
)
|
|
233
|
+
return [srctoken.tokenstr for sel in selectors if (srctoken := items[sel])]
|
|
234
|
+
|
|
235
|
+
def record_as_tsv(self, record: AlignmentRecord) -> str:
|
|
236
|
+
"""Return a 3-column TSV string representation of this record.
|
|
237
|
+
|
|
238
|
+
Represents tokens using combined id|text notation.
|
|
239
|
+
"""
|
|
240
|
+
sourcestrings = self.tokenstrings(record, "sources")
|
|
241
|
+
targetstrings = self.tokenstrings(record, "targets")
|
|
242
|
+
return "\t".join(
|
|
243
|
+
[
|
|
244
|
+
record.identifier,
|
|
245
|
+
", ".join(sourcestrings),
|
|
246
|
+
", ".join(targetstrings),
|
|
247
|
+
]
|
|
248
|
+
)
|
|
249
|
+
|
|
247
250
|
## NOT YET WORKING
|
|
248
251
|
# def generate_html_table(self) -> str:
|
|
249
252
|
# """Generate an HTML table with one row for each source item and one column for each target item."""
|
|
@@ -316,7 +319,7 @@ class VerseData:
|
|
|
316
319
|
return None
|
|
317
320
|
|
|
318
321
|
# TODO: compare
|
|
319
|
-
def diff(self, other: "VerseData") ->
|
|
322
|
+
def diff(self, other: "VerseData") -> list[DiffRecord]:
|
|
320
323
|
"""Return a (possibly empty) list of differences between the alignments data.
|
|
321
324
|
|
|
322
325
|
If there are a different number of alignments, that's the only
|
|
@@ -19,6 +19,7 @@ from .AlignmentType import TranslationType
|
|
|
19
19
|
from .alignments import AlignmentsReader, write_alignment_group
|
|
20
20
|
from .manager import Manager, VerseData
|
|
21
21
|
from .BaseToken import BaseToken, asbool, bare_id
|
|
22
|
+
from .DiffRecord import DiffReason, DiffRecord
|
|
22
23
|
from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
|
|
23
24
|
from .target import Target, TargetReader
|
|
24
25
|
from .util import groupby_key, groupby_bcid, groupby_bcv, token_groupby_bc, filter_by_bcv
|
|
@@ -41,6 +42,9 @@ __all__ = [
|
|
|
41
42
|
"BaseToken",
|
|
42
43
|
"asbool",
|
|
43
44
|
"bare_id",
|
|
45
|
+
# DiffRecord
|
|
46
|
+
"DiffReason",
|
|
47
|
+
"DiffRecord",
|
|
44
48
|
# alignments
|
|
45
49
|
"AlignmentsReader",
|
|
46
50
|
"write_alignment_group",
|
|
@@ -301,6 +301,7 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
|
|
|
301
301
|
"""Write JSON data for an arbitrary group in Scripture Burrito format.
|
|
302
302
|
|
|
303
303
|
Writes some of the JSON by hand to get records on the same line.
|
|
304
|
+
Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
|
|
304
305
|
"""
|
|
305
306
|
|
|
306
307
|
def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
|
|
@@ -311,19 +312,32 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
|
|
|
311
312
|
out.write(" ],\n")
|
|
312
313
|
|
|
313
314
|
def _write_meta(out: TextIO, meta: Metadata) -> None:
|
|
314
|
-
"""Write
|
|
315
|
+
"""Write metadata to out."""
|
|
315
316
|
metarow = '"meta": ' + json.dumps(meta.asdict())
|
|
316
317
|
f.write(f" {metarow},\n")
|
|
317
318
|
|
|
319
|
+
def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
|
|
320
|
+
"""Return the serialized dict for arec with a sequential BCV-based id.
|
|
321
|
+
|
|
322
|
+
This converts the ClearAligner opaque IDs to something
|
|
323
|
+
meaningful, attempting to make files more diff-able.
|
|
324
|
+
"""
|
|
325
|
+
bcv = arec.source_bcv
|
|
326
|
+
bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
|
|
327
|
+
recdict = arec.asdict()
|
|
328
|
+
recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
|
|
329
|
+
return recdict
|
|
330
|
+
|
|
318
331
|
f.write("{\n")
|
|
319
332
|
_write_documents(f, group.documents)
|
|
320
333
|
_write_meta(f, group.meta)
|
|
321
334
|
f.write(f' "roles": {json.dumps(group.roles)},\n')
|
|
322
335
|
f.write(f' "type": "{group._type}",\n "records": [\n ')
|
|
323
336
|
# should sort the records: NIV11 doesn't appear to be sorted
|
|
337
|
+
bcv_counters: dict[str, int] = {}
|
|
324
338
|
for arec in group.records[:-1]:
|
|
325
|
-
json.dump(arec
|
|
339
|
+
json.dump(_record_dict(arec, bcv_counters), f)
|
|
326
340
|
f.write(",\n ")
|
|
327
341
|
# now the last one without a comma, because JSON
|
|
328
|
-
json.dump(group.records[-1]
|
|
342
|
+
json.dump(_record_dict(group.records[-1], bcv_counters), f)
|
|
329
343
|
f.write("\n ]}")
|
|
@@ -114,6 +114,10 @@ class Manager(UserDict):
|
|
|
114
114
|
keepbadrecords=self.keepbadrecords,
|
|
115
115
|
)
|
|
116
116
|
self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
|
|
117
|
+
# TODO: upgrade the selectors to use tokenstr. This requires
|
|
118
|
+
# knowing the source and targetitems, but alignmentsreader
|
|
119
|
+
# doesn't have that data
|
|
120
|
+
# self.add_tokenstr_to_records(self)
|
|
117
121
|
# group records by BCV
|
|
118
122
|
self.bcv["records"] = groupby_bcv(
|
|
119
123
|
list(self.alignmentsreader.alignmentgroup.records), lambda r: r.source_bcv
|
|
@@ -17,12 +17,11 @@ called from burrito.manager.Manager().
|
|
|
17
17
|
5468
|
|
18
18
|
# dict: token ID -> Source() instance
|
|
19
19
|
>>> src["n41004003001"]
|
|
20
|
-
|
|
21
|
-
<Source: n41004003001>
|
|
20
|
+
<Source: n41004003001|Ἀκούετε>
|
|
22
21
|
>>> src["n41004003001"].display()
|
|
23
22
|
n41004003001: Ἀκούετε (Listen, ἀκούω, verb)
|
|
24
|
-
>>> src["n41004003001"].
|
|
25
|
-
|
|
23
|
+
>>> src["n41004003001"].tokenstr
|
|
24
|
+
'n41004003001|Ἀκούετε'
|
|
26
25
|
>>> src["n41004003001"].asdict()
|
|
27
26
|
{'identifier': 'n41004003001',
|
|
28
27
|
'altId': 'Ἀκούετε-1',
|
|
@@ -1,4 +1,14 @@
|
|
|
1
|
-
"""Manage the target/translation data for
|
|
1
|
+
"""Manage the target/translation data for alignment data.
|
|
2
|
+
|
|
3
|
+
This typically reads the output of kathairo.
|
|
4
|
+
|
|
5
|
+
Limitations:
|
|
6
|
+
|
|
7
|
+
- Each token is assigned to the relevant source verse, which may be
|
|
8
|
+
different than the verse assignments in the target text. This is
|
|
9
|
+
version-specific, not necessarily a versification issue. So verse
|
|
10
|
+
identifiers may need mapping.
|
|
11
|
+
- Example: SBLGNT for 3JN has v. 15, but all these tokens are in v. 14 in the NIV11.
|
|
2
12
|
|
|
3
13
|
>>> from biblealignlib.burrito import target
|
|
4
14
|
# Reading is normally done by Manager
|
|
@@ -15,6 +25,7 @@
|
|
|
15
25
|
# write the tokens out
|
|
16
26
|
>>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
|
|
17
27
|
>>> tr.write_tsv(tokenlist=tr.data.values(), outpath=(LANGDATAPATH / "targets/BSB/new-nt_BSB.tsv"))
|
|
28
|
+
|
|
18
29
|
"""
|
|
19
30
|
|
|
20
31
|
from collections import UserDict, defaultdict
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""Compare two alignment groups record by record.
|
|
2
|
+
|
|
3
|
+
Both groups must share the same sourceid, targetid, and
|
|
4
|
+
targetlanguage. This is most useful for checking minor changes to
|
|
5
|
+
ensure you haven't introduced errors.
|
|
6
|
+
|
|
7
|
+
Comparison ignores meta.id (which is assigned on write) but reports
|
|
8
|
+
differences in targets and all other meta fields (status, origin, creator, note).
|
|
9
|
+
|
|
10
|
+
>>> from biblealignlib.burrito import CLEARROOT, AlignmentSet
|
|
11
|
+
>>> from biblealignlib.util.DiffAlignments import DiffAlignments
|
|
12
|
+
>>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
|
|
13
|
+
>>> alset1 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
|
|
14
|
+
... targetlanguage="eng", langdatapath=LANGDATAPATH,
|
|
15
|
+
... alternateid="manual")
|
|
16
|
+
>>> alset2 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
|
|
17
|
+
... targetlanguage="eng", langdatapath=LANGDATAPATH,
|
|
18
|
+
... alternateid="updated")
|
|
19
|
+
>>> da = DiffAlignments(alset1, alset2)
|
|
20
|
+
>>> da.show()
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
|
|
26
|
+
from ..burrito.AlignmentGroup import AlignmentGroup, AlignmentRecord
|
|
27
|
+
from ..burrito.AlignmentSet import AlignmentSet
|
|
28
|
+
from ..burrito.alignments import AlignmentsReader
|
|
29
|
+
|
|
30
|
+
# Meta fields compared between records (id is intentionally excluded)
|
|
31
|
+
_COMPARED_META_FIELDS = ("creator", "note", "origin", "status")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _record_key(rec: AlignmentRecord) -> tuple[str, ...]:
|
|
35
|
+
"""Return a stable key for matching records across groups.
|
|
36
|
+
|
|
37
|
+
Keyed by sorted source selectors, since records are matched on the
|
|
38
|
+
source side and targets may differ.
|
|
39
|
+
"""
|
|
40
|
+
return tuple(sorted(rec.source_selectors))
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _meta_diffs(rec1: AlignmentRecord, rec2: AlignmentRecord) -> dict[str, tuple[str, str]]:
|
|
44
|
+
"""Return a dict of differing meta fields (excluding id).
|
|
45
|
+
|
|
46
|
+
Keys are field names; values are (val_in_rec1, val_in_rec2).
|
|
47
|
+
"""
|
|
48
|
+
diffs: dict[str, tuple[str, str]] = {}
|
|
49
|
+
for field_name in _COMPARED_META_FIELDS:
|
|
50
|
+
v1 = getattr(rec1.meta, field_name, "")
|
|
51
|
+
v2 = getattr(rec2.meta, field_name, "")
|
|
52
|
+
if v1 != v2:
|
|
53
|
+
diffs[field_name] = (str(v1), str(v2))
|
|
54
|
+
return diffs
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass
|
|
58
|
+
class RecordDiff:
|
|
59
|
+
"""Captures differences between two matched alignment records."""
|
|
60
|
+
|
|
61
|
+
source_selectors: tuple[str, ...]
|
|
62
|
+
# non-empty when targets differ
|
|
63
|
+
targets1: list[str] = field(default_factory=list)
|
|
64
|
+
targets2: list[str] = field(default_factory=list)
|
|
65
|
+
# non-empty when meta fields (excluding id) differ
|
|
66
|
+
meta_diffs: dict[str, tuple[str, str]] = field(default_factory=dict)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def targets_differ(self) -> bool:
|
|
70
|
+
"""True if target selectors differ between the two records."""
|
|
71
|
+
return self.targets1 != self.targets2
|
|
72
|
+
|
|
73
|
+
def __repr__(self) -> str:
|
|
74
|
+
src = ", ".join(self.source_selectors)
|
|
75
|
+
parts = [f"<RecordDiff src=[{src}]"]
|
|
76
|
+
if self.targets_differ:
|
|
77
|
+
parts.append(f" targets: {self.targets1} -> {self.targets2}")
|
|
78
|
+
for fname, (v1, v2) in self.meta_diffs.items():
|
|
79
|
+
parts.append(f" {fname}: {v1!r} -> {v2!r}")
|
|
80
|
+
parts.append(">")
|
|
81
|
+
return "".join(parts)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class DiffAlignments:
|
|
85
|
+
"""Compare two alignment groups from the same source/target pair.
|
|
86
|
+
|
|
87
|
+
Records are matched by their source selectors. Differences in
|
|
88
|
+
target selectors and metadata (excluding id) are reported.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
def __init__(self, alset1: AlignmentSet, alset2: AlignmentSet) -> None:
|
|
92
|
+
"""Initialize and compute differences."""
|
|
93
|
+
for attr in ("sourceid", "targetid", "targetlanguage"):
|
|
94
|
+
v1 = getattr(alset1, attr)
|
|
95
|
+
v2 = getattr(alset2, attr)
|
|
96
|
+
if v1 != v2:
|
|
97
|
+
raise ValueError(f"AlignmentSets differ on {attr!r}: {v1!r} vs {v2!r}")
|
|
98
|
+
self.alset1 = alset1
|
|
99
|
+
self.alset2 = alset2
|
|
100
|
+
self.group1: AlignmentGroup = AlignmentsReader(alset1).alignmentgroup
|
|
101
|
+
self.group2: AlignmentGroup = AlignmentsReader(alset2).alignmentgroup
|
|
102
|
+
|
|
103
|
+
# index each group's records by source-selector key
|
|
104
|
+
self._recs1: dict[tuple[str, ...], AlignmentRecord] = {
|
|
105
|
+
_record_key(r): r for r in self.group1.records
|
|
106
|
+
}
|
|
107
|
+
self._recs2: dict[tuple[str, ...], AlignmentRecord] = {
|
|
108
|
+
_record_key(r): r for r in self.group2.records
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
keys1 = set(self._recs1)
|
|
112
|
+
keys2 = set(self._recs2)
|
|
113
|
+
|
|
114
|
+
# records present only in one group
|
|
115
|
+
self.only_in_1: list[AlignmentRecord] = [self._recs1[k] for k in sorted(keys1 - keys2)]
|
|
116
|
+
self.only_in_2: list[AlignmentRecord] = [self._recs2[k] for k in sorted(keys2 - keys1)]
|
|
117
|
+
|
|
118
|
+
# records present in both; compare targets and meta
|
|
119
|
+
self.record_diffs: list[RecordDiff] = []
|
|
120
|
+
for key in sorted(keys1 & keys2):
|
|
121
|
+
r1, r2 = self._recs1[key], self._recs2[key]
|
|
122
|
+
t1, t2 = sorted(r1.target_selectors), sorted(r2.target_selectors)
|
|
123
|
+
mdiffs = _meta_diffs(r1, r2)
|
|
124
|
+
if t1 != t2 or mdiffs:
|
|
125
|
+
self.record_diffs.append(
|
|
126
|
+
RecordDiff(source_selectors=key, targets1=t1, targets2=t2, meta_diffs=mdiffs)
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def has_diffs(self) -> bool:
|
|
131
|
+
"""True if any differences were found."""
|
|
132
|
+
return bool(self.only_in_1 or self.only_in_2 or self.record_diffs)
|
|
133
|
+
|
|
134
|
+
def show(self) -> None:
|
|
135
|
+
"""Print a human-readable summary of all differences."""
|
|
136
|
+
label1 = self.alset1.identifier
|
|
137
|
+
label2 = self.alset2.identifier
|
|
138
|
+
print(f"Comparing {label1!r} vs {label2!r}")
|
|
139
|
+
print(
|
|
140
|
+
f" {len(self.group1.records)} records in {label1}, "
|
|
141
|
+
f"{len(self.group2.records)} records in {label2}"
|
|
142
|
+
)
|
|
143
|
+
if not self.has_diffs:
|
|
144
|
+
print(" No differences found.")
|
|
145
|
+
return
|
|
146
|
+
|
|
147
|
+
if self.only_in_1:
|
|
148
|
+
print(f"\n Records only in {label1} ({len(self.only_in_1)}):")
|
|
149
|
+
for rec in self.only_in_1:
|
|
150
|
+
src = ", ".join(rec.source_selectors)
|
|
151
|
+
print(f" - src=[{src}] tgt={rec.target_selectors}")
|
|
152
|
+
|
|
153
|
+
if self.only_in_2:
|
|
154
|
+
print(f"\n Records only in {label2} ({len(self.only_in_2)}):")
|
|
155
|
+
for rec in self.only_in_2:
|
|
156
|
+
src = ", ".join(rec.source_selectors)
|
|
157
|
+
print(f" + src=[{src}] tgt={rec.target_selectors}")
|
|
158
|
+
|
|
159
|
+
if self.record_diffs:
|
|
160
|
+
print(f"\n Records with differences ({len(self.record_diffs)}):")
|
|
161
|
+
for diff in self.record_diffs:
|
|
162
|
+
src = ", ".join(diff.source_selectors)
|
|
163
|
+
print(f" src=[{src}]")
|
|
164
|
+
if diff.targets_differ:
|
|
165
|
+
print(f" targets: {diff.targets1}")
|
|
166
|
+
print(f" -> {diff.targets2}")
|
|
167
|
+
for fname, (v1, v2) in diff.meta_diffs.items():
|
|
168
|
+
print(f" {fname}: {v1!r} -> {v2!r}")
|