biblealignlib 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/PKG-INFO +6 -2
  2. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/README.md +5 -1
  3. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/mapper.py +2 -2
  4. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentGroup.py +8 -0
  5. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/BaseToken.py +5 -5
  6. biblealignlib-0.3.2/biblealignlib/burrito/DiffRecord.py +73 -0
  7. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/VerseData.py +48 -45
  8. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/__init__.py +4 -0
  9. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/alignments.py +17 -3
  10. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/manager.py +4 -0
  11. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/source.py +3 -4
  12. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/target.py +12 -1
  13. biblealignlib-0.3.2/biblealignlib/util/DiffAlignments.py +168 -0
  14. biblealignlib-0.3.2/biblealignlib/util/DiffTargets.py +778 -0
  15. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/Transfer.py +18 -5
  16. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/__init__.py +19 -4
  17. biblealignlib-0.3.2/biblealignlib/util/canonsplit.py +77 -0
  18. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/merger.py +4 -1
  19. biblealignlib-0.3.2/biblealignlib/util/tokens_to_chars.py +38 -0
  20. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/pyproject.toml +2 -1
  21. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/LICENSE +0 -0
  22. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/LICENSE.md +0 -0
  23. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/__init__.py +0 -0
  24. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/Score.py +0 -0
  25. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/__init__.py +0 -0
  26. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/corpusmapping.py +0 -0
  27. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/eflomal.py +0 -0
  28. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/reader.py +0 -0
  29. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/runeflomal.py +0 -0
  30. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/scorer.py +0 -0
  31. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/autoalign/writer.py +0 -0
  32. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentSet.py +0 -0
  33. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentType.py +0 -0
  34. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/BadRecord.py +0 -0
  35. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/burrito/util.py +0 -0
  36. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/Coverage.py +0 -0
  37. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/__init__.py +0 -0
  38. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/analyzer.py +0 -0
  39. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/exporter.py +0 -0
  40. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/coverage/filters.py +0 -0
  41. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/interlinear/__init__.py +0 -0
  42. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/interlinear/reverse.py +0 -0
  43. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/interlinear/token.py +0 -0
  44. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/strongs.py +0 -0
  45. {biblealignlib-0.3.0 → biblealignlib-0.3.2}/biblealignlib/util/vocab.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblealignlib
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -36,9 +36,13 @@ Description-Content-Type: text/markdown
36
36
 
37
37
  # biblealignlib
38
38
 
39
- Biblica's code for working with Bible alignment data from
39
+ Biblica's Python code for working with Bible alignment data from
40
40
  https://github.com/Clear-Bible/Alignments .
41
41
 
42
+ This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
43
+ licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
44
+
45
+
42
46
  ## Installing extra dependencies
43
47
 
44
48
  ### eflomal
@@ -1,8 +1,12 @@
1
1
  # biblealignlib
2
2
 
3
- Biblica's code for working with Bible alignment data from
3
+ Biblica's Python code for working with Bible alignment data from
4
4
  https://github.com/Clear-Bible/Alignments .
5
5
 
6
+ This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
7
+ licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
8
+
9
+
6
10
  ## Installing extra dependencies
7
11
 
8
12
  ### eflomal
@@ -18,9 +18,9 @@ commonly used by automated alignment algorithms.
18
18
  >>> pm.bcv["mappings"]["41004003"]
19
19
  <CorpusMapping: 41004003>
20
20
  >>> pm.bcv["mappings"]["41004003"].source_pairs
21
- [(<Source: n41004003001>, 0), (<Source: n41004003002>, 1), (<Source: n41004003003>, 2), ...
21
+ [(<Source: n41004003001|Ἀκούετε>, 0), (<Source: n41004003002|ἰδοὺ>, 1), (<Source: n41004003003|ἐξῆλθεν>, 2), ...
22
22
  >>> pm.bcv["mappings"]["41004003"].target_pairs
23
- [(<Target: 410040030011>, 0), (<Target: 410040030021>, 1), (<Target: 410040030031>, 2), ...
23
+ [(<Target: 410040030011|Listen>, 0), (<Target: 410040030021|A>, 1), (<Target: 410040030031|sower>, 2), ...
24
24
 
25
25
 
26
26
  """
@@ -283,6 +283,14 @@ class AlignmentRecord:
283
283
  """True if any selectors in references are incomplete."""
284
284
  return any(ref.incomplete for ref in self.references.values())
285
285
 
286
+ def update_target_selectors(self, selectors: list[str]) -> None:
287
+ """Replace the target selectors for this record.
288
+
289
+ Selectors are sorted, matching the behaviour of
290
+ AlignmentReference.__post_init__.
291
+ """
292
+ self.references["target"].selectors = sorted(selectors)
293
+
286
294
  def asdict(
287
295
  self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
288
296
  ) -> dict[str, Any]:
@@ -25,7 +25,7 @@ class BaseToken:
25
25
 
26
26
  def __repr__(self) -> str:
27
27
  """Return a printed representation."""
28
- return f"<{self.__class__.__name__}: {self.id}>"
28
+ return f"<{self.__class__.__name__}: {self.tokenstr}>"
29
29
 
30
30
  #
31
31
  def __hash__(self) -> int:
@@ -48,6 +48,10 @@ class BaseToken:
48
48
  """Return the BCV-format verse reference for a token instance."""
49
49
  return str(self.bcv)
50
50
 
51
+ @property
52
+ def tokenstr(self) -> str:
53
+ return f"{self.id}|{self.text}"
54
+
51
55
  @property
52
56
  def idtext(self) -> tuple[str, str]:
53
57
  """Return a tuple of id and text.
@@ -59,10 +63,6 @@ class BaseToken:
59
63
  self.text,
60
64
  )
61
65
 
62
- @property
63
- def tokenstr(self) -> str:
64
- return f"{self.id}, {self.text}"
65
-
66
66
  @property
67
67
  def bare_id(self) -> str:
68
68
  """Return the ID minus any canon prefixes."""
@@ -0,0 +1,73 @@
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+ from .source import Source
6
+ from .target import Target
7
+
8
+
9
+ class DiffReason(Enum):
10
+ """Enumerate constants for alignment differences."""
11
+
12
+ DIFFLEN = "Different number of alignments"
13
+ DIFFSOURCES = "Source selectors differ"
14
+ DIFFTARGETS = "Target selectors differ"
15
+ DIFFNOTES = "Different notes"
16
+ DIFFSTATUS = "Different status"
17
+
18
+
19
+ @dataclass
20
+ class DiffRecord:
21
+ """Container for data on alignment differences for a verse.
22
+
23
+ The same verse could have multiple alignment differences.
24
+ """
25
+
26
+ # the alignment BCV
27
+ bcvid: str
28
+ # the data in the first alignment
29
+ sources1: tuple[Source, ...] = ()
30
+ targets1: tuple[Target, ...] = ()
31
+ # the data in the second alignment
32
+ sources2: tuple[Source, ...] = ()
33
+ targets2: tuple[Target, ...] = ()
34
+ # why it's different
35
+ diffreason: Optional[DiffReason] = None
36
+ # any auxiliary data
37
+ data: tuple = ()
38
+ # optional
39
+ n_differences: Optional[int] = None
40
+
41
+ def __hash__(self) -> int:
42
+ """Return a hash based on bcvid, diffreason, and data."""
43
+ return hash((self.bcvid, self.diffreason, self.data))
44
+
45
+ def __repr__(self) -> str:
46
+ """Return a string representation."""
47
+ basestr = (
48
+ f"<DiffRecord ({self.bcvid}): '{self.diffreason.value if self.diffreason else None}'"
49
+ )
50
+ if self.data:
51
+ basestr += ", " + repr(self.data)
52
+ basestr += ">"
53
+ return basestr
54
+
55
+ @property
56
+ def n_sources1(self) -> int:
57
+ """Return the number of sources in the first alignment."""
58
+ return len(self.sources1)
59
+
60
+ @property
61
+ def n_sources2(self) -> int:
62
+ """Return the number of sources in the second alignment."""
63
+ return len(self.sources2)
64
+
65
+ @property
66
+ def n_targets1(self) -> int:
67
+ """Return the number of targets in the first alignment."""
68
+ return len(self.targets1)
69
+
70
+ @property
71
+ def n_targets2(self) -> int:
72
+ """Return the number of targets in the second alignment."""
73
+ return len(self.targets2)
@@ -29,7 +29,6 @@ Target: 44020020021: করিনি ('', False, False)
29
29
 
30
30
  from collections import Counter
31
31
  from dataclasses import dataclass
32
- from enum import Enum
33
32
  from typing import Optional
34
33
 
35
34
  import pandas as pd
@@ -39,47 +38,7 @@ from .BaseToken import BaseToken
39
38
  from .source import Source
40
39
  from .target import Target
41
40
  from .AlignmentGroup import AlignmentRecord
42
-
43
-
44
- class DiffReason(Enum):
45
- """Enumerate constants for alignment differences."""
46
-
47
- DIFFLEN = "Different number of alignments"
48
- DIFFSOURCES = "Source selectors differ"
49
- DIFFTARGETS = "Target selectors differ"
50
- DIFFNOTES = "Different notes"
51
- DIFFSTATUS = "Different status"
52
-
53
-
54
- @dataclass
55
- class DiffRecord:
56
- """Container for data on alignment differences.
57
-
58
- The same verse could have multiple alignment differences.
59
- """
60
-
61
- # the alignment BCV
62
- bcvid: str
63
- # the data in the first alignment
64
- sources1: tuple[Source, ...] = ()
65
- targets1: tuple[Target, ...] = ()
66
- # the data in the second alignment
67
- sources2: tuple[Source, ...] = ()
68
- targets2: tuple[Target, ...] = ()
69
- # why it's different
70
- diffreason: Optional[DiffReason] = None
71
- # any auxiliary data
72
- data: tuple = ()
73
-
74
- def __repr__(self) -> str:
75
- """Return a string representation."""
76
- basestr = (
77
- f"<DiffRecord ({self.bcvid}): '{self.diffreason.value if self.diffreason else None}'"
78
- )
79
- if self.data:
80
- basestr += ", " + repr(self.data)
81
- basestr += ">"
82
- return basestr
41
+ from .DiffRecord import DiffRecord, DiffReason
83
42
 
84
43
 
85
44
  @dataclass
@@ -110,6 +69,16 @@ class VerseData:
110
69
  """Return a string representation."""
111
70
  return f"<VerseData: {self.bcvid}>"
112
71
 
72
+ @property
73
+ def sourceitems(self) -> dict[str, Source]:
74
+ """Return mapping from BCVW to source tokens."""
75
+ return {src.bare_id: src for src in self.sources}
76
+
77
+ @property
78
+ def targetitems(self) -> dict[str, Target]:
79
+ """Return mapping from BCVW to target tokens."""
80
+ return {src.bare_id: src for src in self.targets}
81
+
113
82
  @property
114
83
  def aligned_sources(self) -> list[Source]:
115
84
  """Return list of aligned source tokens.
@@ -187,6 +156,16 @@ class VerseData:
187
156
  for trg in targets:
188
157
  print(f"Target: {trg._display}")
189
158
 
159
+ def display_record(self, alrec: AlignmentRecord) -> None:
160
+ """Display an alignment record from this instance."""
161
+ source_tokenstring: str = ", ".join(
162
+ [self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
163
+ )
164
+ target_tokenstring: str = ", ".join(
165
+ [self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
166
+ )
167
+ print(f"{alrec.meta.id}: {source_tokenstring} --- {target_tokenstring}")
168
+
190
169
  def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
191
170
  """Display tokens from typeattr that are _not_ aligned."""
192
171
  assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
@@ -203,13 +182,13 @@ class VerseData:
203
182
  if aligned:
204
183
  for sources, targets in self.alignments:
205
184
  print(
206
- f"{str([src.idtext for src in sources]):{srcwidth}}\t\t{[trg.idtext for trg in targets]}"
185
+ f"{str([src.tokenstr for src in sources]):{srcwidth}}\t\t{[trg.tokenstr for trg in targets]}"
207
186
  )
208
187
  else:
209
188
  # show all sources with their (possibly empty) target alignments
210
189
  for source in self.sources:
211
190
  print(
212
- f"{str(source.idtext):{srcwidth}}\t\t{[trg.idtext for trg in self.get_source_alignments(source)]}"
191
+ f"{str(source.tokenstr):{srcwidth}}\t\t{[trg.tokenstr for trg in self.get_source_alignments(source)]}"
213
192
  )
214
193
 
215
194
  def get_texts(
@@ -244,6 +223,30 @@ class VerseData:
244
223
  texts = [item.text for item in tokens]
245
224
  return texts
246
225
 
226
+ def tokenstrings(self, record: AlignmentRecord, typeattr: str) -> list[str]:
227
+ """Return a list of id|text strings for the tokens in this record."""
228
+ assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
229
+ items = self.sourceitems if typeattr == "sources" else self.targetitems
230
+ selectors: list[str] = (
231
+ record.source_selectors if typeattr == "sources" else record.target_selectors
232
+ )
233
+ return [srctoken.tokenstr for sel in selectors if (srctoken := items[sel])]
234
+
235
+ def record_as_tsv(self, record: AlignmentRecord) -> str:
236
+ """Return a 3-column TSV string representation of this record.
237
+
238
+ Represents tokens using combined id|text notation.
239
+ """
240
+ sourcestrings = self.tokenstrings(record, "sources")
241
+ targetstrings = self.tokenstrings(record, "targets")
242
+ return "\t".join(
243
+ [
244
+ record.identifier,
245
+ ", ".join(sourcestrings),
246
+ ", ".join(targetstrings),
247
+ ]
248
+ )
249
+
247
250
  ## NOT YET WORKING
248
251
  # def generate_html_table(self) -> str:
249
252
  # """Generate an HTML table with one row for each source item and one column for each target item."""
@@ -316,7 +319,7 @@ class VerseData:
316
319
  return None
317
320
 
318
321
  # TODO: compare
319
- def diff(self, other: "VerseData") -> Optional[list[DiffRecord]]:
322
+ def diff(self, other: "VerseData") -> list[DiffRecord]:
320
323
  """Return a (possibly empty) list of differences between the alignments data.
321
324
 
322
325
  If there are a different number of alignments, that's the only
@@ -19,6 +19,7 @@ from .AlignmentType import TranslationType
19
19
  from .alignments import AlignmentsReader, write_alignment_group
20
20
  from .manager import Manager, VerseData
21
21
  from .BaseToken import BaseToken, asbool, bare_id
22
+ from .DiffRecord import DiffReason, DiffRecord
22
23
  from .source import macula_prefixer, macula_unprefixer, Source, SourceReader
23
24
  from .target import Target, TargetReader
24
25
  from .util import groupby_key, groupby_bcid, groupby_bcv, token_groupby_bc, filter_by_bcv
@@ -41,6 +42,9 @@ __all__ = [
41
42
  "BaseToken",
42
43
  "asbool",
43
44
  "bare_id",
45
+ # DiffRecord
46
+ "DiffReason",
47
+ "DiffRecord",
44
48
  # alignments
45
49
  "AlignmentsReader",
46
50
  "write_alignment_group",
@@ -301,6 +301,7 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
301
301
  """Write JSON data for an arbitrary group in Scripture Burrito format.
302
302
 
303
303
  Writes some of the JSON by hand to get records on the same line.
304
+ Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
304
305
  """
305
306
 
306
307
  def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
@@ -311,19 +312,32 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
311
312
  out.write(" ],\n")
312
313
 
313
314
  def _write_meta(out: TextIO, meta: Metadata) -> None:
314
- """Write metdatadata to out."""
315
+ """Write metadata to out."""
315
316
  metarow = '"meta": ' + json.dumps(meta.asdict())
316
317
  f.write(f" {metarow},\n")
317
318
 
319
+ def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
320
+ """Return the serialized dict for arec with a sequential BCV-based id.
321
+
322
+ This converts the ClearAligner opaque IDs to something
323
+ meaningful, attempting to make files more diff-able.
324
+ """
325
+ bcv = arec.source_bcv
326
+ bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
327
+ recdict = arec.asdict()
328
+ recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
329
+ return recdict
330
+
318
331
  f.write("{\n")
319
332
  _write_documents(f, group.documents)
320
333
  _write_meta(f, group.meta)
321
334
  f.write(f' "roles": {json.dumps(group.roles)},\n')
322
335
  f.write(f' "type": "{group._type}",\n "records": [\n ')
323
336
  # should sort the records: NIV11 doesn't appear to be sorted
337
+ bcv_counters: dict[str, int] = {}
324
338
  for arec in group.records[:-1]:
325
- json.dump(arec.asdict(), f)
339
+ json.dump(_record_dict(arec, bcv_counters), f)
326
340
  f.write(",\n ")
327
341
  # now the last one without a comma, because JSON
328
- json.dump(group.records[-1].asdict(), f)
342
+ json.dump(_record_dict(group.records[-1], bcv_counters), f)
329
343
  f.write("\n ]}")
@@ -114,6 +114,10 @@ class Manager(UserDict):
114
114
  keepbadrecords=self.keepbadrecords,
115
115
  )
116
116
  self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
117
+ # TODO: upgrade the selectors to use tokenstr. This requires
118
+ # knowing the source and targetitems, but alignmentsreader
119
+ # doesn't have that data
120
+ # self.add_tokenstr_to_records(self)
117
121
  # group records by BCV
118
122
  self.bcv["records"] = groupby_bcv(
119
123
  list(self.alignmentsreader.alignmentgroup.records), lambda r: r.source_bcv
@@ -17,12 +17,11 @@ called from burrito.manager.Manager().
17
17
  5468
18
18
  # dict: token ID -> Source() instance
19
19
  >>> src["n41004003001"]
20
- src["n41004003001"]
21
- <Source: n41004003001>
20
+ <Source: n41004003001|Ἀκούετε>
22
21
  >>> src["n41004003001"].display()
23
22
  n41004003001: Ἀκούετε (Listen, ἀκούω, verb)
24
- >>> src["n41004003001"].idtext
25
- ('n41004003001', 'Ἀκούετε')
23
+ >>> src["n41004003001"].tokenstr
24
+ 'n41004003001|Ἀκούετε'
26
25
  >>> src["n41004003001"].asdict()
27
26
  {'identifier': 'n41004003001',
28
27
  'altId': 'Ἀκούετε-1',
@@ -1,4 +1,14 @@
1
- """Manage the target/translation data for Grape City (gc) alignment data.
1
+ """Manage the target/translation data for alignment data.
2
+
3
+ This typically reads the output of kathairo.
4
+
5
+ Limitations:
6
+
7
+ - Each token is assigned to the relevant source verse, which may be
8
+ different than the verse assignments in the target text. This is
9
+ version-specific, not necessarily a versification issue. So verse
10
+ identifiers may need mapping.
11
+ - Example: SBLGNT for 3JN has v. 15, but all these tokens are in v. 14 in the NIV11.
2
12
 
3
13
  >>> from biblealignlib.burrito import target
4
14
  # Reading is normally done by Manager
@@ -15,6 +25,7 @@
15
25
  # write the tokens out
16
26
  >>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
17
27
  >>> tr.write_tsv(tokenlist=tr.data.values(), outpath=(LANGDATAPATH / "targets/BSB/new-nt_BSB.tsv"))
28
+
18
29
  """
19
30
 
20
31
  from collections import UserDict, defaultdict
@@ -0,0 +1,168 @@
1
+ """Compare two alignment groups record by record.
2
+
3
+ Both groups must share the same sourceid, targetid, and
4
+ targetlanguage. This is most useful for checking minor changes to
5
+ ensure you haven't introduced errors.
6
+
7
+ Comparison ignores meta.id (which is assigned on write) but reports
8
+ differences in targets and all other meta fields (status, origin, creator, note).
9
+
10
+ >>> from biblealignlib.burrito import CLEARROOT, AlignmentSet
11
+ >>> from biblealignlib.util.DiffAlignments import DiffAlignments
12
+ >>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
13
+ >>> alset1 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
14
+ ... targetlanguage="eng", langdatapath=LANGDATAPATH,
15
+ ... alternateid="manual")
16
+ >>> alset2 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
17
+ ... targetlanguage="eng", langdatapath=LANGDATAPATH,
18
+ ... alternateid="updated")
19
+ >>> da = DiffAlignments(alset1, alset2)
20
+ >>> da.show()
21
+
22
+ """
23
+
24
+ from dataclasses import dataclass, field
25
+
26
+ from ..burrito.AlignmentGroup import AlignmentGroup, AlignmentRecord
27
+ from ..burrito.AlignmentSet import AlignmentSet
28
+ from ..burrito.alignments import AlignmentsReader
29
+
30
+ # Meta fields compared between records (id is intentionally excluded)
31
+ _COMPARED_META_FIELDS = ("creator", "note", "origin", "status")
32
+
33
+
34
+ def _record_key(rec: AlignmentRecord) -> tuple[str, ...]:
35
+ """Return a stable key for matching records across groups.
36
+
37
+ Keyed by sorted source selectors, since records are matched on the
38
+ source side and targets may differ.
39
+ """
40
+ return tuple(sorted(rec.source_selectors))
41
+
42
+
43
+ def _meta_diffs(rec1: AlignmentRecord, rec2: AlignmentRecord) -> dict[str, tuple[str, str]]:
44
+ """Return a dict of differing meta fields (excluding id).
45
+
46
+ Keys are field names; values are (val_in_rec1, val_in_rec2).
47
+ """
48
+ diffs: dict[str, tuple[str, str]] = {}
49
+ for field_name in _COMPARED_META_FIELDS:
50
+ v1 = getattr(rec1.meta, field_name, "")
51
+ v2 = getattr(rec2.meta, field_name, "")
52
+ if v1 != v2:
53
+ diffs[field_name] = (str(v1), str(v2))
54
+ return diffs
55
+
56
+
57
+ @dataclass
58
+ class RecordDiff:
59
+ """Captures differences between two matched alignment records."""
60
+
61
+ source_selectors: tuple[str, ...]
62
+ # non-empty when targets differ
63
+ targets1: list[str] = field(default_factory=list)
64
+ targets2: list[str] = field(default_factory=list)
65
+ # non-empty when meta fields (excluding id) differ
66
+ meta_diffs: dict[str, tuple[str, str]] = field(default_factory=dict)
67
+
68
+ @property
69
+ def targets_differ(self) -> bool:
70
+ """True if target selectors differ between the two records."""
71
+ return self.targets1 != self.targets2
72
+
73
+ def __repr__(self) -> str:
74
+ src = ", ".join(self.source_selectors)
75
+ parts = [f"<RecordDiff src=[{src}]"]
76
+ if self.targets_differ:
77
+ parts.append(f" targets: {self.targets1} -> {self.targets2}")
78
+ for fname, (v1, v2) in self.meta_diffs.items():
79
+ parts.append(f" {fname}: {v1!r} -> {v2!r}")
80
+ parts.append(">")
81
+ return "".join(parts)
82
+
83
+
84
+ class DiffAlignments:
85
+ """Compare two alignment groups from the same source/target pair.
86
+
87
+ Records are matched by their source selectors. Differences in
88
+ target selectors and metadata (excluding id) are reported.
89
+ """
90
+
91
+ def __init__(self, alset1: AlignmentSet, alset2: AlignmentSet) -> None:
92
+ """Initialize and compute differences."""
93
+ for attr in ("sourceid", "targetid", "targetlanguage"):
94
+ v1 = getattr(alset1, attr)
95
+ v2 = getattr(alset2, attr)
96
+ if v1 != v2:
97
+ raise ValueError(f"AlignmentSets differ on {attr!r}: {v1!r} vs {v2!r}")
98
+ self.alset1 = alset1
99
+ self.alset2 = alset2
100
+ self.group1: AlignmentGroup = AlignmentsReader(alset1).alignmentgroup
101
+ self.group2: AlignmentGroup = AlignmentsReader(alset2).alignmentgroup
102
+
103
+ # index each group's records by source-selector key
104
+ self._recs1: dict[tuple[str, ...], AlignmentRecord] = {
105
+ _record_key(r): r for r in self.group1.records
106
+ }
107
+ self._recs2: dict[tuple[str, ...], AlignmentRecord] = {
108
+ _record_key(r): r for r in self.group2.records
109
+ }
110
+
111
+ keys1 = set(self._recs1)
112
+ keys2 = set(self._recs2)
113
+
114
+ # records present only in one group
115
+ self.only_in_1: list[AlignmentRecord] = [self._recs1[k] for k in sorted(keys1 - keys2)]
116
+ self.only_in_2: list[AlignmentRecord] = [self._recs2[k] for k in sorted(keys2 - keys1)]
117
+
118
+ # records present in both; compare targets and meta
119
+ self.record_diffs: list[RecordDiff] = []
120
+ for key in sorted(keys1 & keys2):
121
+ r1, r2 = self._recs1[key], self._recs2[key]
122
+ t1, t2 = sorted(r1.target_selectors), sorted(r2.target_selectors)
123
+ mdiffs = _meta_diffs(r1, r2)
124
+ if t1 != t2 or mdiffs:
125
+ self.record_diffs.append(
126
+ RecordDiff(source_selectors=key, targets1=t1, targets2=t2, meta_diffs=mdiffs)
127
+ )
128
+
129
+ @property
130
+ def has_diffs(self) -> bool:
131
+ """True if any differences were found."""
132
+ return bool(self.only_in_1 or self.only_in_2 or self.record_diffs)
133
+
134
+ def show(self) -> None:
135
+ """Print a human-readable summary of all differences."""
136
+ label1 = self.alset1.identifier
137
+ label2 = self.alset2.identifier
138
+ print(f"Comparing {label1!r} vs {label2!r}")
139
+ print(
140
+ f" {len(self.group1.records)} records in {label1}, "
141
+ f"{len(self.group2.records)} records in {label2}"
142
+ )
143
+ if not self.has_diffs:
144
+ print(" No differences found.")
145
+ return
146
+
147
+ if self.only_in_1:
148
+ print(f"\n Records only in {label1} ({len(self.only_in_1)}):")
149
+ for rec in self.only_in_1:
150
+ src = ", ".join(rec.source_selectors)
151
+ print(f" - src=[{src}] tgt={rec.target_selectors}")
152
+
153
+ if self.only_in_2:
154
+ print(f"\n Records only in {label2} ({len(self.only_in_2)}):")
155
+ for rec in self.only_in_2:
156
+ src = ", ".join(rec.source_selectors)
157
+ print(f" + src=[{src}] tgt={rec.target_selectors}")
158
+
159
+ if self.record_diffs:
160
+ print(f"\n Records with differences ({len(self.record_diffs)}):")
161
+ for diff in self.record_diffs:
162
+ src = ", ".join(diff.source_selectors)
163
+ print(f" src=[{src}]")
164
+ if diff.targets_differ:
165
+ print(f" targets: {diff.targets1}")
166
+ print(f" -> {diff.targets2}")
167
+ for fname, (v1, v2) in diff.meta_diffs.items():
168
+ print(f" {fname}: {v1!r} -> {v2!r}")