biblealignlib 0.3.1__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/PKG-INFO +6 -2
  2. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/README.md +5 -1
  3. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/mapper.py +2 -2
  4. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentGroup.py +8 -0
  5. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/BaseToken.py +1 -1
  6. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/VerseData.py +12 -2
  7. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/alignments.py +17 -3
  8. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/manager.py +4 -0
  9. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/source.py +3 -4
  10. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/target.py +12 -1
  11. biblealignlib-0.3.2/biblealignlib/util/DiffAlignments.py +168 -0
  12. biblealignlib-0.3.2/biblealignlib/util/DiffTargets.py +778 -0
  13. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/pyproject.toml +1 -1
  14. biblealignlib-0.3.1/biblealignlib/util/DiffTargets.py +0 -402
  15. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/LICENSE +0 -0
  16. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/LICENSE.md +0 -0
  17. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/__init__.py +0 -0
  18. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/Score.py +0 -0
  19. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/__init__.py +0 -0
  20. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/corpusmapping.py +0 -0
  21. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/eflomal.py +0 -0
  22. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/reader.py +0 -0
  23. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/runeflomal.py +0 -0
  24. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/scorer.py +0 -0
  25. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/autoalign/writer.py +0 -0
  26. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentSet.py +0 -0
  27. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/AlignmentType.py +0 -0
  28. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/BadRecord.py +0 -0
  29. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/DiffRecord.py +0 -0
  30. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/__init__.py +0 -0
  31. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/burrito/util.py +0 -0
  32. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/coverage/Coverage.py +0 -0
  33. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/coverage/__init__.py +0 -0
  34. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/coverage/analyzer.py +0 -0
  35. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/coverage/exporter.py +0 -0
  36. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/coverage/filters.py +0 -0
  37. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/interlinear/__init__.py +0 -0
  38. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/interlinear/reverse.py +0 -0
  39. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/interlinear/token.py +0 -0
  40. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/strongs.py +0 -0
  41. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/util/Transfer.py +0 -0
  42. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/util/__init__.py +0 -0
  43. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/util/canonsplit.py +0 -0
  44. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/util/merger.py +0 -0
  45. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/util/tokens_to_chars.py +0 -0
  46. {biblealignlib-0.3.1 → biblealignlib-0.3.2}/biblealignlib/util/vocab.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: biblealignlib
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -36,9 +36,13 @@ Description-Content-Type: text/markdown
36
36
 
37
37
  # biblealignlib
38
38
 
39
- Biblica's code for working with Bible alignment data from
39
+ Biblica's Python code for working with Bible alignment data from
40
40
  https://github.com/Clear-Bible/Alignments .
41
41
 
42
+ This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
43
+ licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
44
+
45
+
42
46
  ## Installing extra dependencies
43
47
 
44
48
  ### eflomal
@@ -1,8 +1,12 @@
1
1
  # biblealignlib
2
2
 
3
- Biblica's code for working with Bible alignment data from
3
+ Biblica's Python code for working with Bible alignment data from
4
4
  https://github.com/Clear-Bible/Alignments .
5
5
 
6
+ This code is ©2024-2026 by [Biblica, Inc](http://biblica.com) and is
7
+ licensed under [CC BY SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/).
8
+
9
+
6
10
  ## Installing extra dependencies
7
11
 
8
12
  ### eflomal
@@ -18,9 +18,9 @@ commonly used by automated alignment algorithms.
18
18
  >>> pm.bcv["mappings"]["41004003"]
19
19
  <CorpusMapping: 41004003>
20
20
  >>> pm.bcv["mappings"]["41004003"].source_pairs
21
- [(<Source: n41004003001>, 0), (<Source: n41004003002>, 1), (<Source: n41004003003>, 2), ...
21
+ [(<Source: n41004003001|Ἀκούετε>, 0), (<Source: n41004003002|ἰδοὺ>, 1), (<Source: n41004003003|ἐξῆλθεν>, 2), ...
22
22
  >>> pm.bcv["mappings"]["41004003"].target_pairs
23
- [(<Target: 410040030011>, 0), (<Target: 410040030021>, 1), (<Target: 410040030031>, 2), ...
23
+ [(<Target: 410040030011|Listen>, 0), (<Target: 410040030021|A>, 1), (<Target: 410040030031|sower>, 2), ...
24
24
 
25
25
 
26
26
  """
@@ -283,6 +283,14 @@ class AlignmentRecord:
283
283
  """True if any selectors in references are incomplete."""
284
284
  return any(ref.incomplete for ref in self.references.values())
285
285
 
286
+ def update_target_selectors(self, selectors: list[str]) -> None:
287
+ """Replace the target selectors for this record.
288
+
289
+ Selectors are sorted, matching the behaviour of
290
+ AlignmentReference.__post_init__.
291
+ """
292
+ self.references["target"].selectors = sorted(selectors)
293
+
286
294
  def asdict(
287
295
  self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
288
296
  ) -> dict[str, Any]:
@@ -25,7 +25,7 @@ class BaseToken:
25
25
 
26
26
  def __repr__(self) -> str:
27
27
  """Return a printed representation."""
28
- return f"<{self.__class__.__name__}: {self.id}>"
28
+ return f"<{self.__class__.__name__}: {self.tokenstr}>"
29
29
 
30
30
  #
31
31
  def __hash__(self) -> int:
@@ -156,6 +156,16 @@ class VerseData:
156
156
  for trg in targets:
157
157
  print(f"Target: {trg._display}")
158
158
 
159
+ def display_record(self, alrec: AlignmentRecord) -> None:
160
+ """Display an alignment record from this instance."""
161
+ source_tokenstring: str = ", ".join(
162
+ [self.sourceitems[sel].tokenstr for sel in alrec.source_selectors]
163
+ )
164
+ target_tokenstring: str = ", ".join(
165
+ [self.targetitems[sel].tokenstr for sel in alrec.target_selectors]
166
+ )
167
+ print(f"{alrec.meta.id}: {source_tokenstring} --- {target_tokenstring}")
168
+
159
169
  def unaligned(self, typeattr: str = "targets", keepexcluded: bool = False) -> None:
160
170
  """Display tokens from typeattr that are _not_ aligned."""
161
171
  assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
@@ -172,13 +182,13 @@ class VerseData:
172
182
  if aligned:
173
183
  for sources, targets in self.alignments:
174
184
  print(
175
- f"{str([src.idtext for src in sources]):{srcwidth}}\t\t{[trg.idtext for trg in targets]}"
185
+ f"{str([src.tokenstr for src in sources]):{srcwidth}}\t\t{[trg.tokenstr for trg in targets]}"
176
186
  )
177
187
  else:
178
188
  # show all sources with their (possibly empty) target alignments
179
189
  for source in self.sources:
180
190
  print(
181
- f"{str(source.idtext):{srcwidth}}\t\t{[trg.idtext for trg in self.get_source_alignments(source)]}"
191
+ f"{str(source.tokenstr):{srcwidth}}\t\t{[trg.tokenstr for trg in self.get_source_alignments(source)]}"
182
192
  )
183
193
 
184
194
  def get_texts(
@@ -301,6 +301,7 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
301
301
  """Write JSON data for an arbitrary group in Scripture Burrito format.
302
302
 
303
303
  Writes some of the JSON by hand to get records on the same line.
304
+ Record meta.id values are assigned sequentially per BCV, e.g. "40001001.1".
304
305
  """
305
306
 
306
307
  def _write_documents(out: TextIO, documents: tuple[Document, Document]) -> None:
@@ -311,19 +312,32 @@ def write_alignment_group(group: AlignmentGroup, f: TextIO, hoist: bool = True)
311
312
  out.write(" ],\n")
312
313
 
313
314
  def _write_meta(out: TextIO, meta: Metadata) -> None:
314
- """Write metdatadata to out."""
315
+ """Write metadata to out."""
315
316
  metarow = '"meta": ' + json.dumps(meta.asdict())
316
317
  f.write(f" {metarow},\n")
317
318
 
319
+ def _record_dict(arec: AlignmentRecord, bcv_counters: dict[str, int]) -> dict[str, Any]:
320
+ """Return the serialized dict for arec with a sequential BCV-based id.
321
+
322
+ This converts the ClearAligner opaque IDs to something
323
+ meaningful, attempting to make files more diff-able.
324
+ """
325
+ bcv = arec.source_bcv
326
+ bcv_counters[bcv] = bcv_counters.get(bcv, 0) + 1
327
+ recdict = arec.asdict()
328
+ recdict["meta"]["id"] = f"{bcv}.{bcv_counters[bcv]:02}"
329
+ return recdict
330
+
318
331
  f.write("{\n")
319
332
  _write_documents(f, group.documents)
320
333
  _write_meta(f, group.meta)
321
334
  f.write(f' "roles": {json.dumps(group.roles)},\n')
322
335
  f.write(f' "type": "{group._type}",\n "records": [\n ')
323
336
  # should sort the records: NIV11 doesn't appear to be sorted
337
+ bcv_counters: dict[str, int] = {}
324
338
  for arec in group.records[:-1]:
325
- json.dump(arec.asdict(), f)
339
+ json.dump(_record_dict(arec, bcv_counters), f)
326
340
  f.write(",\n ")
327
341
  # now the last one without a comma, because JSON
328
- json.dump(group.records[-1].asdict(), f)
342
+ json.dump(_record_dict(group.records[-1], bcv_counters), f)
329
343
  f.write("\n ]}")
@@ -114,6 +114,10 @@ class Manager(UserDict):
114
114
  keepbadrecords=self.keepbadrecords,
115
115
  )
116
116
  self.alignmentsreader.clean_alignments(self.sourceitems, self.targetitems)
117
+ # TODO: upgrade the selectors to use tokenstr. This requires
118
+ # knowing the source and targetitems, but alignmentsreader
119
+ # doesn't have that data
120
+ # self.add_tokenstr_to_records(self)
117
121
  # group records by BCV
118
122
  self.bcv["records"] = groupby_bcv(
119
123
  list(self.alignmentsreader.alignmentgroup.records), lambda r: r.source_bcv
@@ -17,12 +17,11 @@ called from burrito.manager.Manager().
17
17
  5468
18
18
  # dict: token ID -> Source() instance
19
19
  >>> src["n41004003001"]
20
- src["n41004003001"]
21
- <Source: n41004003001>
20
+ <Source: n41004003001|Ἀκούετε>
22
21
  >>> src["n41004003001"].display()
23
22
  n41004003001: Ἀκούετε (Listen, ἀκούω, verb)
24
- >>> src["n41004003001"].idtext
25
- ('n41004003001', 'Ἀκούετε')
23
+ >>> src["n41004003001"].tokenstr
24
+ 'n41004003001|Ἀκούετε'
26
25
  >>> src["n41004003001"].asdict()
27
26
  {'identifier': 'n41004003001',
28
27
  'altId': 'Ἀκούετε-1',
@@ -1,4 +1,14 @@
1
- """Manage the target/translation data for Grape City (gc) alignment data.
1
+ """Manage the target/translation data for alignment data.
2
+
3
+ This typically reads the output of kathairo.
4
+
5
+ Limitations:
6
+
7
+ - Each token is assigned to the relevant source verse, which may be
8
+ different than the verse assignments in the target text. This is
9
+ version-specific, not necessarily a versification issue. So verse
10
+ identifiers may need mapping.
11
+ - Example: SBLGNT for 3JN has v. 15, but all these tokens are in v. 14 in the NIV11.
2
12
 
3
13
  >>> from biblealignlib.burrito import target
4
14
  # Reading is normally done by Manager
@@ -15,6 +25,7 @@
15
25
  # write the tokens out
16
26
  >>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
17
27
  >>> tr.write_tsv(tokenlist=tr.data.values(), outpath=(LANGDATAPATH / "targets/BSB/new-nt_BSB.tsv"))
28
+
18
29
  """
19
30
 
20
31
  from collections import UserDict, defaultdict
@@ -0,0 +1,168 @@
1
+ """Compare two alignment groups record by record.
2
+
3
+ Both groups must share the same sourceid, targetid, and
4
+ targetlanguage. This is most useful for checking minor changes to
5
+ ensure you haven't introduced errors.
6
+
7
+ Comparison ignores meta.id (which is assigned on write) but reports
8
+ differences in targets and all other meta fields (status, origin, creator, note).
9
+
10
+ >>> from biblealignlib.burrito import CLEARROOT, AlignmentSet
11
+ >>> from biblealignlib.util.DiffAlignments import DiffAlignments
12
+ >>> LANGDATAPATH = CLEARROOT / "alignments-eng/data"
13
+ >>> alset1 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
14
+ ... targetlanguage="eng", langdatapath=LANGDATAPATH,
15
+ ... alternateid="manual")
16
+ >>> alset2 = AlignmentSet(sourceid="SBLGNT", targetid="BSB",
17
+ ... targetlanguage="eng", langdatapath=LANGDATAPATH,
18
+ ... alternateid="updated")
19
+ >>> da = DiffAlignments(alset1, alset2)
20
+ >>> da.show()
21
+
22
+ """
23
+
24
+ from dataclasses import dataclass, field
25
+
26
+ from ..burrito.AlignmentGroup import AlignmentGroup, AlignmentRecord
27
+ from ..burrito.AlignmentSet import AlignmentSet
28
+ from ..burrito.alignments import AlignmentsReader
29
+
30
+ # Meta fields compared between records (id is intentionally excluded)
31
+ _COMPARED_META_FIELDS = ("creator", "note", "origin", "status")
32
+
33
+
34
+ def _record_key(rec: AlignmentRecord) -> tuple[str, ...]:
35
+ """Return a stable key for matching records across groups.
36
+
37
+ Keyed by sorted source selectors, since records are matched on the
38
+ source side and targets may differ.
39
+ """
40
+ return tuple(sorted(rec.source_selectors))
41
+
42
+
43
+ def _meta_diffs(rec1: AlignmentRecord, rec2: AlignmentRecord) -> dict[str, tuple[str, str]]:
44
+ """Return a dict of differing meta fields (excluding id).
45
+
46
+ Keys are field names; values are (val_in_rec1, val_in_rec2).
47
+ """
48
+ diffs: dict[str, tuple[str, str]] = {}
49
+ for field_name in _COMPARED_META_FIELDS:
50
+ v1 = getattr(rec1.meta, field_name, "")
51
+ v2 = getattr(rec2.meta, field_name, "")
52
+ if v1 != v2:
53
+ diffs[field_name] = (str(v1), str(v2))
54
+ return diffs
55
+
56
+
57
+ @dataclass
58
+ class RecordDiff:
59
+ """Captures differences between two matched alignment records."""
60
+
61
+ source_selectors: tuple[str, ...]
62
+ # non-empty when targets differ
63
+ targets1: list[str] = field(default_factory=list)
64
+ targets2: list[str] = field(default_factory=list)
65
+ # non-empty when meta fields (excluding id) differ
66
+ meta_diffs: dict[str, tuple[str, str]] = field(default_factory=dict)
67
+
68
+ @property
69
+ def targets_differ(self) -> bool:
70
+ """True if target selectors differ between the two records."""
71
+ return self.targets1 != self.targets2
72
+
73
+ def __repr__(self) -> str:
74
+ src = ", ".join(self.source_selectors)
75
+ parts = [f"<RecordDiff src=[{src}]"]
76
+ if self.targets_differ:
77
+ parts.append(f" targets: {self.targets1} -> {self.targets2}")
78
+ for fname, (v1, v2) in self.meta_diffs.items():
79
+ parts.append(f" {fname}: {v1!r} -> {v2!r}")
80
+ parts.append(">")
81
+ return "".join(parts)
82
+
83
+
84
+ class DiffAlignments:
85
+ """Compare two alignment groups from the same source/target pair.
86
+
87
+ Records are matched by their source selectors. Differences in
88
+ target selectors and metadata (excluding id) are reported.
89
+ """
90
+
91
+ def __init__(self, alset1: AlignmentSet, alset2: AlignmentSet) -> None:
92
+ """Initialize and compute differences."""
93
+ for attr in ("sourceid", "targetid", "targetlanguage"):
94
+ v1 = getattr(alset1, attr)
95
+ v2 = getattr(alset2, attr)
96
+ if v1 != v2:
97
+ raise ValueError(f"AlignmentSets differ on {attr!r}: {v1!r} vs {v2!r}")
98
+ self.alset1 = alset1
99
+ self.alset2 = alset2
100
+ self.group1: AlignmentGroup = AlignmentsReader(alset1).alignmentgroup
101
+ self.group2: AlignmentGroup = AlignmentsReader(alset2).alignmentgroup
102
+
103
+ # index each group's records by source-selector key
104
+ self._recs1: dict[tuple[str, ...], AlignmentRecord] = {
105
+ _record_key(r): r for r in self.group1.records
106
+ }
107
+ self._recs2: dict[tuple[str, ...], AlignmentRecord] = {
108
+ _record_key(r): r for r in self.group2.records
109
+ }
110
+
111
+ keys1 = set(self._recs1)
112
+ keys2 = set(self._recs2)
113
+
114
+ # records present only in one group
115
+ self.only_in_1: list[AlignmentRecord] = [self._recs1[k] for k in sorted(keys1 - keys2)]
116
+ self.only_in_2: list[AlignmentRecord] = [self._recs2[k] for k in sorted(keys2 - keys1)]
117
+
118
+ # records present in both; compare targets and meta
119
+ self.record_diffs: list[RecordDiff] = []
120
+ for key in sorted(keys1 & keys2):
121
+ r1, r2 = self._recs1[key], self._recs2[key]
122
+ t1, t2 = sorted(r1.target_selectors), sorted(r2.target_selectors)
123
+ mdiffs = _meta_diffs(r1, r2)
124
+ if t1 != t2 or mdiffs:
125
+ self.record_diffs.append(
126
+ RecordDiff(source_selectors=key, targets1=t1, targets2=t2, meta_diffs=mdiffs)
127
+ )
128
+
129
+ @property
130
+ def has_diffs(self) -> bool:
131
+ """True if any differences were found."""
132
+ return bool(self.only_in_1 or self.only_in_2 or self.record_diffs)
133
+
134
+ def show(self) -> None:
135
+ """Print a human-readable summary of all differences."""
136
+ label1 = self.alset1.identifier
137
+ label2 = self.alset2.identifier
138
+ print(f"Comparing {label1!r} vs {label2!r}")
139
+ print(
140
+ f" {len(self.group1.records)} records in {label1}, "
141
+ f"{len(self.group2.records)} records in {label2}"
142
+ )
143
+ if not self.has_diffs:
144
+ print(" No differences found.")
145
+ return
146
+
147
+ if self.only_in_1:
148
+ print(f"\n Records only in {label1} ({len(self.only_in_1)}):")
149
+ for rec in self.only_in_1:
150
+ src = ", ".join(rec.source_selectors)
151
+ print(f" - src=[{src}] tgt={rec.target_selectors}")
152
+
153
+ if self.only_in_2:
154
+ print(f"\n Records only in {label2} ({len(self.only_in_2)}):")
155
+ for rec in self.only_in_2:
156
+ src = ", ".join(rec.source_selectors)
157
+ print(f" + src=[{src}] tgt={rec.target_selectors}")
158
+
159
+ if self.record_diffs:
160
+ print(f"\n Records with differences ({len(self.record_diffs)}):")
161
+ for diff in self.record_diffs:
162
+ src = ", ".join(diff.source_selectors)
163
+ print(f" src=[{src}]")
164
+ if diff.targets_differ:
165
+ print(f" targets: {diff.targets1}")
166
+ print(f" -> {diff.targets2}")
167
+ for fname, (v1, v2) in diff.meta_diffs.items():
168
+ print(f" {fname}: {v1!r} -> {v2!r}")