biblealignlib 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. biblealignlib-0.1.4/LICENSE +21 -0
  2. biblealignlib-0.1.4/LICENSE.md +92 -0
  3. biblealignlib-0.1.4/PKG-INFO +59 -0
  4. biblealignlib-0.1.4/README.md +28 -0
  5. biblealignlib-0.1.4/biblealignlib/__init__.py +110 -0
  6. biblealignlib-0.1.4/biblealignlib/autoalign/Score.py +144 -0
  7. biblealignlib-0.1.4/biblealignlib/autoalign/__init__.py +23 -0
  8. biblealignlib-0.1.4/biblealignlib/autoalign/corpusmapping.py +73 -0
  9. biblealignlib-0.1.4/biblealignlib/autoalign/eflomal.py +134 -0
  10. biblealignlib-0.1.4/biblealignlib/autoalign/mapper.py +150 -0
  11. biblealignlib-0.1.4/biblealignlib/autoalign/reader.py +212 -0
  12. biblealignlib-0.1.4/biblealignlib/autoalign/runeflomal.py +32 -0
  13. biblealignlib-0.1.4/biblealignlib/autoalign/scorer.py +311 -0
  14. biblealignlib-0.1.4/biblealignlib/autoalign/writer.py +180 -0
  15. biblealignlib-0.1.4/biblealignlib/burrito/AlignmentGroup.py +420 -0
  16. biblealignlib-0.1.4/biblealignlib/burrito/AlignmentSet.py +164 -0
  17. biblealignlib-0.1.4/biblealignlib/burrito/AlignmentType.py +67 -0
  18. biblealignlib-0.1.4/biblealignlib/burrito/BadRecord.py +70 -0
  19. biblealignlib-0.1.4/biblealignlib/burrito/BaseToken.py +85 -0
  20. biblealignlib-0.1.4/biblealignlib/burrito/VerseData.py +242 -0
  21. biblealignlib-0.1.4/biblealignlib/burrito/__init__.py +65 -0
  22. biblealignlib-0.1.4/biblealignlib/burrito/alignments.py +327 -0
  23. biblealignlib-0.1.4/biblealignlib/burrito/manager.py +183 -0
  24. biblealignlib-0.1.4/biblealignlib/burrito/source.py +476 -0
  25. biblealignlib-0.1.4/biblealignlib/burrito/target.py +329 -0
  26. biblealignlib-0.1.4/biblealignlib/burrito/util.py +91 -0
  27. biblealignlib-0.1.4/biblealignlib/strongs.py +67 -0
  28. biblealignlib-0.1.4/pyproject.toml +78 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Clear.Bible
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,92 @@
1
+ # Bible Word Alignments
2
+
3
+ We license all of our own code under an MIT license and all of our own
4
+ data under CC-BY. For details, see sections below for [Code](#code) and [Data](#data).
5
+
6
+ ## Code
7
+
8
+ Code for this project (`../bible_alignments`) is copyright (c) 2023 by
9
+ [Clear Bible, Inc](http://www.clear.bible) and is licensed under the
10
+ terms of the MIT License.
11
+
12
+ ### MIT License
13
+
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining
16
+ a copy of this software and associated documentation files (the
17
+ “Software”), to deal in the Software without restriction, including
18
+ without limitation the rights to use, copy, modify, merge, publish,
19
+ distribute, sublicense, and/or sell copies of the Software, and to
20
+ permit persons to whom the Software is furnished to do so, subject to
21
+ the following conditions:
22
+
23
+ The above copyright notice and this permission notice shall be
24
+ included in all copies or substantial portions of the Software.
25
+
26
+ THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
27
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
29
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
30
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
31
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
32
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33
+
34
+ ## Data
35
+
36
+ [Bible Word Alignments](https://github.com/Clear-Bible/Alignments) © 2022 by [Clear Bible, Inc](http://www.clear.bible) is licensed under [CC BY 4.0 ](http://creativecommons.org/licenses/by/4.0/).
37
+
38
+ These datasets include:
39
+
40
+ 1. Alignment files (`../data/alignments`) derived from Clear Bible's data.
41
+ 2. Source text files (`../data/sources`) derived from Clear Bible's
42
+ data. Note that any copyright-protected text has been stripped out.
43
+ 3. Target text files (`../data/targets`) derived from Clear Bible's
44
+ data. Note that any copyright-protected text has been stripped out.
45
+ 4. Names files (`../data/targets`) derived from alignment data.
46
+
47
+ Source text files include data from:
48
+
49
+ * Westminster Leningrad Codex - the somewhat informal license states
50
+ that "All biblical Hebrew text, in any format, may be viewed or
51
+ copied without restriction."
52
+
53
+ Target text files include data from:
54
+
55
+ * The text for the Chinese Union Version (Simplified) is in the public
56
+ domain. The Chinese Union Version with Modern Punctuation
57
+ (Simplified) (`CUVMP`) is a derivative work, and is Copyright © 2011
58
+ Global Bible Initiative / © 2011 全球圣经促进会 and in the public
59
+ domain.
60
+ * Young's Literal Translation (`YLT`), by Robert Young (1862, 1887, 1898),
61
+ which is in the public domain.
62
+
63
+ The repository also includes data on strategic languages for Bible
64
+ translation (`../data/languages`) from the [ETEN Innovation
65
+ Lab](https://dev.lab.eten.bible/).
66
+
67
+ ### License
68
+
69
+ #### Creative Commons Attribution 4.0 International (CC BY 4.0)
70
+
71
+ This is a human-readable summary of (and not a substitute for) the [license](http://creativecommons.org/licenses/by/4.0/).
72
+
73
+ ##### You are free to:
74
+
75
+ * **Share** — copy and redistribute the material in any medium or format
76
+ * **Adapt** — remix, transform, and build upon the material
77
+ for any purpose, even commercially.
78
+
79
+ The licensor cannot revoke these freedoms as long as you follow the license terms.
80
+
81
+ ##### Under the following terms:
82
+
83
+ * **Attribution** — You must attribute the work as follows: "MACULA Greek Linguistic Datasets, available at https://github.com/Clear-Bible/macula-greek/". You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
84
+
85
+ **No additional restrictions** — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
86
+
87
+ ##### Notices:
88
+
89
+ You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
90
+
91
+ No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
92
+
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.1
2
+ Name: biblealignlib
3
+ Version: 0.1.4
4
+ Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
5
+ Home-page: https://github.com/Clear-Bible/biblealignlib
6
+ License: MIT
7
+ Keywords: Bible,alignment,Bible alignment
8
+ Author: Sean Boisen
9
+ Author-email: sean.boisen@biblica.com
10
+ Requires-Python: >=3.10,<3.12
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Religion
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Topic :: Religion
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Dist: altair (>=5.5.0,<6.0.0)
21
+ Requires-Dist: biblelib (>=0.3.17,<0.4.0)
22
+ Requires-Dist: jupyter-server-ydoc (>=1.1.0,<2.0.0)
23
+ Requires-Dist: jupyterlab (>=4.3.3,<5.0.0)
24
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
25
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
26
+ Requires-Dist: regex (>=2024.11.6,<2025.0.0)
27
+ Requires-Dist: unicodecsv (>=0.14.1,<0.15.0)
28
+ Project-URL: Repository, https://github.com/Clear-Bible/biblealignlib
29
+ Description-Content-Type: text/markdown
30
+
31
+ # biblealignlib
32
+
33
+ Biblica's code for working with Bible alignment data from
34
+ https://github.com/Clear-Bible/Alignments .
35
+
36
+ Currently private but we should move toward a future where it's
37
+ public.
38
+
39
+ ## Installing extra dependencies
40
+
41
+ ### eflomal
42
+ `eflomal` is specified as an extra, so it is not installed with `poetry install`.
43
+
44
+ On macOS, you may need to install additional dependencies before installing:
45
+
46
+ ```
47
+ brew install llvm libomp
48
+ ```
49
+
50
+ You'll need to override the `CFLAGS` and `LDFLAGS` environment variables before installing `eflomal`.
51
+
52
+ ```
53
+ poetry shell
54
+ export CFLAGS="-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include -Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g"
55
+ export LDFLAGS="-Xpreprocessor -fopenmp -L${HOMEBREW_PREFIX}/opt/libomp/lib -lm -lomp"
56
+ poetry install -E eflomal
57
+ ```
58
+
59
+
@@ -0,0 +1,28 @@
1
+ # biblealignlib
2
+
3
+ Biblica's code for working with Bible alignment data from
4
+ https://github.com/Clear-Bible/Alignments .
5
+
6
+ Currently private but we should move toward a future where it's
7
+ public.
8
+
9
+ ## Installing extra dependencies
10
+
11
+ ### eflomal
12
+ `eflomal` is specified as an extra, so it is not installed with `poetry install`.
13
+
14
+ On macOS, you may need to install additional dependencies before installing:
15
+
16
+ ```
17
+ brew install llvm libomp
18
+ ```
19
+
20
+ You'll need to override the `CFLAGS` and `LDFLAGS` environment variables before installing `eflomal`.
21
+
22
+ ```
23
+ poetry shell
24
+ export CFLAGS="-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include -Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g"
25
+ export LDFLAGS="-Xpreprocessor -fopenmp -L${HOMEBREW_PREFIX}/opt/libomp/lib -lm -lomp"
26
+ poetry install -E eflomal
27
+ ```
28
+
@@ -0,0 +1,110 @@
1
+ """Internal-only code for working with alignment data."""
2
+
3
+ from enum import Enum
4
+ import os
5
+ from pathlib import Path
6
+ import re
7
+
8
+ import dotenv
9
+
10
+ from .strongs import normalize_strongs
11
+
12
+ # it would be nice to import symbols from burrito and autoalign here:
13
+ # but i don't know how to avoid circular imports, when that codes also
14
+ # imports from biblealignlib
15
+
16
+ # set path variables. These assume you have a .env file that locates
17
+ # the directory where Clear-Bible repositories are located, like
18
+ #
19
+ # CLEARROOT=/Users/sboisen/git/Clear-Bible
20
+ #
21
+ # use an environment variable if
22
+ if not dotenv.load_dotenv():
23
+ print("No .env file found")
24
+ clearrootenvar = os.getenv("CLEARROOT")
25
+ if clearrootenvar:
26
+ CLEARROOT = Path(clearrootenvar)
27
+ else:
28
+ CLEARROOT = Path.home() / "git/Clear-Bible"
29
+ print(f"No environment variable for CLEARROOT: assuming {CLEARROOT}")
30
+
31
+ # for loading published data. Alignments are here under language
32
+ ALIGNMENTSDATA = CLEARROOT / "Alignments/data"
33
+ # for loading published source TSVs
34
+ SOURCES = ALIGNMENTSDATA / "sources"
35
+
36
+ CANONIDS = {
37
+ "nt",
38
+ "ot",
39
+ # meaning the entire 66 book corpus
40
+ "protestant",
41
+ }
42
+
43
+
44
+ VERSIFICATIONIDS: set[str] = {
45
+ "eng",
46
+ "org",
47
+ "rso",
48
+ # not yet implemented
49
+ # "ethiopian_custom", "lxx", "rsc", "vul"
50
+ }
51
+
52
+
53
+ class SourceidEnum(str, Enum):
54
+ """Valid source identifiers."""
55
+
56
+ BGNT = "BGNT"
57
+ NA27 = "NA27"
58
+ NA28 = "NA28"
59
+ SBLGNT = "SBLGNT"
60
+ WLC = "WLC"
61
+ WLCM = "WLCM"
62
+
63
+ @property
64
+ def canon(self) -> str:
65
+ """Return 'ot' or 'nt' for the canon."""
66
+ if self.value in ["WLC", "WLCM"]:
67
+ return "ot"
68
+ elif self.value in ["BGNT", "NA27", "NA28", "SBLGNT"]:
69
+ return "nt"
70
+ else:
71
+ raise ValueError(f"Unknown error in SourceidEnum.canon for {self.value}")
72
+
73
+ # need to add DC, probably others down the road
74
+ @staticmethod
75
+ def get_canon(sourceid: str) -> str:
76
+ """Return a canon string for recognized sources, else 'X'."""
77
+ try:
78
+ srcenum = SourceidEnum(sourceid)
79
+ return srcenum.canon
80
+ except ValueError:
81
+ # unrecognized source
82
+ return "X"
83
+
84
+
85
+ def get_canonid(bcv: str) -> str:
86
+ """Return nt/ot for a BCVish string.
87
+
88
+ Simple string matching on the book portion of an identifier, so
89
+ works for books, chapters, verses and full BCVWPID identifiers.
90
+
91
+ """
92
+ otcanonre = re.compile(r"^[0-3][0-9]")
93
+ ntcanonre = re.compile(r"^[4-6][0-9]")
94
+ # don't include 67-69
95
+ notntcanonre = re.compile(r"^6[7-9]")
96
+ if otcanonre.match(bcv):
97
+ return "ot"
98
+ elif ntcanonre.match(bcv) and not notntcanonre.match(bcv):
99
+ return "nt"
100
+ else:
101
+ raise ValueError(f"Invalid BCVish id value: {bcv}")
102
+
103
+
104
+ __all__ = [
105
+ "CLEARROOT",
106
+ "SOURCES",
107
+ "SourceidEnum",
108
+ # strongs
109
+ "normalize_strongs",
110
+ ]
@@ -0,0 +1,144 @@
1
+ """Manage scores for alignment data."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Optional
5
+
6
+ from biblelib.word import BCVID
7
+
8
+ from biblealignlib.burrito import Source, Target, VerseData
9
+
10
+
11
+ def precision(true_positives: int, false_positives: int) -> float:
12
+ denom = true_positives + false_positives
13
+ return true_positives / denom if denom else 0
14
+
15
+
16
+ def recall(true_positives: int, false_negatives: int) -> float:
17
+ denom = true_positives + false_negatives
18
+ return true_positives / denom if denom else 0
19
+
20
+
21
+ def f1(recall: float, precision: float) -> float:
22
+ denom = precision + recall
23
+ return ((2 * precision * recall) / denom) if denom else 0
24
+
25
+
26
+ @dataclass
27
+ class _BaseScore:
28
+ """Manage base scoring metrics."""
29
+
30
+ identifier: str = ""
31
+ true_positives: int = 0
32
+ # true_negatives: int = 0
33
+ false_positives: int = 0
34
+ false_negatives: int = 0
35
+ precision: float = 0.0
36
+ recall: float = 0.0
37
+ f1: float = 0.0
38
+ aer: float = 0.0
39
+
40
+ def __repr__(self) -> str:
41
+ """Return a string representation of the Score."""
42
+ return f"<{self.__class__.__name__}: {self.identifier}>"
43
+
44
+ def compute_metrics(self) -> None:
45
+ """Compute various metrics."""
46
+ self.precision = precision(self.true_positives, self.false_positives)
47
+ self.aer = 1 - self.precision
48
+ self.recall = recall(self.true_positives, self.false_negatives)
49
+ self.f1 = f1(self.recall, self.precision)
50
+
51
+ # should use summary_dict here
52
+ def summary(self, width: int = 4, brief: bool = True) -> str:
53
+ """Return summary metrics."""
54
+ plabel = "P" if brief else "Precision"
55
+ rlabel = "R" if brief else "Recall"
56
+ return f"{self.identifier}: AER={self.aer:.{width}f}\t{plabel}={self.precision:.{width}f}\t{rlabel}={self.recall:.{width}f}\tF1={self.f1:.{width}f}"
57
+
58
+ def summary_dict(self, width: int = 4) -> dict[str, str]:
59
+ """Return a dict with summary scores."""
60
+ return {
61
+ "AER": f"{self.aer:.{width}f}",
62
+ "F1": f"{self.f1:.{width}f}",
63
+ "Precision": f"{self.precision:.{width}f}",
64
+ "Recall": f"{self.recall:.{width}f}",
65
+ }
66
+
67
+ def asdict(self, ndigits=3) -> dict[str, Any]:
68
+ """Return a dict usable as a dataframe row."""
69
+ scoredict = {
70
+ # this _should_ always be a BCV
71
+ "Identifier": self.identifier,
72
+ # just the verse index
73
+ "Verse": self.identifier[5:],
74
+ "Chapter": self.identifier[:5],
75
+ "Book": self.identifier[:2],
76
+ "Reference": BCVID(self.identifier).to_usfm(),
77
+ "AER": round(self.aer, ndigits),
78
+ "F1": round(self.f1, ndigits),
79
+ "Precision": round(self.precision, ndigits),
80
+ "Recall": round(self.recall, ndigits),
81
+ }
82
+ return scoredict
83
+
84
+
85
+ @dataclass(repr=False)
86
+ class VerseScore(_BaseScore):
87
+ """Manage scoring data for a verse."""
88
+
89
+ # not really optional, but dataclass inheritance requires this
90
+ reference: Optional[VerseData] = None
91
+ hypothesis: Optional[VerseData] = None
92
+ # computed
93
+ n_sources: int = 0
94
+ n_targets: int = 0
95
+ referencepairs: list[tuple[Source, Target]] = field(init=False, default_factory=list)
96
+ hypothesispairs: list[tuple[Source, Target]] = field(init=False, default_factory=list)
97
+
98
+ def __post_init__(self) -> None:
99
+ """Compute values on initialization."""
100
+ self.identifier = self.reference.bcvid
101
+ self.n_sources = len(self.reference.sources)
102
+ self.n_targets = len(self.reference.targets)
103
+ # decompose into pairs of source and target indices
104
+ self._get_pairs()
105
+ # set operations on pairs: no partial credit
106
+ self.true_positives = len(set(self.referencepairs) & set(self.hypothesispairs))
107
+ self.false_positives = len(set(self.hypothesispairs) - set(self.referencepairs))
108
+ self.false_negatives = len(set(self.referencepairs) - set(self.hypothesispairs))
109
+ # sets values for P, R, F1, AER
110
+ self.compute_metrics()
111
+
112
+ def _get_pairs(self) -> None:
113
+ """Populate reference/hypothesispairs."""
114
+ # these are like pharaoh: tokens are repeated for multiple alignments
115
+ self.referencepairs = self.reference.get_pairs()
116
+ self.hypothesispairs = self.hypothesis.get_pairs()
117
+
118
+
119
+ @dataclass(repr=False)
120
+ class EssentialVerseScore(VerseScore):
121
+ """Like VerseScore but only for essential alignments."""
122
+
123
+ def _get_pairs(self) -> None:
124
+ """Populate reference/hypothesispairs."""
125
+ # these are like pharaoh: tokens are repeated for multiple alignments
126
+ self.referencepairs = self.reference.get_pairs(essential=True)
127
+ self.hypothesispairs = self.hypothesis.get_pairs(essential=True)
128
+
129
+
130
+ @dataclass(repr=False)
131
+ class GroupScore(_BaseScore):
132
+ """Manage scoring data for a group of verses."""
133
+
134
+ verse_scores: list[VerseScore] = field(default_factory=list)
135
+
136
+ def __post_init__(self) -> None:
137
+ """Compute values on initialization."""
138
+ assert self.identifier, "Must provide identifier"
139
+ assert self.verse_scores, "Must provide verse_scores."
140
+
141
+ self.true_positives: int = sum(v.true_positives for v in self.verse_scores)
142
+ self.false_positives: int = sum(v.false_positives for v in self.verse_scores)
143
+ self.false_negatives: int = sum(v.false_negatives for v in self.verse_scores)
144
+ self.compute_metrics()
@@ -0,0 +1,23 @@
1
+ from biblealignlib import CLEARROOT, SOURCES
2
+
3
+ # when it gets fixed
4
+ # from .eflomal import Eflomal
5
+ from .mapper import PharaohMapper
6
+ from .reader import PharaohReader
7
+ from .scorer import Scorer
8
+ from .writer import PharaohWriter
9
+
10
+ __all__ = [
11
+ "CLEARROOT",
12
+ "SOURCES",
13
+ # # eflomal
14
+ # "Eflomal",
15
+ # mapper
16
+ "PharaohMapper",
17
+ # reader
18
+ "PharaohReader",
19
+ # scorer
20
+ "Scorer",
21
+ # writer
22
+ "PharaohWriter",
23
+ ]
@@ -0,0 +1,73 @@
1
+ """Manages corpus data for auto alignment."""
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+ from biblealignlib.burrito import (
6
+ BaseToken,
7
+ Source,
8
+ Target,
9
+ )
10
+
11
+
12
+ @dataclass
13
+ class CorpusMapping:
14
+ """Map corpus instances to pharaoh-data for a single verse correspondence.
15
+
16
+ bcv is based on source versification: targets with different
17
+ versifications should be mapped to comparable source verses in the
18
+ TSV.
19
+
20
+ Example: Target:01031005007 corresponds to Source:01032001
21
+
22
+ """
23
+
24
+ # BCV-format verse reference
25
+ bcv: str
26
+ # Source instances and their pharaoh indices
27
+ source_pairs: list[tuple[Source, int]] = field(default_factory=list)
28
+ # Target instances and their pharaoh indices
29
+ target_pairs: list[tuple[Target, int]] = field(default_factory=list)
30
+ _typeattrs: tuple = ("sources", "targets")
31
+ # these values computed in post_init
32
+ # dict: index -> Token
33
+ sourceindexmap: dict[int, BaseToken] = field(default_factory=dict)
34
+ targetindexmap: dict[int, BaseToken] = field(default_factory=dict)
35
+ # dict: Token -> index
36
+ sourcetokenmap: dict[BaseToken, int] = field(default_factory=dict)
37
+ targettokenmap: dict[BaseToken, int] = field(default_factory=dict)
38
+
39
+ def __post_init__(self) -> None:
40
+ """Compute values after initialization."""
41
+ self.sourceindexmap = {index: item for item, index in self.source_pairs}
42
+ self.targetindexmap = {index: item for item, index in self.target_pairs}
43
+ self.sourcetokenmap = dict(self.source_pairs)
44
+ self.targettokenmap = dict(self.target_pairs)
45
+
46
+ def __repr__(self) -> str:
47
+ """Return a string representation of the CorpusMapping."""
48
+ return f"<CorpusMapping: {self.bcv}>"
49
+
50
+ def tokenids(self, typeattr: str) -> list[str]:
51
+ """Return the list of corpus token ids for typeattr."""
52
+ assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
53
+ pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
54
+ return [corpus.id for corpus, _ in pairs]
55
+
56
+ def indices(self, typeattr: str) -> list[str]:
57
+ """Return the list of pharaoah indices for typeattr."""
58
+ assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
59
+ pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
60
+ return [index for _, index in pairs]
61
+
62
+ def tokentexts(self, typeattr: str) -> list[str]:
63
+ """Return the list of corpus token texts for typeattr."""
64
+ assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
65
+ pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
66
+ return [corpus.text for corpus, _ in pairs]
67
+
68
+ def display(self, typeattr: str) -> None:
69
+ """Print out the id and text pairs for debugging."""
70
+ assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
71
+ pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
72
+ for token, index in pairs:
73
+ print(f"{index}: {token.id}, {token.text}")