biblealignlib 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblealignlib-0.1.4/LICENSE +21 -0
- biblealignlib-0.1.4/LICENSE.md +92 -0
- biblealignlib-0.1.4/PKG-INFO +59 -0
- biblealignlib-0.1.4/README.md +28 -0
- biblealignlib-0.1.4/biblealignlib/__init__.py +110 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/Score.py +144 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/__init__.py +23 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/corpusmapping.py +73 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/eflomal.py +134 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/mapper.py +150 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/reader.py +212 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/runeflomal.py +32 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/scorer.py +311 -0
- biblealignlib-0.1.4/biblealignlib/autoalign/writer.py +180 -0
- biblealignlib-0.1.4/biblealignlib/burrito/AlignmentGroup.py +420 -0
- biblealignlib-0.1.4/biblealignlib/burrito/AlignmentSet.py +164 -0
- biblealignlib-0.1.4/biblealignlib/burrito/AlignmentType.py +67 -0
- biblealignlib-0.1.4/biblealignlib/burrito/BadRecord.py +70 -0
- biblealignlib-0.1.4/biblealignlib/burrito/BaseToken.py +85 -0
- biblealignlib-0.1.4/biblealignlib/burrito/VerseData.py +242 -0
- biblealignlib-0.1.4/biblealignlib/burrito/__init__.py +65 -0
- biblealignlib-0.1.4/biblealignlib/burrito/alignments.py +327 -0
- biblealignlib-0.1.4/biblealignlib/burrito/manager.py +183 -0
- biblealignlib-0.1.4/biblealignlib/burrito/source.py +476 -0
- biblealignlib-0.1.4/biblealignlib/burrito/target.py +329 -0
- biblealignlib-0.1.4/biblealignlib/burrito/util.py +91 -0
- biblealignlib-0.1.4/biblealignlib/strongs.py +67 -0
- biblealignlib-0.1.4/pyproject.toml +78 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Clear.Bible
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Bible Word Alignments
|
|
2
|
+
|
|
3
|
+
We license all of our own code under an MIT license and all of our own
|
|
4
|
+
data under CC-BY. For details, see sections below for [Code](#code) and [Data](#data).
|
|
5
|
+
|
|
6
|
+
## Code
|
|
7
|
+
|
|
8
|
+
Code for this project (`../bible_alignments`) is copyright (c) 2023 by
|
|
9
|
+
[Clear Bible, Inc](http://www.clear.bible) and is licensed under the
|
|
10
|
+
terms of the MIT License.
|
|
11
|
+
|
|
12
|
+
### MIT License
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
16
|
+
a copy of this software and associated documentation files (the
|
|
17
|
+
“Software”), to deal in the Software without restriction, including
|
|
18
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
19
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
20
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
21
|
+
the following conditions:
|
|
22
|
+
|
|
23
|
+
The above copyright notice and this permission notice shall be
|
|
24
|
+
included in all copies or substantial portions of the Software.
|
|
25
|
+
|
|
26
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
|
|
27
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
28
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
29
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
30
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
31
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
32
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
33
|
+
|
|
34
|
+
## Data
|
|
35
|
+
|
|
36
|
+
[Bible Word Alignments](https://github.com/Clear-Bible/Alignments) © 2022 by [Clear Bible, Inc](http://www.clear.bible) is licensed under [CC BY 4.0 ](http://creativecommons.org/licenses/by/4.0/).
|
|
37
|
+
|
|
38
|
+
These datasets include:
|
|
39
|
+
|
|
40
|
+
1. Alignment files (`../data/alignments`) derived from Clear Bible's data.
|
|
41
|
+
2. Source text files (`../data/sources`) derived from Clear Bible's
|
|
42
|
+
data. Note that any copyright-protected text has been stripped out.
|
|
43
|
+
3. Target text files (`../data/targets`) derived from Clear Bible's
|
|
44
|
+
data. Note that any copyright-protected text has been stripped out.
|
|
45
|
+
4. Names files (`../data/targets`) derived from alignment data.
|
|
46
|
+
|
|
47
|
+
Source text files include data from:
|
|
48
|
+
|
|
49
|
+
* Westminster Leningrad Codex - the somewhat informal license states
|
|
50
|
+
that "All biblical Hebrew text, in any format, may be viewed or
|
|
51
|
+
copied without restriction."
|
|
52
|
+
|
|
53
|
+
Target text files include data from:
|
|
54
|
+
|
|
55
|
+
* The text for the Chinese Union Version (Simplified) is in the public
|
|
56
|
+
domain. The Chinese Union Version with Modern Punctuation
|
|
57
|
+
(Simplified) (`CUVMP`) is a derivative work, and is Copyright © 2011
|
|
58
|
+
Global Bible Initiative / © 2011 全球圣经促进会 and in the public
|
|
59
|
+
domain.
|
|
60
|
+
* Young's Literal Translation (`YLT`), by Robert Young (1862, 1887, 1898),
|
|
61
|
+
which is in the public domain.
|
|
62
|
+
|
|
63
|
+
The repository also includes data on strategic languages for Bible
|
|
64
|
+
translation (`../data/languages`) from the [ETEN Innovation
|
|
65
|
+
Lab](https://dev.lab.eten.bible/).
|
|
66
|
+
|
|
67
|
+
### License
|
|
68
|
+
|
|
69
|
+
#### Creative Commons Attribution 4.0 International (CC BY 4.0)
|
|
70
|
+
|
|
71
|
+
This is a human-readable summary of (and not a substitute for) the [license](http://creativecommons.org/licenses/by/4.0/).
|
|
72
|
+
|
|
73
|
+
##### You are free to:
|
|
74
|
+
|
|
75
|
+
* **Share** — copy and redistribute the material in any medium or format
|
|
76
|
+
* **Adapt** — remix, transform, and build upon the material
|
|
77
|
+
for any purpose, even commercially.
|
|
78
|
+
|
|
79
|
+
The licensor cannot revoke these freedoms as long as you follow the license terms.
|
|
80
|
+
|
|
81
|
+
##### Under the following terms:
|
|
82
|
+
|
|
83
|
+
* **Attribution** — You must attribute the work as follows: "MACULA Greek Linguistic Datasets, available at https://github.com/Clear-Bible/macula-greek/". You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
|
|
84
|
+
|
|
85
|
+
**No additional restrictions** — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
|
|
86
|
+
|
|
87
|
+
##### Notices:
|
|
88
|
+
|
|
89
|
+
You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
|
|
90
|
+
|
|
91
|
+
No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
|
|
92
|
+
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: biblealignlib
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
|
|
5
|
+
Home-page: https://github.com/Clear-Bible/biblealignlib
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: Bible,alignment,Bible alignment
|
|
8
|
+
Author: Sean Boisen
|
|
9
|
+
Author-email: sean.boisen@biblica.com
|
|
10
|
+
Requires-Python: >=3.10,<3.12
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Religion
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Religion
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Dist: altair (>=5.5.0,<6.0.0)
|
|
21
|
+
Requires-Dist: biblelib (>=0.3.17,<0.4.0)
|
|
22
|
+
Requires-Dist: jupyter-server-ydoc (>=1.1.0,<2.0.0)
|
|
23
|
+
Requires-Dist: jupyterlab (>=4.3.3,<5.0.0)
|
|
24
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
25
|
+
Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
|
|
26
|
+
Requires-Dist: regex (>=2024.11.6,<2025.0.0)
|
|
27
|
+
Requires-Dist: unicodecsv (>=0.14.1,<0.15.0)
|
|
28
|
+
Project-URL: Repository, https://github.com/Clear-Bible/biblealignlib
|
|
29
|
+
Description-Content-Type: text/markdown
|
|
30
|
+
|
|
31
|
+
# biblealignlib
|
|
32
|
+
|
|
33
|
+
Biblica's code for working with Bible alignment data from
|
|
34
|
+
https://github.com/Clear-Bible/Alignments .
|
|
35
|
+
|
|
36
|
+
Currently private but we should move toward a future where it's
|
|
37
|
+
public.
|
|
38
|
+
|
|
39
|
+
## Installing extra dependencies
|
|
40
|
+
|
|
41
|
+
### eflomal
|
|
42
|
+
`eflomal` is specified as an extra, so it is not installed with `poetry install`.
|
|
43
|
+
|
|
44
|
+
On macOS, you may need to install additional dependencies before installing:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
brew install llvm libomp
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
You'll need to override the `CFLAGS` and `LDFLAGS` environment variables before installing `eflomal`.
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
poetry shell
|
|
54
|
+
export CFLAGS="-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include -Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g"
|
|
55
|
+
export LDFLAGS="-Xpreprocessor -fopenmp -L${HOMEBREW_PREFIX}/opt/libomp/lib -lm -lomp"
|
|
56
|
+
poetry install -E eflomal
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# biblealignlib
|
|
2
|
+
|
|
3
|
+
Biblica's code for working with Bible alignment data from
|
|
4
|
+
https://github.com/Clear-Bible/Alignments .
|
|
5
|
+
|
|
6
|
+
Currently private but we should move toward a future where it's
|
|
7
|
+
public.
|
|
8
|
+
|
|
9
|
+
## Installing extra dependencies
|
|
10
|
+
|
|
11
|
+
### eflomal
|
|
12
|
+
`eflomal` is specified as an extra, so it is not installed with `poetry install`.
|
|
13
|
+
|
|
14
|
+
On macOS, you may need to install additional dependencies before installing:
|
|
15
|
+
|
|
16
|
+
```
|
|
17
|
+
brew install llvm libomp
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
You'll need to override the `CFLAGS` and `LDFLAGS` environment variables before installing `eflomal`.
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
poetry shell
|
|
24
|
+
export CFLAGS="-Xpreprocessor -fopenmp -I${HOMEBREW_PREFIX}/opt/libomp/include -Ofast -march=native -Wall --std=gnu99 -Wno-unused-function -g"
|
|
25
|
+
export LDFLAGS="-Xpreprocessor -fopenmp -L${HOMEBREW_PREFIX}/opt/libomp/lib -lm -lomp"
|
|
26
|
+
poetry install -E eflomal
|
|
27
|
+
```
|
|
28
|
+
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""Internal-only code for working with alignment data."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
import dotenv
|
|
9
|
+
|
|
10
|
+
from .strongs import normalize_strongs
|
|
11
|
+
|
|
12
|
+
# it would be nice to import symbols from burrito and autoalign here:
|
|
13
|
+
# but i don't know how to avoid circular imports, when that codes also
|
|
14
|
+
# imports from biblealignlib
|
|
15
|
+
|
|
16
|
+
# set path variables. These assume you have a .env file that locates
|
|
17
|
+
# the directory where Clear-Bible repositories are located, like
|
|
18
|
+
#
|
|
19
|
+
# CLEARROOT=/Users/sboisen/git/Clear-Bible
|
|
20
|
+
#
|
|
21
|
+
# use an environment variable if
|
|
22
|
+
if not dotenv.load_dotenv():
|
|
23
|
+
print("No .env file found")
|
|
24
|
+
clearrootenvar = os.getenv("CLEARROOT")
|
|
25
|
+
if clearrootenvar:
|
|
26
|
+
CLEARROOT = Path(clearrootenvar)
|
|
27
|
+
else:
|
|
28
|
+
CLEARROOT = Path.home() / "git/Clear-Bible"
|
|
29
|
+
print(f"No environment variable for CLEARROOT: assuming {CLEARROOT}")
|
|
30
|
+
|
|
31
|
+
# for loading published data. Alignments are here under language
|
|
32
|
+
ALIGNMENTSDATA = CLEARROOT / "Alignments/data"
|
|
33
|
+
# for loading published source TSVs
|
|
34
|
+
SOURCES = ALIGNMENTSDATA / "sources"
|
|
35
|
+
|
|
36
|
+
CANONIDS = {
|
|
37
|
+
"nt",
|
|
38
|
+
"ot",
|
|
39
|
+
# meaning the entire 66 book corpus
|
|
40
|
+
"protestant",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
VERSIFICATIONIDS: set[str] = {
|
|
45
|
+
"eng",
|
|
46
|
+
"org",
|
|
47
|
+
"rso",
|
|
48
|
+
# not yet implemented
|
|
49
|
+
# "ethiopian_custom", "lxx", "rsc", "vul"
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class SourceidEnum(str, Enum):
|
|
54
|
+
"""Valid source identifiers."""
|
|
55
|
+
|
|
56
|
+
BGNT = "BGNT"
|
|
57
|
+
NA27 = "NA27"
|
|
58
|
+
NA28 = "NA28"
|
|
59
|
+
SBLGNT = "SBLGNT"
|
|
60
|
+
WLC = "WLC"
|
|
61
|
+
WLCM = "WLCM"
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def canon(self) -> str:
|
|
65
|
+
"""Return 'ot' or 'nt' for the canon."""
|
|
66
|
+
if self.value in ["WLC", "WLCM"]:
|
|
67
|
+
return "ot"
|
|
68
|
+
elif self.value in ["BGNT", "NA27", "NA28", "SBLGNT"]:
|
|
69
|
+
return "nt"
|
|
70
|
+
else:
|
|
71
|
+
raise ValueError(f"Unknown error in SourceidEnum.canon for {self.value}")
|
|
72
|
+
|
|
73
|
+
# need to add DC, probably others down the road
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_canon(sourceid: str) -> str:
|
|
76
|
+
"""Return a canon string for recognized sources, else 'X'."""
|
|
77
|
+
try:
|
|
78
|
+
srcenum = SourceidEnum(sourceid)
|
|
79
|
+
return srcenum.canon
|
|
80
|
+
except ValueError:
|
|
81
|
+
# unrecognized source
|
|
82
|
+
return "X"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def get_canonid(bcv: str) -> str:
|
|
86
|
+
"""Return nt/ot for a BCVish string.
|
|
87
|
+
|
|
88
|
+
Simple string matching on the book portion of an identifier, so
|
|
89
|
+
works for books, chapters, verses and full BCVWPID identifiers.
|
|
90
|
+
|
|
91
|
+
"""
|
|
92
|
+
otcanonre = re.compile(r"^[0-3][0-9]")
|
|
93
|
+
ntcanonre = re.compile(r"^[4-6][0-9]")
|
|
94
|
+
# don't include 67-69
|
|
95
|
+
notntcanonre = re.compile(r"^6[7-9]")
|
|
96
|
+
if otcanonre.match(bcv):
|
|
97
|
+
return "ot"
|
|
98
|
+
elif ntcanonre.match(bcv) and not notntcanonre.match(bcv):
|
|
99
|
+
return "nt"
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError(f"Invalid BCVish id value: {bcv}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
__all__ = [
|
|
105
|
+
"CLEARROOT",
|
|
106
|
+
"SOURCES",
|
|
107
|
+
"SourceidEnum",
|
|
108
|
+
# strongs
|
|
109
|
+
"normalize_strongs",
|
|
110
|
+
]
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Manage scores for alignment data."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Optional
|
|
5
|
+
|
|
6
|
+
from biblelib.word import BCVID
|
|
7
|
+
|
|
8
|
+
from biblealignlib.burrito import Source, Target, VerseData
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def precision(true_positives: int, false_positives: int) -> float:
|
|
12
|
+
denom = true_positives + false_positives
|
|
13
|
+
return true_positives / denom if denom else 0
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def recall(true_positives: int, false_negatives: int) -> float:
|
|
17
|
+
denom = true_positives + false_negatives
|
|
18
|
+
return true_positives / denom if denom else 0
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def f1(recall: float, precision: float) -> float:
|
|
22
|
+
denom = precision + recall
|
|
23
|
+
return ((2 * precision * recall) / denom) if denom else 0
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class _BaseScore:
|
|
28
|
+
"""Manage base scoring metrics."""
|
|
29
|
+
|
|
30
|
+
identifier: str = ""
|
|
31
|
+
true_positives: int = 0
|
|
32
|
+
# true_negatives: int = 0
|
|
33
|
+
false_positives: int = 0
|
|
34
|
+
false_negatives: int = 0
|
|
35
|
+
precision: float = 0.0
|
|
36
|
+
recall: float = 0.0
|
|
37
|
+
f1: float = 0.0
|
|
38
|
+
aer: float = 0.0
|
|
39
|
+
|
|
40
|
+
def __repr__(self) -> str:
|
|
41
|
+
"""Return a string representation of the Score."""
|
|
42
|
+
return f"<{self.__class__.__name__}: {self.identifier}>"
|
|
43
|
+
|
|
44
|
+
def compute_metrics(self) -> None:
|
|
45
|
+
"""Compute various metrics."""
|
|
46
|
+
self.precision = precision(self.true_positives, self.false_positives)
|
|
47
|
+
self.aer = 1 - self.precision
|
|
48
|
+
self.recall = recall(self.true_positives, self.false_negatives)
|
|
49
|
+
self.f1 = f1(self.recall, self.precision)
|
|
50
|
+
|
|
51
|
+
# should use summary_dict here
|
|
52
|
+
def summary(self, width: int = 4, brief: bool = True) -> str:
|
|
53
|
+
"""Return summary metrics."""
|
|
54
|
+
plabel = "P" if brief else "Precision"
|
|
55
|
+
rlabel = "R" if brief else "Recall"
|
|
56
|
+
return f"{self.identifier}: AER={self.aer:.{width}f}\t{plabel}={self.precision:.{width}f}\t{rlabel}={self.recall:.{width}f}\tF1={self.f1:.{width}f}"
|
|
57
|
+
|
|
58
|
+
def summary_dict(self, width: int = 4) -> dict[str, str]:
|
|
59
|
+
"""Return a dict with summary scores."""
|
|
60
|
+
return {
|
|
61
|
+
"AER": f"{self.aer:.{width}f}",
|
|
62
|
+
"F1": f"{self.f1:.{width}f}",
|
|
63
|
+
"Precision": f"{self.precision:.{width}f}",
|
|
64
|
+
"Recall": f"{self.recall:.{width}f}",
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
def asdict(self, ndigits=3) -> dict[str, Any]:
|
|
68
|
+
"""Return a dict usable as a dataframe row."""
|
|
69
|
+
scoredict = {
|
|
70
|
+
# this _should_ always be a BCV
|
|
71
|
+
"Identifier": self.identifier,
|
|
72
|
+
# just the verse index
|
|
73
|
+
"Verse": self.identifier[5:],
|
|
74
|
+
"Chapter": self.identifier[:5],
|
|
75
|
+
"Book": self.identifier[:2],
|
|
76
|
+
"Reference": BCVID(self.identifier).to_usfm(),
|
|
77
|
+
"AER": round(self.aer, ndigits),
|
|
78
|
+
"F1": round(self.f1, ndigits),
|
|
79
|
+
"Precision": round(self.precision, ndigits),
|
|
80
|
+
"Recall": round(self.recall, ndigits),
|
|
81
|
+
}
|
|
82
|
+
return scoredict
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
@dataclass(repr=False)
|
|
86
|
+
class VerseScore(_BaseScore):
|
|
87
|
+
"""Manage scoring data for a verse."""
|
|
88
|
+
|
|
89
|
+
# not really optional, but dataclass inheritance requires this
|
|
90
|
+
reference: Optional[VerseData] = None
|
|
91
|
+
hypothesis: Optional[VerseData] = None
|
|
92
|
+
# computed
|
|
93
|
+
n_sources: int = 0
|
|
94
|
+
n_targets: int = 0
|
|
95
|
+
referencepairs: list[tuple[Source, Target]] = field(init=False, default_factory=list)
|
|
96
|
+
hypothesispairs: list[tuple[Source, Target]] = field(init=False, default_factory=list)
|
|
97
|
+
|
|
98
|
+
def __post_init__(self) -> None:
|
|
99
|
+
"""Compute values on initialization."""
|
|
100
|
+
self.identifier = self.reference.bcvid
|
|
101
|
+
self.n_sources = len(self.reference.sources)
|
|
102
|
+
self.n_targets = len(self.reference.targets)
|
|
103
|
+
# decompose into pairs of source and target indices
|
|
104
|
+
self._get_pairs()
|
|
105
|
+
# set operations on pairs: no partial credit
|
|
106
|
+
self.true_positives = len(set(self.referencepairs) & set(self.hypothesispairs))
|
|
107
|
+
self.false_positives = len(set(self.hypothesispairs) - set(self.referencepairs))
|
|
108
|
+
self.false_negatives = len(set(self.referencepairs) - set(self.hypothesispairs))
|
|
109
|
+
# sets values for P, R, F1, AER
|
|
110
|
+
self.compute_metrics()
|
|
111
|
+
|
|
112
|
+
def _get_pairs(self) -> None:
|
|
113
|
+
"""Populate reference/hypothesispairs."""
|
|
114
|
+
# these are like pharaoh: tokens are repeated for multiple alignments
|
|
115
|
+
self.referencepairs = self.reference.get_pairs()
|
|
116
|
+
self.hypothesispairs = self.hypothesis.get_pairs()
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass(repr=False)
|
|
120
|
+
class EssentialVerseScore(VerseScore):
|
|
121
|
+
"""Like VerseScore but only for essential alignments."""
|
|
122
|
+
|
|
123
|
+
def _get_pairs(self) -> None:
|
|
124
|
+
"""Populate reference/hypothesispairs."""
|
|
125
|
+
# these are like pharaoh: tokens are repeated for multiple alignments
|
|
126
|
+
self.referencepairs = self.reference.get_pairs(essential=True)
|
|
127
|
+
self.hypothesispairs = self.hypothesis.get_pairs(essential=True)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
@dataclass(repr=False)
|
|
131
|
+
class GroupScore(_BaseScore):
|
|
132
|
+
"""Manage scoring data for a group of verses."""
|
|
133
|
+
|
|
134
|
+
verse_scores: list[VerseScore] = field(default_factory=list)
|
|
135
|
+
|
|
136
|
+
def __post_init__(self) -> None:
|
|
137
|
+
"""Compute values on initialization."""
|
|
138
|
+
assert self.identifier, "Must provide identifier"
|
|
139
|
+
assert self.verse_scores, "Must provide verse_scores."
|
|
140
|
+
|
|
141
|
+
self.true_positives: int = sum(v.true_positives for v in self.verse_scores)
|
|
142
|
+
self.false_positives: int = sum(v.false_positives for v in self.verse_scores)
|
|
143
|
+
self.false_negatives: int = sum(v.false_negatives for v in self.verse_scores)
|
|
144
|
+
self.compute_metrics()
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from biblealignlib import CLEARROOT, SOURCES
|
|
2
|
+
|
|
3
|
+
# when it gets fixed
|
|
4
|
+
# from .eflomal import Eflomal
|
|
5
|
+
from .mapper import PharaohMapper
|
|
6
|
+
from .reader import PharaohReader
|
|
7
|
+
from .scorer import Scorer
|
|
8
|
+
from .writer import PharaohWriter
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"CLEARROOT",
|
|
12
|
+
"SOURCES",
|
|
13
|
+
# # eflomal
|
|
14
|
+
# "Eflomal",
|
|
15
|
+
# mapper
|
|
16
|
+
"PharaohMapper",
|
|
17
|
+
# reader
|
|
18
|
+
"PharaohReader",
|
|
19
|
+
# scorer
|
|
20
|
+
"Scorer",
|
|
21
|
+
# writer
|
|
22
|
+
"PharaohWriter",
|
|
23
|
+
]
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Manages corpus data for auto alignment."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
from biblealignlib.burrito import (
|
|
6
|
+
BaseToken,
|
|
7
|
+
Source,
|
|
8
|
+
Target,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CorpusMapping:
|
|
14
|
+
"""Map corpus instances to pharaoh-data for a single verse correspondence.
|
|
15
|
+
|
|
16
|
+
bcv is based on source versification: targets with different
|
|
17
|
+
versifications should be mapped to comparable source verses in the
|
|
18
|
+
TSV.
|
|
19
|
+
|
|
20
|
+
Example: Target:01031005007 corresponds to Source:01032001
|
|
21
|
+
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
# BCV-format verse reference
|
|
25
|
+
bcv: str
|
|
26
|
+
# Source instances and their pharaoh indices
|
|
27
|
+
source_pairs: list[tuple[Source, int]] = field(default_factory=list)
|
|
28
|
+
# Target instances and their pharaoh indices
|
|
29
|
+
target_pairs: list[tuple[Target, int]] = field(default_factory=list)
|
|
30
|
+
_typeattrs: tuple = ("sources", "targets")
|
|
31
|
+
# these values computed in post_init
|
|
32
|
+
# dict: index -> Token
|
|
33
|
+
sourceindexmap: dict[int, BaseToken] = field(default_factory=dict)
|
|
34
|
+
targetindexmap: dict[int, BaseToken] = field(default_factory=dict)
|
|
35
|
+
# dict: Token -> index
|
|
36
|
+
sourcetokenmap: dict[BaseToken, int] = field(default_factory=dict)
|
|
37
|
+
targettokenmap: dict[BaseToken, int] = field(default_factory=dict)
|
|
38
|
+
|
|
39
|
+
def __post_init__(self) -> None:
|
|
40
|
+
"""Compute values after initialization."""
|
|
41
|
+
self.sourceindexmap = {index: item for item, index in self.source_pairs}
|
|
42
|
+
self.targetindexmap = {index: item for item, index in self.target_pairs}
|
|
43
|
+
self.sourcetokenmap = dict(self.source_pairs)
|
|
44
|
+
self.targettokenmap = dict(self.target_pairs)
|
|
45
|
+
|
|
46
|
+
def __repr__(self) -> str:
|
|
47
|
+
"""Return a string representation of the CorpusMapping."""
|
|
48
|
+
return f"<CorpusMapping: {self.bcv}>"
|
|
49
|
+
|
|
50
|
+
def tokenids(self, typeattr: str) -> list[str]:
|
|
51
|
+
"""Return the list of corpus token ids for typeattr."""
|
|
52
|
+
assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
|
|
53
|
+
pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
|
|
54
|
+
return [corpus.id for corpus, _ in pairs]
|
|
55
|
+
|
|
56
|
+
def indices(self, typeattr: str) -> list[str]:
|
|
57
|
+
"""Return the list of pharaoah indices for typeattr."""
|
|
58
|
+
assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
|
|
59
|
+
pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
|
|
60
|
+
return [index for _, index in pairs]
|
|
61
|
+
|
|
62
|
+
def tokentexts(self, typeattr: str) -> list[str]:
|
|
63
|
+
"""Return the list of corpus token texts for typeattr."""
|
|
64
|
+
assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
|
|
65
|
+
pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
|
|
66
|
+
return [corpus.text for corpus, _ in pairs]
|
|
67
|
+
|
|
68
|
+
def display(self, typeattr: str) -> None:
|
|
69
|
+
"""Print out the id and text pairs for debugging."""
|
|
70
|
+
assert typeattr in self._typeattrs, f"typeattr should be one of {self._typeattrs}"
|
|
71
|
+
pairs = self.source_pairs if typeattr == "sources" else self.target_pairs
|
|
72
|
+
for token, index in pairs:
|
|
73
|
+
print(f"{index}: {token.id}, {token.text}")
|