biblealignlib 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- biblealignlib-0.1.0/LICENSE +21 -0
- biblealignlib-0.1.0/LICENSE.md +92 -0
- biblealignlib-0.1.0/PKG-INFO +30 -0
- biblealignlib-0.1.0/README.md +2 -0
- biblealignlib-0.1.0/biblealignlib/__init__.py +93 -0
- biblealignlib-0.1.0/biblealignlib/burrito/AlignmentGroup.py +420 -0
- biblealignlib-0.1.0/biblealignlib/burrito/AlignmentSet.py +164 -0
- biblealignlib-0.1.0/biblealignlib/burrito/AlignmentType.py +67 -0
- biblealignlib-0.1.0/biblealignlib/burrito/BadRecord.py +70 -0
- biblealignlib-0.1.0/biblealignlib/burrito/BaseToken.py +79 -0
- biblealignlib-0.1.0/biblealignlib/burrito/VerseData.py +242 -0
- biblealignlib-0.1.0/biblealignlib/burrito/__init__.py +61 -0
- biblealignlib-0.1.0/biblealignlib/burrito/alignments.py +327 -0
- biblealignlib-0.1.0/biblealignlib/burrito/manager.py +181 -0
- biblealignlib-0.1.0/biblealignlib/burrito/source.py +477 -0
- biblealignlib-0.1.0/biblealignlib/burrito/target.py +328 -0
- biblealignlib-0.1.0/biblealignlib/burrito/util.py +78 -0
- biblealignlib-0.1.0/biblealignlib/strongs.py +67 -0
- biblealignlib-0.1.0/pyproject.toml +73 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Clear.Bible
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# Bible Word Alignments
|
|
2
|
+
|
|
3
|
+
We license all of our own code under an MIT license and all of our own
|
|
4
|
+
data under CC-BY. For details, see sections below for [Code](#code) and [Data](#data).
|
|
5
|
+
|
|
6
|
+
## Code
|
|
7
|
+
|
|
8
|
+
Code for this project (`../bible_alignments`) is copyright (c) 2023 by
|
|
9
|
+
[Clear Bible, Inc](http://www.clear.bible) and is licensed under the
|
|
10
|
+
terms of the MIT License.
|
|
11
|
+
|
|
12
|
+
### MIT License
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
16
|
+
a copy of this software and associated documentation files (the
|
|
17
|
+
“Software”), to deal in the Software without restriction, including
|
|
18
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
19
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
20
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
21
|
+
the following conditions:
|
|
22
|
+
|
|
23
|
+
The above copyright notice and this permission notice shall be
|
|
24
|
+
included in all copies or substantial portions of the Software.
|
|
25
|
+
|
|
26
|
+
THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND,
|
|
27
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
28
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
29
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
30
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
31
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
32
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
33
|
+
|
|
34
|
+
## Data
|
|
35
|
+
|
|
36
|
+
[Bible Word Alignments](https://github.com/Clear-Bible/Alignments) © 2022 by [Clear Bible, Inc](http://www.clear.bible) is licensed under [CC BY 4.0 ](http://creativecommons.org/licenses/by/4.0/).
|
|
37
|
+
|
|
38
|
+
These datasets include:
|
|
39
|
+
|
|
40
|
+
1. Alignment files (`../data/alignments`) derived from Clear Bible's data.
|
|
41
|
+
2. Source text files (`../data/sources`) derived from Clear Bible's
|
|
42
|
+
data. Note that any copyright-protected text has been stripped out.
|
|
43
|
+
3. Target text files (`../data/targets`) derived from Clear Bible's
|
|
44
|
+
data. Note that any copyright-protected text has been stripped out.
|
|
45
|
+
4. Names files (`../data/targets`) derived from alignment data.
|
|
46
|
+
|
|
47
|
+
Source text files include data from:
|
|
48
|
+
|
|
49
|
+
* Westminster Leningrad Codex - the somewhat informal license states
|
|
50
|
+
that "All biblical Hebrew text, in any format, may be viewed or
|
|
51
|
+
copied without restriction."
|
|
52
|
+
|
|
53
|
+
Target text files include data from:
|
|
54
|
+
|
|
55
|
+
* The text for the Chinese Union Version (Simplified) is in the public
|
|
56
|
+
domain. The Chinese Union Version with Modern Punctuation
|
|
57
|
+
(Simplified) (`CUVMP`) is a derivative work, and is Copyright © 2011
|
|
58
|
+
Global Bible Initiative / © 2011 全球圣经促进会 and in the public
|
|
59
|
+
domain.
|
|
60
|
+
* Young's Literal Translation (`YLT`), by Robert Young (1862, 1887, 1898),
|
|
61
|
+
which is in the public domain.
|
|
62
|
+
|
|
63
|
+
The repository also includes data on strategic languages for Bible
|
|
64
|
+
translation (`../data/languages`) from the [ETEN Innovation
|
|
65
|
+
Lab](https://dev.lab.eten.bible/).
|
|
66
|
+
|
|
67
|
+
### License
|
|
68
|
+
|
|
69
|
+
#### Creative Commons Attribution 4.0 International (CC BY 4.0)
|
|
70
|
+
|
|
71
|
+
This is a human-readable summary of (and not a substitute for) the [license](http://creativecommons.org/licenses/by/4.0/).
|
|
72
|
+
|
|
73
|
+
##### You are free to:
|
|
74
|
+
|
|
75
|
+
* **Share** — copy and redistribute the material in any medium or format
|
|
76
|
+
* **Adapt** — remix, transform, and build upon the material
|
|
77
|
+
for any purpose, even commercially.
|
|
78
|
+
|
|
79
|
+
The licensor cannot revoke these freedoms as long as you follow the license terms.
|
|
80
|
+
|
|
81
|
+
##### Under the following terms:
|
|
82
|
+
|
|
83
|
+
* **Attribution** — You must attribute the work as follows: "MACULA Greek Linguistic Datasets, available at https://github.com/Clear-Bible/macula-greek/". You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
|
|
84
|
+
|
|
85
|
+
**No additional restrictions** — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
|
|
86
|
+
|
|
87
|
+
##### Notices:
|
|
88
|
+
|
|
89
|
+
You do not have to comply with the license for elements of the material in the public domain or where your use is permitted by an applicable exception or limitation.
|
|
90
|
+
|
|
91
|
+
No warranties are given. The license may not give you all of the permissions necessary for your intended use. For example, other rights such as publicity, privacy, or moral rights may limit how you use the material.
|
|
92
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: biblealignlib
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Code for managing Word-level alignments for Bibles, including both automatic alignments and manually corrected alignments.
|
|
5
|
+
Home-page: https://github.com/Clear-Bible/biblealignlib
|
|
6
|
+
License: MIT
|
|
7
|
+
Keywords: Bible,alignment,Bible alignment
|
|
8
|
+
Author: Sean Boisen
|
|
9
|
+
Author-email: sean.boisen@biblica.com
|
|
10
|
+
Requires-Python: >=3.10,<3.12
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Religion
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Topic :: Religion
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Dist: biblelib (>=0.3.17,<0.4.0)
|
|
21
|
+
Requires-Dist: jupyterlab (>=4.3.3,<5.0.0)
|
|
22
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
23
|
+
Requires-Dist: regex (>=2024.11.6,<2025.0.0)
|
|
24
|
+
Requires-Dist: unicodecsv (>=0.14.1,<0.15.0)
|
|
25
|
+
Project-URL: Repository, https://github.com/Clear-Bible/biblealignlib
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# biblealignlib
|
|
29
|
+
Code for working with Bible alignment data
|
|
30
|
+
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Internal-only code for working with alignment data."""
|
|
2
|
+
|
|
3
|
+
from enum import Enum
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .strongs import normalize_strongs
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
ROOT = Path(__file__).parent
|
|
11
|
+
DATAPATH = ROOT / "data"
|
|
12
|
+
SRCPATH = ROOT / "src"
|
|
13
|
+
|
|
14
|
+
GRAPECITYDIR = ROOT.parent / "grapecity-alignments"
|
|
15
|
+
# for output
|
|
16
|
+
ALIGNMENTSROOT = ROOT.parent / "Alignments"
|
|
17
|
+
|
|
18
|
+
CANONIDS = {
|
|
19
|
+
"nt",
|
|
20
|
+
"ot",
|
|
21
|
+
# meaning the entire 66 book corpus
|
|
22
|
+
"protestant",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
VERSIFICATIONIDS: set[str] = {
|
|
27
|
+
"eng",
|
|
28
|
+
"org",
|
|
29
|
+
"rso",
|
|
30
|
+
# not yet implemented
|
|
31
|
+
# "ethiopian_custom", "lxx", "rsc", "vul"
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SourceidEnum(str, Enum):
|
|
36
|
+
"""Valid source identifiers."""
|
|
37
|
+
|
|
38
|
+
BGNT = "BGNT"
|
|
39
|
+
NA27 = "NA27"
|
|
40
|
+
NA28 = "NA28"
|
|
41
|
+
SBLGNT = "SBLGNT"
|
|
42
|
+
WLC = "WLC"
|
|
43
|
+
WLCM = "WLCM"
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def canon(self) -> str:
|
|
47
|
+
"""Return 'ot' or 'nt' for the canon."""
|
|
48
|
+
if self.value in ["WLC", "WLCM"]:
|
|
49
|
+
return "ot"
|
|
50
|
+
elif self.value in ["BGNT", "NA27", "NA28", "SBLGNT"]:
|
|
51
|
+
return "nt"
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError(f"Unknown error in SourceidEnum.canon for {self.value}")
|
|
54
|
+
|
|
55
|
+
# need to add DC, probably others down the road
|
|
56
|
+
@staticmethod
|
|
57
|
+
def get_canon(sourceid: str) -> str:
|
|
58
|
+
"""Return a canon string for recognized sources, else 'X'."""
|
|
59
|
+
try:
|
|
60
|
+
srcenum = SourceidEnum(sourceid)
|
|
61
|
+
return srcenum.canon
|
|
62
|
+
except ValueError:
|
|
63
|
+
# unrecognized source
|
|
64
|
+
return "X"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def get_canonid(bcv: str) -> str:
|
|
68
|
+
"""Return nt/ot for a BCVish string.
|
|
69
|
+
|
|
70
|
+
Simple string matching on the book portion of an identifier, so
|
|
71
|
+
works for books, chapters, verses and full BCVWPID identifiers.
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
otcanonre = re.compile(r"^[0-3][0-9]")
|
|
75
|
+
ntcanonre = re.compile(r"^[4-6][0-9]")
|
|
76
|
+
# don't include 67-69
|
|
77
|
+
notntcanonre = re.compile(r"^6[7-9]")
|
|
78
|
+
if otcanonre.match(bcv):
|
|
79
|
+
return "ot"
|
|
80
|
+
elif ntcanonre.match(bcv) and not notntcanonre.match(bcv):
|
|
81
|
+
return "nt"
|
|
82
|
+
else:
|
|
83
|
+
raise ValueError(f"Invalid BCVish id value: {bcv}")
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
__all__ = [
|
|
87
|
+
"ROOT",
|
|
88
|
+
"DATAPATH",
|
|
89
|
+
"SRCPATH",
|
|
90
|
+
"SourceidEnum",
|
|
91
|
+
# strongs
|
|
92
|
+
"normalize_strongs",
|
|
93
|
+
]
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""Manage data for an alignment record.
|
|
2
|
+
|
|
3
|
+
This defines AlignmentGroup and supporting dataclasses.
|
|
4
|
+
|
|
5
|
+
This implements Scripture Burrito Alignment Standard, v 0.3,
|
|
6
|
+
https://docs.google.com/document/d/1zR5gsrm3gIoNiHVBlWz5_BBw3N-Ew1-4M5rMsFrPzSw/.
|
|
7
|
+
|
|
8
|
+
The information model allows some values at multiple levels due to
|
|
9
|
+
'hoisting'. This opinionated code:
|
|
10
|
+
- defines attributes at the lowest relevant class
|
|
11
|
+
- use maximal hoisting for serialization unless overridded
|
|
12
|
+
|
|
13
|
+
While this aims at generality, the main application supported here is
|
|
14
|
+
publishing Bible alignment data. There may therefore be aspects of the
|
|
15
|
+
spec that are not supported by this code.
|
|
16
|
+
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from dataclasses import dataclass, field, fields
|
|
20
|
+
import datetime as dt
|
|
21
|
+
from functools import total_ordering
|
|
22
|
+
from itertools import groupby
|
|
23
|
+
from typing import Any, Optional
|
|
24
|
+
|
|
25
|
+
from biblelib.word import bcvwpid
|
|
26
|
+
|
|
27
|
+
from biblealignlib import SourceidEnum
|
|
28
|
+
|
|
29
|
+
from .AlignmentType import TranslationType
|
|
30
|
+
from .source import macula_prefixer
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# hoisting means this can be defined at several different levels, so
|
|
34
|
+
# called out as a separate class
|
|
35
|
+
@dataclass
|
|
36
|
+
class Document:
|
|
37
|
+
"""Manage data for an alignment document."""
|
|
38
|
+
|
|
39
|
+
# best practices
|
|
40
|
+
# - for source documents: a standard identifier like 'NA28' or
|
|
41
|
+
# "WLC"
|
|
42
|
+
# - for target documents: a Version Abbreviation value from
|
|
43
|
+
# digitalbiblelibrary.org is a good choice.
|
|
44
|
+
docid: str
|
|
45
|
+
scheme: str = "BCVWP"
|
|
46
|
+
# only set if a source ID
|
|
47
|
+
sourceid: Optional[SourceidEnum] = None
|
|
48
|
+
|
|
49
|
+
def __post_init__(self) -> None:
|
|
50
|
+
"""Compute values after initialization."""
|
|
51
|
+
try:
|
|
52
|
+
# set is_source if in the standard list
|
|
53
|
+
self.sourceid = SourceidEnum(self.docid)
|
|
54
|
+
except ValueError:
|
|
55
|
+
self.sourceid = None
|
|
56
|
+
# downgrade BCVWP to BCVW: no subword indices if not source
|
|
57
|
+
if self.scheme == "BCVWP":
|
|
58
|
+
self.scheme = "BCVW"
|
|
59
|
+
|
|
60
|
+
def asdict(self) -> dict[str, Any]:
|
|
61
|
+
"""Return a dict of values suitable for serialization."""
|
|
62
|
+
return {"docid": self.docid, "scheme": self.scheme}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(order=True)
|
|
66
|
+
class AlignmentReference:
|
|
67
|
+
"""Manage data for an alignment reference."""
|
|
68
|
+
|
|
69
|
+
# perhaps over-engineered, but Document data can occur at multiple
|
|
70
|
+
# levels in serialization.
|
|
71
|
+
document: Document
|
|
72
|
+
# selectors identify tokens or other units in a document. For most
|
|
73
|
+
# alignment purposes, these are token identifiers.
|
|
74
|
+
selectors: list[str]
|
|
75
|
+
|
|
76
|
+
def __post_init__(self) -> None:
|
|
77
|
+
"""Compute values after initialization."""
|
|
78
|
+
# no good for reading alignment hub data
|
|
79
|
+
# assert bool(self.selectors), "Selectors must not be empty."
|
|
80
|
+
self.selectors = sorted(self.selectors)
|
|
81
|
+
|
|
82
|
+
def __repr__(self) -> str:
|
|
83
|
+
"""Return a printed representation."""
|
|
84
|
+
return f"<{self.docid}: {self.selectors}>"
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def docid(self) -> str:
|
|
88
|
+
"""Return the docid from document, for convenience."""
|
|
89
|
+
return self.document.docid
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def scheme(self) -> str:
|
|
93
|
+
"""Return the scheme from document, for convenience."""
|
|
94
|
+
return self.document.scheme
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def incomplete(self) -> bool:
|
|
98
|
+
"""True if any selectors are MISSING."""
|
|
99
|
+
return any((sel == "MISSING") for sel in self.selectors)
|
|
100
|
+
|
|
101
|
+
def asdict(self, hoist: bool = True) -> dict[str, Any]:
|
|
102
|
+
"""Return a dict of values suitable for serialization.
|
|
103
|
+
|
|
104
|
+
With hoist, omit docid and scheme from document, assuming
|
|
105
|
+
they'll be specified 'higher up'.
|
|
106
|
+
|
|
107
|
+
"""
|
|
108
|
+
refdict: dict[str, Any] = {"selectors": self.selectors}
|
|
109
|
+
if not hoist:
|
|
110
|
+
refdict.update({"docid": self.docid, "scheme": self.scheme})
|
|
111
|
+
return refdict
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class Metadata:
|
|
116
|
+
"""Contains metadata for alignment records.
|
|
117
|
+
|
|
118
|
+
While all attributes are optional, the attributes creator and
|
|
119
|
+
created are strongly encouraged.
|
|
120
|
+
|
|
121
|
+
Other attributes are taken from Dublin Core terms
|
|
122
|
+
(https://www.dublincore.org/specifications/dublin-core/dcmi-terms/#section-2)
|
|
123
|
+
to encourage standardization. This may also be extended to include
|
|
124
|
+
other attributes.
|
|
125
|
+
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# these initial attributes mirror DCMI, and typically apply to an
|
|
129
|
+
# AlignmentGroup
|
|
130
|
+
#
|
|
131
|
+
# which version of the alignment spec this conforms to
|
|
132
|
+
conformsTo: str = ""
|
|
133
|
+
# "An entity responsible for making contributions to the
|
|
134
|
+
# resource". Could be used for alignment annotators. If there are
|
|
135
|
+
# multiple values, concatenate with commas.
|
|
136
|
+
contributor: str = ""
|
|
137
|
+
# strongly recommended if available: "Date of creation of the resource."
|
|
138
|
+
created: Optional[dt.datetime] = None
|
|
139
|
+
# strongly recommended if available: "An entity responsible for
|
|
140
|
+
# making the resource."
|
|
141
|
+
# Recommended usage: the organization providing the data.
|
|
142
|
+
creator: str = ""
|
|
143
|
+
# recommended standard values if defined: "NT", "OT", "DC", a USFM
|
|
144
|
+
# abbreviation for Bible book
|
|
145
|
+
coverage: str = ""
|
|
146
|
+
# "An account of the resource."
|
|
147
|
+
description: str = ""
|
|
148
|
+
# for AlignmentRecords: unique identifier
|
|
149
|
+
id: str = ""
|
|
150
|
+
# for AlignmentRecords: how was this alignment originally created?
|
|
151
|
+
# This does *not* capture changes to the original value.
|
|
152
|
+
# common values include 'manual', 'automated' or an algorithm name
|
|
153
|
+
origin: str = ""
|
|
154
|
+
# for ClearAligner to track status. Output here should always be
|
|
155
|
+
# 'created': set in Manager._make_record() so it's not set of AlignmentGroup metadata
|
|
156
|
+
# eventually i may need to separate this into two classes, one for
|
|
157
|
+
# Groups and one for Records.
|
|
158
|
+
status: str = ""
|
|
159
|
+
_fieldnames: tuple[str, ...] = ()
|
|
160
|
+
|
|
161
|
+
def __post_init__(self) -> None:
|
|
162
|
+
"""Compute values after initialization."""
|
|
163
|
+
self._fieldnames = tuple(
|
|
164
|
+
sorted(tuple([f.name for f in fields(self) if f.name != "_fieldnames"]))
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def __repr__(self) -> str:
|
|
168
|
+
"""Return a printed representation."""
|
|
169
|
+
attrstr: str = " ".join(
|
|
170
|
+
{f"{f}={repr(fattr)}" for f in self._fieldnames if (fattr := getattr(self, f))}
|
|
171
|
+
)
|
|
172
|
+
return f"Metadata({attrstr})"
|
|
173
|
+
|
|
174
|
+
# no hoist option here: the caller decides
|
|
175
|
+
def asdict(self) -> dict[str, Any]:
|
|
176
|
+
"""Return a dict of values for serialization."""
|
|
177
|
+
metadict = {f: fattr for f in sorted(self._fieldnames) if (fattr := getattr(self, f))}
|
|
178
|
+
return metadict
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@dataclass
|
|
182
|
+
@total_ordering
|
|
183
|
+
class AlignmentRecord:
|
|
184
|
+
"""Manage data for an alignment record."""
|
|
185
|
+
|
|
186
|
+
# metadata for this record only
|
|
187
|
+
meta: Metadata
|
|
188
|
+
# keys are roles, corresponding to type.roles
|
|
189
|
+
references: dict[str, AlignmentReference]
|
|
190
|
+
# TranslationType wires roles to 'source' and 'target'
|
|
191
|
+
type: TranslationType = field(default_factory=TranslationType)
|
|
192
|
+
|
|
193
|
+
def __post_init__(self) -> None:
|
|
194
|
+
"""Compute values after initialization."""
|
|
195
|
+
for role in self.roles:
|
|
196
|
+
assert role in self.references, f"role missing from references: {role}"
|
|
197
|
+
assert len(self.roles) == len(self.references), "different numbers of roles and references"
|
|
198
|
+
|
|
199
|
+
def __repr__(self) -> str:
|
|
200
|
+
"""Return a printed representation."""
|
|
201
|
+
return f"<AlignmentRecord: {repr(self.references)}>"
|
|
202
|
+
|
|
203
|
+
def __hash__(self) -> int:
|
|
204
|
+
"""Return a hash value for the record."""
|
|
205
|
+
return hash(self.identifier)
|
|
206
|
+
|
|
207
|
+
def __eq__(self, other):
|
|
208
|
+
assert isinstance(other, AlignmentRecord), f"Not an AlignmentRecord: {other}"
|
|
209
|
+
return self.source_selectors[0] == other.source_selectors[0]
|
|
210
|
+
|
|
211
|
+
def __lt__(self, other):
|
|
212
|
+
assert isinstance(other, AlignmentRecord), f"Not an AlignmentRecord: {other}"
|
|
213
|
+
return self.source_selectors[0] < other.source_selectors[0]
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def identifier(self) -> str:
|
|
217
|
+
"""Return the identifier for this record, for convenience."""
|
|
218
|
+
return self.meta.id
|
|
219
|
+
|
|
220
|
+
@property
|
|
221
|
+
def roles(self) -> tuple[str, str]:
|
|
222
|
+
"""Return the roles for this type, for convenience."""
|
|
223
|
+
return self.type.roles
|
|
224
|
+
|
|
225
|
+
def get_selectors(self, role: str) -> list[str]:
|
|
226
|
+
"""Return the list of selectors for role."""
|
|
227
|
+
assert role in self.roles, f"Invalid role: {role}"
|
|
228
|
+
return self.references[role].selectors
|
|
229
|
+
|
|
230
|
+
@property
|
|
231
|
+
def source_selectors(self) -> list[str]:
|
|
232
|
+
"""Return the source selectors for this record."""
|
|
233
|
+
return self.get_selectors("source")
|
|
234
|
+
|
|
235
|
+
@property
|
|
236
|
+
def target_selectors(self) -> list[str]:
|
|
237
|
+
"""Return the target selectors for this record."""
|
|
238
|
+
return self.get_selectors("target")
|
|
239
|
+
|
|
240
|
+
@property
|
|
241
|
+
def source_bcv(self) -> str:
|
|
242
|
+
"""Return the source BCV identifier for this record.
|
|
243
|
+
|
|
244
|
+
Returns data for the first selector, though multiples should
|
|
245
|
+
have the same BCV.
|
|
246
|
+
|
|
247
|
+
"""
|
|
248
|
+
if self.source_selectors:
|
|
249
|
+
firstbcv: str = [bcvwpid.to_bcv(sel) for sel in self.source_selectors][0]
|
|
250
|
+
return firstbcv
|
|
251
|
+
else:
|
|
252
|
+
return ""
|
|
253
|
+
|
|
254
|
+
@property
|
|
255
|
+
def incomplete(self) -> bool:
|
|
256
|
+
"""True if any selectors in references are incomplete."""
|
|
257
|
+
return any(ref.incomplete for ref in self.references.values())
|
|
258
|
+
|
|
259
|
+
def asdict(
|
|
260
|
+
self, positional: bool = False, withmeta: bool = True, withmaculaprefix: bool = False
|
|
261
|
+
) -> dict[str, Any]:
|
|
262
|
+
"""Return a dict of values suitable for serialization.
|
|
263
|
+
|
|
264
|
+
With positional=False (the default), returns a dict whose keys
|
|
265
|
+
are the roles and values are the references. Otherwise, the
|
|
266
|
+
single key is 'references', and the position is determined by
|
|
267
|
+
the position of the roles.
|
|
268
|
+
|
|
269
|
+
With withmeta=False (the default), omits record-level
|
|
270
|
+
metadata: otherwise includes it.
|
|
271
|
+
|
|
272
|
+
With withmaculaprefix=True (the default), prefix source
|
|
273
|
+
references with 'o' or 'n' depending on canon.
|
|
274
|
+
|
|
275
|
+
"""
|
|
276
|
+
recdict: dict[str, Any] = {}
|
|
277
|
+
if positional:
|
|
278
|
+
if not withmaculaprefix:
|
|
279
|
+
raise NotImplementedError(
|
|
280
|
+
"Positional and not withmaculaprefix is not yet supported."
|
|
281
|
+
)
|
|
282
|
+
else:
|
|
283
|
+
recdict["references"] = self.references.items()
|
|
284
|
+
else:
|
|
285
|
+
# typical case
|
|
286
|
+
sourcerefs: list[str] = self.references["source"].selectors
|
|
287
|
+
if withmaculaprefix:
|
|
288
|
+
# default: add back the Macula prefix
|
|
289
|
+
sourcerefs = [macula_prefixer(srcstr) for srcstr in sourcerefs]
|
|
290
|
+
# else leave as is (atypical)
|
|
291
|
+
recdict["source"] = sourcerefs
|
|
292
|
+
recdict["target"] = self.references["target"].selectors
|
|
293
|
+
if withmeta:
|
|
294
|
+
recdict.update(
|
|
295
|
+
{
|
|
296
|
+
"meta": self.meta.asdict(),
|
|
297
|
+
}
|
|
298
|
+
)
|
|
299
|
+
# sort by keys as Mike prefers
|
|
300
|
+
return {k: recdict[k] for k in sorted(recdict)}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
@dataclass
|
|
304
|
+
class AlignmentGroup:
|
|
305
|
+
"""Manage a full set of alignment records.
|
|
306
|
+
|
|
307
|
+
This is opinionated about the composition of the group:
|
|
308
|
+
- enforces a single type across all records
|
|
309
|
+
"""
|
|
310
|
+
|
|
311
|
+
# same order and count as roles
|
|
312
|
+
documents: tuple[Document, Document]
|
|
313
|
+
# metadata for the group as a whole
|
|
314
|
+
meta: Metadata
|
|
315
|
+
records: list[AlignmentRecord]
|
|
316
|
+
# keys to AlignmentRecord.references: same order as documents
|
|
317
|
+
roles: tuple[str, str] = ("source", "target")
|
|
318
|
+
# either "ot" or "nt", based on documents.docid
|
|
319
|
+
sourcedocid: str = ""
|
|
320
|
+
canon: str = ""
|
|
321
|
+
_type: str = ""
|
|
322
|
+
# hoist docid values from reference.document up to this metadata
|
|
323
|
+
_hoist_docid: bool = True
|
|
324
|
+
|
|
325
|
+
def __post_init__(self) -> None:
|
|
326
|
+
"""Compute values and do checks after initialization."""
|
|
327
|
+
# only a single type across all records
|
|
328
|
+
typeset = {rec.type.type for rec in self.records if self.records}
|
|
329
|
+
assert len(typeset) == 1, f"Multiple AlignmentRecord types found: {typeset}"
|
|
330
|
+
self._type = typeset.pop()
|
|
331
|
+
assert len(self.documents) == len(
|
|
332
|
+
self.roles
|
|
333
|
+
), f"Must have same number of documents and roles: {self.documents}, {self.roles}"
|
|
334
|
+
# one of the documents should have a non-null sourceid: use it
|
|
335
|
+
# to set the canon for the group
|
|
336
|
+
sourcedocid = self.documents[0].sourceid or self.documents[1].sourceid
|
|
337
|
+
assert (
|
|
338
|
+
sourcedocid
|
|
339
|
+
), f"Neither {self.documents[0].docid} nor {self.documents[1].docid} are recognized as source texts:\ncheck src/SourceidEnum for completeness."
|
|
340
|
+
self.canon = sourcedocid.canon
|
|
341
|
+
self.sourcedocid = sourcedocid.value
|
|
342
|
+
|
|
343
|
+
def __repr__(self) -> str:
|
|
344
|
+
"""Return a printed representation."""
|
|
345
|
+
docids: tuple[str, str] = tuple([doc.asdict()["docid"] for doc in self.documents])
|
|
346
|
+
return f"<AlignmentGroup{docids}: {len(self.records)} records>"
|
|
347
|
+
|
|
348
|
+
def asdict(self, hoist: bool = True) -> dict[str, Any]:
|
|
349
|
+
"""Return a dict of values suitable for serialization.
|
|
350
|
+
|
|
351
|
+
This is opinionated about the preferred serialization: hoists
|
|
352
|
+
as much as possible to upper levels.
|
|
353
|
+
|
|
354
|
+
"""
|
|
355
|
+
# for now
|
|
356
|
+
positional: bool = False
|
|
357
|
+
withmeta: bool = False
|
|
358
|
+
|
|
359
|
+
return {
|
|
360
|
+
"meta": self.meta.asdict(),
|
|
361
|
+
"type": self._type,
|
|
362
|
+
"records": [
|
|
363
|
+
rec.asdict(positional=positional, withmeta=withmeta) for rec in self.records
|
|
364
|
+
],
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
def verserecords(self) -> dict[str, list[AlignmentRecord]]:
|
|
368
|
+
"""Return a dict mapping source BCV references to their alignment records."""
|
|
369
|
+
verserecords: dict[str, list[AlignmentRecord]] = {
|
|
370
|
+
k: list(g) for k, g in groupby(self.records, lambda r: r.source_bcv)
|
|
371
|
+
}
|
|
372
|
+
return verserecords
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# not for Alignments: 2024-09-09
|
|
376
|
+
@dataclass
|
|
377
|
+
class TopLevelGroups:
|
|
378
|
+
"""Manage a pair of AlignmentGroups. Both groups are required."""
|
|
379
|
+
|
|
380
|
+
# one group for OT, one group for NT
|
|
381
|
+
groups: tuple[AlignmentGroup, AlignmentGroup]
|
|
382
|
+
format: str = "alignment"
|
|
383
|
+
version: str = "0.3.1"
|
|
384
|
+
sourcedocids: tuple[str, str] = ()
|
|
385
|
+
targetdocid: str = ""
|
|
386
|
+
|
|
387
|
+
def __post_init__(self) -> None:
|
|
388
|
+
"""Compute values on initialization."""
|
|
389
|
+
assert len(self.groups) == 2, "There must be two groups."
|
|
390
|
+
# everything below assumes two groups
|
|
391
|
+
assert (
|
|
392
|
+
self.groups[0].roles == self.groups[1].roles
|
|
393
|
+
), f"Roles must match: {self.groups[0].roles}, {self.groups[1].roles}"
|
|
394
|
+
assert (
|
|
395
|
+
self.groups[0].meta.conformsTo == self.groups[1].meta.conformsTo
|
|
396
|
+
), f"meta.conformsto values must match: {self.groups[0].meta.conformsTo}, {self.groups[1].meta.conformsTo}"
|
|
397
|
+
# target documents should also match
|
|
398
|
+
targetdocids = list(
|
|
399
|
+
{group.documents[group.roles.index("target")].docid for group in self.groups}
|
|
400
|
+
)
|
|
401
|
+
assert len(targetdocids) == 1, f"OT and NT target docids must match: {targetdocids}"
|
|
402
|
+
self.targetdocid = targetdocids[0]
|
|
403
|
+
# canons must be different
|
|
404
|
+
assert {self.groups[0].canon, self.groups[1].canon} == {
|
|
405
|
+
"ot",
|
|
406
|
+
"nt",
|
|
407
|
+
}, "Both OT and NT canons are required."
|
|
408
|
+
self.sourcedocids = (self.groups[0].sourcedocid, self.groups[1].sourcedocid)
|
|
409
|
+
|
|
410
|
+
def __repr__(self) -> str:
|
|
411
|
+
"""Return a printed representation."""
|
|
412
|
+
return f"<TopLevelGroups({self.targetdocid}): {self.sourcedocids}>"
|
|
413
|
+
|
|
414
|
+
def asdict(self, hoist: bool = True) -> dict[str, Any]:
|
|
415
|
+
"""Return an opionated dict of values suitable for serialization."""
|
|
416
|
+
return {
|
|
417
|
+
"format": self.format,
|
|
418
|
+
"version": self.version,
|
|
419
|
+
"groups": [self.groups[0].asdict(hoist=hoist), self.groups[1].asdict(hoist=hoist)],
|
|
420
|
+
}
|