mets-to-edm 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mets_to_edm-0.2.1/PKG-INFO +100 -0
- mets_to_edm-0.2.1/README.md +83 -0
- mets_to_edm-0.2.1/mets_to_edm/__init__.py +1 -0
- mets_to_edm-0.2.1/mets_to_edm/__main__.py +33 -0
- mets_to_edm-0.2.1/mets_to_edm/mapper.py +864 -0
- mets_to_edm-0.2.1/mets_to_edm/utilities.py +104 -0
- mets_to_edm-0.2.1/pyproject.toml +29 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: mets-to-edm
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Modular mapping of METS/MODS data to EDM (Europeana Data Model), providing comprehensive transformation of METS/MODS XML structures into Pydantic-based EDM classes and properties.
|
|
5
|
+
License: MIT
|
|
6
|
+
Author: Kulturpool
|
|
7
|
+
Author-email: info@kulturpool.at
|
|
8
|
+
Requires-Python: >=3.12,<4.0
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Requires-Dist: edmlib (>=2.5.2,<3.0.0)
|
|
13
|
+
Requires-Dist: lxml (>=5.2.2,<6.0.0)
|
|
14
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
15
|
+
Requires-Dist: types-lxml (>=2025.8.25,<2026.0.0)
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# mets_to_edm
|
|
19
|
+
|
|
20
|
+
`mets_to_edm` is a Python library for converting METS/MODS XML records into Europeana Data Model (EDM) records.
|
|
21
|
+
It can be used both as a library and from the command line and has a basic Mapping that should work well for most cases.
|
|
22
|
+
But the library also provides a flexible mapping layer to override certain parts.
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- Converts METS/MODS XML to EDM using [edmlib](https://github.com/kulturpool/EDMLib)
|
|
27
|
+
- Easily extensible: override mapping methods to customize output
|
|
28
|
+
- CLI and Python API
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
Install via Poetry (recommended):
|
|
33
|
+
|
|
34
|
+
```sh
|
|
35
|
+
poetry install
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Or with pip (if you have all dependencies):
|
|
39
|
+
|
|
40
|
+
```sh
|
|
41
|
+
pip install .
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Usage
|
|
45
|
+
|
|
46
|
+
### As a Python Library
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from mets_to_edm import MetsToEdmMapper
|
|
50
|
+
from lxml import etree
|
|
51
|
+
|
|
52
|
+
# Parse your METS/MODS XML file
|
|
53
|
+
xml_tree = etree.parse("example.xml")
|
|
54
|
+
|
|
55
|
+
# Convert to an EDM record
|
|
56
|
+
edmlib_record = MetsToEdmMapper.process_record(xml_tree)
|
|
57
|
+
|
|
58
|
+
# Serialize to EDM XML
|
|
59
|
+
edm_xml = edmlib_record.serialize()
|
|
60
|
+
print(edm_xml)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### From the Command Line
|
|
64
|
+
|
|
65
|
+
```sh
|
|
66
|
+
python -m mets_to_edm example.xml "Provider Name" [--data-provider "Data Provider"]
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
- `"Provider Name"`: the institution name to be filled in as edm:provider (the aggregator providing the data to europeana)
|
|
70
|
+
- `"Data Provider"`: the institution name to be filled in as edm:dataProvider (the Organisation where the data originates from). Optional as it will otherwise be extracted from the amdSec using XPath "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner"
|
|
71
|
+
|
|
72
|
+
## Customizing the Mapping
|
|
73
|
+
|
|
74
|
+
To change how specific fields are mapped, subclass `MetsToEdmMapper` and override the relevant class methods. For example, to change how titles are extracted:
|
|
75
|
+
You can override any method such as:
|
|
76
|
+
- `get_titles`
|
|
77
|
+
- `get_descriptions`
|
|
78
|
+
- `get_publishers`
|
|
79
|
+
- `get_types`
|
|
80
|
+
- `get_languages`
|
|
81
|
+
- ...and more (see `mets_to_edm/mapper.py` for all available hooks)
|
|
82
|
+
|
|
83
|
+
### Example: Overriding the Data Provider
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
class MyMapper(MetsToEdmMapper):
|
|
87
|
+
@classmethod
|
|
88
|
+
def get_data_provider(cls, dmd_sec, amd_sec, default=None):
|
|
89
|
+
return "My Custom Data Provider"
|
|
90
|
+
|
|
91
|
+
# Usage:
|
|
92
|
+
# edmlib_record = MyCustomMapper.process_record(tree)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
For more examples have a look at the examples directory.
|
|
96
|
+
|
|
97
|
+
## Further Information
|
|
98
|
+
|
|
99
|
+
- See the source code in `mets_to_edm/mapper.py` for all overridable methods and mapping logic.
|
|
100
|
+
- For questions or contributions, open an issue or pull request.
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# mets_to_edm
|
|
2
|
+
|
|
3
|
+
`mets_to_edm` is a Python library for converting METS/MODS XML records into Europeana Data Model (EDM) records.
|
|
4
|
+
It can be used both as a library and from the command line and has a basic Mapping that should work well for most cases.
|
|
5
|
+
But the library also provides a flexible mapping layer to override certain parts.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Converts METS/MODS XML to EDM using [edmlib](https://github.com/kulturpool/EDMLib)
|
|
10
|
+
- Easily extensible: override mapping methods to customize output
|
|
11
|
+
- CLI and Python API
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
Install via Poetry (recommended):
|
|
16
|
+
|
|
17
|
+
```sh
|
|
18
|
+
poetry install
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Or with pip (if you have all dependencies):
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
pip install .
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
### As a Python Library
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
from mets_to_edm import MetsToEdmMapper
|
|
33
|
+
from lxml import etree
|
|
34
|
+
|
|
35
|
+
# Parse your METS/MODS XML file
|
|
36
|
+
xml_tree = etree.parse("example.xml")
|
|
37
|
+
|
|
38
|
+
# Convert to an EDM record
|
|
39
|
+
edmlib_record = MetsToEdmMapper.process_record(xml_tree)
|
|
40
|
+
|
|
41
|
+
# Serialize to EDM XML
|
|
42
|
+
edm_xml = edmlib_record.serialize()
|
|
43
|
+
print(edm_xml)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### From the Command Line
|
|
47
|
+
|
|
48
|
+
```sh
|
|
49
|
+
python -m mets_to_edm example.xml "Provider Name" [--data-provider "Data Provider"]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
- `"Provider Name"`: the institution name to be filled in as edm:provider (the aggregator providing the data to europeana)
|
|
53
|
+
- `"Data Provider"`: the institution name to be filled in as edm:dataProvider (the Organisation where the data originates from). Optional as it will otherwise be extracted from the amdSec using XPath "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner"
|
|
54
|
+
|
|
55
|
+
## Customizing the Mapping
|
|
56
|
+
|
|
57
|
+
To change how specific fields are mapped, subclass `MetsToEdmMapper` and override the relevant class methods. For example, to change how titles are extracted:
|
|
58
|
+
You can override any method such as:
|
|
59
|
+
- `get_titles`
|
|
60
|
+
- `get_descriptions`
|
|
61
|
+
- `get_publishers`
|
|
62
|
+
- `get_types`
|
|
63
|
+
- `get_languages`
|
|
64
|
+
- ...and more (see `mets_to_edm/mapper.py` for all available hooks)
|
|
65
|
+
|
|
66
|
+
### Example: Overriding the Data Provider
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
class MyMapper(MetsToEdmMapper):
|
|
70
|
+
@classmethod
|
|
71
|
+
def get_data_provider(cls, dmd_sec, amd_sec, default=None):
|
|
72
|
+
return "My Custom Data Provider"
|
|
73
|
+
|
|
74
|
+
# Usage:
|
|
75
|
+
# edmlib_record = MyCustomMapper.process_record(tree)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
For more examples have a look at the examples directory.
|
|
79
|
+
|
|
80
|
+
## Further Information
|
|
81
|
+
|
|
82
|
+
- See the source code in `mets_to_edm/mapper.py` for all overridable methods and mapping logic.
|
|
83
|
+
- For questions or contributions, open an issue or pull request.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from mets_to_edm.mapper import MetsToEdmMapper
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from lxml import etree
|
|
3
|
+
from mets_to_edm.mapper import MetsToEdmMapper
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def main():
|
|
7
|
+
parser = argparse.ArgumentParser(
|
|
8
|
+
description="Process a file with a specified data provider."
|
|
9
|
+
)
|
|
10
|
+
parser.add_argument("file", type=str, help="Path to the input file")
|
|
11
|
+
parser.add_argument(
|
|
12
|
+
"provider",
|
|
13
|
+
type=str,
|
|
14
|
+
help="Name of the edm:provider (institution providing the data to Europeana)",
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument("--data-provider", type=str, help="Name of the data provider")
|
|
17
|
+
|
|
18
|
+
args = parser.parse_args()
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
with open(args.file, "rb") as f:
|
|
22
|
+
tree = etree.parse(f)
|
|
23
|
+
print(
|
|
24
|
+
MetsToEdmMapper.process_record(
|
|
25
|
+
tree, edm_provider=args.provider, data_provider=args.data_provider
|
|
26
|
+
).serialize()
|
|
27
|
+
)
|
|
28
|
+
except (etree.XMLSyntaxError, FileNotFoundError) as e:
|
|
29
|
+
print(f"Error parsing the file: {e}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
if __name__ == "__main__":
|
|
33
|
+
main()
|
|
@@ -0,0 +1,864 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from typing import Any, Callable, Optional, Type, Dict, List, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from edmlib import (
|
|
7
|
+
MixedValuesList,
|
|
8
|
+
ORE_Aggregation,
|
|
9
|
+
SKOS_Concept,
|
|
10
|
+
EDM_Place,
|
|
11
|
+
EDM_TimeSpan,
|
|
12
|
+
EDM_Agent,
|
|
13
|
+
EDM_WebResource,
|
|
14
|
+
SVCS_Service,
|
|
15
|
+
)
|
|
16
|
+
from edmlib.edm import EDM_Record, EDM_ProvidedCHO, Lit, Ref
|
|
17
|
+
from lxml.etree import _Element
|
|
18
|
+
|
|
19
|
+
from .utilities import (
|
|
20
|
+
METS_MODS_NAMESPACES,
|
|
21
|
+
join_tag_texts_xpath,
|
|
22
|
+
literal_list_from_xpath,
|
|
23
|
+
xpath_first_match,
|
|
24
|
+
mods_ns,
|
|
25
|
+
uri_list_from_xpath,
|
|
26
|
+
ModsNameResultsType,
|
|
27
|
+
first_literal_from_xpath,
|
|
28
|
+
CONTEXT_DICT_TYPE,
|
|
29
|
+
context_dict_to_edm_record_dict,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger("mets-to-edm")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
XSL_FILE = os.path.join(os.path.dirname(__file__), "MODSMETS2EDM.xsl")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def retry_with_host_data(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
39
|
+
def wrapper_retry_with_host_data(
|
|
40
|
+
cls: Type["MetsToEdmMapper"],
|
|
41
|
+
dmd_sec: _Element,
|
|
42
|
+
host_dmd_sec: Optional[_Element] = None,
|
|
43
|
+
*args: Any,
|
|
44
|
+
**kwargs: Any,
|
|
45
|
+
) -> Any:
|
|
46
|
+
values = func(cls, dmd_sec=dmd_sec, *args, **kwargs)
|
|
47
|
+
|
|
48
|
+
# Check if original result was already valid
|
|
49
|
+
if isinstance(values, dict):
|
|
50
|
+
if any(x for x in values.values()):
|
|
51
|
+
return values
|
|
52
|
+
elif values:
|
|
53
|
+
return values
|
|
54
|
+
|
|
55
|
+
# Retry with host dmd_sec
|
|
56
|
+
if host_dmd_sec is not None:
|
|
57
|
+
return func(cls, dmd_sec=host_dmd_sec, *args, **kwargs)
|
|
58
|
+
|
|
59
|
+
return values
|
|
60
|
+
|
|
61
|
+
return wrapper_retry_with_host_data
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class MetsToEdmMapper:
|
|
65
|
+
SUBJECT_SUBELEMENTS_MAPPING = {
|
|
66
|
+
mods_ns("topic"): ("dc_subject", SKOS_Concept),
|
|
67
|
+
mods_ns("geographic"): ("dcterms_spatial", EDM_Place),
|
|
68
|
+
mods_ns("temporal"): ("dcterms_temporal", EDM_TimeSpan),
|
|
69
|
+
mods_ns("titleInfo"): ("dc_subject", SKOS_Concept),
|
|
70
|
+
mods_ns("name"): ("dc_subject", EDM_Agent),
|
|
71
|
+
mods_ns("genre"): ("dc_type", SKOS_Concept),
|
|
72
|
+
mods_ns("cartographics"): (None, None),
|
|
73
|
+
mods_ns("hierarchicalGeographic"): (None, None),
|
|
74
|
+
mods_ns("geographicCode"): (None, None),
|
|
75
|
+
mods_ns("occupation"): (None, None),
|
|
76
|
+
# TODO: maybe also support hierarchicalGeographic, cartographics, geographicCode, occupation
|
|
77
|
+
}
|
|
78
|
+
CREATOR_ROLES = ["aut", "cmp", "art", "pht", "edt"]
|
|
79
|
+
PUBLISHER_ROLES = ["pbl", "isb"]
|
|
80
|
+
SUBJECT_ROLES = ["rcp"]
|
|
81
|
+
OTHER_ROLES = [
|
|
82
|
+
"ctb",
|
|
83
|
+
"trl",
|
|
84
|
+
"prt",
|
|
85
|
+
"oth",
|
|
86
|
+
"egr",
|
|
87
|
+
"cns",
|
|
88
|
+
"ill",
|
|
89
|
+
"chr",
|
|
90
|
+
"wst",
|
|
91
|
+
"dto",
|
|
92
|
+
"asn",
|
|
93
|
+
"lyr",
|
|
94
|
+
]
|
|
95
|
+
IGNORE_ROLES = ["his"]
|
|
96
|
+
|
|
97
|
+
# @classmethod
|
|
98
|
+
# def get_file_from_logical_div(cls,record: _Element, logical_div: _Element):
|
|
99
|
+
# div_id = logical_div.get("ID")
|
|
100
|
+
|
|
101
|
+
@classmethod
|
|
102
|
+
def get_main_structmap_div(cls, record: _Element) -> _Element:
|
|
103
|
+
possible_divs = record.xpath(
|
|
104
|
+
"mets:structMap[@TYPE='LOGICAL']//mets:div[@DMDID and not(mets:mptr)]",
|
|
105
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
106
|
+
)
|
|
107
|
+
for div in possible_divs:
|
|
108
|
+
# if div.get("TYPE") and div.get("TYPE").lower() in [
|
|
109
|
+
# "article",
|
|
110
|
+
# "issue",
|
|
111
|
+
# "volume",
|
|
112
|
+
# "document",
|
|
113
|
+
# "monograph",
|
|
114
|
+
# "multivolume_work",
|
|
115
|
+
# "multivolumework",
|
|
116
|
+
# ]:
|
|
117
|
+
return div
|
|
118
|
+
# Else
|
|
119
|
+
div = record.xpath(
|
|
120
|
+
"(mets:structMap[@TYPE='LOGICAL']//mets:div[@DMDID])[1]",
|
|
121
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
122
|
+
)
|
|
123
|
+
assert len(div) > 0, "Could not find starting div in structmap"
|
|
124
|
+
return div[0]
|
|
125
|
+
|
|
126
|
+
@classmethod
|
|
127
|
+
def get_mods_part(cls, record: _Element, dmdid: str) -> _Element:
|
|
128
|
+
dmd_secs = record.xpath(
|
|
129
|
+
f"mets:dmdSec[@ID='{dmdid}']/mets:mdWrap/mets:xmlData/mods:mods[1]",
|
|
130
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
131
|
+
)
|
|
132
|
+
assert len(dmd_secs) == 1, f"dmdsec not found or multiples for id {dmdid}"
|
|
133
|
+
return dmd_secs[0]
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def get_host_dmd_sec(
|
|
137
|
+
cls, record: _Element, dmd_sec: _Element, logical_main_div: _Element
|
|
138
|
+
) -> Optional[_Element]:
|
|
139
|
+
host_dmd_sec = None
|
|
140
|
+
if possible_hosts := dmd_sec.xpath(
|
|
141
|
+
"mods:relatedItem[@type='host']", namespaces=METS_MODS_NAMESPACES
|
|
142
|
+
):
|
|
143
|
+
host_dmd_sec = possible_hosts[0]
|
|
144
|
+
elif logical_host_div := logical_main_div.xpath(
|
|
145
|
+
"ancestor::mets:div[@DMDID][1]", namespaces=METS_MODS_NAMESPACES
|
|
146
|
+
):
|
|
147
|
+
host_dmd_sec = cls.get_mods_part(
|
|
148
|
+
record, dmdid=logical_host_div[0].get("DMDID")
|
|
149
|
+
)
|
|
150
|
+
return host_dmd_sec
|
|
151
|
+
|
|
152
|
+
@classmethod
|
|
153
|
+
def get_amd_part(cls, record: _Element, amdid: str) -> list[_Element]:
|
|
154
|
+
return record.xpath(
|
|
155
|
+
f"mets:amdSec[@ID='{amdid}'][1]",
|
|
156
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def process_title_tag(cls, title_element: _Element) -> tuple[str, Lit]:
|
|
161
|
+
# TODO: consider whitespace handling and separators
|
|
162
|
+
title = join_tag_texts_xpath(title_element, "mods:nonSort")
|
|
163
|
+
title += join_tag_texts_xpath(title_element, "mods:title")
|
|
164
|
+
subtitle = join_tag_texts_xpath(title_element, "mods:subTitle", separator="; ")
|
|
165
|
+
if subtitle:
|
|
166
|
+
title += ": " + subtitle
|
|
167
|
+
partnumber = join_tag_texts_xpath(
|
|
168
|
+
title_element, "mods:partNumber", separator=", "
|
|
169
|
+
)
|
|
170
|
+
if partnumber:
|
|
171
|
+
title += " " + partnumber
|
|
172
|
+
partname = join_tag_texts_xpath(title_element, "mods:partName", separator=", ")
|
|
173
|
+
if partname:
|
|
174
|
+
title += ": " + partname
|
|
175
|
+
|
|
176
|
+
# TODO: languages: either from attrs lang/xml:lang on titleInfo or subtags, or from document language
|
|
177
|
+
|
|
178
|
+
if title_element.get("type"):
|
|
179
|
+
return ("dcterms_alternative", Lit(value=title))
|
|
180
|
+
else:
|
|
181
|
+
return ("dc_title", Lit(value=title))
|
|
182
|
+
|
|
183
|
+
@classmethod
|
|
184
|
+
def get_titles(
|
|
185
|
+
cls, dmd_sec: _Element, host_dmd_sec: Optional[_Element] = None
|
|
186
|
+
) -> Dict[str, List[Lit]]:
|
|
187
|
+
title_properties = {"dcterms_alternative": [], "dc_title": []}
|
|
188
|
+
titles = dmd_sec.xpath("mods:titleInfo", namespaces=METS_MODS_NAMESPACES)
|
|
189
|
+
for title_info in titles:
|
|
190
|
+
title_type, title = cls.process_title_tag(title_info)
|
|
191
|
+
title_properties[title_type].append(title)
|
|
192
|
+
|
|
193
|
+
# If no title try to create it from host volume and part
|
|
194
|
+
volume = None
|
|
195
|
+
issue = None
|
|
196
|
+
others = []
|
|
197
|
+
detail_numbers = dmd_sec.xpath(
|
|
198
|
+
"mods:part/mods:detail[mods:number]",
|
|
199
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
200
|
+
)
|
|
201
|
+
for detail_number in detail_numbers:
|
|
202
|
+
number = detail_number.find(
|
|
203
|
+
"mods:number", namespaces=METS_MODS_NAMESPACES
|
|
204
|
+
).text
|
|
205
|
+
if detail_number.get("type") == "volume":
|
|
206
|
+
volume = number
|
|
207
|
+
elif detail_number.get("type") == "issue":
|
|
208
|
+
issue = number
|
|
209
|
+
else:
|
|
210
|
+
others.append(number)
|
|
211
|
+
|
|
212
|
+
if volume and issue:
|
|
213
|
+
suffix = f"{volume}/{issue}"
|
|
214
|
+
else:
|
|
215
|
+
suffix = volume or issue or (others[0] if others else None)
|
|
216
|
+
|
|
217
|
+
# suffix = dmd_sec.xpath(
|
|
218
|
+
# "mods:part/mods:detail/mods:number[1]/text()",
|
|
219
|
+
# namespaces=METS_MODS_NAMESPACES,
|
|
220
|
+
# )
|
|
221
|
+
if not suffix:
|
|
222
|
+
date_suffix = dmd_sec.xpath(
|
|
223
|
+
"mods:part/mods:date[1]/text()", namespaces=METS_MODS_NAMESPACES
|
|
224
|
+
)
|
|
225
|
+
suffix = date_suffix[0] if date_suffix else None
|
|
226
|
+
|
|
227
|
+
if suffix and host_dmd_sec is not None:
|
|
228
|
+
for host_title_type, host_titles in cls.get_titles(host_dmd_sec).items():
|
|
229
|
+
for host_title in host_titles:
|
|
230
|
+
title_properties[host_title_type].append(
|
|
231
|
+
Lit(value=host_title.value + " " + suffix)
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
if (
|
|
235
|
+
not title_properties["dcterms_alternative"]
|
|
236
|
+
and not title_properties["dc_title"]
|
|
237
|
+
):
|
|
238
|
+
# if still no title try the mets:mets/@LABEL as last resort
|
|
239
|
+
mets_label = dmd_sec.xpath(
|
|
240
|
+
"/mets:mets/@LABEL", namespaces=METS_MODS_NAMESPACES
|
|
241
|
+
)[0]
|
|
242
|
+
if suffix and mets_label:
|
|
243
|
+
title_properties["dc_title"].append(
|
|
244
|
+
Lit(value=mets_label + " " + suffix)
|
|
245
|
+
)
|
|
246
|
+
return title_properties
|
|
247
|
+
|
|
248
|
+
@classmethod
|
|
249
|
+
def get_descriptions(cls, dmd_sec: _Element) -> MixedValuesList:
|
|
250
|
+
def note_string_extract(tag: _Element):
|
|
251
|
+
output = ""
|
|
252
|
+
if tag.get("type"):
|
|
253
|
+
output += tag.get("type") + ": "
|
|
254
|
+
output += tag.text
|
|
255
|
+
return output
|
|
256
|
+
|
|
257
|
+
return literal_list_from_xpath(
|
|
258
|
+
dmd_sec, "mods:note", string_extract_function=note_string_extract
|
|
259
|
+
) + literal_list_from_xpath(dmd_sec, "mods:abstract")
|
|
260
|
+
|
|
261
|
+
@classmethod
|
|
262
|
+
def get_identifiers(cls, dmd_sec: _Element) -> List[Lit]:
|
|
263
|
+
return literal_list_from_xpath(
|
|
264
|
+
dmd_sec, "mods:recordInfo/mods:recordIdentifier"
|
|
265
|
+
) + literal_list_from_xpath(dmd_sec, "mods:identifier")
|
|
266
|
+
|
|
267
|
+
@classmethod
|
|
268
|
+
def get_edm_type(
|
|
269
|
+
cls, dmd_sec: _Element, logical_main_div: Optional[_Element] = None
|
|
270
|
+
) -> Lit:
|
|
271
|
+
return Lit(value="TEXT")
|
|
272
|
+
|
|
273
|
+
@classmethod
|
|
274
|
+
def parse_mods_subjects(
|
|
275
|
+
cls, dmd_sec: _Element, context_objects: CONTEXT_DICT_TYPE
|
|
276
|
+
) -> Dict[str, List[Union[Lit, Ref]]]:
|
|
277
|
+
subjects = dmd_sec.findall("mods:subject", namespaces=METS_MODS_NAMESPACES)
|
|
278
|
+
edm_values: dict[str, list[Lit | Ref]] = defaultdict(list)
|
|
279
|
+
for subject in subjects:
|
|
280
|
+
for subject_subelement in subject:
|
|
281
|
+
edm_property, context_class = cls.SUBJECT_SUBELEMENTS_MAPPING[
|
|
282
|
+
subject_subelement.tag
|
|
283
|
+
]
|
|
284
|
+
if edm_property is None:
|
|
285
|
+
logger.warning(
|
|
286
|
+
f"unimplemented mods:subject subelement {subject_subelement.tag}"
|
|
287
|
+
)
|
|
288
|
+
continue
|
|
289
|
+
if subject_subelement.tag == mods_ns("titleInfo"):
|
|
290
|
+
pref_label = cls.process_title_tag(subject_subelement)[1]
|
|
291
|
+
elif subject_subelement.tag == mods_ns("name"):
|
|
292
|
+
person = cls.parse_mods_name(subject_subelement)
|
|
293
|
+
if not person:
|
|
294
|
+
continue
|
|
295
|
+
elif isinstance(person, EDM_Agent):
|
|
296
|
+
context_objects[person.id.value] = person
|
|
297
|
+
edm_values[edm_property].append(person.id)
|
|
298
|
+
continue
|
|
299
|
+
else:
|
|
300
|
+
pref_label = Lit(value=person.value)
|
|
301
|
+
else:
|
|
302
|
+
pref_label = Lit(value=subject_subelement.text)
|
|
303
|
+
|
|
304
|
+
if subject_subelement.get("valueURI"):
|
|
305
|
+
context_object = context_class(
|
|
306
|
+
id=Ref(value=subject_subelement.get("valueURI")),
|
|
307
|
+
skos_prefLabel=[pref_label],
|
|
308
|
+
)
|
|
309
|
+
context_objects[context_object.id.value] = context_object
|
|
310
|
+
edm_values[edm_property].append(context_object.id)
|
|
311
|
+
else:
|
|
312
|
+
edm_values[edm_property].append(pref_label)
|
|
313
|
+
return edm_values
|
|
314
|
+
|
|
315
|
+
@classmethod
|
|
316
|
+
def get_subjects(cls, dmd_sec: _Element) -> MixedValuesList:
|
|
317
|
+
# intranda extension:
|
|
318
|
+
return literal_list_from_xpath(
|
|
319
|
+
dmd_sec, "mods:extension/intranda:intranda/intranda:subjectPerson"
|
|
320
|
+
) + literal_list_from_xpath(dmd_sec, "mods:extension/intranda:Topic")
|
|
321
|
+
|
|
322
|
+
@classmethod
|
|
323
|
+
def parse_logical_main_div_type(
|
|
324
|
+
cls, logical_main_div: Optional[_Element] = None
|
|
325
|
+
) -> List[Lit]:
|
|
326
|
+
if logical_main_div is not None and logical_main_div.get("TYPE"):
|
|
327
|
+
type_from_div = [Lit(value=logical_main_div.get("TYPE"))]
|
|
328
|
+
return type_from_div
|
|
329
|
+
else:
|
|
330
|
+
return []
|
|
331
|
+
|
|
332
|
+
@classmethod
|
|
333
|
+
def get_types(
|
|
334
|
+
cls, dmd_sec: _Element, logical_main_div: Optional[_Element] = None
|
|
335
|
+
) -> MixedValuesList:
|
|
336
|
+
# intranda extension:
|
|
337
|
+
return (
|
|
338
|
+
literal_list_from_xpath(dmd_sec, "mods:extension/intranda:ObjectType")
|
|
339
|
+
+ literal_list_from_xpath(
|
|
340
|
+
dmd_sec, "mods:physicalDescription/mods:form[@type='technique']"
|
|
341
|
+
)
|
|
342
|
+
+ literal_list_from_xpath(dmd_sec, "mods:genre")
|
|
343
|
+
+ cls.parse_logical_main_div_type(logical_main_div)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
@classmethod
|
|
347
|
+
def get_temporals(cls, dmd_sec: _Element) -> MixedValuesList:
|
|
348
|
+
# intranda extension:
|
|
349
|
+
return literal_list_from_xpath(dmd_sec, "mods:extension/intranda:TopicPeriod")
|
|
350
|
+
|
|
351
|
+
@classmethod
|
|
352
|
+
def get_spatials(cls, dmd_sec: _Element) -> MixedValuesList:
|
|
353
|
+
# intranda extension:
|
|
354
|
+
intranda_spatials = literal_list_from_xpath(
|
|
355
|
+
dmd_sec, "mods:extension/intranda:TopicRoom"
|
|
356
|
+
)
|
|
357
|
+
origin_places = literal_list_from_xpath(
|
|
358
|
+
dmd_sec, "mods:originInfo/mods:place/mods:placeTerm[@type='text']"
|
|
359
|
+
)
|
|
360
|
+
return intranda_spatials + origin_places
|
|
361
|
+
|
|
362
|
+
@classmethod
|
|
363
|
+
def get_mediums(cls, dmd_sec: _Element) -> MixedValuesList:
|
|
364
|
+
return literal_list_from_xpath(
|
|
365
|
+
dmd_sec,
|
|
366
|
+
"mods:physicalDescription/mods:form[not(@type='technique') and not(@type='dimensions')]",
|
|
367
|
+
) # TODO: check if there is a valueURI to create vocabulary references
|
|
368
|
+
|
|
369
|
+
@classmethod
|
|
370
|
+
def get_extent(cls, dmd_sec: _Element) -> MixedValuesList:
|
|
371
|
+
return literal_list_from_xpath(
|
|
372
|
+
dmd_sec, "mods:physicalDescription/mods:extent"
|
|
373
|
+
) + literal_list_from_xpath(
|
|
374
|
+
dmd_sec, "mods:physicalDescription/mods:form[@type='dimensions']"
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
@classmethod
|
|
378
|
+
def get_languages(cls, dmd_sec: _Element) -> List[str]:
|
|
379
|
+
langs = dmd_sec.xpath(
|
|
380
|
+
"mods:language/mods:languageTerm/text()", namespaces=METS_MODS_NAMESPACES
|
|
381
|
+
)
|
|
382
|
+
# TODO: convert to ISO language codes
|
|
383
|
+
return langs
|
|
384
|
+
|
|
385
|
+
@classmethod
|
|
386
|
+
def parse_mods_date(
|
|
387
|
+
cls, dmd_sec: _Element, date_element_name: str
|
|
388
|
+
) -> Optional[List[Lit]]:
|
|
389
|
+
dates = dmd_sec.xpath(date_element_name, namespaces=METS_MODS_NAMESPACES)
|
|
390
|
+
start = ""
|
|
391
|
+
end = ""
|
|
392
|
+
other = ""
|
|
393
|
+
for date in dates:
|
|
394
|
+
if date.get("point") == "start":
|
|
395
|
+
start = date.text
|
|
396
|
+
elif date.get("point") == "end":
|
|
397
|
+
end = date.text
|
|
398
|
+
else:
|
|
399
|
+
if date.get("keyDate") == "yes" or not other:
|
|
400
|
+
other = date.text
|
|
401
|
+
# TODO: consider other date attributes, like qualifier for approximate/inferred/questionable
|
|
402
|
+
if start and end:
|
|
403
|
+
return [Lit(value=start + "-" + end)]
|
|
404
|
+
elif other:
|
|
405
|
+
return [Lit(value=other)]
|
|
406
|
+
elif start or end:
|
|
407
|
+
return [Lit(value=start + "-" + end)]
|
|
408
|
+
else:
|
|
409
|
+
return None
|
|
410
|
+
|
|
411
|
+
@classmethod
|
|
412
|
+
def get_issued(cls, dmd_sec: _Element) -> Optional[List[Lit]]:
|
|
413
|
+
return cls.parse_mods_date(dmd_sec, "mods:originInfo/mods:dateIssued")
|
|
414
|
+
|
|
415
|
+
@classmethod
|
|
416
|
+
def get_created(cls, dmd_sec: _Element) -> Optional[List[Lit]]:
|
|
417
|
+
return cls.parse_mods_date(dmd_sec, "mods:originInfo/mods:dateCreated")
|
|
418
|
+
|
|
419
|
+
@classmethod
|
|
420
|
+
@retry_with_host_data
|
|
421
|
+
def get_publishers(
|
|
422
|
+
cls, dmd_sec: _Element, host_dmd_sec: Optional[_Element] = None
|
|
423
|
+
) -> List[Lit]:
|
|
424
|
+
return literal_list_from_xpath(dmd_sec, "mods:originInfo/mods:publisher")
|
|
425
|
+
|
|
426
|
+
@classmethod
|
|
427
|
+
def get_full_name_from_name_tag(cls, name_tag: _Element) -> str:
|
|
428
|
+
# first try displayForm
|
|
429
|
+
display_form = name_tag.find(
|
|
430
|
+
"mods:displayForm", namespaces=METS_MODS_NAMESPACES
|
|
431
|
+
)
|
|
432
|
+
if display_form is not None and display_form.text:
|
|
433
|
+
return display_form.text
|
|
434
|
+
|
|
435
|
+
# otherwise join nameparts based on type
|
|
436
|
+
given_name = join_tag_texts_xpath(
|
|
437
|
+
name_tag, "mods:namePart[@type='given']", separator=" "
|
|
438
|
+
)
|
|
439
|
+
family_name = join_tag_texts_xpath(
|
|
440
|
+
name_tag, "mods:namePart[@type='family']", separator=" "
|
|
441
|
+
)
|
|
442
|
+
address = join_tag_texts_xpath(
|
|
443
|
+
name_tag, "mods:namePart[@type='termsOfAddress']", separator=" "
|
|
444
|
+
)
|
|
445
|
+
name = (" ".join([given_name, family_name, address])).strip()
|
|
446
|
+
if name:
|
|
447
|
+
return name
|
|
448
|
+
|
|
449
|
+
# otherwise use nameparts without type
|
|
450
|
+
return join_tag_texts_xpath(
|
|
451
|
+
name_tag, "mods:namePart[not(@type)]", separator=" "
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
@classmethod
|
|
455
|
+
def parse_mods_name(cls, name_tag: _Element) -> Lit | EDM_Agent | None:
|
|
456
|
+
uri = name_tag.get("valueURI")
|
|
457
|
+
if not uri:
|
|
458
|
+
uri = name_tag.get("nameIdentifier")
|
|
459
|
+
|
|
460
|
+
# name
|
|
461
|
+
name = cls.get_full_name_from_name_tag(name_tag)
|
|
462
|
+
name_lit = Lit(value=name) if name else None
|
|
463
|
+
|
|
464
|
+
# then do alternativeNames as well
|
|
465
|
+
alt_names = [
|
|
466
|
+
Lit(value=alt_name)
|
|
467
|
+
for alt_name_tag in name_tag.findall(
|
|
468
|
+
"mods:alternativeName", namespaces=METS_MODS_NAMESPACES
|
|
469
|
+
)
|
|
470
|
+
if (alt_name := cls.get_full_name_from_name_tag(alt_name_tag))
|
|
471
|
+
]
|
|
472
|
+
# TODO: maybe also support altRepGroup in the future
|
|
473
|
+
|
|
474
|
+
if not name_lit and not uri:
|
|
475
|
+
return None
|
|
476
|
+
elif not alt_names and not uri:
|
|
477
|
+
return name_lit
|
|
478
|
+
else:
|
|
479
|
+
if not uri:
|
|
480
|
+
uri = "agent"
|
|
481
|
+
name_lits = [name_lit] if name_lit else None
|
|
482
|
+
return EDM_Agent(
|
|
483
|
+
id=Ref(value=uri), skos_prefLabel=name_lits, skos_altLabel=alt_names
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
@classmethod
|
|
487
|
+
def get_edm_property_for_roles(cls, roles: List[str]) -> Optional[str]:
|
|
488
|
+
edm_property = "dc_contributor"
|
|
489
|
+
for role_entry in roles:
|
|
490
|
+
if role_entry in cls.IGNORE_ROLES:
|
|
491
|
+
return None
|
|
492
|
+
elif role_entry in cls.CREATOR_ROLES:
|
|
493
|
+
return "dc_creator"
|
|
494
|
+
elif role_entry in cls.PUBLISHER_ROLES:
|
|
495
|
+
edm_property = "dc_publisher"
|
|
496
|
+
elif role_entry in cls.SUBJECT_ROLES:
|
|
497
|
+
edm_property = "dc_subject"
|
|
498
|
+
elif role_entry not in cls.OTHER_ROLES:
|
|
499
|
+
logger.warning(
|
|
500
|
+
f'Unknown Role: "{role_entry}", falling back to contributor'
|
|
501
|
+
)
|
|
502
|
+
return edm_property
|
|
503
|
+
|
|
504
|
+
@classmethod
|
|
505
|
+
@retry_with_host_data
|
|
506
|
+
def parse_mods_names(
|
|
507
|
+
cls, dmd_sec: _Element, context_objects: CONTEXT_DICT_TYPE
|
|
508
|
+
) -> ModsNameResultsType:
|
|
509
|
+
name_results = {
|
|
510
|
+
"dc_creator": [],
|
|
511
|
+
"dc_publisher": [],
|
|
512
|
+
"dc_contributor": [],
|
|
513
|
+
"dcterms_provenance": [],
|
|
514
|
+
"dc_subject": [],
|
|
515
|
+
}
|
|
516
|
+
for name_tag in dmd_sec.findall("mods:name", namespaces=METS_MODS_NAMESPACES):
|
|
517
|
+
literal_or_agent = cls.parse_mods_name(name_tag)
|
|
518
|
+
if not literal_or_agent:
|
|
519
|
+
continue
|
|
520
|
+
name_value = literal_or_agent
|
|
521
|
+
if isinstance(literal_or_agent, EDM_Agent):
|
|
522
|
+
context_objects[literal_or_agent.id.value] = literal_or_agent
|
|
523
|
+
name_value = literal_or_agent.id
|
|
524
|
+
|
|
525
|
+
roles = [
|
|
526
|
+
r.text
|
|
527
|
+
for r in name_tag.findall(
|
|
528
|
+
"mods:role/mods:roleTerm", namespaces=METS_MODS_NAMESPACES
|
|
529
|
+
)
|
|
530
|
+
]
|
|
531
|
+
if "fmo" in roles:
|
|
532
|
+
former_owner_value = (
|
|
533
|
+
literal_or_agent.skos_prefLabel[0].value
|
|
534
|
+
if isinstance(literal_or_agent, EDM_Agent)
|
|
535
|
+
else literal_or_agent.value
|
|
536
|
+
)
|
|
537
|
+
name_results["dcterms_provenance"] += [
|
|
538
|
+
Lit(
|
|
539
|
+
value="Former owner: " + former_owner_value,
|
|
540
|
+
lang="en",
|
|
541
|
+
),
|
|
542
|
+
Lit(
|
|
543
|
+
value="Frühere:r Eigentümer:in: " + former_owner_value,
|
|
544
|
+
lang="de",
|
|
545
|
+
),
|
|
546
|
+
]
|
|
547
|
+
roles.remove("fmo")
|
|
548
|
+
if not roles:
|
|
549
|
+
break
|
|
550
|
+
|
|
551
|
+
edm_property = cls.get_edm_property_for_roles(roles)
|
|
552
|
+
if edm_property:
|
|
553
|
+
name_results[edm_property].append(name_value)
|
|
554
|
+
return name_results
|
|
555
|
+
|
|
556
|
+
@classmethod
|
|
557
|
+
def get_edm_rights(cls, dmd_sec: _Element) -> Ref:
|
|
558
|
+
access_conditions = dmd_sec.xpath(
|
|
559
|
+
"mods:accessCondition[@xlink:href][1]/@xlink:href",
|
|
560
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
561
|
+
)
|
|
562
|
+
if access_conditions:
|
|
563
|
+
return Ref(value=access_conditions[0].replace("https://", "http://"))
|
|
564
|
+
|
|
565
|
+
access_conditions = dmd_sec.xpath(
|
|
566
|
+
"mods:accessCondition[@mods:valueURI][1]/@mods:valueURI",
|
|
567
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
568
|
+
)
|
|
569
|
+
if access_conditions:
|
|
570
|
+
return Ref(value=access_conditions[0].replace("https://", "http://"))
|
|
571
|
+
|
|
572
|
+
access_conditions = dmd_sec.xpath(
|
|
573
|
+
"mods:accessCondition[@type!='hide']", namespaces=METS_MODS_NAMESPACES
|
|
574
|
+
)
|
|
575
|
+
if access_conditions:
|
|
576
|
+
return Ref(
|
|
577
|
+
value=access_conditions[0].text.strip().replace("https://", "http://")
|
|
578
|
+
)
|
|
579
|
+
|
|
580
|
+
raise Exception("no corresponding field for edm:rights found")
|
|
581
|
+
|
|
582
|
+
@classmethod
|
|
583
|
+
def get_data_provider(
|
|
584
|
+
cls, dmd_sec: _Element, amd_sec: _Element, default: Optional[str] = None
|
|
585
|
+
) -> Lit:
|
|
586
|
+
if default:
|
|
587
|
+
return Lit(value=default)
|
|
588
|
+
|
|
589
|
+
data_provider = amd_sec.find(
|
|
590
|
+
"mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner",
|
|
591
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
592
|
+
)
|
|
593
|
+
return Lit(value=data_provider.text)
|
|
594
|
+
|
|
595
|
+
@classmethod
|
|
596
|
+
def get_provider(cls, default: Optional[str]) -> Lit:
|
|
597
|
+
assert (
|
|
598
|
+
default
|
|
599
|
+
), "Missing value for edm:provider. Either override get_provider or provide a default value to process_record."
|
|
600
|
+
return Lit(value=default)
|
|
601
|
+
|
|
602
|
+
@classmethod
|
|
603
|
+
def get_is_part_of(cls, dmd_sec: _Element) -> List[Any]:
|
|
604
|
+
return []
|
|
605
|
+
|
|
606
|
+
@classmethod
|
|
607
|
+
def get_referenced_by(
|
|
608
|
+
cls, dmd_sec: _Element, contex_objects: CONTEXT_DICT_TYPE
|
|
609
|
+
) -> MixedValuesList:
|
|
610
|
+
return []
|
|
611
|
+
|
|
612
|
+
@classmethod
|
|
613
|
+
def get_current_location(cls, dmd_sec: _Element) -> Optional[Lit]:
|
|
614
|
+
location = join_tag_texts_xpath(
|
|
615
|
+
dmd_sec, "mods:location[1]/mods:physicalLocation[1]"
|
|
616
|
+
)
|
|
617
|
+
shelf_locator = join_tag_texts_xpath(
|
|
618
|
+
dmd_sec, "mods:location[1]/mods:shelfLocator", separator=" ; "
|
|
619
|
+
)
|
|
620
|
+
full_location = location + ((" ; " + shelf_locator) if shelf_locator else "")
|
|
621
|
+
return Lit(value=full_location) if full_location else None
|
|
622
|
+
|
|
623
|
+
@classmethod
|
|
624
|
+
def get_iiif_image_api_service(cls, url: str) -> Optional[SVCS_Service]:
|
|
625
|
+
"""Override in Institution specific implementation to generate the SVCS_Service object from a given url
|
|
626
|
+
|
|
627
|
+
Args:
|
|
628
|
+
url: URL of a WebResource
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
SVCS_Service object or None if no IIIF Image API service can be extracted from the url
|
|
632
|
+
"""
|
|
633
|
+
return None
|
|
634
|
+
|
|
635
|
+
@classmethod
|
|
636
|
+
def get_iiif_manifest_url(cls, amd_sec: _Element) -> Optional[List[Ref]]:
|
|
637
|
+
iiif_manifest = amd_sec.find(
|
|
638
|
+
"mets:digiprovMD/mets:mdWrap/mets:xmlData/dv:links/dv:iiif",
|
|
639
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
640
|
+
)
|
|
641
|
+
if iiif_manifest is None:
|
|
642
|
+
return None
|
|
643
|
+
return [Ref(value=iiif_manifest.text)]
|
|
644
|
+
|
|
645
|
+
@classmethod
|
|
646
|
+
def query_url_for_div(
|
|
647
|
+
cls, div: _Element, file_sec: _Element, file_grp: str
|
|
648
|
+
) -> Optional[str]:
|
|
649
|
+
fptr_id = xpath_first_match(
|
|
650
|
+
div,
|
|
651
|
+
f"mets:fptr[contains(@FILEID,'{file_grp}')]/@FILEID",
|
|
652
|
+
)
|
|
653
|
+
# assert fptr_id, "no fptr found"
|
|
654
|
+
if fptr_id:
|
|
655
|
+
file_url = xpath_first_match(
|
|
656
|
+
file_sec,
|
|
657
|
+
f".//mets:file[@ID='{fptr_id}'][1]/mets:FLocat[@LOCTYPE='URL']/@xlink:href",
|
|
658
|
+
)
|
|
659
|
+
assert file_url, f"file with ID {fptr_id} not found"
|
|
660
|
+
return file_url
|
|
661
|
+
return None
|
|
662
|
+
|
|
663
|
+
@classmethod
|
|
664
|
+
def query_shownBy_urls(
|
|
665
|
+
cls,
|
|
666
|
+
physical_div: _Element,
|
|
667
|
+
file_sec: _Element,
|
|
668
|
+
xpath_query_pages: str = "mets:div[@TYPE='page']",
|
|
669
|
+
file_grp: str = "DEFAULT",
|
|
670
|
+
) -> List[str]:
|
|
671
|
+
urls = []
|
|
672
|
+
if physical_div is not None:
|
|
673
|
+
page_divs = physical_div.xpath(
|
|
674
|
+
xpath_query_pages, namespaces=METS_MODS_NAMESPACES
|
|
675
|
+
)
|
|
676
|
+
for page_div in page_divs:
|
|
677
|
+
# TODO: consider ORDER attributes on page divs
|
|
678
|
+
file_url = cls.query_url_for_div(page_div, file_sec, file_grp)
|
|
679
|
+
if file_url:
|
|
680
|
+
urls.append(file_url)
|
|
681
|
+
return urls
|
|
682
|
+
|
|
683
|
+
@classmethod
|
|
684
|
+
def get_object(cls, logical_div: _Element, file_sec: _Element) -> Optional[Ref]:
|
|
685
|
+
thumbnail_id = (
|
|
686
|
+
xpath_first_match(
|
|
687
|
+
logical_div, "mets:fptr[contains(@FILEID,'FRONTIMAGE')]/@FILEID"
|
|
688
|
+
)
|
|
689
|
+
or xpath_first_match(
|
|
690
|
+
logical_div, "mets:fptr[contains(@FILEID,'TEASER')]/@FILEID"
|
|
691
|
+
)
|
|
692
|
+
or xpath_first_match(
|
|
693
|
+
file_sec, "mets:fileGrp[@USE='DEFAULT']/mets:file[@USE='banner']/@ID"
|
|
694
|
+
)
|
|
695
|
+
)
|
|
696
|
+
# TODO: last option: get from TitlePage
|
|
697
|
+
if thumbnail_id is None:
|
|
698
|
+
return None
|
|
699
|
+
thumbnail_url = xpath_first_match(
|
|
700
|
+
file_sec,
|
|
701
|
+
f".//mets:file[@ID='{thumbnail_id}'][1]/mets:FLocat[@LOCTYPE='URL']/@xlink:href",
|
|
702
|
+
)
|
|
703
|
+
return Ref(value=thumbnail_url)
|
|
704
|
+
|
|
705
|
+
@classmethod
|
|
706
|
+
def get_webresource_urls(
|
|
707
|
+
cls,
|
|
708
|
+
amd_sec: _Element,
|
|
709
|
+
physical_div: _Element,
|
|
710
|
+
logical_div: _Element,
|
|
711
|
+
file_sec: _Element,
|
|
712
|
+
context_objects: CONTEXT_DICT_TYPE,
|
|
713
|
+
) -> Dict[str, Any]:
|
|
714
|
+
results = {
|
|
715
|
+
"edm_hasView": [],
|
|
716
|
+
"edm_isShownBy": None,
|
|
717
|
+
"edm_isShownAt": None,
|
|
718
|
+
"edm_object": None,
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
urls = cls.query_shownBy_urls(physical_div, file_sec)
|
|
722
|
+
|
|
723
|
+
edm_object = cls.get_object(logical_div, file_sec)
|
|
724
|
+
results["edm_object"] = edm_object
|
|
725
|
+
if edm_object and not urls:
|
|
726
|
+
urls.append(edm_object.value)
|
|
727
|
+
|
|
728
|
+
iiif_manifest = cls.get_iiif_manifest_url(amd_sec)
|
|
729
|
+
|
|
730
|
+
first = True
|
|
731
|
+
for url in urls:
|
|
732
|
+
url = url.replace(" ", "%20")
|
|
733
|
+
service = cls.get_iiif_image_api_service(url)
|
|
734
|
+
has_service = None
|
|
735
|
+
if service:
|
|
736
|
+
context_objects[service.id.value] = service
|
|
737
|
+
has_service = [service.id]
|
|
738
|
+
if iiif_manifest or service:
|
|
739
|
+
context_objects[url] = EDM_WebResource(
|
|
740
|
+
id=Ref(value=url),
|
|
741
|
+
dcterms_isReferencedBy=iiif_manifest,
|
|
742
|
+
svcs_has_service=has_service,
|
|
743
|
+
)
|
|
744
|
+
if first:
|
|
745
|
+
results["edm_isShownBy"] = Ref(value=url)
|
|
746
|
+
first = False
|
|
747
|
+
else:
|
|
748
|
+
results["edm_hasView"].append(Ref(value=url))
|
|
749
|
+
|
|
750
|
+
if pdf_url := cls.query_url_for_div(logical_div, file_sec, file_grp="PDF"):
|
|
751
|
+
if first:
|
|
752
|
+
results["edm_isShownBy"] = Ref(value=pdf_url)
|
|
753
|
+
first = False
|
|
754
|
+
else:
|
|
755
|
+
results["edm_hasView"].append(Ref(value=pdf_url))
|
|
756
|
+
|
|
757
|
+
shown_ats = uri_list_from_xpath(
|
|
758
|
+
amd_sec,
|
|
759
|
+
"mets:digiprovMD/mets:mdWrap/mets:xmlData/dv:links/dv:presentation",
|
|
760
|
+
)
|
|
761
|
+
results["edm_isShownAt"] = shown_ats[0]
|
|
762
|
+
if len(shown_ats) > 1:
|
|
763
|
+
results["edm_hasView"] += shown_ats[1:]
|
|
764
|
+
# TODO: maybe if there is a "mods:originInfo/mods:dateCaptured" put it into the WebResource as dcterms:created
|
|
765
|
+
return results
|
|
766
|
+
|
|
767
|
+
@classmethod
|
|
768
|
+
def process_record(
|
|
769
|
+
cls,
|
|
770
|
+
record: _Element,
|
|
771
|
+
edm_provider: Optional[str] = None,
|
|
772
|
+
data_provider: Optional[str] = None,
|
|
773
|
+
) -> EDM_Record:
|
|
774
|
+
"""Maps a METS/MODS record to EDM using class methods that can be overwritten to adapt the mapping logic
|
|
775
|
+
|
|
776
|
+
Args:
|
|
777
|
+
record: METS/MODS record as lxml Element or ElementTree
|
|
778
|
+
edm_provider: default value for edm:provider (Institution providing the data to Europeana). Mandatory if not overwritten in a subclass by overriding get_provider.
|
|
779
|
+
data_provider: default value for edm:dataProvider (Institution providing the original data), if not provided in the METS/MODS record. Would otherwise be extracted from the amdSec using "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner"
|
|
780
|
+
"""
|
|
781
|
+
|
|
782
|
+
context_objects: CONTEXT_DICT_TYPE = {}
|
|
783
|
+
|
|
784
|
+
record = record.xpath("//mets:mets", namespaces=METS_MODS_NAMESPACES)[0]
|
|
785
|
+
logical_main_div = cls.get_main_structmap_div(record)
|
|
786
|
+
dmd_sec = cls.get_mods_part(record, dmdid=logical_main_div.get("DMDID"))
|
|
787
|
+
host_dmd_sec = cls.get_host_dmd_sec(record, dmd_sec, logical_main_div)
|
|
788
|
+
|
|
789
|
+
amd_sec = cls.get_amd_part(record, amdid=logical_main_div.get("ADMID"))
|
|
790
|
+
assert (
|
|
791
|
+
len(amd_sec) == 1
|
|
792
|
+
), f'amdsec not found or multiples for id {logical_main_div.get("ADMID")}'
|
|
793
|
+
amd_sec = amd_sec[0]
|
|
794
|
+
physical_main_div = record.find(
|
|
795
|
+
"mets:structMap[@TYPE='PHYSICAL']/mets:div", # [@TYPE='physSequence']",
|
|
796
|
+
namespaces=METS_MODS_NAMESPACES,
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
filesec = record.find("mets:fileSec", namespaces=METS_MODS_NAMESPACES)
|
|
800
|
+
|
|
801
|
+
edm_type = cls.get_edm_type(dmd_sec, logical_main_div=logical_main_div)
|
|
802
|
+
|
|
803
|
+
languages = cls.get_languages(dmd_sec)
|
|
804
|
+
lang_tag = languages[0] if len(languages) == 1 else None
|
|
805
|
+
if len(languages) == 0 and edm_type.value == "TEXT":
|
|
806
|
+
languages = ["und"]
|
|
807
|
+
|
|
808
|
+
titles = cls.get_titles(dmd_sec, host_dmd_sec=host_dmd_sec)
|
|
809
|
+
|
|
810
|
+
# TODO: lang tags potentially for all properties
|
|
811
|
+
|
|
812
|
+
from_mods_subject = cls.parse_mods_subjects(dmd_sec, context_objects)
|
|
813
|
+
|
|
814
|
+
from_mods_name = cls.parse_mods_names(
|
|
815
|
+
dmd_sec=dmd_sec, host_dmd_sec=host_dmd_sec, context_objects=context_objects
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
cho = EDM_ProvidedCHO(
|
|
819
|
+
id=Ref(value="1"), # TODO: id
|
|
820
|
+
**titles,
|
|
821
|
+
dc_description=cls.get_descriptions(dmd_sec),
|
|
822
|
+
edm_type=edm_type,
|
|
823
|
+
dc_language=[Lit(value=lang) for lang in languages],
|
|
824
|
+
dc_type=cls.get_types(dmd_sec, logical_main_div)
|
|
825
|
+
+ from_mods_subject["dc_type"],
|
|
826
|
+
dc_subject=cls.get_subjects(dmd_sec)
|
|
827
|
+
+ from_mods_subject["dc_subject"]
|
|
828
|
+
+ from_mods_name["dc_subject"],
|
|
829
|
+
dcterms_temporal=cls.get_temporals(dmd_sec)
|
|
830
|
+
+ from_mods_subject["dcterms_temporal"],
|
|
831
|
+
dcterms_spatial=cls.get_spatials(dmd_sec)
|
|
832
|
+
+ from_mods_subject["dcterms_spatial"],
|
|
833
|
+
dc_identifier=cls.get_identifiers(dmd_sec),
|
|
834
|
+
dcterms_medium=cls.get_mediums(dmd_sec),
|
|
835
|
+
dcterms_extent=cls.get_extent(dmd_sec),
|
|
836
|
+
dc_publisher=cls.get_publishers(dmd_sec=dmd_sec, host_dmd_sec=host_dmd_sec)
|
|
837
|
+
+ from_mods_name["dc_publisher"],
|
|
838
|
+
dc_creator=from_mods_name["dc_creator"],
|
|
839
|
+
dc_contributor=from_mods_name["dc_contributor"],
|
|
840
|
+
dcterms_provenance=from_mods_name["dcterms_provenance"],
|
|
841
|
+
dcterms_issued=cls.get_issued(dmd_sec),
|
|
842
|
+
dcterms_created=cls.get_created(dmd_sec),
|
|
843
|
+
dcterms_isPartOf=cls.get_is_part_of(dmd_sec),
|
|
844
|
+
dcterms_isReferencedBy=cls.get_referenced_by(dmd_sec, context_objects),
|
|
845
|
+
edm_currentLocation=cls.get_current_location(dmd_sec),
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
provider = cls.get_provider(default=edm_provider)
|
|
849
|
+
aggregation = ORE_Aggregation(
|
|
850
|
+
id=Ref(value="2"), # TODO: id
|
|
851
|
+
edm_rights=cls.get_edm_rights(dmd_sec),
|
|
852
|
+
edm_aggregatedCHO=cho.id,
|
|
853
|
+
edm_dataProvider=cls.get_data_provider(
|
|
854
|
+
dmd_sec, amd_sec, default=data_provider
|
|
855
|
+
),
|
|
856
|
+
edm_provider=provider,
|
|
857
|
+
**cls.get_webresource_urls(
|
|
858
|
+
amd_sec, physical_main_div, logical_main_div, filesec, context_objects
|
|
859
|
+
),
|
|
860
|
+
)
|
|
861
|
+
|
|
862
|
+
context_classes = context_dict_to_edm_record_dict(context_objects)
|
|
863
|
+
|
|
864
|
+
return EDM_Record(provided_cho=cho, aggregation=aggregation, **context_classes)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
from typing import TypedDict, Callable
|
|
2
|
+
|
|
3
|
+
from edmlib import EDM_TimeSpan, EDM_WebResource, MixedValuesList
|
|
4
|
+
from edmlib.edm import Lit, Ref
|
|
5
|
+
from edmlib.edm.base import EDM_BaseClass
|
|
6
|
+
from lxml.etree import _Element
|
|
7
|
+
|
|
8
|
+
CONTEXT_DICT_TYPE = dict[str, EDM_BaseClass]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def context_dict_to_edm_record_dict(
|
|
12
|
+
context_objects: CONTEXT_DICT_TYPE,
|
|
13
|
+
) -> dict[str, list[EDM_BaseClass]]:
|
|
14
|
+
context_classes = {
|
|
15
|
+
"edm_agent": [],
|
|
16
|
+
"edm_place": [],
|
|
17
|
+
"edm_time_span": [],
|
|
18
|
+
"skos_concept": [],
|
|
19
|
+
"svcs_service": [],
|
|
20
|
+
"web_resource": [],
|
|
21
|
+
}
|
|
22
|
+
for context_object in context_objects.values():
|
|
23
|
+
if isinstance(context_object, EDM_WebResource):
|
|
24
|
+
context_classes["web_resource"].append(context_object)
|
|
25
|
+
elif isinstance(context_object, EDM_TimeSpan):
|
|
26
|
+
context_classes["edm_time_span"].append(context_object)
|
|
27
|
+
else:
|
|
28
|
+
context_classes[type(context_object).__name__.lower()].append(
|
|
29
|
+
context_object
|
|
30
|
+
)
|
|
31
|
+
return context_classes
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
METS_MODS_NAMESPACES = {
|
|
35
|
+
"mets": "http://www.loc.gov/METS/",
|
|
36
|
+
"mods": "http://www.loc.gov/mods/v3",
|
|
37
|
+
"intranda": "http://intranda.com/MODS/",
|
|
38
|
+
"xlink": "http://www.w3.org/1999/xlink",
|
|
39
|
+
"dv": "http://dfg-viewer.de/",
|
|
40
|
+
"vl": "http://visuallibrary.net/vl",
|
|
41
|
+
"ext": "http://ns.vls.io/mods",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
ModsNameResultsType = TypedDict(
|
|
45
|
+
"ModsNameResultsType",
|
|
46
|
+
{
|
|
47
|
+
"dc_creator": MixedValuesList,
|
|
48
|
+
"dc_publisher": MixedValuesList,
|
|
49
|
+
"dc_contributor": MixedValuesList,
|
|
50
|
+
"dcterms_provenance": MixedValuesList,
|
|
51
|
+
"dc_subject": MixedValuesList,
|
|
52
|
+
},
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def xpath_first_match(element: _Element, xpath_query):
|
|
57
|
+
results = element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES)
|
|
58
|
+
return results[0] if results else None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def join_tag_texts(elements: list[_Element], separator=" "):
|
|
62
|
+
if len(elements) > 0:
|
|
63
|
+
return separator.join([element.text for element in elements if element.text])
|
|
64
|
+
else:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def join_tag_texts_xpath(element: _Element, xpath_query, separator=" "):
|
|
69
|
+
return join_tag_texts(
|
|
70
|
+
element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES),
|
|
71
|
+
separator=separator,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def literal_list_from_xpath(
|
|
76
|
+
element: _Element,
|
|
77
|
+
xpath_query,
|
|
78
|
+
string_extract_function: Callable[[_Element], str] = None,
|
|
79
|
+
):
|
|
80
|
+
return [
|
|
81
|
+
Lit(
|
|
82
|
+
value=string_extract_function(tag) if string_extract_function else tag.text,
|
|
83
|
+
lang=tag.get("lang"),
|
|
84
|
+
)
|
|
85
|
+
for tag in element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES)
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def first_literal_from_xpath(element: _Element, xpath_query):
|
|
90
|
+
literal_list = literal_list_from_xpath(element, xpath_query)
|
|
91
|
+
if literal_list:
|
|
92
|
+
return literal_list[0]
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def uri_list_from_xpath(element: _Element, xpath_query):
|
|
97
|
+
return [
|
|
98
|
+
Ref(value=tag.text)
|
|
99
|
+
for tag in element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES)
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def mods_ns(tag_name: str):
|
|
104
|
+
return "{" + METS_MODS_NAMESPACES["mods"] + "}" + tag_name
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "mets-to-edm"
|
|
3
|
+
# handled via git tags:
|
|
4
|
+
version = "0.2.1"
|
|
5
|
+
description = "Modular mapping of METS/MODS data to EDM (Europeana Data Model), providing comprehensive transformation of METS/MODS XML structures into Pydantic-based EDM classes and properties."
|
|
6
|
+
authors = ["Kulturpool <info@kulturpool.at>"]
|
|
7
|
+
license = "MIT"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
packages = [{ include = "./mets_to_edm" }]
|
|
10
|
+
|
|
11
|
+
[tool.poetry.dependencies]
|
|
12
|
+
python = "^3.12"
|
|
13
|
+
lxml = "^5.2.2"
|
|
14
|
+
edmlib = "^2.5.2"
|
|
15
|
+
requests = "^2.32.3"
|
|
16
|
+
types-lxml = "^2025.8.25"
|
|
17
|
+
|
|
18
|
+
[tool.poetry.group.dev.dependencies]
|
|
19
|
+
pytest = "^8.2.2"
|
|
20
|
+
black = "^24.8.0"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["poetry-core"]
|
|
24
|
+
build-backend = "poetry_dynamic_versioning.backend"
|
|
25
|
+
|
|
26
|
+
[tool.poetry-dynamic-versioning]
|
|
27
|
+
enable = false
|
|
28
|
+
vcs = "git"
|
|
29
|
+
pattern = "^(?P<base>[0-9]+[.][0-9]+[.][0-9]+)$"
|