mets-to-edm 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ Metadata-Version: 2.1
2
+ Name: mets-to-edm
3
+ Version: 0.2.1
4
+ Summary: Modular mapping of METS/MODS data to EDM (Europeana Data Model), providing comprehensive transformation of METS/MODS XML structures into Pydantic-based EDM classes and properties.
5
+ License: MIT
6
+ Author: Kulturpool
7
+ Author-email: info@kulturpool.at
8
+ Requires-Python: >=3.12,<4.0
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Requires-Dist: edmlib (>=2.5.2,<3.0.0)
13
+ Requires-Dist: lxml (>=5.2.2,<6.0.0)
14
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
15
+ Requires-Dist: types-lxml (>=2025.8.25,<2026.0.0)
16
+ Description-Content-Type: text/markdown
17
+
18
+ # mets_to_edm
19
+
20
+ `mets_to_edm` is a Python library for converting METS/MODS XML records into Europeana Data Model (EDM) records.
21
+ It can be used both as a library and from the command line and has a basic Mapping that should work well for most cases.
22
+ But the library also provides a flexible mapping layer to override certain parts.
23
+
24
+ ## Features
25
+
26
+ - Converts METS/MODS XML to EDM using [edmlib](https://github.com/kulturpool/EDMLib)
27
+ - Easily extensible: override mapping methods to customize output
28
+ - CLI and Python API
29
+
30
+ ## Installation
31
+
32
+ Install via Poetry (recommended):
33
+
34
+ ```sh
35
+ poetry install
36
+ ```
37
+
38
+ Or with pip (if you have all dependencies):
39
+
40
+ ```sh
41
+ pip install .
42
+ ```
43
+
44
+ ## Usage
45
+
46
+ ### As a Python Library
47
+
48
+ ```python
49
+ from mets_to_edm import MetsToEdmMapper
50
+ from lxml import etree
51
+
52
+ # Parse your METS/MODS XML file
53
+ xml_tree = etree.parse("example.xml")
54
+
55
+ # Convert to an EDM record
56
+ edmlib_record = MetsToEdmMapper.process_record(xml_tree)
57
+
58
+ # Serialize to EDM XML
59
+ edm_xml = edmlib_record.serialize()
60
+ print(edm_xml)
61
+ ```
62
+
63
+ ### From the Command Line
64
+
65
+ ```sh
66
+ python -m mets_to_edm example.xml "Provider Name" [--data-provider "Data Provider"]
67
+ ```
68
+
69
+ - `"Provider Name"`: the institution name to be filled in as edm:provider (the aggregator providing the data to europeana)
70
+ - `"Data Provider"`: the institution name to be filled in as edm:dataProvider (the Organisation where the data originates from). Optional as it will otherwise be extracted from the amdSec using XPath "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner"
71
+
72
+ ## Customizing the Mapping
73
+
74
+ To change how specific fields are mapped, subclass `MetsToEdmMapper` and override the relevant class methods. For example, to change how titles are extracted:
75
+ You can override any method such as:
76
+ - `get_titles`
77
+ - `get_descriptions`
78
+ - `get_publishers`
79
+ - `get_types`
80
+ - `get_languages`
81
+ - ...and more (see `mets_to_edm/mapper.py` for all available hooks)
82
+
83
+ ### Example: Overriding the Data Provider
84
+
85
+ ```python
86
+ class MyMapper(MetsToEdmMapper):
87
+ @classmethod
88
+ def get_data_provider(cls, dmd_sec, amd_sec, default=None):
89
+ return "My Custom Data Provider"
90
+
91
+ # Usage:
92
+ # edmlib_record = MyCustomMapper.process_record(tree)
93
+ ```
94
+
95
+ For more examples have a look at the examples directory.
96
+
97
+ ## Further Information
98
+
99
+ - See the source code in `mets_to_edm/mapper.py` for all overridable methods and mapping logic.
100
+ - For questions or contributions, open an issue or pull request.
@@ -0,0 +1,83 @@
1
+ # mets_to_edm
2
+
3
+ `mets_to_edm` is a Python library for converting METS/MODS XML records into Europeana Data Model (EDM) records.
4
+ It can be used both as a library and from the command line and has a basic Mapping that should work well for most cases.
5
+ But the library also provides a flexible mapping layer to override certain parts.
6
+
7
+ ## Features
8
+
9
+ - Converts METS/MODS XML to EDM using [edmlib](https://github.com/kulturpool/EDMLib)
10
+ - Easily extensible: override mapping methods to customize output
11
+ - CLI and Python API
12
+
13
+ ## Installation
14
+
15
+ Install via Poetry (recommended):
16
+
17
+ ```sh
18
+ poetry install
19
+ ```
20
+
21
+ Or with pip (if you have all dependencies):
22
+
23
+ ```sh
24
+ pip install .
25
+ ```
26
+
27
+ ## Usage
28
+
29
+ ### As a Python Library
30
+
31
+ ```python
32
+ from mets_to_edm import MetsToEdmMapper
33
+ from lxml import etree
34
+
35
+ # Parse your METS/MODS XML file
36
+ xml_tree = etree.parse("example.xml")
37
+
38
+ # Convert to an EDM record
39
+ edmlib_record = MetsToEdmMapper.process_record(xml_tree)
40
+
41
+ # Serialize to EDM XML
42
+ edm_xml = edmlib_record.serialize()
43
+ print(edm_xml)
44
+ ```
45
+
46
+ ### From the Command Line
47
+
48
+ ```sh
49
+ python -m mets_to_edm example.xml "Provider Name" [--data-provider "Data Provider"]
50
+ ```
51
+
52
+ - `"Provider Name"`: the institution name to be filled in as edm:provider (the aggregator providing the data to europeana)
53
+ - `"Data Provider"`: the institution name to be filled in as edm:dataProvider (the Organisation where the data originates from). Optional as it will otherwise be extracted from the amdSec using XPath "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner"
54
+
55
+ ## Customizing the Mapping
56
+
57
+ To change how specific fields are mapped, subclass `MetsToEdmMapper` and override the relevant class methods. For example, to change how titles are extracted:
58
+ You can override any method such as:
59
+ - `get_titles`
60
+ - `get_descriptions`
61
+ - `get_publishers`
62
+ - `get_types`
63
+ - `get_languages`
64
+ - ...and more (see `mets_to_edm/mapper.py` for all available hooks)
65
+
66
+ ### Example: Overriding the Data Provider
67
+
68
+ ```python
69
+ class MyMapper(MetsToEdmMapper):
70
+ @classmethod
71
+ def get_data_provider(cls, dmd_sec, amd_sec, default=None):
72
+ return "My Custom Data Provider"
73
+
74
+ # Usage:
75
+ # edmlib_record = MyCustomMapper.process_record(tree)
76
+ ```
77
+
78
+ For more examples have a look at the examples directory.
79
+
80
+ ## Further Information
81
+
82
+ - See the source code in `mets_to_edm/mapper.py` for all overridable methods and mapping logic.
83
+ - For questions or contributions, open an issue or pull request.
@@ -0,0 +1 @@
1
+ from mets_to_edm.mapper import MetsToEdmMapper
@@ -0,0 +1,33 @@
1
+ import argparse
2
+ from lxml import etree
3
+ from mets_to_edm.mapper import MetsToEdmMapper
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(
8
+ description="Process a file with a specified data provider."
9
+ )
10
+ parser.add_argument("file", type=str, help="Path to the input file")
11
+ parser.add_argument(
12
+ "provider",
13
+ type=str,
14
+ help="Name of the edm:provider (institution providing the data to Europeana)",
15
+ )
16
+ parser.add_argument("--data-provider", type=str, help="Name of the data provider")
17
+
18
+ args = parser.parse_args()
19
+
20
+ try:
21
+ with open(args.file, "rb") as f:
22
+ tree = etree.parse(f)
23
+ print(
24
+ MetsToEdmMapper.process_record(
25
+ tree, edm_provider=args.provider, data_provider=args.data_provider
26
+ ).serialize()
27
+ )
28
+ except (etree.XMLSyntaxError, FileNotFoundError) as e:
29
+ print(f"Error parsing the file: {e}")
30
+
31
+
32
+ if __name__ == "__main__":
33
+ main()
@@ -0,0 +1,864 @@
1
+ import logging
2
+ import os
3
+ from collections import defaultdict
4
+ from typing import Any, Callable, Optional, Type, Dict, List, Tuple, Union
5
+
6
+ from edmlib import (
7
+ MixedValuesList,
8
+ ORE_Aggregation,
9
+ SKOS_Concept,
10
+ EDM_Place,
11
+ EDM_TimeSpan,
12
+ EDM_Agent,
13
+ EDM_WebResource,
14
+ SVCS_Service,
15
+ )
16
+ from edmlib.edm import EDM_Record, EDM_ProvidedCHO, Lit, Ref
17
+ from lxml.etree import _Element
18
+
19
+ from .utilities import (
20
+ METS_MODS_NAMESPACES,
21
+ join_tag_texts_xpath,
22
+ literal_list_from_xpath,
23
+ xpath_first_match,
24
+ mods_ns,
25
+ uri_list_from_xpath,
26
+ ModsNameResultsType,
27
+ first_literal_from_xpath,
28
+ CONTEXT_DICT_TYPE,
29
+ context_dict_to_edm_record_dict,
30
+ )
31
+
32
+ logger = logging.getLogger("mets-to-edm")
33
+
34
+
35
+ XSL_FILE = os.path.join(os.path.dirname(__file__), "MODSMETS2EDM.xsl")
36
+
37
+
38
+ def retry_with_host_data(func: Callable[..., Any]) -> Callable[..., Any]:
39
+ def wrapper_retry_with_host_data(
40
+ cls: Type["MetsToEdmMapper"],
41
+ dmd_sec: _Element,
42
+ host_dmd_sec: Optional[_Element] = None,
43
+ *args: Any,
44
+ **kwargs: Any,
45
+ ) -> Any:
46
+ values = func(cls, dmd_sec=dmd_sec, *args, **kwargs)
47
+
48
+ # Check if original result was already valid
49
+ if isinstance(values, dict):
50
+ if any(x for x in values.values()):
51
+ return values
52
+ elif values:
53
+ return values
54
+
55
+ # Retry with host dmd_sec
56
+ if host_dmd_sec is not None:
57
+ return func(cls, dmd_sec=host_dmd_sec, *args, **kwargs)
58
+
59
+ return values
60
+
61
+ return wrapper_retry_with_host_data
62
+
63
+
64
+ class MetsToEdmMapper:
65
+ SUBJECT_SUBELEMENTS_MAPPING = {
66
+ mods_ns("topic"): ("dc_subject", SKOS_Concept),
67
+ mods_ns("geographic"): ("dcterms_spatial", EDM_Place),
68
+ mods_ns("temporal"): ("dcterms_temporal", EDM_TimeSpan),
69
+ mods_ns("titleInfo"): ("dc_subject", SKOS_Concept),
70
+ mods_ns("name"): ("dc_subject", EDM_Agent),
71
+ mods_ns("genre"): ("dc_type", SKOS_Concept),
72
+ mods_ns("cartographics"): (None, None),
73
+ mods_ns("hierarchicalGeographic"): (None, None),
74
+ mods_ns("geographicCode"): (None, None),
75
+ mods_ns("occupation"): (None, None),
76
+ # TODO: maybe also support hierarchicalGeographic, cartographics, geographicCode, occupation
77
+ }
78
+ CREATOR_ROLES = ["aut", "cmp", "art", "pht", "edt"]
79
+ PUBLISHER_ROLES = ["pbl", "isb"]
80
+ SUBJECT_ROLES = ["rcp"]
81
+ OTHER_ROLES = [
82
+ "ctb",
83
+ "trl",
84
+ "prt",
85
+ "oth",
86
+ "egr",
87
+ "cns",
88
+ "ill",
89
+ "chr",
90
+ "wst",
91
+ "dto",
92
+ "asn",
93
+ "lyr",
94
+ ]
95
+ IGNORE_ROLES = ["his"]
96
+
97
+ # @classmethod
98
+ # def get_file_from_logical_div(cls,record: _Element, logical_div: _Element):
99
+ # div_id = logical_div.get("ID")
100
+
101
+ @classmethod
102
+ def get_main_structmap_div(cls, record: _Element) -> _Element:
103
+ possible_divs = record.xpath(
104
+ "mets:structMap[@TYPE='LOGICAL']//mets:div[@DMDID and not(mets:mptr)]",
105
+ namespaces=METS_MODS_NAMESPACES,
106
+ )
107
+ for div in possible_divs:
108
+ # if div.get("TYPE") and div.get("TYPE").lower() in [
109
+ # "article",
110
+ # "issue",
111
+ # "volume",
112
+ # "document",
113
+ # "monograph",
114
+ # "multivolume_work",
115
+ # "multivolumework",
116
+ # ]:
117
+ return div
118
+ # Else
119
+ div = record.xpath(
120
+ "(mets:structMap[@TYPE='LOGICAL']//mets:div[@DMDID])[1]",
121
+ namespaces=METS_MODS_NAMESPACES,
122
+ )
123
+ assert len(div) > 0, "Could not find starting div in structmap"
124
+ return div[0]
125
+
126
+ @classmethod
127
+ def get_mods_part(cls, record: _Element, dmdid: str) -> _Element:
128
+ dmd_secs = record.xpath(
129
+ f"mets:dmdSec[@ID='{dmdid}']/mets:mdWrap/mets:xmlData/mods:mods[1]",
130
+ namespaces=METS_MODS_NAMESPACES,
131
+ )
132
+ assert len(dmd_secs) == 1, f"dmdsec not found or multiples for id {dmdid}"
133
+ return dmd_secs[0]
134
+
135
+ @classmethod
136
+ def get_host_dmd_sec(
137
+ cls, record: _Element, dmd_sec: _Element, logical_main_div: _Element
138
+ ) -> Optional[_Element]:
139
+ host_dmd_sec = None
140
+ if possible_hosts := dmd_sec.xpath(
141
+ "mods:relatedItem[@type='host']", namespaces=METS_MODS_NAMESPACES
142
+ ):
143
+ host_dmd_sec = possible_hosts[0]
144
+ elif logical_host_div := logical_main_div.xpath(
145
+ "ancestor::mets:div[@DMDID][1]", namespaces=METS_MODS_NAMESPACES
146
+ ):
147
+ host_dmd_sec = cls.get_mods_part(
148
+ record, dmdid=logical_host_div[0].get("DMDID")
149
+ )
150
+ return host_dmd_sec
151
+
152
+ @classmethod
153
+ def get_amd_part(cls, record: _Element, amdid: str) -> list[_Element]:
154
+ return record.xpath(
155
+ f"mets:amdSec[@ID='{amdid}'][1]",
156
+ namespaces=METS_MODS_NAMESPACES,
157
+ )
158
+
159
+ @classmethod
160
+ def process_title_tag(cls, title_element: _Element) -> tuple[str, Lit]:
161
+ # TODO: consider whitespace handling and separators
162
+ title = join_tag_texts_xpath(title_element, "mods:nonSort")
163
+ title += join_tag_texts_xpath(title_element, "mods:title")
164
+ subtitle = join_tag_texts_xpath(title_element, "mods:subTitle", separator="; ")
165
+ if subtitle:
166
+ title += ": " + subtitle
167
+ partnumber = join_tag_texts_xpath(
168
+ title_element, "mods:partNumber", separator=", "
169
+ )
170
+ if partnumber:
171
+ title += " " + partnumber
172
+ partname = join_tag_texts_xpath(title_element, "mods:partName", separator=", ")
173
+ if partname:
174
+ title += ": " + partname
175
+
176
+ # TODO: languages: either from attrs lang/xml:lang on titleInfo or subtags, or from document language
177
+
178
+ if title_element.get("type"):
179
+ return ("dcterms_alternative", Lit(value=title))
180
+ else:
181
+ return ("dc_title", Lit(value=title))
182
+
183
+ @classmethod
184
+ def get_titles(
185
+ cls, dmd_sec: _Element, host_dmd_sec: Optional[_Element] = None
186
+ ) -> Dict[str, List[Lit]]:
187
+ title_properties = {"dcterms_alternative": [], "dc_title": []}
188
+ titles = dmd_sec.xpath("mods:titleInfo", namespaces=METS_MODS_NAMESPACES)
189
+ for title_info in titles:
190
+ title_type, title = cls.process_title_tag(title_info)
191
+ title_properties[title_type].append(title)
192
+
193
+ # If no title try to create it from host volume and part
194
+ volume = None
195
+ issue = None
196
+ others = []
197
+ detail_numbers = dmd_sec.xpath(
198
+ "mods:part/mods:detail[mods:number]",
199
+ namespaces=METS_MODS_NAMESPACES,
200
+ )
201
+ for detail_number in detail_numbers:
202
+ number = detail_number.find(
203
+ "mods:number", namespaces=METS_MODS_NAMESPACES
204
+ ).text
205
+ if detail_number.get("type") == "volume":
206
+ volume = number
207
+ elif detail_number.get("type") == "issue":
208
+ issue = number
209
+ else:
210
+ others.append(number)
211
+
212
+ if volume and issue:
213
+ suffix = f"{volume}/{issue}"
214
+ else:
215
+ suffix = volume or issue or (others[0] if others else None)
216
+
217
+ # suffix = dmd_sec.xpath(
218
+ # "mods:part/mods:detail/mods:number[1]/text()",
219
+ # namespaces=METS_MODS_NAMESPACES,
220
+ # )
221
+ if not suffix:
222
+ date_suffix = dmd_sec.xpath(
223
+ "mods:part/mods:date[1]/text()", namespaces=METS_MODS_NAMESPACES
224
+ )
225
+ suffix = date_suffix[0] if date_suffix else None
226
+
227
+ if suffix and host_dmd_sec is not None:
228
+ for host_title_type, host_titles in cls.get_titles(host_dmd_sec).items():
229
+ for host_title in host_titles:
230
+ title_properties[host_title_type].append(
231
+ Lit(value=host_title.value + " " + suffix)
232
+ )
233
+
234
+ if (
235
+ not title_properties["dcterms_alternative"]
236
+ and not title_properties["dc_title"]
237
+ ):
238
+ # if still no title try the mets:mets/@LABEL as last resort
239
+ mets_label = dmd_sec.xpath(
240
+ "/mets:mets/@LABEL", namespaces=METS_MODS_NAMESPACES
241
+ )[0]
242
+ if suffix and mets_label:
243
+ title_properties["dc_title"].append(
244
+ Lit(value=mets_label + " " + suffix)
245
+ )
246
+ return title_properties
247
+
248
+ @classmethod
249
+ def get_descriptions(cls, dmd_sec: _Element) -> MixedValuesList:
250
+ def note_string_extract(tag: _Element):
251
+ output = ""
252
+ if tag.get("type"):
253
+ output += tag.get("type") + ": "
254
+ output += tag.text
255
+ return output
256
+
257
+ return literal_list_from_xpath(
258
+ dmd_sec, "mods:note", string_extract_function=note_string_extract
259
+ ) + literal_list_from_xpath(dmd_sec, "mods:abstract")
260
+
261
+ @classmethod
262
+ def get_identifiers(cls, dmd_sec: _Element) -> List[Lit]:
263
+ return literal_list_from_xpath(
264
+ dmd_sec, "mods:recordInfo/mods:recordIdentifier"
265
+ ) + literal_list_from_xpath(dmd_sec, "mods:identifier")
266
+
267
+ @classmethod
268
+ def get_edm_type(
269
+ cls, dmd_sec: _Element, logical_main_div: Optional[_Element] = None
270
+ ) -> Lit:
271
+ return Lit(value="TEXT")
272
+
273
+ @classmethod
274
+ def parse_mods_subjects(
275
+ cls, dmd_sec: _Element, context_objects: CONTEXT_DICT_TYPE
276
+ ) -> Dict[str, List[Union[Lit, Ref]]]:
277
+ subjects = dmd_sec.findall("mods:subject", namespaces=METS_MODS_NAMESPACES)
278
+ edm_values: dict[str, list[Lit | Ref]] = defaultdict(list)
279
+ for subject in subjects:
280
+ for subject_subelement in subject:
281
+ edm_property, context_class = cls.SUBJECT_SUBELEMENTS_MAPPING[
282
+ subject_subelement.tag
283
+ ]
284
+ if edm_property is None:
285
+ logger.warning(
286
+ f"unimplemented mods:subject subelement {subject_subelement.tag}"
287
+ )
288
+ continue
289
+ if subject_subelement.tag == mods_ns("titleInfo"):
290
+ pref_label = cls.process_title_tag(subject_subelement)[1]
291
+ elif subject_subelement.tag == mods_ns("name"):
292
+ person = cls.parse_mods_name(subject_subelement)
293
+ if not person:
294
+ continue
295
+ elif isinstance(person, EDM_Agent):
296
+ context_objects[person.id.value] = person
297
+ edm_values[edm_property].append(person.id)
298
+ continue
299
+ else:
300
+ pref_label = Lit(value=person.value)
301
+ else:
302
+ pref_label = Lit(value=subject_subelement.text)
303
+
304
+ if subject_subelement.get("valueURI"):
305
+ context_object = context_class(
306
+ id=Ref(value=subject_subelement.get("valueURI")),
307
+ skos_prefLabel=[pref_label],
308
+ )
309
+ context_objects[context_object.id.value] = context_object
310
+ edm_values[edm_property].append(context_object.id)
311
+ else:
312
+ edm_values[edm_property].append(pref_label)
313
+ return edm_values
314
+
315
+ @classmethod
316
+ def get_subjects(cls, dmd_sec: _Element) -> MixedValuesList:
317
+ # intranda extension:
318
+ return literal_list_from_xpath(
319
+ dmd_sec, "mods:extension/intranda:intranda/intranda:subjectPerson"
320
+ ) + literal_list_from_xpath(dmd_sec, "mods:extension/intranda:Topic")
321
+
322
+ @classmethod
323
+ def parse_logical_main_div_type(
324
+ cls, logical_main_div: Optional[_Element] = None
325
+ ) -> List[Lit]:
326
+ if logical_main_div is not None and logical_main_div.get("TYPE"):
327
+ type_from_div = [Lit(value=logical_main_div.get("TYPE"))]
328
+ return type_from_div
329
+ else:
330
+ return []
331
+
332
+ @classmethod
333
+ def get_types(
334
+ cls, dmd_sec: _Element, logical_main_div: Optional[_Element] = None
335
+ ) -> MixedValuesList:
336
+ # intranda extension:
337
+ return (
338
+ literal_list_from_xpath(dmd_sec, "mods:extension/intranda:ObjectType")
339
+ + literal_list_from_xpath(
340
+ dmd_sec, "mods:physicalDescription/mods:form[@type='technique']"
341
+ )
342
+ + literal_list_from_xpath(dmd_sec, "mods:genre")
343
+ + cls.parse_logical_main_div_type(logical_main_div)
344
+ )
345
+
346
+ @classmethod
347
+ def get_temporals(cls, dmd_sec: _Element) -> MixedValuesList:
348
+ # intranda extension:
349
+ return literal_list_from_xpath(dmd_sec, "mods:extension/intranda:TopicPeriod")
350
+
351
+ @classmethod
352
+ def get_spatials(cls, dmd_sec: _Element) -> MixedValuesList:
353
+ # intranda extension:
354
+ intranda_spatials = literal_list_from_xpath(
355
+ dmd_sec, "mods:extension/intranda:TopicRoom"
356
+ )
357
+ origin_places = literal_list_from_xpath(
358
+ dmd_sec, "mods:originInfo/mods:place/mods:placeTerm[@type='text']"
359
+ )
360
+ return intranda_spatials + origin_places
361
+
362
+ @classmethod
363
+ def get_mediums(cls, dmd_sec: _Element) -> MixedValuesList:
364
+ return literal_list_from_xpath(
365
+ dmd_sec,
366
+ "mods:physicalDescription/mods:form[not(@type='technique') and not(@type='dimensions')]",
367
+ ) # TODO: check if there is a valueURI to create vocabulary references
368
+
369
+ @classmethod
370
+ def get_extent(cls, dmd_sec: _Element) -> MixedValuesList:
371
+ return literal_list_from_xpath(
372
+ dmd_sec, "mods:physicalDescription/mods:extent"
373
+ ) + literal_list_from_xpath(
374
+ dmd_sec, "mods:physicalDescription/mods:form[@type='dimensions']"
375
+ )
376
+
377
+ @classmethod
378
+ def get_languages(cls, dmd_sec: _Element) -> List[str]:
379
+ langs = dmd_sec.xpath(
380
+ "mods:language/mods:languageTerm/text()", namespaces=METS_MODS_NAMESPACES
381
+ )
382
+ # TODO: convert to ISO language codes
383
+ return langs
384
+
385
+ @classmethod
386
+ def parse_mods_date(
387
+ cls, dmd_sec: _Element, date_element_name: str
388
+ ) -> Optional[List[Lit]]:
389
+ dates = dmd_sec.xpath(date_element_name, namespaces=METS_MODS_NAMESPACES)
390
+ start = ""
391
+ end = ""
392
+ other = ""
393
+ for date in dates:
394
+ if date.get("point") == "start":
395
+ start = date.text
396
+ elif date.get("point") == "end":
397
+ end = date.text
398
+ else:
399
+ if date.get("keyDate") == "yes" or not other:
400
+ other = date.text
401
+ # TODO: consider other date attributes, like qualifier for approximate/inferred/questionable
402
+ if start and end:
403
+ return [Lit(value=start + "-" + end)]
404
+ elif other:
405
+ return [Lit(value=other)]
406
+ elif start or end:
407
+ return [Lit(value=start + "-" + end)]
408
+ else:
409
+ return None
410
+
411
+ @classmethod
412
+ def get_issued(cls, dmd_sec: _Element) -> Optional[List[Lit]]:
413
+ return cls.parse_mods_date(dmd_sec, "mods:originInfo/mods:dateIssued")
414
+
415
+ @classmethod
416
+ def get_created(cls, dmd_sec: _Element) -> Optional[List[Lit]]:
417
+ return cls.parse_mods_date(dmd_sec, "mods:originInfo/mods:dateCreated")
418
+
419
+ @classmethod
420
+ @retry_with_host_data
421
+ def get_publishers(
422
+ cls, dmd_sec: _Element, host_dmd_sec: Optional[_Element] = None
423
+ ) -> List[Lit]:
424
+ return literal_list_from_xpath(dmd_sec, "mods:originInfo/mods:publisher")
425
+
426
+ @classmethod
427
+ def get_full_name_from_name_tag(cls, name_tag: _Element) -> str:
428
+ # first try displayForm
429
+ display_form = name_tag.find(
430
+ "mods:displayForm", namespaces=METS_MODS_NAMESPACES
431
+ )
432
+ if display_form is not None and display_form.text:
433
+ return display_form.text
434
+
435
+ # otherwise join nameparts based on type
436
+ given_name = join_tag_texts_xpath(
437
+ name_tag, "mods:namePart[@type='given']", separator=" "
438
+ )
439
+ family_name = join_tag_texts_xpath(
440
+ name_tag, "mods:namePart[@type='family']", separator=" "
441
+ )
442
+ address = join_tag_texts_xpath(
443
+ name_tag, "mods:namePart[@type='termsOfAddress']", separator=" "
444
+ )
445
+ name = (" ".join([given_name, family_name, address])).strip()
446
+ if name:
447
+ return name
448
+
449
+ # otherwise use nameparts without type
450
+ return join_tag_texts_xpath(
451
+ name_tag, "mods:namePart[not(@type)]", separator=" "
452
+ )
453
+
454
+ @classmethod
455
+ def parse_mods_name(cls, name_tag: _Element) -> Lit | EDM_Agent | None:
456
+ uri = name_tag.get("valueURI")
457
+ if not uri:
458
+ uri = name_tag.get("nameIdentifier")
459
+
460
+ # name
461
+ name = cls.get_full_name_from_name_tag(name_tag)
462
+ name_lit = Lit(value=name) if name else None
463
+
464
+ # then do alternativeNames as well
465
+ alt_names = [
466
+ Lit(value=alt_name)
467
+ for alt_name_tag in name_tag.findall(
468
+ "mods:alternativeName", namespaces=METS_MODS_NAMESPACES
469
+ )
470
+ if (alt_name := cls.get_full_name_from_name_tag(alt_name_tag))
471
+ ]
472
+ # TODO: maybe also support altRepGroup in the future
473
+
474
+ if not name_lit and not uri:
475
+ return None
476
+ elif not alt_names and not uri:
477
+ return name_lit
478
+ else:
479
+ if not uri:
480
+ uri = "agent"
481
+ name_lits = [name_lit] if name_lit else None
482
+ return EDM_Agent(
483
+ id=Ref(value=uri), skos_prefLabel=name_lits, skos_altLabel=alt_names
484
+ )
485
+
486
+ @classmethod
487
+ def get_edm_property_for_roles(cls, roles: List[str]) -> Optional[str]:
488
+ edm_property = "dc_contributor"
489
+ for role_entry in roles:
490
+ if role_entry in cls.IGNORE_ROLES:
491
+ return None
492
+ elif role_entry in cls.CREATOR_ROLES:
493
+ return "dc_creator"
494
+ elif role_entry in cls.PUBLISHER_ROLES:
495
+ edm_property = "dc_publisher"
496
+ elif role_entry in cls.SUBJECT_ROLES:
497
+ edm_property = "dc_subject"
498
+ elif role_entry not in cls.OTHER_ROLES:
499
+ logger.warning(
500
+ f'Unknown Role: "{role_entry}", falling back to contributor'
501
+ )
502
+ return edm_property
503
+
504
+ @classmethod
505
+ @retry_with_host_data
506
+ def parse_mods_names(
507
+ cls, dmd_sec: _Element, context_objects: CONTEXT_DICT_TYPE
508
+ ) -> ModsNameResultsType:
509
+ name_results = {
510
+ "dc_creator": [],
511
+ "dc_publisher": [],
512
+ "dc_contributor": [],
513
+ "dcterms_provenance": [],
514
+ "dc_subject": [],
515
+ }
516
+ for name_tag in dmd_sec.findall("mods:name", namespaces=METS_MODS_NAMESPACES):
517
+ literal_or_agent = cls.parse_mods_name(name_tag)
518
+ if not literal_or_agent:
519
+ continue
520
+ name_value = literal_or_agent
521
+ if isinstance(literal_or_agent, EDM_Agent):
522
+ context_objects[literal_or_agent.id.value] = literal_or_agent
523
+ name_value = literal_or_agent.id
524
+
525
+ roles = [
526
+ r.text
527
+ for r in name_tag.findall(
528
+ "mods:role/mods:roleTerm", namespaces=METS_MODS_NAMESPACES
529
+ )
530
+ ]
531
+ if "fmo" in roles:
532
+ former_owner_value = (
533
+ literal_or_agent.skos_prefLabel[0].value
534
+ if isinstance(literal_or_agent, EDM_Agent)
535
+ else literal_or_agent.value
536
+ )
537
+ name_results["dcterms_provenance"] += [
538
+ Lit(
539
+ value="Former owner: " + former_owner_value,
540
+ lang="en",
541
+ ),
542
+ Lit(
543
+ value="Frühere:r Eigentümer:in: " + former_owner_value,
544
+ lang="de",
545
+ ),
546
+ ]
547
+ roles.remove("fmo")
548
+ if not roles:
549
+ break
550
+
551
+ edm_property = cls.get_edm_property_for_roles(roles)
552
+ if edm_property:
553
+ name_results[edm_property].append(name_value)
554
+ return name_results
555
+
556
+ @classmethod
557
+ def get_edm_rights(cls, dmd_sec: _Element) -> Ref:
558
+ access_conditions = dmd_sec.xpath(
559
+ "mods:accessCondition[@xlink:href][1]/@xlink:href",
560
+ namespaces=METS_MODS_NAMESPACES,
561
+ )
562
+ if access_conditions:
563
+ return Ref(value=access_conditions[0].replace("https://", "http://"))
564
+
565
+ access_conditions = dmd_sec.xpath(
566
+ "mods:accessCondition[@mods:valueURI][1]/@mods:valueURI",
567
+ namespaces=METS_MODS_NAMESPACES,
568
+ )
569
+ if access_conditions:
570
+ return Ref(value=access_conditions[0].replace("https://", "http://"))
571
+
572
+ access_conditions = dmd_sec.xpath(
573
+ "mods:accessCondition[@type!='hide']", namespaces=METS_MODS_NAMESPACES
574
+ )
575
+ if access_conditions:
576
+ return Ref(
577
+ value=access_conditions[0].text.strip().replace("https://", "http://")
578
+ )
579
+
580
+ raise Exception("no corresponding field for edm:rights found")
581
+
582
+ @classmethod
583
+ def get_data_provider(
584
+ cls, dmd_sec: _Element, amd_sec: _Element, default: Optional[str] = None
585
+ ) -> Lit:
586
+ if default:
587
+ return Lit(value=default)
588
+
589
+ data_provider = amd_sec.find(
590
+ "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner",
591
+ namespaces=METS_MODS_NAMESPACES,
592
+ )
593
+ return Lit(value=data_provider.text)
594
+
595
+ @classmethod
596
+ def get_provider(cls, default: Optional[str]) -> Lit:
597
+ assert (
598
+ default
599
+ ), "Missing value for edm:provider. Either override get_provider or provide a default value to process_record."
600
+ return Lit(value=default)
601
+
602
+ @classmethod
603
+ def get_is_part_of(cls, dmd_sec: _Element) -> List[Any]:
604
+ return []
605
+
606
+ @classmethod
607
+ def get_referenced_by(
608
+ cls, dmd_sec: _Element, contex_objects: CONTEXT_DICT_TYPE
609
+ ) -> MixedValuesList:
610
+ return []
611
+
612
+ @classmethod
613
+ def get_current_location(cls, dmd_sec: _Element) -> Optional[Lit]:
614
+ location = join_tag_texts_xpath(
615
+ dmd_sec, "mods:location[1]/mods:physicalLocation[1]"
616
+ )
617
+ shelf_locator = join_tag_texts_xpath(
618
+ dmd_sec, "mods:location[1]/mods:shelfLocator", separator=" ; "
619
+ )
620
+ full_location = location + ((" ; " + shelf_locator) if shelf_locator else "")
621
+ return Lit(value=full_location) if full_location else None
622
+
623
+ @classmethod
624
+ def get_iiif_image_api_service(cls, url: str) -> Optional[SVCS_Service]:
625
+ """Override in Institution specific implementation to generate the SVCS_Service object from a given url
626
+
627
+ Args:
628
+ url: URL of a WebResource
629
+
630
+ Returns:
631
+ SVCS_Service object or None if no IIIF Image API service can be extracted from the url
632
+ """
633
+ return None
634
+
635
+ @classmethod
636
+ def get_iiif_manifest_url(cls, amd_sec: _Element) -> Optional[List[Ref]]:
637
+ iiif_manifest = amd_sec.find(
638
+ "mets:digiprovMD/mets:mdWrap/mets:xmlData/dv:links/dv:iiif",
639
+ namespaces=METS_MODS_NAMESPACES,
640
+ )
641
+ if iiif_manifest is None:
642
+ return None
643
+ return [Ref(value=iiif_manifest.text)]
644
+
645
+ @classmethod
646
+ def query_url_for_div(
647
+ cls, div: _Element, file_sec: _Element, file_grp: str
648
+ ) -> Optional[str]:
649
+ fptr_id = xpath_first_match(
650
+ div,
651
+ f"mets:fptr[contains(@FILEID,'{file_grp}')]/@FILEID",
652
+ )
653
+ # assert fptr_id, "no fptr found"
654
+ if fptr_id:
655
+ file_url = xpath_first_match(
656
+ file_sec,
657
+ f".//mets:file[@ID='{fptr_id}'][1]/mets:FLocat[@LOCTYPE='URL']/@xlink:href",
658
+ )
659
+ assert file_url, f"file with ID {fptr_id} not found"
660
+ return file_url
661
+ return None
662
+
663
+ @classmethod
664
+ def query_shownBy_urls(
665
+ cls,
666
+ physical_div: _Element,
667
+ file_sec: _Element,
668
+ xpath_query_pages: str = "mets:div[@TYPE='page']",
669
+ file_grp: str = "DEFAULT",
670
+ ) -> List[str]:
671
+ urls = []
672
+ if physical_div is not None:
673
+ page_divs = physical_div.xpath(
674
+ xpath_query_pages, namespaces=METS_MODS_NAMESPACES
675
+ )
676
+ for page_div in page_divs:
677
+ # TODO: consider ORDER attributes on page divs
678
+ file_url = cls.query_url_for_div(page_div, file_sec, file_grp)
679
+ if file_url:
680
+ urls.append(file_url)
681
+ return urls
682
+
683
+ @classmethod
684
+ def get_object(cls, logical_div: _Element, file_sec: _Element) -> Optional[Ref]:
685
+ thumbnail_id = (
686
+ xpath_first_match(
687
+ logical_div, "mets:fptr[contains(@FILEID,'FRONTIMAGE')]/@FILEID"
688
+ )
689
+ or xpath_first_match(
690
+ logical_div, "mets:fptr[contains(@FILEID,'TEASER')]/@FILEID"
691
+ )
692
+ or xpath_first_match(
693
+ file_sec, "mets:fileGrp[@USE='DEFAULT']/mets:file[@USE='banner']/@ID"
694
+ )
695
+ )
696
+ # TODO: last option: get from TitlePage
697
+ if thumbnail_id is None:
698
+ return None
699
+ thumbnail_url = xpath_first_match(
700
+ file_sec,
701
+ f".//mets:file[@ID='{thumbnail_id}'][1]/mets:FLocat[@LOCTYPE='URL']/@xlink:href",
702
+ )
703
+ return Ref(value=thumbnail_url)
704
+
705
+ @classmethod
706
+ def get_webresource_urls(
707
+ cls,
708
+ amd_sec: _Element,
709
+ physical_div: _Element,
710
+ logical_div: _Element,
711
+ file_sec: _Element,
712
+ context_objects: CONTEXT_DICT_TYPE,
713
+ ) -> Dict[str, Any]:
714
+ results = {
715
+ "edm_hasView": [],
716
+ "edm_isShownBy": None,
717
+ "edm_isShownAt": None,
718
+ "edm_object": None,
719
+ }
720
+
721
+ urls = cls.query_shownBy_urls(physical_div, file_sec)
722
+
723
+ edm_object = cls.get_object(logical_div, file_sec)
724
+ results["edm_object"] = edm_object
725
+ if edm_object and not urls:
726
+ urls.append(edm_object.value)
727
+
728
+ iiif_manifest = cls.get_iiif_manifest_url(amd_sec)
729
+
730
+ first = True
731
+ for url in urls:
732
+ url = url.replace(" ", "%20")
733
+ service = cls.get_iiif_image_api_service(url)
734
+ has_service = None
735
+ if service:
736
+ context_objects[service.id.value] = service
737
+ has_service = [service.id]
738
+ if iiif_manifest or service:
739
+ context_objects[url] = EDM_WebResource(
740
+ id=Ref(value=url),
741
+ dcterms_isReferencedBy=iiif_manifest,
742
+ svcs_has_service=has_service,
743
+ )
744
+ if first:
745
+ results["edm_isShownBy"] = Ref(value=url)
746
+ first = False
747
+ else:
748
+ results["edm_hasView"].append(Ref(value=url))
749
+
750
+ if pdf_url := cls.query_url_for_div(logical_div, file_sec, file_grp="PDF"):
751
+ if first:
752
+ results["edm_isShownBy"] = Ref(value=pdf_url)
753
+ first = False
754
+ else:
755
+ results["edm_hasView"].append(Ref(value=pdf_url))
756
+
757
+ shown_ats = uri_list_from_xpath(
758
+ amd_sec,
759
+ "mets:digiprovMD/mets:mdWrap/mets:xmlData/dv:links/dv:presentation",
760
+ )
761
+ results["edm_isShownAt"] = shown_ats[0]
762
+ if len(shown_ats) > 1:
763
+ results["edm_hasView"] += shown_ats[1:]
764
+ # TODO: maybe if there is a "mods:originInfo/mods:dateCaptured" put it into the WebResource as dcterms:created
765
+ return results
766
+
767
+ @classmethod
768
+ def process_record(
769
+ cls,
770
+ record: _Element,
771
+ edm_provider: Optional[str] = None,
772
+ data_provider: Optional[str] = None,
773
+ ) -> EDM_Record:
774
+ """Maps a METS/MODS record to EDM using class methods that can be overwritten to adapt the mapping logic
775
+
776
+ Args:
777
+ record: METS/MODS record as lxml Element or ElementTree
778
+ edm_provider: default value for edm:provider (Institution providing the data to Europeana). Mandatory if not overwritten in a subclass by overriding get_provider.
779
+ data_provider: default value for edm:dataProvider (Institution providing the original data), if not provided in the METS/MODS record. Would otherwise be extracted from the amdSec using "mets:rightsMD/mets:mdWrap/mets:xmlData/dv:rights/dv:owner"
780
+ """
781
+
782
+ context_objects: CONTEXT_DICT_TYPE = {}
783
+
784
+ record = record.xpath("//mets:mets", namespaces=METS_MODS_NAMESPACES)[0]
785
+ logical_main_div = cls.get_main_structmap_div(record)
786
+ dmd_sec = cls.get_mods_part(record, dmdid=logical_main_div.get("DMDID"))
787
+ host_dmd_sec = cls.get_host_dmd_sec(record, dmd_sec, logical_main_div)
788
+
789
+ amd_sec = cls.get_amd_part(record, amdid=logical_main_div.get("ADMID"))
790
+ assert (
791
+ len(amd_sec) == 1
792
+ ), f'amdsec not found or multiples for id {logical_main_div.get("ADMID")}'
793
+ amd_sec = amd_sec[0]
794
+ physical_main_div = record.find(
795
+ "mets:structMap[@TYPE='PHYSICAL']/mets:div", # [@TYPE='physSequence']",
796
+ namespaces=METS_MODS_NAMESPACES,
797
+ )
798
+
799
+ filesec = record.find("mets:fileSec", namespaces=METS_MODS_NAMESPACES)
800
+
801
+ edm_type = cls.get_edm_type(dmd_sec, logical_main_div=logical_main_div)
802
+
803
+ languages = cls.get_languages(dmd_sec)
804
+ lang_tag = languages[0] if len(languages) == 1 else None
805
+ if len(languages) == 0 and edm_type.value == "TEXT":
806
+ languages = ["und"]
807
+
808
+ titles = cls.get_titles(dmd_sec, host_dmd_sec=host_dmd_sec)
809
+
810
+ # TODO: lang tags potentially for all properties
811
+
812
+ from_mods_subject = cls.parse_mods_subjects(dmd_sec, context_objects)
813
+
814
+ from_mods_name = cls.parse_mods_names(
815
+ dmd_sec=dmd_sec, host_dmd_sec=host_dmd_sec, context_objects=context_objects
816
+ )
817
+
818
+ cho = EDM_ProvidedCHO(
819
+ id=Ref(value="1"), # TODO: id
820
+ **titles,
821
+ dc_description=cls.get_descriptions(dmd_sec),
822
+ edm_type=edm_type,
823
+ dc_language=[Lit(value=lang) for lang in languages],
824
+ dc_type=cls.get_types(dmd_sec, logical_main_div)
825
+ + from_mods_subject["dc_type"],
826
+ dc_subject=cls.get_subjects(dmd_sec)
827
+ + from_mods_subject["dc_subject"]
828
+ + from_mods_name["dc_subject"],
829
+ dcterms_temporal=cls.get_temporals(dmd_sec)
830
+ + from_mods_subject["dcterms_temporal"],
831
+ dcterms_spatial=cls.get_spatials(dmd_sec)
832
+ + from_mods_subject["dcterms_spatial"],
833
+ dc_identifier=cls.get_identifiers(dmd_sec),
834
+ dcterms_medium=cls.get_mediums(dmd_sec),
835
+ dcterms_extent=cls.get_extent(dmd_sec),
836
+ dc_publisher=cls.get_publishers(dmd_sec=dmd_sec, host_dmd_sec=host_dmd_sec)
837
+ + from_mods_name["dc_publisher"],
838
+ dc_creator=from_mods_name["dc_creator"],
839
+ dc_contributor=from_mods_name["dc_contributor"],
840
+ dcterms_provenance=from_mods_name["dcterms_provenance"],
841
+ dcterms_issued=cls.get_issued(dmd_sec),
842
+ dcterms_created=cls.get_created(dmd_sec),
843
+ dcterms_isPartOf=cls.get_is_part_of(dmd_sec),
844
+ dcterms_isReferencedBy=cls.get_referenced_by(dmd_sec, context_objects),
845
+ edm_currentLocation=cls.get_current_location(dmd_sec),
846
+ )
847
+
848
+ provider = cls.get_provider(default=edm_provider)
849
+ aggregation = ORE_Aggregation(
850
+ id=Ref(value="2"), # TODO: id
851
+ edm_rights=cls.get_edm_rights(dmd_sec),
852
+ edm_aggregatedCHO=cho.id,
853
+ edm_dataProvider=cls.get_data_provider(
854
+ dmd_sec, amd_sec, default=data_provider
855
+ ),
856
+ edm_provider=provider,
857
+ **cls.get_webresource_urls(
858
+ amd_sec, physical_main_div, logical_main_div, filesec, context_objects
859
+ ),
860
+ )
861
+
862
+ context_classes = context_dict_to_edm_record_dict(context_objects)
863
+
864
+ return EDM_Record(provided_cho=cho, aggregation=aggregation, **context_classes)
@@ -0,0 +1,104 @@
1
+ from typing import TypedDict, Callable
2
+
3
+ from edmlib import EDM_TimeSpan, EDM_WebResource, MixedValuesList
4
+ from edmlib.edm import Lit, Ref
5
+ from edmlib.edm.base import EDM_BaseClass
6
+ from lxml.etree import _Element
7
+
8
+ CONTEXT_DICT_TYPE = dict[str, EDM_BaseClass]
9
+
10
+
11
+ def context_dict_to_edm_record_dict(
12
+ context_objects: CONTEXT_DICT_TYPE,
13
+ ) -> dict[str, list[EDM_BaseClass]]:
14
+ context_classes = {
15
+ "edm_agent": [],
16
+ "edm_place": [],
17
+ "edm_time_span": [],
18
+ "skos_concept": [],
19
+ "svcs_service": [],
20
+ "web_resource": [],
21
+ }
22
+ for context_object in context_objects.values():
23
+ if isinstance(context_object, EDM_WebResource):
24
+ context_classes["web_resource"].append(context_object)
25
+ elif isinstance(context_object, EDM_TimeSpan):
26
+ context_classes["edm_time_span"].append(context_object)
27
+ else:
28
+ context_classes[type(context_object).__name__.lower()].append(
29
+ context_object
30
+ )
31
+ return context_classes
32
+
33
+
34
+ METS_MODS_NAMESPACES = {
35
+ "mets": "http://www.loc.gov/METS/",
36
+ "mods": "http://www.loc.gov/mods/v3",
37
+ "intranda": "http://intranda.com/MODS/",
38
+ "xlink": "http://www.w3.org/1999/xlink",
39
+ "dv": "http://dfg-viewer.de/",
40
+ "vl": "http://visuallibrary.net/vl",
41
+ "ext": "http://ns.vls.io/mods",
42
+ }
43
+
44
+ ModsNameResultsType = TypedDict(
45
+ "ModsNameResultsType",
46
+ {
47
+ "dc_creator": MixedValuesList,
48
+ "dc_publisher": MixedValuesList,
49
+ "dc_contributor": MixedValuesList,
50
+ "dcterms_provenance": MixedValuesList,
51
+ "dc_subject": MixedValuesList,
52
+ },
53
+ )
54
+
55
+
56
+ def xpath_first_match(element: _Element, xpath_query):
57
+ results = element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES)
58
+ return results[0] if results else None
59
+
60
+
61
+ def join_tag_texts(elements: list[_Element], separator=" "):
62
+ if len(elements) > 0:
63
+ return separator.join([element.text for element in elements if element.text])
64
+ else:
65
+ return ""
66
+
67
+
68
+ def join_tag_texts_xpath(element: _Element, xpath_query, separator=" "):
69
+ return join_tag_texts(
70
+ element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES),
71
+ separator=separator,
72
+ )
73
+
74
+
75
+ def literal_list_from_xpath(
76
+ element: _Element,
77
+ xpath_query,
78
+ string_extract_function: Callable[[_Element], str] = None,
79
+ ):
80
+ return [
81
+ Lit(
82
+ value=string_extract_function(tag) if string_extract_function else tag.text,
83
+ lang=tag.get("lang"),
84
+ )
85
+ for tag in element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES)
86
+ ]
87
+
88
+
89
+ def first_literal_from_xpath(element: _Element, xpath_query):
90
+ literal_list = literal_list_from_xpath(element, xpath_query)
91
+ if literal_list:
92
+ return literal_list[0]
93
+ return None
94
+
95
+
96
+ def uri_list_from_xpath(element: _Element, xpath_query):
97
+ return [
98
+ Ref(value=tag.text)
99
+ for tag in element.xpath(xpath_query, namespaces=METS_MODS_NAMESPACES)
100
+ ]
101
+
102
+
103
+ def mods_ns(tag_name: str):
104
+ return "{" + METS_MODS_NAMESPACES["mods"] + "}" + tag_name
@@ -0,0 +1,29 @@
1
+ [tool.poetry]
2
+ name = "mets-to-edm"
3
+ # handled via git tags:
4
+ version = "0.2.1"
5
+ description = "Modular mapping of METS/MODS data to EDM (Europeana Data Model), providing comprehensive transformation of METS/MODS XML structures into Pydantic-based EDM classes and properties."
6
+ authors = ["Kulturpool <info@kulturpool.at>"]
7
+ license = "MIT"
8
+ readme = "README.md"
9
+ packages = [{ include = "./mets_to_edm" }]
10
+
11
+ [tool.poetry.dependencies]
12
+ python = "^3.12"
13
+ lxml = "^5.2.2"
14
+ edmlib = "^2.5.2"
15
+ requests = "^2.32.3"
16
+ types-lxml = "^2025.8.25"
17
+
18
+ [tool.poetry.group.dev.dependencies]
19
+ pytest = "^8.2.2"
20
+ black = "^24.8.0"
21
+
22
+ [build-system]
23
+ requires = ["poetry-core"]
24
+ build-backend = "poetry_dynamic_versioning.backend"
25
+
26
+ [tool.poetry-dynamic-versioning]
27
+ enable = false
28
+ vcs = "git"
29
+ pattern = "^(?P<base>[0-9]+[.][0-9]+[.][0-9]+)$"