gamslib 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gamslib/__init__.py +6 -0
- gamslib/objectcsv/__init__.py +57 -0
- gamslib/objectcsv/create_csv.py +173 -0
- gamslib/objectcsv/defaultvalues.py +28 -0
- gamslib/objectcsv/dublincore.py +188 -0
- gamslib/objectcsv/manage_csv.py +84 -0
- gamslib/objectcsv/objectcsv.py +315 -0
- gamslib/objectcsv/utils.py +21 -0
- gamslib/objectcsv/xlsx.py +60 -0
- gamslib/projectconfiguration/__init__.py +50 -0
- gamslib/projectconfiguration/configuration.py +108 -0
- gamslib/projectconfiguration/resources/project.toml +23 -0
- gamslib/py.typed +0 -0
- gamslib-0.3.1.dist-info/METADATA +94 -0
- gamslib-0.3.1.dist-info/RECORD +17 -0
- gamslib-0.3.1.dist-info/WHEEL +4 -0
- gamslib-0.3.1.dist-info/licenses/LICENSE +21 -0
gamslib/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Handle object and datastream metadata in csv files.
|
|
2
|
+
|
|
3
|
+
When creating bags for GAMS, we provide some metadata in csv
|
|
4
|
+
files (which are not part of the bag, btw).
|
|
5
|
+
|
|
6
|
+
The objectcsv package provides tools to handle this metadata.
|
|
7
|
+
|
|
8
|
+
* The ObjectCSV class represents the object and datastream csv data
|
|
9
|
+
for a single object. It is created by providing the path to the
|
|
10
|
+
object directory. It is composed of two classes:
|
|
11
|
+
|
|
12
|
+
* ObjectCSVFile represents the object metadata. It hold typically
|
|
13
|
+
a single ObjectData object, but can hold multiple objects if needed.
|
|
14
|
+
* DatastreamsCSVFile represents the datastream metadata. It holds
|
|
15
|
+
typically multiple DSData objects, one for each datastream.
|
|
16
|
+
* The dublincore_csv module represents the object metadata stored in
|
|
17
|
+
the objects 'DC.xml' file. It provides useful functions for acessing
|
|
18
|
+
DC data e.g. for prefered languages etc.
|
|
19
|
+
* The create_csv module can be used to initally create the csv files for
|
|
20
|
+
all objects
|
|
21
|
+
* The manage_csv module can be used collect csv data from all objects
|
|
22
|
+
into a single file, which makes editing the data more efficient.
|
|
23
|
+
It also has a function to update the csv files in the object directories
|
|
24
|
+
based on the collected data.
|
|
25
|
+
* The xlsx module can be used to convert the csv files to xlsx files
|
|
26
|
+
and vice versa. This is useful for editing the data in a spreadsheet
|
|
27
|
+
without the hassles of importing and exporting the csv files, which
|
|
28
|
+
led to encoding problems in the past.
|
|
29
|
+
|
|
30
|
+
The "public" functions and classes from the submodules are directly
|
|
31
|
+
available in the objectcsv:
|
|
32
|
+
|
|
33
|
+
* ObjectCSV
|
|
34
|
+
* ObjectData
|
|
35
|
+
* DSData
|
|
36
|
+
* create_csv_files
|
|
37
|
+
* collect_csv_data
|
|
38
|
+
* update_csv_files
|
|
39
|
+
* csv_to_xlsx
|
|
40
|
+
* xlsx_to_csv
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
from .objectcsv import ObjectCSV, ObjectData, DSData
|
|
44
|
+
from .create_csv import create_csv_files
|
|
45
|
+
from .manage_csv import collect_csv_data, update_csv_files
|
|
46
|
+
from .xlsx import csv_to_xlsx, xlsx_to_csv
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"ObjectCSV",
|
|
50
|
+
"ObjectData",
|
|
51
|
+
"DSData",
|
|
52
|
+
"create_csv_files",
|
|
53
|
+
"collect_csv_data",
|
|
54
|
+
"update_csv_files",
|
|
55
|
+
"csv_to_xlsx",
|
|
56
|
+
"xlsx_to_csv",
|
|
57
|
+
]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Create object.csv and datastreams.csv files.
|
|
2
|
+
|
|
3
|
+
This module creates the object.csv and datastreams.csv files for one or many given
|
|
4
|
+
object folder. It uses data from the DC.xml file and the project configuration
|
|
5
|
+
to fill in the metadata. When not enough information is available, some fields
|
|
6
|
+
will be left blank or filled with default values.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import mimetypes
|
|
11
|
+
import re
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
import warnings
|
|
14
|
+
|
|
15
|
+
from gamslib.projectconfiguration import Configuration
|
|
16
|
+
|
|
17
|
+
from .objectcsv import DSData, ObjectCSV, ObjectData
|
|
18
|
+
from .dublincore import DublinCore
|
|
19
|
+
from .utils import find_object_folders
|
|
20
|
+
from . import defaultvalues
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
NAMESPACES = {
|
|
26
|
+
"dc": "http://purl.org/dc/elements/1.1/",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def get_rights(config: Configuration, dc: DublinCore) -> str:
|
|
31
|
+
"""Get the rights from various sources.
|
|
32
|
+
|
|
33
|
+
Lookup in this ortder:
|
|
34
|
+
|
|
35
|
+
1. Check if set in dublin core
|
|
36
|
+
2. Check if set in the configuration
|
|
37
|
+
3. Use a default value.
|
|
38
|
+
"""
|
|
39
|
+
rights = dc.get_element_as_str("rights", default="")
|
|
40
|
+
if not rights: # empty string is a valid value
|
|
41
|
+
if config.metadata.rights:
|
|
42
|
+
rights = config.metadata.rights
|
|
43
|
+
else:
|
|
44
|
+
rights = defaultvalues.DEFAULT_RIGHTS
|
|
45
|
+
return rights
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def extract_dsid(datastream: Path | str, keep_extension=True) -> str:
|
|
49
|
+
"""Extract and validate the datastream id from a datastream path.
|
|
50
|
+
|
|
51
|
+
If remove_extension is True, the file extension is removed from the PID.
|
|
52
|
+
"""
|
|
53
|
+
if isinstance(datastream, str):
|
|
54
|
+
datastream = Path(datastream)
|
|
55
|
+
|
|
56
|
+
pid = datastream.name
|
|
57
|
+
|
|
58
|
+
if not keep_extension:
|
|
59
|
+
# not everything after the last dot is an extension :-(
|
|
60
|
+
mtype = mimetypes.guess_type(datastream)[0]
|
|
61
|
+
if mtype is None:
|
|
62
|
+
known_extensions = []
|
|
63
|
+
else:
|
|
64
|
+
known_extensions = mimetypes.guess_all_extensions(mtype)
|
|
65
|
+
if datastream.suffix in known_extensions:
|
|
66
|
+
pid = pid.removesuffix(datastream.suffix)
|
|
67
|
+
logger.debug("Removed extension '%s' for ID: %s", datastream.suffix, pid)
|
|
68
|
+
else:
|
|
69
|
+
parts = pid.split(".")
|
|
70
|
+
if re.match(r"^[a-zA-Z]+\w?$", parts[-1]):
|
|
71
|
+
pid = ".".join(parts[:-1])
|
|
72
|
+
logger.debug("Removed extension for ID: %s", parts[0])
|
|
73
|
+
else:
|
|
74
|
+
warnings.warn(f"'{pid[-1]}' does not look like an extension. Keeping it in PID.",
|
|
75
|
+
UserWarning)
|
|
76
|
+
|
|
77
|
+
if re.match(r"^[a-zA-Z0-9]+[-.%_a-zA-Z0-9]+[a-zA-Z0-9]+$", pid) is None:
|
|
78
|
+
raise ValueError(f"Invalid PID: '{pid}'")
|
|
79
|
+
|
|
80
|
+
logger.debug(
|
|
81
|
+
"Extracted PID: %s from %s (keep_extension=%s)", pid, datastream, keep_extension
|
|
82
|
+
)
|
|
83
|
+
return pid
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def collect_object_data(pid: str, config: Configuration, dc: DublinCore) -> ObjectData:
|
|
87
|
+
"""Find data for the object.csv by examining dc file and configuration.
|
|
88
|
+
|
|
89
|
+
This is the place to change the resolving order for data from other sources.
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
title = "; ".join(dc.get_element("title", default=pid))
|
|
93
|
+
description = "; ".join(dc.get_element("description", default=""))
|
|
94
|
+
|
|
95
|
+
return ObjectData(
|
|
96
|
+
recid=pid,
|
|
97
|
+
title=title,
|
|
98
|
+
project=config.metadata.project_id,
|
|
99
|
+
description=description,
|
|
100
|
+
creator=config.metadata.creator,
|
|
101
|
+
rights=get_rights(config, dc),
|
|
102
|
+
source=defaultvalues.DEFAULT_SOURCE,
|
|
103
|
+
objectType=defaultvalues.DEFAULT_OBJECT_TYPE,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def collect_datastream_data(
|
|
108
|
+
ds_file: Path, config: Configuration, dc: DublinCore
|
|
109
|
+
) -> DSData:
|
|
110
|
+
"""Collect data for a single datastream."""
|
|
111
|
+
dsid = extract_dsid(ds_file, config.general.dsid_keep_extension)
|
|
112
|
+
|
|
113
|
+
# I think it's not possible to derive a ds title or description from the DC file
|
|
114
|
+
# title = "; ".join(dc.get_element("title", default=dsid)) # ??
|
|
115
|
+
# description = "; ".join(dc.get_element("description", default="")) #??
|
|
116
|
+
|
|
117
|
+
return DSData(
|
|
118
|
+
dspath=str(ds_file.relative_to(ds_file.parents[1])), # objectsdir
|
|
119
|
+
dsid=dsid,
|
|
120
|
+
title="",
|
|
121
|
+
description="",
|
|
122
|
+
mimetype=mimetypes.guess_type(ds_file)[0] or "",
|
|
123
|
+
creator=config.metadata.creator,
|
|
124
|
+
rights=get_rights(config, dc),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def create_csv(
|
|
129
|
+
object_directory: Path, configuration: Configuration, force_overwrite: bool = False
|
|
130
|
+
) -> ObjectCSV | None:
|
|
131
|
+
"""Generate the csv file containing the preliminary metadata for a single object.
|
|
132
|
+
|
|
133
|
+
Existing csv files will not be touched unless 'force_overwrite' is True.
|
|
134
|
+
"""
|
|
135
|
+
objectcsv = ObjectCSV(object_directory)
|
|
136
|
+
|
|
137
|
+
# Avoid that existing (and potentially already edited) metadata is replaced
|
|
138
|
+
if force_overwrite and not objectcsv.is_new():
|
|
139
|
+
objectcsv.clear()
|
|
140
|
+
objectcsv.obj_csv_file.unlink()
|
|
141
|
+
objectcsv.ds_csv_file.unlink()
|
|
142
|
+
if not objectcsv.is_new():
|
|
143
|
+
logger.info(
|
|
144
|
+
"CSV files for object '%s' already exist. Will not be re-created.",
|
|
145
|
+
objectcsv.object_id,
|
|
146
|
+
)
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
dc = DublinCore(object_directory / "DC.xml")
|
|
150
|
+
objectcsv.add_objectdata(
|
|
151
|
+
collect_object_data(objectcsv.object_id, configuration, dc)
|
|
152
|
+
)
|
|
153
|
+
for ds_file in object_directory.glob("*"):
|
|
154
|
+
if ds_file.is_file() and ds_file.name not in ("object.csv", "datastreams.csv"):
|
|
155
|
+
objectcsv.add_datastream(
|
|
156
|
+
# collect_datastream_data(ds_file, objectcsv.object_id, configuration, dc)
|
|
157
|
+
collect_datastream_data(ds_file, configuration, dc)
|
|
158
|
+
)
|
|
159
|
+
objectcsv.write()
|
|
160
|
+
return objectcsv
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def create_csv_files(
|
|
164
|
+
root_folder: Path, config: Configuration, force_overwrite:bool=False
|
|
165
|
+
) -> list[ObjectCSV]:
|
|
166
|
+
"""Create the CSV files for all objects below root_folder."""
|
|
167
|
+
extended_objects: list[ObjectCSV] = []
|
|
168
|
+
for path in find_object_folders(root_folder):
|
|
169
|
+
extended_obj = create_csv(path, config, force_overwrite)
|
|
170
|
+
|
|
171
|
+
if extended_obj is not None:
|
|
172
|
+
extended_objects.append(extended_obj)
|
|
173
|
+
return extended_objects
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Default values for the datastream meatadata."""
|
|
2
|
+
|
|
3
|
+
DEFAULT_CREATOR = "Unknown"
|
|
4
|
+
DEFAULT_MIMETYPE = "application/octet-stream"
|
|
5
|
+
DEFAULT_OBJECT_TYPE = "text"
|
|
6
|
+
DEFAULT_RIGHTS = "Creative Commons Attribution-NonCommercial 4.0 (https://creativecommons.org/licenses/by-nc/4.0/)"
|
|
7
|
+
DEFAULT_SOURCE = "local"
|
|
8
|
+
|
|
9
|
+
# This is a mapping of filenames to default metadata values.
|
|
10
|
+
# Add new entries here if you want to add new metadata fields.
|
|
11
|
+
FILENAME_MAP = {
|
|
12
|
+
"DC.xml": {
|
|
13
|
+
"title": "Dublin Core Metadata",
|
|
14
|
+
"description": "Dublin Core Metadata in XML format for this content file.",
|
|
15
|
+
},
|
|
16
|
+
"TEI.xml": {
|
|
17
|
+
"title": "Main TEI file",
|
|
18
|
+
"description": "The central TEI File for this object"
|
|
19
|
+
},
|
|
20
|
+
"LIDO.xml": {
|
|
21
|
+
"title": "Main LIDO file",
|
|
22
|
+
"description": "The central LIDO file of this object"
|
|
23
|
+
},
|
|
24
|
+
"RDF.xml": {
|
|
25
|
+
"title": "RDF Statements",
|
|
26
|
+
"description": ""
|
|
27
|
+
}
|
|
28
|
+
}
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Represents data from DC.xml and provides some methods to access it."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
from xml.etree import ElementTree as ET
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
DC_ELEMENTS = [
|
|
11
|
+
"contributor",
|
|
12
|
+
"coverage",
|
|
13
|
+
"creator",
|
|
14
|
+
"date",
|
|
15
|
+
"description",
|
|
16
|
+
"format",
|
|
17
|
+
"identifier",
|
|
18
|
+
"language",
|
|
19
|
+
"publisher",
|
|
20
|
+
"relation",
|
|
21
|
+
"rights",
|
|
22
|
+
"source",
|
|
23
|
+
"subject",
|
|
24
|
+
"title",
|
|
25
|
+
"type",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
## DC_TERMS and DCMI_TYPES are not used in the current implementation,
|
|
29
|
+
## but are kept here for future reference.
|
|
30
|
+
# DC_TERMS = [
|
|
31
|
+
# "abstract",
|
|
32
|
+
# "accessRights",
|
|
33
|
+
# "accrualMethod",
|
|
34
|
+
# "accrualPeriodicity",
|
|
35
|
+
# "accrualPolicy",
|
|
36
|
+
# "alternative",
|
|
37
|
+
# "audience",
|
|
38
|
+
# "available",
|
|
39
|
+
# "bibliographicCitation",
|
|
40
|
+
# "conformsTo",
|
|
41
|
+
# "created",
|
|
42
|
+
# "dateAccepted",
|
|
43
|
+
# "dateCopyrighted",
|
|
44
|
+
# "dateSubmitted",
|
|
45
|
+
# "educationLevel",
|
|
46
|
+
# "extent",
|
|
47
|
+
# "hasFormat",
|
|
48
|
+
# "hasPart",
|
|
49
|
+
# "hasVersion",
|
|
50
|
+
# "instructionalMethod",
|
|
51
|
+
# "isFormatOf",
|
|
52
|
+
# "isPartOf",
|
|
53
|
+
# "isReferencedBy",
|
|
54
|
+
# "isReplacedBy",
|
|
55
|
+
# "isRequiredBy",
|
|
56
|
+
# "issued",
|
|
57
|
+
# "isVersionOf",
|
|
58
|
+
# "license",
|
|
59
|
+
# "mediator",
|
|
60
|
+
# "medium",
|
|
61
|
+
# "modified",
|
|
62
|
+
# "provenance",
|
|
63
|
+
# "references",
|
|
64
|
+
# "replaces",
|
|
65
|
+
# "requires",
|
|
66
|
+
# "rightsHolder",
|
|
67
|
+
# "spatial",
|
|
68
|
+
# "tableOfContents",
|
|
69
|
+
# "temporal",
|
|
70
|
+
# "valid",
|
|
71
|
+
# ]
|
|
72
|
+
|
|
73
|
+
# DCMI_TYPES = [
|
|
74
|
+
# "Collection",
|
|
75
|
+
# "Dataset",
|
|
76
|
+
# "Event",
|
|
77
|
+
# "Image",
|
|
78
|
+
# "InteractiveResource",
|
|
79
|
+
# "MovingImage",
|
|
80
|
+
# "PhysicalObject",
|
|
81
|
+
# "Service",
|
|
82
|
+
# "Software",
|
|
83
|
+
# "Sound",
|
|
84
|
+
# "StillImage",
|
|
85
|
+
# "Text",
|
|
86
|
+
# ]
|
|
87
|
+
|
|
88
|
+
NAMESPACES = {
|
|
89
|
+
"dc": "http://purl.org/dc/elements/1.1/",
|
|
90
|
+
"dcterms": "http://purl.org/dc/terms/",
|
|
91
|
+
"dcmitype": "http://purl.org/dc/dcmitype/",
|
|
92
|
+
"rdf": "http://www.w3.org/1999/02/22-rdf",
|
|
93
|
+
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
|
|
94
|
+
"xml": "http://www.w3.org/XML/1998/namespace",
|
|
95
|
+
"xsd": "http://www.w3.org/2001/XMLSchema#",
|
|
96
|
+
"oai_dc": "http://www.openarchives.org/OAI/2.0/oai_dc/",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DublinCore:
|
|
101
|
+
"""Represents data from DC.xml and provides some methods to access it."""
|
|
102
|
+
|
|
103
|
+
def __init__(
|
|
104
|
+
self, path: Path, lookup_order: tuple = ("en", "de", "fr", "es", "it")
|
|
105
|
+
):
|
|
106
|
+
self.path: Path = path
|
|
107
|
+
self.lookup_order: list[str] = list(lookup_order)
|
|
108
|
+
self._data: dict[str, Any] = {} # [element][lang] = text
|
|
109
|
+
self._parse(path)
|
|
110
|
+
|
|
111
|
+
def _parse(self, path: Path):
|
|
112
|
+
tree = ET.parse(path)
|
|
113
|
+
root = tree.getroot()
|
|
114
|
+
|
|
115
|
+
for elem in DC_ELEMENTS:
|
|
116
|
+
for child in root.findall(f"dc:{elem}", namespaces=NAMESPACES):
|
|
117
|
+
lang = child.attrib.get(f"{{{NAMESPACES['xml']}}}lang", "unspecified")
|
|
118
|
+
element = self._data.get(elem, {})
|
|
119
|
+
values = element.get(lang, [])
|
|
120
|
+
if child.text is not None:
|
|
121
|
+
values.append(child.text)
|
|
122
|
+
element[lang] = values
|
|
123
|
+
self._data[elem] = element
|
|
124
|
+
# TODO: Add DC_TERMS and DCMI_TYPES?
|
|
125
|
+
|
|
126
|
+
def get_element(
|
|
127
|
+
self, name: str, preferred_lang: str = "en", default: str = ""
|
|
128
|
+
) -> list[str]:
|
|
129
|
+
"""Return the value of a Dublin Core element as list of strings.
|
|
130
|
+
|
|
131
|
+
This method is able to deal with multiple values of the same element in different languages.
|
|
132
|
+
Returns always a list of strings, because in DC an element can have multiple values.
|
|
133
|
+
|
|
134
|
+
:param str name: The name of the element without namespace (e.g. "title").
|
|
135
|
+
:param str preferred_lang: The preferred language of the element (eg. "de").
|
|
136
|
+
* Default value is "en".
|
|
137
|
+
* If no entry in this language is available, the function will search
|
|
138
|
+
for entries in another language, depending on the lookup_order, set during
|
|
139
|
+
object creation. If no entry is found with a specified language, the function
|
|
140
|
+
checks for an entry with no 'xml:lang' attribute.
|
|
141
|
+
If still no value is found, the default value will be returned (als list!)
|
|
142
|
+
:param str default: The default value to return if no value is found. Be aware, that this
|
|
143
|
+
value will be returned as list element, even if it is a string.
|
|
144
|
+
:return: The value(s) of the element as list of strings.
|
|
145
|
+
"""
|
|
146
|
+
if name not in DC_ELEMENTS:
|
|
147
|
+
raise ValueError(f"Element {name} is not a Dublin Core element.")
|
|
148
|
+
# element not in DC.xml
|
|
149
|
+
if name not in self._data:
|
|
150
|
+
logger.debug(
|
|
151
|
+
"Element %s not found in %s. Returning default value: [%s]",
|
|
152
|
+
name,
|
|
153
|
+
self.path,
|
|
154
|
+
default,
|
|
155
|
+
)
|
|
156
|
+
return [default]
|
|
157
|
+
|
|
158
|
+
if preferred_lang not in self._data[name]:
|
|
159
|
+
for lang in self.lookup_order + ["unspecified"]:
|
|
160
|
+
if lang in self._data[name]:
|
|
161
|
+
preferred_lang = lang
|
|
162
|
+
logger.debug(
|
|
163
|
+
"Preferred language %s not found in %s. Using %s instead.",
|
|
164
|
+
preferred_lang,
|
|
165
|
+
self.path,
|
|
166
|
+
lang,
|
|
167
|
+
)
|
|
168
|
+
break
|
|
169
|
+
return self._data[name][preferred_lang]
|
|
170
|
+
|
|
171
|
+
def get_element_as_str(
|
|
172
|
+
self, name: str, preferred_lang: str = "en", default: str = ""
|
|
173
|
+
) -> str:
|
|
174
|
+
"""Return the value(s) of a Dublin Core element as string.
|
|
175
|
+
|
|
176
|
+
This is a wrapper around get_element() which returns a single
|
|
177
|
+
string instead of a list.
|
|
178
|
+
|
|
179
|
+
It is able to distinguish between different elements
|
|
180
|
+
(eg. rights is handled differently than title).
|
|
181
|
+
"""
|
|
182
|
+
values = self.get_element(name, preferred_lang, default)
|
|
183
|
+
if len(values) == 1:
|
|
184
|
+
return values[0]
|
|
185
|
+
if name == "rights" and len(values) == 2:
|
|
186
|
+
# we expect the licence name first, followed by the url
|
|
187
|
+
return f"{values[0]} ({values[1]})"
|
|
188
|
+
return "; ".join(values)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Function do collect and update csv files."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from gamslib.objectcsv.objectcsv import ObjectCSV
|
|
7
|
+
|
|
8
|
+
from .utils import find_object_folders
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def collect_csv_data(
|
|
14
|
+
object_root_dir: Path,
|
|
15
|
+
object_csv_path: Path,
|
|
16
|
+
datastream_csv_path: Path,
|
|
17
|
+
) -> ObjectCSV:
|
|
18
|
+
"""Collect csv data from all object folders below object_root_dir.
|
|
19
|
+
|
|
20
|
+
This function collects all data from all object.csv and all datastream.csv files
|
|
21
|
+
below root_dir.
|
|
22
|
+
The collected data is stored in two files 'object_csv_path' and 'datastream_csv_path'.
|
|
23
|
+
|
|
24
|
+
Returns a ObjectCSV object containing all object and datastream metadata.
|
|
25
|
+
"""
|
|
26
|
+
# This is were we put all collected data
|
|
27
|
+
|
|
28
|
+
all_objects_csv = ObjectCSV(object_root_dir)
|
|
29
|
+
|
|
30
|
+
for objectfolder in find_object_folders(object_root_dir):
|
|
31
|
+
obj_csv = ObjectCSV(objectfolder)
|
|
32
|
+
|
|
33
|
+
for objmeta in obj_csv.get_objectdata():
|
|
34
|
+
all_objects_csv.add_objectdata(objmeta)
|
|
35
|
+
for dsmeta in obj_csv.get_datastreamdata():
|
|
36
|
+
all_objects_csv.add_datastream(dsmeta)
|
|
37
|
+
all_objects_csv.sort()
|
|
38
|
+
all_objects_csv.write(
|
|
39
|
+
object_csv_path, datastream_csv_path
|
|
40
|
+
)
|
|
41
|
+
return all_objects_csv
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def update_csv_files(
|
|
45
|
+
object_root_dir: Path,
|
|
46
|
+
input_dir: Path | None = None,
|
|
47
|
+
object_csv_filename: str = "all_objects.csv",
|
|
48
|
+
ds_csv_filename: str = "all_datastreams.csv",
|
|
49
|
+
) -> tuple[int, int]:
|
|
50
|
+
"""Update csv metadata files with data from the combined csv data.
|
|
51
|
+
|
|
52
|
+
If collected_csv_dir is None, we assume that the directory
|
|
53
|
+
containing the combined csv data is the local working directory. This is
|
|
54
|
+
where collect_csv_data() stores the data by default.
|
|
55
|
+
|
|
56
|
+
`object_csv_filename` and `ds_csv_filename` are the names of the csv files.
|
|
57
|
+
The must only be set, if the names are different from the default names.
|
|
58
|
+
|
|
59
|
+
In other words: this function updates all object and datatstream
|
|
60
|
+
metadata with data changed in the central csv files.
|
|
61
|
+
|
|
62
|
+
Returns a a tuple of ints: number of updated objects and number of updated datastreams.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
num_of_changed_objects = 0
|
|
66
|
+
num_of_changed_datastreams = 0
|
|
67
|
+
|
|
68
|
+
if input_dir is None:
|
|
69
|
+
input_dir = Path.cwd()
|
|
70
|
+
all_objects_csv = ObjectCSV(input_dir, object_csv_filename, ds_csv_filename)
|
|
71
|
+
for objectfolder in find_object_folders(object_root_dir):
|
|
72
|
+
obj_csv = ObjectCSV(objectfolder)
|
|
73
|
+
obj_csv.clear()
|
|
74
|
+
for obj_data in all_objects_csv.get_objectdata(obj_csv.object_id):
|
|
75
|
+
obj_csv.add_objectdata(obj_data)
|
|
76
|
+
num_of_changed_objects += 1
|
|
77
|
+
|
|
78
|
+
for ds_data in all_objects_csv.get_datastreamdata(obj_csv.object_id):
|
|
79
|
+
obj_csv.add_datastream(ds_data)
|
|
80
|
+
num_of_changed_datastreams += 1
|
|
81
|
+
|
|
82
|
+
obj_csv.write()
|
|
83
|
+
|
|
84
|
+
return num_of_changed_objects, num_of_changed_datastreams
|
|
@@ -0,0 +1,315 @@
|
|
|
1
|
+
"""Provides classes to handle object and datastream data in csv files.
|
|
2
|
+
|
|
3
|
+
The central class is ObjectCSV, which represents the object and datastream data.
|
|
4
|
+
|
|
5
|
+
ObjectCSV is directly accessible from the objectcsv package.
|
|
6
|
+
"""
|
|
7
|
+
# pylint: disable=too-many-instance-attributes
|
|
8
|
+
# pylint: disable=invalid-name
|
|
9
|
+
|
|
10
|
+
import csv
|
|
11
|
+
from dataclasses import asdict, dataclass, fields
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Generator
|
|
14
|
+
from . import defaultvalues
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class ObjectData:
|
|
19
|
+
"Represents csv data for a single object."
|
|
20
|
+
|
|
21
|
+
recid: str
|
|
22
|
+
title: str = ""
|
|
23
|
+
project: str = ""
|
|
24
|
+
description: str = ""
|
|
25
|
+
creator: str = ""
|
|
26
|
+
rights: str = ""
|
|
27
|
+
publisher: str = ""
|
|
28
|
+
source: str = ""
|
|
29
|
+
objectType: str = ""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def validate(self):
|
|
33
|
+
"Validate the object data."
|
|
34
|
+
# TODO: Needs discussion
|
|
35
|
+
if not self.recid:
|
|
36
|
+
raise ValueError("recid must not be empty")
|
|
37
|
+
if not self.title:
|
|
38
|
+
raise ValueError(f"{self.recid}: title must not be empty")
|
|
39
|
+
if not self.rights:
|
|
40
|
+
raise ValueError(f"{self.recid}: rights must not be empty")
|
|
41
|
+
if not self.source:
|
|
42
|
+
raise ValueError(f"{self.recid}: source must not be empty")
|
|
43
|
+
if not self.objectType:
|
|
44
|
+
raise ValueError(f"{self.recid}: objectType must not be empty")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class DSData:
|
|
49
|
+
"Represents csv data for a single datastream of a single object."
|
|
50
|
+
|
|
51
|
+
dspath: str
|
|
52
|
+
dsid: str = ""
|
|
53
|
+
title: str = ""
|
|
54
|
+
description: str = ""
|
|
55
|
+
mimetype: str = ""
|
|
56
|
+
creator: str = ""
|
|
57
|
+
rights: str = ""
|
|
58
|
+
lang: str = ""
|
|
59
|
+
|
|
60
|
+
def __post_init__(self):
|
|
61
|
+
"Add missing values if applicable and validate."
|
|
62
|
+
self._guess_mimetype()
|
|
63
|
+
self._guess_missing_values()
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def object_id(self):
|
|
69
|
+
"Return the object id of the object the datastream is part of."
|
|
70
|
+
return Path(self.dspath).parts[0]
|
|
71
|
+
|
|
72
|
+
def validate(self):
|
|
73
|
+
"Validate the datastream data."
|
|
74
|
+
if not self.dspath.strip():
|
|
75
|
+
raise ValueError(f"{self.dsid}: dspath must not be empty")
|
|
76
|
+
if not self.dsid.strip():
|
|
77
|
+
raise ValueError(f"{self.dspath}: dsid must not be empty")
|
|
78
|
+
if not self.mimetype.strip():
|
|
79
|
+
raise ValueError(f"{self.dspath}: mimetype must not be empty")
|
|
80
|
+
if not self.rights.strip():
|
|
81
|
+
raise ValueError(f"{self.dspath}: rights must not be empty")
|
|
82
|
+
|
|
83
|
+
def _guess_mimetype(self): # pylint: disable=no-self-use
|
|
84
|
+
"Guess the mimetype if it is empty."
|
|
85
|
+
# TODO!
|
|
86
|
+
if not self.mimetype:
|
|
87
|
+
self.mimetype = defaultvalues.DEFAULT_MIMETYPE
|
|
88
|
+
|
|
89
|
+
def _guess_missing_values(self):
|
|
90
|
+
"Guess missing values."
|
|
91
|
+
filename = Path(self.dspath).name
|
|
92
|
+
if not self.title:
|
|
93
|
+
if filename in defaultvalues.FILENAME_MAP:
|
|
94
|
+
self.title = defaultvalues.FILENAME_MAP[self.dsid]["title"]
|
|
95
|
+
elif self.mimetype.startswith('image/'):
|
|
96
|
+
self.title = f"Image: {self.dsid}"
|
|
97
|
+
elif self.mimetype.startswith('audio/'):
|
|
98
|
+
self.title = f"Audio: {self.dsid}"
|
|
99
|
+
elif self.mimetype.startswith('video/'):
|
|
100
|
+
self.title = f"Video: {self.dsid}"
|
|
101
|
+
|
|
102
|
+
if not self.description:
|
|
103
|
+
if filename in defaultvalues.FILENAME_MAP:
|
|
104
|
+
self.description = defaultvalues.FILENAME_MAP[self.dsid]["description"]
|
|
105
|
+
if not self.rights:
|
|
106
|
+
self.rights = defaultvalues.DEFAULT_RIGHTS
|
|
107
|
+
if not self.creator:
|
|
108
|
+
self.creator = defaultvalues.DEFAULT_CREATOR
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
@dataclass
|
|
113
|
+
class ObjectCSVFile:
|
|
114
|
+
"Represents csv data for a single object."
|
|
115
|
+
|
|
116
|
+
def __init__(self):
|
|
117
|
+
self._objectdata: list[ObjectData] = []
|
|
118
|
+
|
|
119
|
+
def add_objectdata(self, objectdata: ObjectData):
|
|
120
|
+
"Add a ObjectData objects."
|
|
121
|
+
self._objectdata.append(objectdata)
|
|
122
|
+
|
|
123
|
+
def get_data(self, pid: str | None = None) -> Generator[ObjectData, None, None]:
|
|
124
|
+
"""Return the objectdata objects for a given object pid.
|
|
125
|
+
|
|
126
|
+
If pid is None, return all objectdata objects.
|
|
127
|
+
Filtering by pid is only needed if we have data from multiple objects.
|
|
128
|
+
"""
|
|
129
|
+
for objdata in self._objectdata:
|
|
130
|
+
if pid is None:
|
|
131
|
+
yield objdata
|
|
132
|
+
else:
|
|
133
|
+
if objdata.recid == pid:
|
|
134
|
+
yield objdata
|
|
135
|
+
|
|
136
|
+
@classmethod
|
|
137
|
+
def from_csv(cls, csv_file: Path) -> "ObjectCSVFile":
|
|
138
|
+
"Load the object data from a csv file."
|
|
139
|
+
obj_csv_file = ObjectCSVFile()
|
|
140
|
+
with csv_file.open(encoding="utf-8", newline="") as f:
|
|
141
|
+
reader = csv.DictReader(f)
|
|
142
|
+
for row in reader:
|
|
143
|
+
obj_csv_file.add_objectdata(ObjectData(**row))
|
|
144
|
+
return obj_csv_file
|
|
145
|
+
|
|
146
|
+
def to_csv(self, csv_file: Path) -> None:
|
|
147
|
+
"Save the object data to a csv file."
|
|
148
|
+
with csv_file.open("w", encoding="utf-8", newline="") as f:
|
|
149
|
+
writer = csv.DictWriter(
|
|
150
|
+
f, fieldnames=[field.name for field in fields(ObjectData)]
|
|
151
|
+
)
|
|
152
|
+
writer.writeheader()
|
|
153
|
+
for objdata in self._objectdata:
|
|
154
|
+
writer.writerow(asdict(objdata))
|
|
155
|
+
|
|
156
|
+
def __len__(self):
|
|
157
|
+
"Return the number of objectdata objects."
|
|
158
|
+
return len(self._objectdata)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class DatastreamsCSVFile:
|
|
162
|
+
"Represents csv data for all datastreams of a single datastream."
|
|
163
|
+
|
|
164
|
+
def __init__(self):
|
|
165
|
+
self._datastreams: list[DSData] = []
|
|
166
|
+
|
|
167
|
+
def add_datastream(self, dsdata: DSData):
|
|
168
|
+
"Add a datastream to the datastreams."
|
|
169
|
+
self._datastreams.append(dsdata)
|
|
170
|
+
|
|
171
|
+
def get_data(self, pid: str | None = None) -> Generator[DSData, None, None]:
|
|
172
|
+
"""Return the datastream objects for a given object pid.
|
|
173
|
+
|
|
174
|
+
If pid is None, return all datastream objects.
|
|
175
|
+
Filtering by pid is only needed if we have data from multiple objects.
|
|
176
|
+
"""
|
|
177
|
+
for dsdata in self._datastreams:
|
|
178
|
+
if pid is None:
|
|
179
|
+
yield dsdata
|
|
180
|
+
else: # TODO: this is not object_id!
|
|
181
|
+
if dsdata.object_id == pid:
|
|
182
|
+
yield dsdata
|
|
183
|
+
|
|
184
|
+
@classmethod
|
|
185
|
+
def from_csv(cls, csv_file: Path) -> "DatastreamsCSVFile":
|
|
186
|
+
"Load the datastream container data from a csv file."
|
|
187
|
+
ds_csv_file = DatastreamsCSVFile()
|
|
188
|
+
with csv_file.open(encoding="utf-8", newline="") as f:
|
|
189
|
+
reader = csv.DictReader(f)
|
|
190
|
+
for row in reader:
|
|
191
|
+
ds_csv_file.add_datastream(DSData(**row))
|
|
192
|
+
return ds_csv_file
|
|
193
|
+
|
|
194
|
+
def to_csv(self, csv_file: Path):
|
|
195
|
+
"Save the datastream data to a csv file."
|
|
196
|
+
self._datastreams.sort(key=lambda x: x.dspath)
|
|
197
|
+
with csv_file.open("w", encoding="utf-8", newline="") as f:
|
|
198
|
+
writer = csv.DictWriter(
|
|
199
|
+
f, fieldnames=[field.name for field in fields(DSData)]
|
|
200
|
+
)
|
|
201
|
+
writer.writeheader()
|
|
202
|
+
for dsdata in self._datastreams:
|
|
203
|
+
writer.writerow(asdict(dsdata))
|
|
204
|
+
|
|
205
|
+
def __len__(self):
|
|
206
|
+
"Return the number of datastreams."
|
|
207
|
+
return len(self._datastreams)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
@dataclass
|
|
211
|
+
class ObjectCSV:
|
|
212
|
+
"""Represents the object and datastream data for a single object.
|
|
213
|
+
|
|
214
|
+
The constructor expects the Path to the object directory.
|
|
215
|
+
If the csv files are not set, we assume the default filenames:
|
|
216
|
+
object.csv and datastreams.csv.
|
|
217
|
+
"""
|
|
218
|
+
|
|
219
|
+
OBJECT_CSV_FILENAME = "object.csv"
|
|
220
|
+
DATASTREAM_CSV_FILENAME = "datastreams.csv"
|
|
221
|
+
|
|
222
|
+
object_dir: Path
|
|
223
|
+
object_file: str = OBJECT_CSV_FILENAME
|
|
224
|
+
datastream_file: str = DATASTREAM_CSV_FILENAME
|
|
225
|
+
|
|
226
|
+
def __post_init__(self):
|
|
227
|
+
"Check if the object directory exists and load the object and datastream data."
|
|
228
|
+
if not self.object_dir.is_dir():
|
|
229
|
+
raise FileNotFoundError(
|
|
230
|
+
f"Object directory '{self.object_dir}' does not exist."
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
self.obj_csv_file = self.object_dir / self.object_file
|
|
234
|
+
self.ds_csv_file = self.object_dir / self.datastream_file
|
|
235
|
+
|
|
236
|
+
if self.obj_csv_file.is_file():
|
|
237
|
+
self.object_data = ObjectCSVFile.from_csv(self.obj_csv_file)
|
|
238
|
+
else:
|
|
239
|
+
self.object_data = ObjectCSVFile()
|
|
240
|
+
|
|
241
|
+
if self.ds_csv_file.is_file():
|
|
242
|
+
self.datastream_data = DatastreamsCSVFile.from_csv(self.ds_csv_file)
|
|
243
|
+
else:
|
|
244
|
+
self.datastream_data = DatastreamsCSVFile()
|
|
245
|
+
|
|
246
|
+
def is_new(self):
|
|
247
|
+
"Return True if at least one of the csv files exist."
|
|
248
|
+
# obj_csv = self.object_dir / self.OBJECT_CSV_FILENAME
|
|
249
|
+
# ds_csv = self.object_dir / self.DATASTREAM_CSV_FILENAME
|
|
250
|
+
return not (self.obj_csv_file.exists() or self.ds_csv_file.exists())
|
|
251
|
+
|
|
252
|
+
def add_datastream(self, dsdata: DSData):
|
|
253
|
+
"Add a datastream to the object."
|
|
254
|
+
self.datastream_data.add_datastream(dsdata)
|
|
255
|
+
|
|
256
|
+
def add_objectdata(self, objectdata: ObjectData):
|
|
257
|
+
"Add a object to the object."
|
|
258
|
+
self.object_data.add_objectdata(objectdata)
|
|
259
|
+
|
|
260
|
+
def get_objectdata(
|
|
261
|
+
self, pid: str | None = None
|
|
262
|
+
) -> Generator[ObjectData, None, None]:
|
|
263
|
+
"""Return the object data for a given object pid.
|
|
264
|
+
|
|
265
|
+
If pid is None, return all object data.
|
|
266
|
+
"""
|
|
267
|
+
return self.object_data.get_data(pid)
|
|
268
|
+
|
|
269
|
+
def get_datastreamdata(
|
|
270
|
+
self, pid: str | None = None
|
|
271
|
+
) -> Generator[DSData, None, None]:
|
|
272
|
+
"""Return the datastream data for a given object pid.
|
|
273
|
+
|
|
274
|
+
If pid is None, return all datastream data.
|
|
275
|
+
"""
|
|
276
|
+
return self.datastream_data.get_data(pid)
|
|
277
|
+
|
|
278
|
+
def sort(self):
|
|
279
|
+
"Sort the object and datastream data."
|
|
280
|
+
self.object_data._objectdata.sort(key=lambda x: x.recid)
|
|
281
|
+
self.datastream_data._datastreams.sort(key=lambda x: x.dspath)
|
|
282
|
+
|
|
283
|
+
def write(
|
|
284
|
+
self,
|
|
285
|
+
object_csv_path: Path | None = None,
|
|
286
|
+
datastream_csv_path: Path | None = None
|
|
287
|
+
):
|
|
288
|
+
"""Save the object and datastream data to csv files.
|
|
289
|
+
|
|
290
|
+
If no explicit output files are set, we use the default filenames and write to object_dir.
|
|
291
|
+
"""
|
|
292
|
+
if object_csv_path is None:
|
|
293
|
+
object_csv_path = self.obj_csv_file
|
|
294
|
+
if datastream_csv_path is None:
|
|
295
|
+
datastream_csv_path = self.ds_csv_file
|
|
296
|
+
self.object_data.to_csv(object_csv_path)
|
|
297
|
+
self.datastream_data.to_csv(datastream_csv_path)
|
|
298
|
+
|
|
299
|
+
def count_objects(self) -> int:
|
|
300
|
+
"Return the number of object data objects."
|
|
301
|
+
return len(self.object_data)
|
|
302
|
+
|
|
303
|
+
def count_datastreams(self) -> int:
|
|
304
|
+
"Return the number of datastream data objects."
|
|
305
|
+
return len(self.datastream_data)
|
|
306
|
+
|
|
307
|
+
def clear(self):
|
|
308
|
+
"Clear the object and datastream data."
|
|
309
|
+
self.object_data = ObjectCSVFile()
|
|
310
|
+
self.datastream_data = DatastreamsCSVFile()
|
|
311
|
+
|
|
312
|
+
@property
|
|
313
|
+
def object_id(self):
|
|
314
|
+
"Return the object id."
|
|
315
|
+
return self.object_dir.name
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Utility functions for the objectcsv module."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Generator
|
|
6
|
+
import warnings
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def find_object_folders(root_directory: Path) -> Generator[Path, None, None]:
|
|
12
|
+
"""Find all object folders below root_directory."""
|
|
13
|
+
for directory in root_directory.rglob("*"):
|
|
14
|
+
if directory.is_dir():
|
|
15
|
+
if "DC.xml" in [f.name for f in directory.iterdir()]:
|
|
16
|
+
yield directory
|
|
17
|
+
else:
|
|
18
|
+
warnings.warn(
|
|
19
|
+
f"Skipping '{directory}' as folder does not contain a DC.xml file.",
|
|
20
|
+
UserWarning
|
|
21
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Module to convert csv to xlsx files."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import pylightxl as xl
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def read_csv(csvfile: Path, skip_header: bool = True) -> list[list[str]]:
|
|
10
|
+
"""Read a csv file and return a list of rows."""
|
|
11
|
+
with open(csvfile, "r", encoding="utf-8", newline="") as f:
|
|
12
|
+
reader = csv.reader(f)
|
|
13
|
+
if skip_header:
|
|
14
|
+
next(reader)
|
|
15
|
+
return list(reader)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def csv_to_xlsx(object_csv: Path, ds_csv: Path, output_file: Path) -> Path:
|
|
19
|
+
"""Convert csv files to xlsx files.
|
|
20
|
+
|
|
21
|
+
Convert the csv files object_csv and ds_csv to xlsx files in the output_dir.
|
|
22
|
+
|
|
23
|
+
Returns Path to output file.
|
|
24
|
+
"""
|
|
25
|
+
object_data = read_csv(object_csv, skip_header=False)
|
|
26
|
+
ds_data = read_csv(ds_csv, skip_header=False)
|
|
27
|
+
|
|
28
|
+
db = xl.Database()
|
|
29
|
+
db.add_ws("Object Metadata")
|
|
30
|
+
for row_id, row in enumerate(object_data, start=1):
|
|
31
|
+
for col_id, value in enumerate(row, start=1):
|
|
32
|
+
db.ws(ws="Object Metadata").update_index(row=row_id, col=col_id, val=value)
|
|
33
|
+
db.add_ws("Datastream Metadata")
|
|
34
|
+
for row_id, row_data in enumerate(ds_data, start=1):
|
|
35
|
+
for col_id, value in enumerate(row_data, start=1):
|
|
36
|
+
db.ws(ws="Datastream Metadata").update_index(
|
|
37
|
+
row=row_id, col=col_id, val=value
|
|
38
|
+
)
|
|
39
|
+
xl.writexl(fn=output_file, db=db)
|
|
40
|
+
return output_file
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def xlsx_to_csv(xlsx_path: Path, obj_csv_path: Path, ds_csv_path: Path)-> tuple[Path, Path]:
|
|
44
|
+
"""Convert a xlsx metadata file to 2 csv files: object.csv and datastreams.csv.
|
|
45
|
+
|
|
46
|
+
Return Paths to the csv files as tuple (obj_csv_path, ds_csv_path).
|
|
47
|
+
"""
|
|
48
|
+
db = xl.readxl(xlsx_path)
|
|
49
|
+
|
|
50
|
+
object_data = list(db.ws(ws="Object Metadata").rows)
|
|
51
|
+
ds_data = list(db.ws(ws="Datastream Metadata").rows)
|
|
52
|
+
|
|
53
|
+
with open(obj_csv_path, "w", encoding="utf-8", newline="") as f:
|
|
54
|
+
writer = csv.writer(f)
|
|
55
|
+
writer.writerows(object_data)
|
|
56
|
+
|
|
57
|
+
with open(ds_csv_path, "w", encoding="utf-8", newline="") as f:
|
|
58
|
+
writer = csv.writer(f)
|
|
59
|
+
writer.writerows(ds_data)
|
|
60
|
+
return obj_csv_path, ds_csv_path
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Handle project configuration.
|
|
2
|
+
|
|
3
|
+
The projectconfiguration package contains the classes and functions to
|
|
4
|
+
manage the configuration of a GAMS project.
|
|
5
|
+
|
|
6
|
+
It tries to find a configuration, validates the configuration and provides
|
|
7
|
+
all configuration inline tables as Python objects.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from importlib import resources as impresources
|
|
12
|
+
import shutil
|
|
13
|
+
import logging
|
|
14
|
+
import warnings
|
|
15
|
+
from .configuration import Configuration, find_project_toml
|
|
16
|
+
|
|
17
|
+
__all__ = ["Configuration"]
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def create_configuration(objects_dir: Path) -> Path | None:
|
|
23
|
+
"""Create a project.toml template file in the objects_dir directory.
|
|
24
|
+
|
|
25
|
+
It is assumed that the objects_dir is the root directory of a GAMS project
|
|
26
|
+
and that the directory exists.
|
|
27
|
+
The template file will not be created if a project.toml file already exists.
|
|
28
|
+
Return the path to the created project.toml file or None if the file already exists.
|
|
29
|
+
"""
|
|
30
|
+
toml_file = objects_dir / "project.toml"
|
|
31
|
+
if toml_file.exists():
|
|
32
|
+
warnings.warn(f"'{toml_file}' already exists. Will not be re-created.", UserWarning)
|
|
33
|
+
return None
|
|
34
|
+
toml_template_file = str(
|
|
35
|
+
impresources.files("gamslib")
|
|
36
|
+
/ "projectconfiguration"
|
|
37
|
+
/ "resources"
|
|
38
|
+
/ "project.toml"
|
|
39
|
+
)
|
|
40
|
+
shutil.copy(toml_template_file, toml_file)
|
|
41
|
+
return objects_dir / "project.toml"
|
|
42
|
+
|
|
43
|
+
def load_configuration(object_root: Path, config_file: Path | str | None = None) -> Configuration:
|
|
44
|
+
"""Read the configuration file and return a configuration object."""
|
|
45
|
+
if config_file is None:
|
|
46
|
+
config_file = find_project_toml(object_root)
|
|
47
|
+
if isinstance(config_file, str):
|
|
48
|
+
config_file = Path(config_file)
|
|
49
|
+
|
|
50
|
+
return Configuration.from_toml(config_file)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"Provides a configuration class"
|
|
2
|
+
|
|
3
|
+
# pylint: disable=too-many-arguments
|
|
4
|
+
# pylint: disable=too-few-public-methods
|
|
5
|
+
# pylint: disable=too-many-positional-arguments
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import tomllib
|
|
9
|
+
|
|
10
|
+
from typing import Annotated, Literal
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
from pydantic import BaseModel, ValidationError, StringConstraints
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def find_project_toml(start_dir: Path) -> Path:
|
|
17
|
+
"""Find the project.toml file in the start_dir or above.
|
|
18
|
+
|
|
19
|
+
Return a path object to the project.toml file.
|
|
20
|
+
If no project.toml file is found, raise a FileNotFoundError.
|
|
21
|
+
"""
|
|
22
|
+
for folder in (start_dir / "a_non_existing_folder_to_include_start_dir").parents:
|
|
23
|
+
project_toml = folder / "project.toml"
|
|
24
|
+
if project_toml.exists():
|
|
25
|
+
return project_toml
|
|
26
|
+
|
|
27
|
+
# if we read this point, no project.toml has been found in object_root or above
|
|
28
|
+
# So we check if there's a project.toml in the current working directory
|
|
29
|
+
project_toml = Path.cwd() / "project.toml"
|
|
30
|
+
|
|
31
|
+
if project_toml.exists():
|
|
32
|
+
return project_toml
|
|
33
|
+
raise FileNotFoundError("No project.toml file found in or above the start_dir.")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Metadata(BaseModel, validate_assignment=True):
|
|
37
|
+
"""Represent the 'metadata' section of the configuration file."""
|
|
38
|
+
|
|
39
|
+
project_id: Annotated[str, StringConstraints(min_length=2)]
|
|
40
|
+
creator: Annotated[str, StringConstraints(min_length=3)]
|
|
41
|
+
publisher: Annotated[str, StringConstraints(min_length=3)]
|
|
42
|
+
rights: str = ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class General(BaseModel, validate_assignment=True):
|
|
46
|
+
"""Represent the 'general' section of the configuration file."""
|
|
47
|
+
|
|
48
|
+
dsid_keep_extension: bool = True
|
|
49
|
+
loglevel: Literal["debug", "info", "warning", "error", "critical"] = "info"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Configuration(BaseModel):
|
|
53
|
+
"""Represent the configuration from the project toml file.
|
|
54
|
+
|
|
55
|
+
Properties can be accessed as attributes of the object and sub object:
|
|
56
|
+
eg.: metadata.rights
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
toml_file: Path
|
|
60
|
+
metadata: Metadata
|
|
61
|
+
general: General
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def _make_readable_message(cls, cfgfile, error_type: str, loc: tuple) -> str | None:
|
|
65
|
+
"""Return a readable error message or None.
|
|
66
|
+
|
|
67
|
+
Helper function which creates a readable error messages.
|
|
68
|
+
|
|
69
|
+
Returns a readable error message or None if 'type' is not known by function.
|
|
70
|
+
"""
|
|
71
|
+
# There are many more types which could be handled, but are not needed yet.
|
|
72
|
+
# See: https://docs.pydantic.dev/latest/errors/validation_errors/
|
|
73
|
+
reasons = {
|
|
74
|
+
"missing": "missing required field",
|
|
75
|
+
"string_too_short": "value is too short",
|
|
76
|
+
"bool_type": "value is not a boolean",
|
|
77
|
+
"bool_parsing": "value is not a boolean",
|
|
78
|
+
"literal_error": "value is not allowed here",
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
loc_str = ".".join([str(e) for e in loc])
|
|
82
|
+
reason = reasons.get(error_type)
|
|
83
|
+
if reason is None:
|
|
84
|
+
return None
|
|
85
|
+
return f"Error in project TOML file '{cfgfile}'. {reason}: '{loc_str}'"
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def from_toml(cls, toml_file: Path) -> "Configuration":
|
|
89
|
+
"""Create a configuration object from a toml file."""
|
|
90
|
+
try:
|
|
91
|
+
with toml_file.open("r", encoding="utf-8", newline="") as tf:
|
|
92
|
+
data = tomllib.loads(tf.read())
|
|
93
|
+
data["toml_file"] = toml_file
|
|
94
|
+
return cls(**data)
|
|
95
|
+
except FileNotFoundError as e:
|
|
96
|
+
raise FileNotFoundError(
|
|
97
|
+
f"Configuration file '{toml_file.parent}' not found."
|
|
98
|
+
) from e
|
|
99
|
+
except tomllib.TOMLDecodeError as e:
|
|
100
|
+
raise tomllib.TOMLDecodeError(
|
|
101
|
+
f"Error in project TOML file '{toml_file}': {e}"
|
|
102
|
+
) from e
|
|
103
|
+
except ValidationError as e:
|
|
104
|
+
msg = cls._make_readable_message(
|
|
105
|
+
toml_file, e.errors()[0]["type"], e.errors()[0]["loc"]
|
|
106
|
+
)
|
|
107
|
+
raise ValueError(msg) from e
|
|
108
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
# [metadata] contains general metadata for the GAMS project
|
|
3
|
+
[metadata]
|
|
4
|
+
# the id of the project (eg.: hsa)
|
|
5
|
+
project_id = ""
|
|
6
|
+
|
|
7
|
+
# The project creator (like in Dublin Core)
|
|
8
|
+
creator = ""
|
|
9
|
+
|
|
10
|
+
# You might want to keep this value ("GAMS")
|
|
11
|
+
publisher = "GAMS"
|
|
12
|
+
|
|
13
|
+
# Set the default license for the project.
|
|
14
|
+
# Can be overriden in project.csv and datastream.csv for single objects/data streams
|
|
15
|
+
rights = "Creative Commons Attribution-NonCommercial 4.0 (https://creativecommons.org/licenses/by-nc/4.0/)"
|
|
16
|
+
|
|
17
|
+
[general]
|
|
18
|
+
# Normally dsid is the full filename (with extension). Set to false to strip extension for dsid.
|
|
19
|
+
dsid_keep_extension = true
|
|
20
|
+
|
|
21
|
+
# debug, info, warning, error or critical
|
|
22
|
+
loglevel = "info"
|
|
23
|
+
|
gamslib/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gamslib
|
|
3
|
+
Version: 0.3.1
|
|
4
|
+
Summary: Modules and subpackages used in various GAMS5 related projects
|
|
5
|
+
Project-URL: Homepage, https://github.com/DHGraz/gamslib
|
|
6
|
+
Project-URL: Repository, https://github.com/DHGraz/gamslib
|
|
7
|
+
Project-URL: Issues, https://github.com/DHGraz/gamslib/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/DHGraz/gamslib/blob/main/CHANGELOG.md
|
|
9
|
+
Author-email: Gunter Vasold <gunter.vasold@uni-graz.at>, Fabio Tosques <fabio.tosques@uni-graz.at>
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: GAMS
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
19
|
+
Requires-Python: >=3.11
|
|
20
|
+
Requires-Dist: pydantic>=2.10.1
|
|
21
|
+
Requires-Dist: pylightxl>=1.61
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
|
|
24
|
+
# gamslib
|
|
25
|
+
|
|
26
|
+
Gamslib is a collection of GAMS related modules and packages, which are used in multiple other packages.
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
gamslib is available on pypi.org and can be installed via pip:
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install gamslib
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Usage
|
|
37
|
+
|
|
38
|
+
As gamslib is a library, it can only we used with other code.
|
|
39
|
+
|
|
40
|
+
The main purpose is to make code reusable in other GAMS5 projects and to have
|
|
41
|
+
a unique way of doing things. If you are not working on GAMS5 reated code
|
|
42
|
+
(which is very likely), this library will be useless for you.
|
|
43
|
+
|
|
44
|
+
Currently these subpackages are available (more to come):
|
|
45
|
+
|
|
46
|
+
## objectcsv
|
|
47
|
+
|
|
48
|
+
Handle object and datastream metadata in object csv files.
|
|
49
|
+
|
|
50
|
+
When creating bags for GAMS, we provide some metadata in csv
|
|
51
|
+
files (which are not part of the bag, btw).
|
|
52
|
+
|
|
53
|
+
The objectcsv package provides tools to handle this metadata.
|
|
54
|
+
|
|
55
|
+
* The ObjectCSV class represents the object and datastream csv data for a
|
|
56
|
+
single object. It is created by providing the path to the object
|
|
57
|
+
directory.
|
|
58
|
+
* The manage_csv module can be used to collect csv data from all objects into
|
|
59
|
+
a single file, which makes editing the data more efficient. It also has a
|
|
60
|
+
function to update the csv files in the object directories based on the
|
|
61
|
+
collected data.
|
|
62
|
+
* The xlsx module can be used to convert the csv files to xlsx files and vice
|
|
63
|
+
versa. This is useful for editing the data in a spreadsheet without the
|
|
64
|
+
hassles of importing and exporting the csv files, which led to encoding
|
|
65
|
+
problems in the past.
|
|
66
|
+
|
|
67
|
+
## projectconfiguration
|
|
68
|
+
|
|
69
|
+
This package contains a central class `Configuration` that represents the
|
|
70
|
+
project configuration. To create this object, the function
|
|
71
|
+
`load_configuration(OBJECT_ROOT, [PATH_TO_TOML_FILE])` should be used.
|
|
72
|
+
|
|
73
|
+
The function tries to find the project configuration file, validates its
|
|
74
|
+
content, and creates the central Configuration object with all sub-objects
|
|
75
|
+
(Each TOML inline table is provided as its own sub-object). These sub-objects
|
|
76
|
+
are currently:
|
|
77
|
+
|
|
78
|
+
* general
|
|
79
|
+
* metadata
|
|
80
|
+
|
|
81
|
+
A basic configuration file can be generated via the `create_condiguration()`
|
|
82
|
+
function.
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
## Contributing
|
|
86
|
+
|
|
87
|
+
The Github repository is ment to be a read only mirror of the work repository
|
|
88
|
+
hosted on our institutional private Github server. You can use the bug tracker
|
|
89
|
+
on Github, but everything else should happen in the Zimlab Github repo.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
[MIT](https://choosealicense.com/licenses/mit/)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
gamslib/__init__.py,sha256=8FpAlbzlpTpbNmRd4PRcmEDb-6UxuEAFpuLTvcPoSfI,230
|
|
2
|
+
gamslib/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
gamslib/objectcsv/__init__.py,sha256=B8QoKtHmtHvxfUaDr16EzSLXeJ77LJYXRuf3oKlhA1A,2097
|
|
4
|
+
gamslib/objectcsv/create_csv.py,sha256=eC6Rg-04N6St9h32z_rWTQoUqFbQxt2o0hD0Lxh6640,5936
|
|
5
|
+
gamslib/objectcsv/defaultvalues.py,sha256=EEjdP7xs6IS3KdOt1o9-8U7xrnJ380LDGwHfovVfkVw,913
|
|
6
|
+
gamslib/objectcsv/dublincore.py,sha256=SOSlxkC5ldWKDi62ai2-5dkIZOAQf2RR6238RAQWaZU,5984
|
|
7
|
+
gamslib/objectcsv/manage_csv.py,sha256=pCuT6RGvGRBmc4H41ZpRKV8FJJ4N8pmYSkTCPUtV9e0,2856
|
|
8
|
+
gamslib/objectcsv/objectcsv.py,sha256=cTWqRcs1495YqfgBMppTydvGWUwqjlD8JYSFraQG6WA,10763
|
|
9
|
+
gamslib/objectcsv/utils.py,sha256=CNnVK6SlpwIQxc0jC1Uayb1SSnFuu3_0glBHLTB0u9A,667
|
|
10
|
+
gamslib/objectcsv/xlsx.py,sha256=6sbtlJzMu8Lbt1di2L0hKCdIOQw1Tv224vz0B_Nfdrk,2064
|
|
11
|
+
gamslib/projectconfiguration/__init__.py,sha256=dhG8PKVGNpIMMzmz2L0a1uKt8juAU_hP2EpGxSZgFIU,1751
|
|
12
|
+
gamslib/projectconfiguration/configuration.py,sha256=_rDLaKfxDQ_wN2f5W0s_xplIY5EohKlxvINK24pheaE,3851
|
|
13
|
+
gamslib/projectconfiguration/resources/project.toml,sha256=rKsB6jEs4gbqtAaJZeMboXbuWZ7c0lLuuD2QAR_3P88,686
|
|
14
|
+
gamslib-0.3.1.dist-info/METADATA,sha256=S1zLYFX9-viVcm0RnTQu7sy42JUcttRuvS-TVH99FQA,3383
|
|
15
|
+
gamslib-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
16
|
+
gamslib-0.3.1.dist-info/licenses/LICENSE,sha256=exKVUgS3rp03ninfpZsfMw6x7VfLkXKaN2vQfgEICCw,1085
|
|
17
|
+
gamslib-0.3.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Fabio Tosques, Gunter Vasold
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|