ghga-transpiler 2.3.2__tar.gz → 3.0.0rc1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {ghga_transpiler-2.3.2/src/ghga_transpiler.egg-info → ghga_transpiler-3.0.0rc1}/PKG-INFO +5 -6
  2. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/pyproject.toml +11 -13
  3. ghga_transpiler-3.0.0rc1/src/ghga_transpiler/cli.py +107 -0
  4. ghga_transpiler-3.0.0rc1/src/ghga_transpiler/config.py +136 -0
  5. {ghga_transpiler-2.3.2/src/ghga_transpiler/config → ghga_transpiler-3.0.0rc1/src/ghga_transpiler}/exceptions.py +14 -2
  6. ghga_transpiler-3.0.0rc1/src/ghga_transpiler/metasheet_parser.py +125 -0
  7. ghga_transpiler-3.0.0rc1/src/ghga_transpiler/models.py +98 -0
  8. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler/transformations.py +1 -1
  9. ghga_transpiler-3.0.0rc1/src/ghga_transpiler/transpile.py +53 -0
  10. ghga_transpiler-2.3.2/src/ghga_transpiler/io.py → ghga_transpiler-3.0.0rc1/src/ghga_transpiler/transpiler_io.py +20 -21
  11. ghga_transpiler-3.0.0rc1/src/ghga_transpiler/workbook_parser.py +177 -0
  12. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1/src/ghga_transpiler.egg-info}/PKG-INFO +5 -6
  13. ghga_transpiler-3.0.0rc1/src/ghga_transpiler.egg-info/SOURCES.txt +23 -0
  14. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler.egg-info/requires.txt +2 -3
  15. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/tests/test_convert_workbook.py +4 -6
  16. ghga_transpiler-3.0.0rc1/tests/test_create_workbook_config.py +46 -0
  17. ghga_transpiler-3.0.0rc1/tests/test_io.py +106 -0
  18. ghga_transpiler-2.3.2/src/ghga_transpiler/cli.py +0 -79
  19. ghga_transpiler-2.3.2/src/ghga_transpiler/config/__init__.py +0 -20
  20. ghga_transpiler-2.3.2/src/ghga_transpiler/config/config.py +0 -106
  21. ghga_transpiler-2.3.2/src/ghga_transpiler/configs/0.10.yaml +0 -135
  22. ghga_transpiler-2.3.2/src/ghga_transpiler/configs/1.0.yaml +0 -135
  23. ghga_transpiler-2.3.2/src/ghga_transpiler/configs/1.1.yaml +0 -135
  24. ghga_transpiler-2.3.2/src/ghga_transpiler/configs/2.0.yaml +0 -170
  25. ghga_transpiler-2.3.2/src/ghga_transpiler/configs/2.1.yaml +0 -172
  26. ghga_transpiler-2.3.2/src/ghga_transpiler/configs/__init__.py +0 -16
  27. ghga_transpiler-2.3.2/src/ghga_transpiler/core.py +0 -155
  28. ghga_transpiler-2.3.2/src/ghga_transpiler.egg-info/SOURCES.txt +0 -28
  29. ghga_transpiler-2.3.2/tests/test_create_config.py +0 -43
  30. ghga_transpiler-2.3.2/tests/test_io.py +0 -59
  31. ghga_transpiler-2.3.2/tests/test_process_workbook.py +0 -47
  32. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/LICENSE +0 -0
  33. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/README.md +0 -0
  34. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/setup.cfg +0 -0
  35. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler/__init__.py +0 -0
  36. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler/__main__.py +0 -0
  37. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler.egg-info/dependency_links.txt +0 -0
  38. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler.egg-info/entry_points.txt +0 -0
  39. {ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler.egg-info/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ghga_transpiler
3
- Version: 2.3.2
3
+ Version: 3.0.0rc1
4
4
  Summary: GHGA-Transpiler - excel to JSON converter
5
5
  Author-email: "German Human Genome Phenome Archive (GHGA)" <contact@ghga.de>
6
6
  License: Apache 2.0
7
7
  Project-URL: Repository, https://github.com/ghga-de/ghga-transpiler
8
- Classifier: Development Status :: 1 - Planning
8
+ Classifier: Development Status :: 5 - Production/Stable
9
9
  Classifier: Operating System :: POSIX :: Linux
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.12
@@ -13,16 +13,15 @@ Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Topic :: Internet :: WWW/HTTP :: HTTP Servers
14
14
  Classifier: Topic :: Software Development :: Libraries
15
15
  Classifier: Intended Audience :: Developers
16
- Requires-Python: >=3.9
16
+ Requires-Python: >=3.12
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
- Requires-Dist: typer>=0.12
20
19
  Requires-Dist: openpyxl==3.*,>=3.1.2
21
20
  Requires-Dist: defusedxml==0.*,>=0.7
22
- Requires-Dist: pydantic<3,>=2.6
21
+ Requires-Dist: pydantic<3,>=2
23
22
  Requires-Dist: PyYAML~=6.0
24
23
  Requires-Dist: semver==3.*
25
- Requires-Dist: click~=8.1.0
24
+ Requires-Dist: schemapack==2.0.0
26
25
  Dynamic: license-file
27
26
 
28
27
 
@@ -1,6 +1,6 @@
1
1
  [build-system]
2
2
  requires = [
3
- "setuptools>=69",
3
+ "setuptools>=80.3",
4
4
  ]
5
5
  build-backend = "setuptools.build_meta"
6
6
 
@@ -9,9 +9,9 @@ readme = "README.md"
9
9
  authors = [
10
10
  { name = "German Human Genome Phenome Archive (GHGA)", email = "contact@ghga.de" },
11
11
  ]
12
- requires-python = ">=3.9"
12
+ requires-python = ">=3.12"
13
13
  classifiers = [
14
- "Development Status :: 1 - Planning",
14
+ "Development Status :: 5 - Production/Stable",
15
15
  "Operating System :: POSIX :: Linux",
16
16
  "Programming Language :: Python :: 3",
17
17
  "Programming Language :: Python :: 3.12",
@@ -21,16 +21,15 @@ classifiers = [
21
21
  "Intended Audience :: Developers",
22
22
  ]
23
23
  name = "ghga_transpiler"
24
- version = "2.3.2"
24
+ version = "3.0.0-rc.1"
25
25
  description = "GHGA-Transpiler - excel to JSON converter"
26
26
  dependencies = [
27
- "typer >= 0.12",
28
27
  "openpyxl >= 3.1.2, == 3.*",
29
28
  "defusedxml >= 0.7, == 0.*",
30
- "pydantic >=2.6, <3",
29
+ "pydantic >=2, <3",
31
30
  "PyYAML ~= 6.0",
32
31
  "semver == 3.*",
33
- "click ~= 8.1.0",
32
+ "schemapack == 2.0.0",
34
33
  ]
35
34
 
36
35
  [project.license]
@@ -47,11 +46,6 @@ where = [
47
46
  "src",
48
47
  ]
49
48
 
50
- [tool.setuptools.package-data]
51
- "ghga_transpiler.configs" = [
52
- "*.yaml",
53
- ]
54
-
55
49
  [tool.ruff]
56
50
  exclude = [
57
51
  ".git",
@@ -67,12 +61,14 @@ src = [
67
61
  "examples",
68
62
  "scripts",
69
63
  ]
64
+ target-version = "py312"
70
65
 
71
66
  [tool.ruff.lint]
72
67
  fixable = [
73
68
  "UP",
74
69
  "I",
75
70
  "D",
71
+ "RUF022",
76
72
  ]
77
73
  ignore = [
78
74
  "E111",
@@ -92,6 +88,7 @@ ignore = [
92
88
  "D206",
93
89
  "D300",
94
90
  "UP040",
91
+ "PLC0206",
95
92
  ]
96
93
  select = [
97
94
  "C90",
@@ -151,8 +148,9 @@ check_untyped_defs = true
151
148
  no_site_packages = false
152
149
 
153
150
  [tool.pytest.ini_options]
154
- minversion = "8.0"
151
+ minversion = "8.3"
155
152
  asyncio_mode = "strict"
153
+ asyncio_default_fixture_loop_scope = "function"
156
154
 
157
155
  [tool.coverage.paths]
158
156
  source = [
@@ -0,0 +1,107 @@
1
+ # Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
2
+ # for the German Human Genome-Phenome Archive (GHGA)
3
+
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+ #
17
+ """CLI-specific wrappers around core functions."""
18
+
19
+ from __future__ import annotations
20
+
21
+ import sys
22
+ from enum import Enum
23
+ from pathlib import Path
24
+ from typing import Annotated
25
+
26
+ import typer
27
+
28
+ from . import __version__, transpiler_io
29
+ from .transpile import transpile
30
+
31
+ cli = typer.Typer()
32
+
33
+
34
+ def version_callback(value: bool):
35
+ """Prints the package version"""
36
+ if value:
37
+ print(__version__)
38
+ raise typer.Exit()
39
+
40
+
41
+ def format_callback(value: str):
42
+ """Validates the user input for format parameter"""
43
+ if value not in ["json", "yaml"]:
44
+ raise typer.BadParameter("Only 'json' or 'yaml' is allowed.")
45
+ return value
46
+
47
+
48
+ class Format(str, Enum):
49
+ """Enum class for output format types"""
50
+
51
+ json = "json"
52
+ yaml = "yaml"
53
+
54
+
55
+ @cli.command()
56
+ def main(
57
+ spread_sheet: Annotated[
58
+ Path,
59
+ typer.Argument(
60
+ exists=True,
61
+ help="The path to input file (XLSX)",
62
+ dir_okay=False,
63
+ readable=True,
64
+ ),
65
+ ],
66
+ output_file: Annotated[
67
+ Path | None,
68
+ typer.Argument(help="The path to output file (JSON).", dir_okay=False),
69
+ ] = None,
70
+ format: Annotated[
71
+ Format,
72
+ typer.Option(
73
+ "--format",
74
+ "-t",
75
+ help="Output format: 'json' or 'yaml'",
76
+ callback=format_callback,
77
+ is_eager=True,
78
+ ),
79
+ ] = Format.json,
80
+ force: Annotated[
81
+ bool, typer.Option("--force", "-f", help="Override output file if it exists.")
82
+ ] = False,
83
+ version: Annotated[
84
+ bool,
85
+ typer.Option(
86
+ "--version",
87
+ "-v",
88
+ callback=version_callback,
89
+ is_eager=True,
90
+ help="Print package version",
91
+ ),
92
+ ] = False,
93
+ ):
94
+ """ghga-transpiler is a command line utility to transpile the official GHGA
95
+ metadata XLSX workbooks to JSON. TODO Validation
96
+ """
97
+ try:
98
+ ghga_datapack = transpile(spread_sheet)
99
+ except SyntaxError as exc:
100
+ sys.exit(f"Unable to parse input file '{spread_sheet}': {exc}")
101
+ yaml_format = format == "yaml"
102
+ try:
103
+ transpiler_io.write_datapack(
104
+ data=ghga_datapack, path=output_file, yaml_format=yaml_format, force=force
105
+ )
106
+ except FileExistsError as exc:
107
+ sys.exit(f"ERROR: {exc}")
@@ -0,0 +1,136 @@
1
+ # Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
2
+ # for the German Human Genome-Phenome Archive (GHGA)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Module to process config file"""
18
+
19
+ from collections import Counter
20
+ from collections.abc import Callable
21
+ from typing import NamedTuple
22
+
23
+ from pydantic import (
24
+ BaseModel,
25
+ ConfigDict,
26
+ Field,
27
+ model_validator,
28
+ )
29
+
30
+ from .exceptions import DuplicatedName
31
+ from .transformations import to_attributes, to_list, to_snake_case, to_snake_case_list
32
+
33
+
34
+ class RelationMeta(NamedTuple):
35
+ """A data model for relation properties of a column"""
36
+
37
+ name: str
38
+ target_class: str | None
39
+
40
+
41
+ class ColumnMeta(BaseModel):
42
+ """A data model for column properties"""
43
+
44
+ model_config = ConfigDict(populate_by_name=True, frozen=True)
45
+
46
+ sheet_name: str = Field(..., alias="sheet")
47
+ column_name: str = Field(..., alias="column")
48
+ multivalued: bool
49
+ type: str
50
+ ref_class: str | None
51
+ ref_id: str | None = Field(..., alias="ref_class_id_property")
52
+ enum: bool
53
+ required: bool
54
+
55
+ def transformation(self) -> Callable | None:
56
+ """Assigns transformation function based on column properties"""
57
+ if self.enum:
58
+ return to_snake_case_list() if self.multivalued else to_snake_case()
59
+ if self.multivalued:
60
+ return to_attributes() if self.type == "object" else to_list()
61
+ return lambda value: value
62
+
63
+ def is_relation(self) -> bool:
64
+ """Return whether this is a relation column"""
65
+ return bool(self.ref_class)
66
+
67
+
68
+ class SheetMeta(BaseModel):
69
+ """A data model for worksheet settings"""
70
+
71
+ model_config = ConfigDict(populate_by_name=True, frozen=True)
72
+
73
+ name: str = Field(..., validation_alias="sheet")
74
+ header_row: int
75
+ start_row: int = Field(..., validation_alias="data_start")
76
+ start_column: int = 1
77
+ end_column: int = Field(..., validation_alias="n_cols")
78
+ primary_key: str
79
+
80
+
81
+ class WorksheetSettings(BaseModel):
82
+ """A data model for a worksheet"""
83
+
84
+ model_config = ConfigDict(frozen=True)
85
+
86
+ settings: SheetMeta
87
+ columns: tuple[ColumnMeta, ...]
88
+
89
+ def get_transformations(self) -> dict:
90
+ """Merges the transformation of a worksheet"""
91
+ return {
92
+ column.column_name: column.transformation()
93
+ for column in self.columns
94
+ if column.transformation() is not None
95
+ }
96
+
97
+ def get_relations(self) -> list[RelationMeta]:
98
+ """Returns relations of a worksheet where column_name is considered as the
99
+ relation name and the ref_class as the relation's target class
100
+ """
101
+ return [
102
+ RelationMeta(column.column_name, column.ref_class)
103
+ for column in self.columns
104
+ if column.is_relation()
105
+ ]
106
+
107
+
108
+ class WorkbookConfig(BaseModel):
109
+ """A data model containing transpiler configurations"""
110
+
111
+ worksheets: dict[str, WorksheetSettings]
112
+
113
+ @model_validator(mode="after")
114
+ def check_name(cls, values): # noqa
115
+ """Ensure that each worksheet has a unique sheet_name and name attributes."""
116
+ # Check for duplicate worksheet names
117
+ ws_counter = Counter(values.worksheets.keys())
118
+ dup_ws_names = [name for name, count in ws_counter.items() if count > 1]
119
+ if dup_ws_names:
120
+ raise DuplicatedName(
121
+ "Duplicate worksheet names: " + ", ".join(dup_ws_names)
122
+ )
123
+
124
+ # Check for duplicate attribute names
125
+ attrs_counter = Counter(
126
+ f"{column.sheet_name}.{column.column_name}"
127
+ for ws in values.worksheets.values()
128
+ for column in ws.columns
129
+ )
130
+ dup_attrs = [name for name, count in attrs_counter.items() if count > 1]
131
+ if dup_attrs:
132
+ raise DuplicatedName(
133
+ "Duplicate target attribute names: " + ", ".join(dup_attrs)
134
+ )
135
+
136
+ return values
@@ -26,5 +26,17 @@ class MissingWorkbookContent(KeyError):
26
26
  """Raised when any worksheet given in the config yaml does not exist in the spreadsheet"""
27
27
 
28
28
 
29
- class UnknownVersionError(RuntimeError):
30
- """Raised when the version encountered in the workbook is unknown"""
29
+ class WorkbookNotFound(FileNotFoundError):
30
+ """Raised when path to the workbook file not found on a path."""
31
+
32
+
33
+ class MetaColumnNotFound(KeyError):
34
+ """Raised when the 'sheet' column holding the sheet names on the meta_sheets
35
+ (__column_meta, __sheet_meta) does not exist.
36
+ """
37
+
38
+
39
+ class MetaColumnNotUnique(ValueError):
40
+ """Raised when the 'sheet' column holding the sheet names on the meta_sheets
41
+ (__column_meta, __sheet_meta) is not unique.
42
+ """
@@ -0,0 +1,125 @@
1
+ # Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
2
+ # for the German Human Genome-Phenome Archive (GHGA)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """Helper functions to parse the configuration sheets in a workbook"""
17
+
18
+ from collections import defaultdict
19
+
20
+ from openpyxl import Workbook
21
+ from pydantic import BaseModel, Field
22
+
23
+ from .config import WorkbookConfig
24
+ from .exceptions import MetaColumnNotFound, MetaColumnNotUnique
25
+
26
+
27
+ class MetaInfo(BaseModel):
28
+ """Class with constants that are required to parse the configuration worksheets
29
+ of a workbook.
30
+ """
31
+
32
+ column_meta: str = Field(
33
+ default="__column_meta",
34
+ description="Name of a sheet that"
35
+ + " consists of column settings of the individual"
36
+ + " worksheets in a workbook.",
37
+ )
38
+ sheet_meta: str = Field(
39
+ default="__sheet_meta",
40
+ description="Name of a sheet that"
41
+ + " consists of general settings of individual worksheets"
42
+ + " (e.g. header_row, start_column) in a workbook.",
43
+ )
44
+ name_column: str = Field(
45
+ default="sheet",
46
+ description="The name of the column in"
47
+ + " column_meta and sheet_meta worksheets that holds the"
48
+ + " names of the worksheets in the workbook that the settings"
49
+ + " are applied to.",
50
+ )
51
+
52
+
53
+ def read_meta_information(workbook: Workbook, meta_sheet_name: str):
54
+ """Reads the content of a worksheet"""
55
+ if meta_sheet_name in workbook.sheetnames:
56
+ sheet_meta_header = [cell.value for cell in workbook[meta_sheet_name][1]]
57
+ sheet_meta_values = workbook[meta_sheet_name].iter_rows(
58
+ min_row=2, values_only=True
59
+ )
60
+ return [
61
+ dict(zip(sheet_meta_header, val, strict=True)) for val in sheet_meta_values
62
+ ]
63
+ raise SyntaxError(
64
+ f"Unable to extract the sheet {meta_sheet_name} from the workbook."
65
+ )
66
+
67
+
68
+ def reshape_columns_meta(column_meta: list, name_column: str) -> dict[str, list]:
69
+ """Reshapes column metadata into a dictionary where keys are worksheet
70
+ names and values are lists of column metadata dictionaries. Worksheet names comes
71
+ from the column 'name_column'.
72
+ """
73
+ worksheet_columns: dict[str, list[dict]] = defaultdict(list)
74
+ for item in column_meta:
75
+ try:
76
+ sheet_name = item.get(name_column)
77
+ except KeyError as err:
78
+ raise MetaColumnNotFound(
79
+ f"{name_column} column not found in column meta sheet"
80
+ ) from err
81
+ worksheet_columns[sheet_name].append(item)
82
+ return worksheet_columns
83
+
84
+
85
+ def reshape_settings_meta(settings_meta: list, name_column: str) -> dict[str, dict]:
86
+ """Reshapes settings metadata into a dictionary where keys
87
+ are worksheet names and values are worksheet settings dictionaries.
88
+ Worksheet names comes from the column 'name_column'.
89
+ """
90
+ worksheet_settings: dict = {}
91
+ for item in settings_meta:
92
+ try:
93
+ sheet_name = item.get(name_column)
94
+ except KeyError as err:
95
+ raise MetaColumnNotFound(
96
+ f"{name_column} column not found in settings meta sheet"
97
+ ) from err
98
+ if sheet_name in worksheet_settings:
99
+ raise MetaColumnNotUnique(
100
+ f"Duplicate sheet name {sheet_name} in settings meta column {
101
+ name_column
102
+ }"
103
+ )
104
+ worksheet_settings[sheet_name] = item
105
+ return worksheet_settings
106
+
107
+
108
+ def worksheet_meta_information(
109
+ workbook: Workbook, meta_info: MetaInfo = MetaInfo()
110
+ ) -> dict[str, dict]:
111
+ """Creates a dictionary containing both settings and columns metadata for each worksheet"""
112
+ settings = read_meta_information(workbook, meta_info.sheet_meta)
113
+ columns = read_meta_information(workbook, meta_info.column_meta)
114
+ reshaped_settings = reshape_settings_meta(settings, meta_info.name_column)
115
+ reshaped_columns = reshape_columns_meta(columns, meta_info.name_column)
116
+ return {
117
+ key: {"settings": reshaped_settings[key], "columns": reshaped_columns[key]}
118
+ for key in reshaped_settings
119
+ }
120
+
121
+
122
+ def get_workbook_config(workbook: Workbook) -> WorkbookConfig:
123
+ """Gets workbook configurations from the worksheet __sheet_meta"""
124
+ worksheet_meta = worksheet_meta_information(workbook)
125
+ return WorkbookConfig.model_validate({"worksheets": worksheet_meta})
@@ -0,0 +1,98 @@
1
+ # Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
2
+ # for the German Human Genome-Phenome Archive (GHGA)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """This module contains the models describing a GHGA Workbook."""
18
+
19
+ from collections import Counter
20
+
21
+ from pydantic import BaseModel, Field, model_serializer, model_validator
22
+
23
+ from .exceptions import DuplicatedName
24
+
25
+
26
+ class GHGAWorksheetRow(BaseModel):
27
+ """A model defining a row in a worksheet encompassing a content and the relations
28
+ keeping the references to other classes.
29
+ """
30
+
31
+ relations: dict = Field(
32
+ ...,
33
+ description="A dictionary mapping resource identifiers to their"
34
+ + " corresponding classes. This field details the resources referenced within"
35
+ + " the worksheet row.",
36
+ )
37
+
38
+ content: dict = Field(
39
+ ...,
40
+ description="A dictionary containing key-value pairs where keys"
41
+ + " represent the properties of the data fields, and values represent"
42
+ + " the corresponding data. This field does not include information"
43
+ + " about the relations.",
44
+ )
45
+
46
+
47
+ class GHGAWorksheet(BaseModel):
48
+ """A model defining a GHGA worksheet."""
49
+
50
+ worksheet: dict[str, dict[str, GHGAWorksheetRow]] = Field(
51
+ ...,
52
+ description="A nested dictionary representing a GHGA worksheet."
53
+ + " The outer dictionary maps worksheet names (strings) to inner dictionaries."
54
+ + " Each inner dictionary maps row primary key values (strings) to their"
55
+ + " corresponding `GHGAWorksheetRow` instances.",
56
+ )
57
+
58
+ @model_serializer()
59
+ def serialize_model(self):
60
+ """Custom serializer method that returns a dictionary representation of the
61
+ worksheet, omitting the attribute name 'worksheet' from the serialized output.
62
+ """
63
+ return {key: value for key, value in self.worksheet.items()}
64
+
65
+
66
+ class GHGAWorkbook(BaseModel):
67
+ """A model defining a GHGA workbook consists of multiple worksheets."""
68
+
69
+ workbook: tuple[GHGAWorksheet, ...] = Field(
70
+ ...,
71
+ description="A tuple of `GHGAWorksheet` instances."
72
+ + "Each `GHGAWorksheet` represents a worksheet within the workbook.",
73
+ )
74
+
75
+ @model_validator(mode="after")
76
+ def check_name(cls, values): # noqa
77
+ """Function to ensure that workbook consists of worksheets with unique names."""
78
+ attrs_counter = Counter(
79
+ key for ws in values.workbook for key, _ in ws.worksheet.items()
80
+ )
81
+ dup_ws_names = [name for name, count in attrs_counter.items() if count > 1]
82
+ if dup_ws_names:
83
+ raise DuplicatedName(
84
+ "Duplicate worksheet names:: " + ", ".join(dup_ws_names)
85
+ )
86
+ return values
87
+
88
+ @model_serializer()
89
+ def serialize_model(self):
90
+ """Custom serializer method that returns a dictionary representation of the
91
+ workbook, omitting the attribute name 'workbook' from the serialized output and
92
+ returning a flattened dictionary instead of a tuple of worksheets.
93
+ """
94
+ return {
95
+ key: value
96
+ for worksheet in self.workbook
97
+ for key, value in worksheet.worksheet.items()
98
+ }
@@ -36,7 +36,7 @@ def to_attributes() -> Callable:
36
36
  def split_one(value: str) -> dict:
37
37
  """Returns a dictionary with key, value as keys, splitted string as values"""
38
38
  splitted = (elem.strip() for elem in value.split("="))
39
- return dict(zip(("key", "value"), splitted))
39
+ return dict(zip(("key", "value"), splitted, strict=True))
40
40
 
41
41
  def split_mult(value: str) -> list[dict]:
42
42
  """Converts string to attributes"""
@@ -0,0 +1,53 @@
1
+ # Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
2
+ # for the German Human Genome-Phenome Archive (GHGA)
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """This module contains functionalities for processing excel sheets into json object."""
18
+
19
+ from pathlib import Path
20
+
21
+ from arcticfreeze import FrozenDict
22
+ from openpyxl import Workbook
23
+ from schemapack.spec.datapack import DataPack
24
+
25
+ from .config import WorkbookConfig
26
+ from .metasheet_parser import get_workbook_config
27
+ from .models import GHGAWorkbook
28
+ from .transpiler_io import read_workbook
29
+ from .workbook_parser import GHGAWorkbookParser
30
+
31
+
32
+ def parse_workbook(workbook: Workbook, config: WorkbookConfig) -> GHGAWorkbook:
33
+ """Converts a workbook into GHGAWorkbook"""
34
+ return GHGAWorkbookParser(config=config, workbook=workbook).parse()
35
+
36
+
37
+ def transpile_to_datapack(workbook: GHGAWorkbook) -> DataPack:
38
+ """Convert GHAWorkbook into a Datapack instance."""
39
+ return DataPack(
40
+ datapack="0.3.0",
41
+ resources=FrozenDict(workbook.model_dump()),
42
+ rootResource=None,
43
+ rootClass=None,
44
+ )
45
+
46
+
47
+ def transpile(spread_sheet: Path) -> DataPack:
48
+ """The main flow with the steps to transpile a spreadsheet into a datapack."""
49
+ workbook = read_workbook(spread_sheet)
50
+ workbook_config = get_workbook_config(workbook)
51
+ ghga_workbook = parse_workbook(workbook, workbook_config)
52
+ ghga_datapack = transpile_to_datapack(ghga_workbook)
53
+ return ghga_datapack