PyPI - ghga-transpiler - Versions diffs - 2.3.2__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl - Mend

ghga-transpiler 2.3.2py3-none-any.whl → 3.0.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

ghga_transpiler/cli.py +64 -36
ghga_transpiler/config.py +136 -0
ghga_transpiler/{config/exceptions.py → exceptions.py} +14 -2
ghga_transpiler/metasheet_parser.py +125 -0
ghga_transpiler/models.py +98 -0
ghga_transpiler/transformations.py +1 -1
ghga_transpiler/transpile.py +53 -0
ghga_transpiler/{io.py → transpiler_io.py} +20 -21
ghga_transpiler/workbook_parser.py +177 -0
{ghga_transpiler-2.3.2.dist-info → ghga_transpiler-3.0.0rc1.dist-info}/METADATA +5 -6
ghga_transpiler-3.0.0rc1.dist-info/RECORD +17 -0
{ghga_transpiler-2.3.2.dist-info → ghga_transpiler-3.0.0rc1.dist-info}/WHEEL +1 -1
ghga_transpiler/config/__init__.py +0 -20
ghga_transpiler/config/config.py +0 -106
ghga_transpiler/configs/0.10.yaml +0 -135
ghga_transpiler/configs/1.0.yaml +0 -135
ghga_transpiler/configs/1.1.yaml +0 -135
ghga_transpiler/configs/2.0.yaml +0 -170
ghga_transpiler/configs/2.1.yaml +0 -172
ghga_transpiler/configs/__init__.py +0 -16
ghga_transpiler/core.py +0 -155
ghga_transpiler-2.3.2.dist-info/RECORD +0 -21
{ghga_transpiler-2.3.2.dist-info → ghga_transpiler-3.0.0rc1.dist-info}/entry_points.txt +0 -0
{ghga_transpiler-2.3.2.dist-info → ghga_transpiler-3.0.0rc1.dist-info}/licenses/LICENSE +0 -0
{ghga_transpiler-2.3.2.dist-info → ghga_transpiler-3.0.0rc1.dist-info}/top_level.txt +0 -0

ghga_transpiler/cli.py CHANGED Viewed

@@ -16,15 +16,17 @@
 #
 """CLI-specific wrappers around core functions."""
+from __future__ import annotations
 import sys
+from enum import Enum
 from pathlib import Path
-from typing import Optional
+from typing import Annotated
 import typer
-from . import __version__, io
-from .config.exceptions import UnknownVersionError
-from .core import InvalidSematicVersion, convert_workbook
+from . import __version__, transpiler_io
+from .transpile import transpile
 cli = typer.Typer()
@@ -36,44 +38,70 @@ def version_callback(value: bool):
         raise typer.Exit()
+def format_callback(value: str):
+    """Validates the user input for format parameter"""
+    if value not in ["json", "yaml"]:
+        raise typer.BadParameter("Only 'json' or 'yaml' is allowed.")
+    return value
+class Format(str, Enum):
+    """Enum class for output format types"""
+    json = "json"
+    yaml = "yaml"
 @cli.command()
-def transpile(
-    spread_sheet: Path = typer.Argument(
-        ...,
-        exists=True,
-        help="The path to input file (XLSX)",
-        dir_okay=False,
-        readable=True,
-    ),
-    output_file: Optional[Path] = typer.Argument(
-        None, help="The path to output file (JSON).", dir_okay=False
-    ),
-    force: bool = typer.Option(
-        False, "--force", "-f", help="Override output file if it exists."
-    ),
-    version: bool = typer.Option(
-        False,
-        "--version",
-        "-v",
-        callback=version_callback,
-        is_eager=True,
-        help="Print package version",
-    ),
+def main(
+    spread_sheet: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            help="The path to input file (XLSX)",
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    output_file: Annotated[
+        Path | None,
+        typer.Argument(help="The path to output file (JSON).", dir_okay=False),
+    ] = None,
+    format: Annotated[
+        Format,
+        typer.Option(
+            "--format",
+            "-t",
+            help="Output format: 'json' or 'yaml'",
+            callback=format_callback,
+            is_eager=True,
+        ),
+    ] = Format.json,
+    force: Annotated[
+        bool, typer.Option("--force", "-f", help="Override output file if it exists.")
+    ] = False,
+    version: Annotated[
+        bool,
+        typer.Option(
+            "--version",
+            "-v",
+            callback=version_callback,
+            is_eager=True,
+            help="Print package version",
+        ),
+    ] = False,
 ):
     """ghga-transpiler is a command line utility to transpile the official GHGA
-    metadata XLSX workbooks to JSON. Please note that ghga-transpiler does not
-    validate that the provided metadata is compliant with the GHGA Metadata
-    Schema. This can be achieved by running ghga-validator on the JSON data
-    generated by the ghga-transpiler.
+    metadata XLSX workbooks to JSON. TODO Validation
     """
     try:
-        ghga_workbook = io.read_workbook(spread_sheet)
-    except (SyntaxError, UnknownVersionError, InvalidSematicVersion) as exc:
+        ghga_datapack = transpile(spread_sheet)
+    except SyntaxError as exc:
         sys.exit(f"Unable to parse input file '{spread_sheet}': {exc}")
-    converted = convert_workbook(ghga_workbook)
+    yaml_format = format == "yaml"
     try:
-        io.write_json(data=converted, path=output_file, force=force)
+        transpiler_io.write_datapack(
+            data=ghga_datapack, path=output_file, yaml_format=yaml_format, force=force
+        )
     except FileExistsError as exc:
         sys.exit(f"ERROR: {exc}")

ghga_transpiler/config.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Module to process config file"""
+from collections import Counter
+from collections.abc import Callable
+from typing import NamedTuple
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    model_validator,
+)
+from .exceptions import DuplicatedName
+from .transformations import to_attributes, to_list, to_snake_case, to_snake_case_list
+class RelationMeta(NamedTuple):
+    """A data model for relation properties of a column"""
+    name: str
+    target_class: str | None
+class ColumnMeta(BaseModel):
+    """A data model for column properties"""
+    model_config = ConfigDict(populate_by_name=True, frozen=True)
+    sheet_name: str = Field(..., alias="sheet")
+    column_name: str = Field(..., alias="column")
+    multivalued: bool
+    type: str
+    ref_class: str | None
+    ref_id: str | None = Field(..., alias="ref_class_id_property")
+    enum: bool
+    required: bool
+    def transformation(self) -> Callable | None:
+        """Assigns transformation function based on column properties"""
+        if self.enum:
+            return to_snake_case_list() if self.multivalued else to_snake_case()
+        if self.multivalued:
+            return to_attributes() if self.type == "object" else to_list()
+        return lambda value: value
+    def is_relation(self) -> bool:
+        """Return whether this is a relation column"""
+        return bool(self.ref_class)
+class SheetMeta(BaseModel):
+    """A data model for worksheet settings"""
+    model_config = ConfigDict(populate_by_name=True, frozen=True)
+    name: str = Field(..., validation_alias="sheet")
+    header_row: int
+    start_row: int = Field(..., validation_alias="data_start")
+    start_column: int = 1
+    end_column: int = Field(..., validation_alias="n_cols")
+    primary_key: str
+class WorksheetSettings(BaseModel):
+    """A data model for a worksheet"""
+    model_config = ConfigDict(frozen=True)
+    settings: SheetMeta
+    columns: tuple[ColumnMeta, ...]
+    def get_transformations(self) -> dict:
+        """Merges the transformation of a worksheet"""
+        return {
+            column.column_name: column.transformation()
+            for column in self.columns
+            if column.transformation() is not None
+        }
+    def get_relations(self) -> list[RelationMeta]:
+        """Returns relations of a worksheet where column_name is considered as the
+        relation name and the ref_class as the relation's target class
+        """
+        return [
+            RelationMeta(column.column_name, column.ref_class)
+            for column in self.columns
+            if column.is_relation()
+        ]
+class WorkbookConfig(BaseModel):
+    """A data model containing transpiler configurations"""
+    worksheets: dict[str, WorksheetSettings]
+    @model_validator(mode="after")
+    def check_name(cls, values):  # noqa
+        """Ensure that each worksheet has a unique sheet_name and name attributes."""
+        # Check for duplicate worksheet names
+        ws_counter = Counter(values.worksheets.keys())
+        dup_ws_names = [name for name, count in ws_counter.items() if count > 1]
+        if dup_ws_names:
+            raise DuplicatedName(
+                "Duplicate worksheet names: " + ", ".join(dup_ws_names)
+            )
+        # Check for duplicate attribute names
+        attrs_counter = Counter(
+            f"{column.sheet_name}.{column.column_name}"
+            for ws in values.worksheets.values()
+            for column in ws.columns
+        )
+        dup_attrs = [name for name, count in attrs_counter.items() if count > 1]
+        if dup_attrs:
+            raise DuplicatedName(
+                "Duplicate target attribute names: " + ", ".join(dup_attrs)
+            )
+        return values

ghga_transpiler/{config/exceptions.py → exceptions.py} RENAMED Viewed

@@ -26,5 +26,17 @@ class MissingWorkbookContent(KeyError):
     """Raised when any worksheet given in the config yaml does not exist in the spreadsheet"""
-class UnknownVersionError(RuntimeError):
-    """Raised when the version encountered in the workbook is unknown"""
+class WorkbookNotFound(FileNotFoundError):
+    """Raised when path to the workbook file not found on a path."""
+class MetaColumnNotFound(KeyError):
+    """Raised when the 'sheet' column holding the sheet names on the meta_sheets
+    (__column_meta, __sheet_meta) does not exist.
+    """
+class MetaColumnNotUnique(ValueError):
+    """Raised when the 'sheet' column holding the sheet names on the meta_sheets
+    (__column_meta, __sheet_meta) is not unique.
+    """

ghga_transpiler/metasheet_parser.py ADDED Viewed

@@ -0,0 +1,125 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions to parse the configuration sheets in a workbook"""
+from collections import defaultdict
+from openpyxl import Workbook
+from pydantic import BaseModel, Field
+from .config import WorkbookConfig
+from .exceptions import MetaColumnNotFound, MetaColumnNotUnique
+class MetaInfo(BaseModel):
+    """Class with constants that are required to parse the configuration worksheets
+    of a workbook.
+    """
+    column_meta: str = Field(
+        default="__column_meta",
+        description="Name of a sheet that"
+        + " consists of column settings of the individual"
+        + " worksheets in a workbook.",
+    )
+    sheet_meta: str = Field(
+        default="__sheet_meta",
+        description="Name of a sheet that"
+        + " consists of general settings of individual worksheets"
+        + " (e.g. header_row, start_column) in a workbook.",
+    )
+    name_column: str = Field(
+        default="sheet",
+        description="The name of the column in"
+        + " column_meta and sheet_meta worksheets that holds the"
+        + " names of the worksheets in the workbook that the settings"
+        + " are applied to.",
+    )
+def read_meta_information(workbook: Workbook, meta_sheet_name: str):
+    """Reads the content of a worksheet"""
+    if meta_sheet_name in workbook.sheetnames:
+        sheet_meta_header = [cell.value for cell in workbook[meta_sheet_name][1]]
+        sheet_meta_values = workbook[meta_sheet_name].iter_rows(
+            min_row=2, values_only=True
+        )
+        return [
+            dict(zip(sheet_meta_header, val, strict=True)) for val in sheet_meta_values
+        ]
+    raise SyntaxError(
+        f"Unable to extract the sheet {meta_sheet_name} from the workbook."
+    )
+def reshape_columns_meta(column_meta: list, name_column: str) -> dict[str, list]:
+    """Reshapes column metadata into a dictionary where keys are worksheet
+    names and values are lists of column metadata dictionaries. Worksheet names comes
+    from the column 'name_column'.
+    """
+    worksheet_columns: dict[str, list[dict]] = defaultdict(list)
+    for item in column_meta:
+        try:
+            sheet_name = item.get(name_column)
+        except KeyError as err:
+            raise MetaColumnNotFound(
+                f"{name_column} column not found in column meta sheet"
+            ) from err
+        worksheet_columns[sheet_name].append(item)
+    return worksheet_columns
+def reshape_settings_meta(settings_meta: list, name_column: str) -> dict[str, dict]:
+    """Reshapes settings metadata into a dictionary where keys
+    are worksheet names and values are worksheet settings dictionaries.
+    Worksheet names comes from the column 'name_column'.
+    """
+    worksheet_settings: dict = {}
+    for item in settings_meta:
+        try:
+            sheet_name = item.get(name_column)
+        except KeyError as err:
+            raise MetaColumnNotFound(
+                f"{name_column} column not found in settings meta sheet"
+            ) from err
+        if sheet_name in worksheet_settings:
+            raise MetaColumnNotUnique(
+                f"Duplicate sheet name {sheet_name} in settings meta column {
+                    name_column
+                }"
+            )
+        worksheet_settings[sheet_name] = item
+    return worksheet_settings
+def worksheet_meta_information(
+    workbook: Workbook, meta_info: MetaInfo = MetaInfo()
+) -> dict[str, dict]:
+    """Creates a dictionary containing both settings and columns metadata for each worksheet"""
+    settings = read_meta_information(workbook, meta_info.sheet_meta)
+    columns = read_meta_information(workbook, meta_info.column_meta)
+    reshaped_settings = reshape_settings_meta(settings, meta_info.name_column)
+    reshaped_columns = reshape_columns_meta(columns, meta_info.name_column)
+    return {
+        key: {"settings": reshaped_settings[key], "columns": reshaped_columns[key]}
+        for key in reshaped_settings
+    }
+def get_workbook_config(workbook: Workbook) -> WorkbookConfig:
+    """Gets workbook configurations from the worksheet __sheet_meta"""
+    worksheet_meta = worksheet_meta_information(workbook)
+    return WorkbookConfig.model_validate({"worksheets": worksheet_meta})

ghga_transpiler/models.py ADDED Viewed

@@ -0,0 +1,98 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""This module contains the models describing a GHGA Workbook."""
+from collections import Counter
+from pydantic import BaseModel, Field, model_serializer, model_validator
+from .exceptions import DuplicatedName
+class GHGAWorksheetRow(BaseModel):
+    """A model defining a row in a worksheet encompassing a content and the relations
+    keeping the references to other classes.
+    """
+    relations: dict = Field(
+        ...,
+        description="A dictionary mapping resource identifiers to their"
+        + " corresponding classes. This field details the resources referenced within"
+        + " the worksheet row.",
+    )
+    content: dict = Field(
+        ...,
+        description="A dictionary containing key-value pairs where keys"
+        + " represent the properties of the data fields, and values represent"
+        + " the corresponding data. This field does not include information"
+        + " about the relations.",
+    )
+class GHGAWorksheet(BaseModel):
+    """A model defining a GHGA worksheet."""
+    worksheet: dict[str, dict[str, GHGAWorksheetRow]] = Field(
+        ...,
+        description="A nested dictionary representing a GHGA worksheet."
+        + " The outer dictionary maps worksheet names (strings) to inner dictionaries."
+        + " Each inner dictionary maps row primary key values (strings) to their"
+        + " corresponding `GHGAWorksheetRow` instances.",
+    )
+    @model_serializer()
+    def serialize_model(self):
+        """Custom serializer method that returns a dictionary representation of the
+        worksheet, omitting the attribute name 'worksheet' from the serialized output.
+        """
+        return {key: value for key, value in self.worksheet.items()}
+class GHGAWorkbook(BaseModel):
+    """A model defining a GHGA workbook consists of multiple worksheets."""
+    workbook: tuple[GHGAWorksheet, ...] = Field(
+        ...,
+        description="A tuple of `GHGAWorksheet` instances."
+        + "Each `GHGAWorksheet` represents a worksheet within the workbook.",
+    )
+    @model_validator(mode="after")
+    def check_name(cls, values):  # noqa
+        """Function to ensure that workbook consists of worksheets with unique names."""
+        attrs_counter = Counter(
+            key for ws in values.workbook for key, _ in ws.worksheet.items()
+        )
+        dup_ws_names = [name for name, count in attrs_counter.items() if count > 1]
+        if dup_ws_names:
+            raise DuplicatedName(
+                "Duplicate worksheet names:: " + ", ".join(dup_ws_names)
+            )
+        return values
+    @model_serializer()
+    def serialize_model(self):
+        """Custom serializer method that returns a dictionary representation of the
+        workbook, omitting the attribute name 'workbook' from the serialized output and
+        returning a flattened dictionary instead of a tuple of worksheets.
+        """
+        return {
+            key: value
+            for worksheet in self.workbook
+            for key, value in worksheet.worksheet.items()
+        }

ghga_transpiler/transformations.py CHANGED Viewed

@@ -36,7 +36,7 @@ def to_attributes() -> Callable:
     def split_one(value: str) -> dict:
         """Returns a dictionary with key, value as keys, splitted string as values"""
         splitted = (elem.strip() for elem in value.split("="))
-        return dict(zip(("key", "value"), splitted))
+        return dict(zip(("key", "value"), splitted, strict=True))
     def split_mult(value: str) -> list[dict]:
         """Converts string to attributes"""

ghga_transpiler/transpile.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""This module contains functionalities for processing excel sheets into json object."""
+from pathlib import Path
+from arcticfreeze import FrozenDict
+from openpyxl import Workbook
+from schemapack.spec.datapack import DataPack
+from .config import WorkbookConfig
+from .metasheet_parser import get_workbook_config
+from .models import GHGAWorkbook
+from .transpiler_io import read_workbook
+from .workbook_parser import GHGAWorkbookParser
+def parse_workbook(workbook: Workbook, config: WorkbookConfig) -> GHGAWorkbook:
+    """Converts a workbook into GHGAWorkbook"""
+    return GHGAWorkbookParser(config=config, workbook=workbook).parse()
+def transpile_to_datapack(workbook: GHGAWorkbook) -> DataPack:
+    """Convert GHAWorkbook into a Datapack instance."""
+    return DataPack(
+        datapack="0.3.0",
+        resources=FrozenDict(workbook.model_dump()),
+        rootResource=None,
+        rootClass=None,
+    )
+def transpile(spread_sheet: Path) -> DataPack:
+    """The main flow with the steps to transpile a spreadsheet into a datapack."""
+    workbook = read_workbook(spread_sheet)
+    workbook_config = get_workbook_config(workbook)
+    ghga_workbook = parse_workbook(workbook, workbook_config)
+    ghga_datapack = transpile_to_datapack(ghga_workbook)
+    return ghga_datapack

ghga_transpiler/{io.py → transpiler_io.py} RENAMED Viewed

@@ -19,37 +19,36 @@
 from __future__ import annotations
-import json
 import sys
-from importlib import resources
 from pathlib import Path
-from typing import TextIO
-from openpyxl import load_workbook
+from openpyxl import Workbook, load_workbook
+from schemapack import dumps_datapack
+from schemapack.spec.datapack import DataPack
-from .core import GHGAWorkbook
+from .exceptions import WorkbookNotFound
-def read_workbook(
-    path: Path, configs_package: resources.Package = "ghga_transpiler.configs"
-) -> GHGAWorkbook:
+def read_workbook(path: Path) -> Workbook:
     """Function to read-in a workbook"""
-    return GHGAWorkbook(load_workbook(path), configs_package=configs_package)
-def _write_json(data: dict, file: TextIO):
-    """Write the data to the specified file in JSON format"""
-    json.dump(obj=data, fp=file, ensure_ascii=False, indent=4)
-def write_json(data: dict, path: Path | None, force: bool) -> None:
-    """Write the data provided as a dictionary to the specified output path or
-    to stdout if the path is None.
+    try:
+        return load_workbook(path)
+    except FileNotFoundError as err:
+        raise WorkbookNotFound(f"Spreadsheet file not found on {path}") from err
+def write_datapack(
+    data: DataPack, path: Path | None, yaml_format: bool, force: bool
+) -> None:
+    """Writes data as JSON to the specified output path or
+    to stdout if the path is None, or overwrites an existing output file if
+    'force' is True.
     """
+    datapack = dumps_datapack(data, yaml_format=yaml_format)
     if path is None:
-        _write_json(data, sys.stdout)
+        sys.stdout.write(datapack)
     elif path.exists() and not force:
         raise FileExistsError(f"File already exists: {path}")
     else:
         with open(file=path, mode="w", encoding="utf8") as outfile:
-            _write_json(data, outfile)
+            outfile.write(datapack)

ghga-transpiler 2.3.2__py3-none-any.whl → 3.0.0rc1__py3-none-any.whl

ghga-transpiler 2.3.2py3-none-any.whl → 3.0.0rc1py3-none-any.whl