PyPI - ghga-transpiler - Versions diffs - 2.3.2__tar.gz → 3.0.0rc1__tar.gz - Mend

ghga-transpiler 2.3.2tar.gz → 3.0.0rc1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

{ghga_transpiler-2.3.2/src/ghga_transpiler.egg-info → ghga_transpiler-3.0.0rc1}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.4
 Name: ghga_transpiler
-Version: 2.3.2
+Version: 3.0.0rc1
 Summary: GHGA-Transpiler - excel to JSON converter
 Author-email: "German Human Genome Phenome Archive (GHGA)" <contact@ghga.de>
 License: Apache 2.0
 Project-URL: Repository, https://github.com/ghga-de/ghga-transpiler
-Classifier: Development Status :: 1 - Planning
+Classifier: Development Status :: 5 - Production/Stable
 Classifier: Operating System :: POSIX :: Linux
 Classifier: Programming Language :: Python :: 3
 Classifier: Programming Language :: Python :: 3.12
@@ -13,16 +13,15 @@ Classifier: License :: OSI Approved :: Apache Software License
 Classifier: Topic :: Internet :: WWW/HTTP :: HTTP Servers
 Classifier: Topic :: Software Development :: Libraries
 Classifier: Intended Audience :: Developers
-Requires-Python: >=3.9
+Requires-Python: >=3.12
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: typer>=0.12
 Requires-Dist: openpyxl==3.*,>=3.1.2
 Requires-Dist: defusedxml==0.*,>=0.7
-Requires-Dist: pydantic<3,>=2.6
+Requires-Dist: pydantic<3,>=2
 Requires-Dist: PyYAML~=6.0
 Requires-Dist: semver==3.*
-Requires-Dist: click~=8.1.0
+Requires-Dist: schemapack==2.0.0
 Dynamic: license-file

{ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [build-system]
 requires = [
-    "setuptools>=69",
+    "setuptools>=80.3",
 ]
 build-backend = "setuptools.build_meta"
@@ -9,9 +9,9 @@ readme = "README.md"
 authors = [
     { name = "German Human Genome Phenome Archive (GHGA)", email = "contact@ghga.de" },
 ]
-requires-python = ">=3.9"
+requires-python = ">=3.12"
 classifiers = [
-    "Development Status :: 1 - Planning",
+    "Development Status :: 5 - Production/Stable",
     "Operating System :: POSIX :: Linux",
     "Programming Language :: Python :: 3",
     "Programming Language :: Python :: 3.12",
@@ -21,16 +21,15 @@ classifiers = [
     "Intended Audience :: Developers",
 ]
 name = "ghga_transpiler"
-version = "2.3.2"
+version = "3.0.0-rc.1"
 description = "GHGA-Transpiler - excel to JSON converter"
 dependencies = [
-    "typer >= 0.12",
     "openpyxl >= 3.1.2, == 3.*",
     "defusedxml >= 0.7, == 0.*",
-    "pydantic >=2.6, <3",
+    "pydantic >=2, <3",
     "PyYAML ~= 6.0",
     "semver == 3.*",
-    "click ~= 8.1.0",
+    "schemapack == 2.0.0",
 ]
 [project.license]
@@ -47,11 +46,6 @@ where = [
     "src",
 ]
-[tool.setuptools.package-data]
-"ghga_transpiler.configs" = [
-    "*.yaml",
-]
 [tool.ruff]
 exclude = [
     ".git",
@@ -67,12 +61,14 @@ src = [
     "examples",
     "scripts",
 ]
+target-version = "py312"
 [tool.ruff.lint]
 fixable = [
     "UP",
     "I",
     "D",
+    "RUF022",
 ]
 ignore = [
     "E111",
@@ -92,6 +88,7 @@ ignore = [
     "D206",
     "D300",
     "UP040",
+    "PLC0206",
 ]
 select = [
     "C90",
@@ -151,8 +148,9 @@ check_untyped_defs = true
 no_site_packages = false
 [tool.pytest.ini_options]
-minversion = "8.0"
+minversion = "8.3"
 asyncio_mode = "strict"
+asyncio_default_fixture_loop_scope = "function"
 [tool.coverage.paths]
 source = [

ghga_transpiler-3.0.0rc1/src/ghga_transpiler/cli.py ADDED Viewed

@@ -0,0 +1,107 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+"""CLI-specific wrappers around core functions."""
+from __future__ import annotations
+import sys
+from enum import Enum
+from pathlib import Path
+from typing import Annotated
+import typer
+from . import __version__, transpiler_io
+from .transpile import transpile
+cli = typer.Typer()
+def version_callback(value: bool):
+    """Prints the package version"""
+    if value:
+        print(__version__)
+        raise typer.Exit()
+def format_callback(value: str):
+    """Validates the user input for format parameter"""
+    if value not in ["json", "yaml"]:
+        raise typer.BadParameter("Only 'json' or 'yaml' is allowed.")
+    return value
+class Format(str, Enum):
+    """Enum class for output format types"""
+    json = "json"
+    yaml = "yaml"
+@cli.command()
+def main(
+    spread_sheet: Annotated[
+        Path,
+        typer.Argument(
+            exists=True,
+            help="The path to input file (XLSX)",
+            dir_okay=False,
+            readable=True,
+        ),
+    ],
+    output_file: Annotated[
+        Path | None,
+        typer.Argument(help="The path to output file (JSON).", dir_okay=False),
+    ] = None,
+    format: Annotated[
+        Format,
+        typer.Option(
+            "--format",
+            "-t",
+            help="Output format: 'json' or 'yaml'",
+            callback=format_callback,
+            is_eager=True,
+        ),
+    ] = Format.json,
+    force: Annotated[
+        bool, typer.Option("--force", "-f", help="Override output file if it exists.")
+    ] = False,
+    version: Annotated[
+        bool,
+        typer.Option(
+            "--version",
+            "-v",
+            callback=version_callback,
+            is_eager=True,
+            help="Print package version",
+        ),
+    ] = False,
+):
+    """ghga-transpiler is a command line utility to transpile the official GHGA
+    metadata XLSX workbooks to JSON. TODO Validation
+    """
+    try:
+        ghga_datapack = transpile(spread_sheet)
+    except SyntaxError as exc:
+        sys.exit(f"Unable to parse input file '{spread_sheet}': {exc}")
+    yaml_format = format == "yaml"
+    try:
+        transpiler_io.write_datapack(
+            data=ghga_datapack, path=output_file, yaml_format=yaml_format, force=force
+        )
+    except FileExistsError as exc:
+        sys.exit(f"ERROR: {exc}")

ghga_transpiler-3.0.0rc1/src/ghga_transpiler/config.py ADDED Viewed

@@ -0,0 +1,136 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Module to process config file"""
+from collections import Counter
+from collections.abc import Callable
+from typing import NamedTuple
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    model_validator,
+)
+from .exceptions import DuplicatedName
+from .transformations import to_attributes, to_list, to_snake_case, to_snake_case_list
+class RelationMeta(NamedTuple):
+    """A data model for relation properties of a column"""
+    name: str
+    target_class: str | None
+class ColumnMeta(BaseModel):
+    """A data model for column properties"""
+    model_config = ConfigDict(populate_by_name=True, frozen=True)
+    sheet_name: str = Field(..., alias="sheet")
+    column_name: str = Field(..., alias="column")
+    multivalued: bool
+    type: str
+    ref_class: str | None
+    ref_id: str | None = Field(..., alias="ref_class_id_property")
+    enum: bool
+    required: bool
+    def transformation(self) -> Callable | None:
+        """Assigns transformation function based on column properties"""
+        if self.enum:
+            return to_snake_case_list() if self.multivalued else to_snake_case()
+        if self.multivalued:
+            return to_attributes() if self.type == "object" else to_list()
+        return lambda value: value
+    def is_relation(self) -> bool:
+        """Return whether this is a relation column"""
+        return bool(self.ref_class)
+class SheetMeta(BaseModel):
+    """A data model for worksheet settings"""
+    model_config = ConfigDict(populate_by_name=True, frozen=True)
+    name: str = Field(..., validation_alias="sheet")
+    header_row: int
+    start_row: int = Field(..., validation_alias="data_start")
+    start_column: int = 1
+    end_column: int = Field(..., validation_alias="n_cols")
+    primary_key: str
+class WorksheetSettings(BaseModel):
+    """A data model for a worksheet"""
+    model_config = ConfigDict(frozen=True)
+    settings: SheetMeta
+    columns: tuple[ColumnMeta, ...]
+    def get_transformations(self) -> dict:
+        """Merges the transformation of a worksheet"""
+        return {
+            column.column_name: column.transformation()
+            for column in self.columns
+            if column.transformation() is not None
+        }
+    def get_relations(self) -> list[RelationMeta]:
+        """Returns relations of a worksheet where column_name is considered as the
+        relation name and the ref_class as the relation's target class
+        """
+        return [
+            RelationMeta(column.column_name, column.ref_class)
+            for column in self.columns
+            if column.is_relation()
+        ]
+class WorkbookConfig(BaseModel):
+    """A data model containing transpiler configurations"""
+    worksheets: dict[str, WorksheetSettings]
+    @model_validator(mode="after")
+    def check_name(cls, values):  # noqa
+        """Ensure that each worksheet has a unique sheet_name and name attributes."""
+        # Check for duplicate worksheet names
+        ws_counter = Counter(values.worksheets.keys())
+        dup_ws_names = [name for name, count in ws_counter.items() if count > 1]
+        if dup_ws_names:
+            raise DuplicatedName(
+                "Duplicate worksheet names: " + ", ".join(dup_ws_names)
+            )
+        # Check for duplicate attribute names
+        attrs_counter = Counter(
+            f"{column.sheet_name}.{column.column_name}"
+            for ws in values.worksheets.values()
+            for column in ws.columns
+        )
+        dup_attrs = [name for name, count in attrs_counter.items() if count > 1]
+        if dup_attrs:
+            raise DuplicatedName(
+                "Duplicate target attribute names: " + ", ".join(dup_attrs)
+            )
+        return values

{ghga_transpiler-2.3.2/src/ghga_transpiler/config → ghga_transpiler-3.0.0rc1/src/ghga_transpiler}/exceptions.py RENAMED Viewed

@@ -26,5 +26,17 @@ class MissingWorkbookContent(KeyError):
     """Raised when any worksheet given in the config yaml does not exist in the spreadsheet"""
-class UnknownVersionError(RuntimeError):
-    """Raised when the version encountered in the workbook is unknown"""
+class WorkbookNotFound(FileNotFoundError):
+    """Raised when path to the workbook file not found on a path."""
+class MetaColumnNotFound(KeyError):
+    """Raised when the 'sheet' column holding the sheet names on the meta_sheets
+    (__column_meta, __sheet_meta) does not exist.
+    """
+class MetaColumnNotUnique(ValueError):
+    """Raised when the 'sheet' column holding the sheet names on the meta_sheets
+    (__column_meta, __sheet_meta) is not unique.
+    """

ghga_transpiler-3.0.0rc1/src/ghga_transpiler/metasheet_parser.py ADDED Viewed

@@ -0,0 +1,125 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions to parse the configuration sheets in a workbook"""
+from collections import defaultdict
+from openpyxl import Workbook
+from pydantic import BaseModel, Field
+from .config import WorkbookConfig
+from .exceptions import MetaColumnNotFound, MetaColumnNotUnique
+class MetaInfo(BaseModel):
+    """Class with constants that are required to parse the configuration worksheets
+    of a workbook.
+    """
+    column_meta: str = Field(
+        default="__column_meta",
+        description="Name of a sheet that"
+        + " consists of column settings of the individual"
+        + " worksheets in a workbook.",
+    )
+    sheet_meta: str = Field(
+        default="__sheet_meta",
+        description="Name of a sheet that"
+        + " consists of general settings of individual worksheets"
+        + " (e.g. header_row, start_column) in a workbook.",
+    )
+    name_column: str = Field(
+        default="sheet",
+        description="The name of the column in"
+        + " column_meta and sheet_meta worksheets that holds the"
+        + " names of the worksheets in the workbook that the settings"
+        + " are applied to.",
+    )
+def read_meta_information(workbook: Workbook, meta_sheet_name: str):
+    """Reads the content of a worksheet"""
+    if meta_sheet_name in workbook.sheetnames:
+        sheet_meta_header = [cell.value for cell in workbook[meta_sheet_name][1]]
+        sheet_meta_values = workbook[meta_sheet_name].iter_rows(
+            min_row=2, values_only=True
+        )
+        return [
+            dict(zip(sheet_meta_header, val, strict=True)) for val in sheet_meta_values
+        ]
+    raise SyntaxError(
+        f"Unable to extract the sheet {meta_sheet_name} from the workbook."
+    )
+def reshape_columns_meta(column_meta: list, name_column: str) -> dict[str, list]:
+    """Reshapes column metadata into a dictionary where keys are worksheet
+    names and values are lists of column metadata dictionaries. Worksheet names comes
+    from the column 'name_column'.
+    """
+    worksheet_columns: dict[str, list[dict]] = defaultdict(list)
+    for item in column_meta:
+        try:
+            sheet_name = item.get(name_column)
+        except KeyError as err:
+            raise MetaColumnNotFound(
+                f"{name_column} column not found in column meta sheet"
+            ) from err
+        worksheet_columns[sheet_name].append(item)
+    return worksheet_columns
+def reshape_settings_meta(settings_meta: list, name_column: str) -> dict[str, dict]:
+    """Reshapes settings metadata into a dictionary where keys
+    are worksheet names and values are worksheet settings dictionaries.
+    Worksheet names comes from the column 'name_column'.
+    """
+    worksheet_settings: dict = {}
+    for item in settings_meta:
+        try:
+            sheet_name = item.get(name_column)
+        except KeyError as err:
+            raise MetaColumnNotFound(
+                f"{name_column} column not found in settings meta sheet"
+            ) from err
+        if sheet_name in worksheet_settings:
+            raise MetaColumnNotUnique(
+                f"Duplicate sheet name {sheet_name} in settings meta column {
+                    name_column
+                }"
+            )
+        worksheet_settings[sheet_name] = item
+    return worksheet_settings
+def worksheet_meta_information(
+    workbook: Workbook, meta_info: MetaInfo = MetaInfo()
+) -> dict[str, dict]:
+    """Creates a dictionary containing both settings and columns metadata for each worksheet"""
+    settings = read_meta_information(workbook, meta_info.sheet_meta)
+    columns = read_meta_information(workbook, meta_info.column_meta)
+    reshaped_settings = reshape_settings_meta(settings, meta_info.name_column)
+    reshaped_columns = reshape_columns_meta(columns, meta_info.name_column)
+    return {
+        key: {"settings": reshaped_settings[key], "columns": reshaped_columns[key]}
+        for key in reshaped_settings
+    }
+def get_workbook_config(workbook: Workbook) -> WorkbookConfig:
+    """Gets workbook configurations from the worksheet __sheet_meta"""
+    worksheet_meta = worksheet_meta_information(workbook)
+    return WorkbookConfig.model_validate({"worksheets": worksheet_meta})

ghga_transpiler-3.0.0rc1/src/ghga_transpiler/models.py ADDED Viewed

@@ -0,0 +1,98 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""This module contains the models describing a GHGA Workbook."""
+from collections import Counter
+from pydantic import BaseModel, Field, model_serializer, model_validator
+from .exceptions import DuplicatedName
+class GHGAWorksheetRow(BaseModel):
+    """A model defining a row in a worksheet encompassing a content and the relations
+    keeping the references to other classes.
+    """
+    relations: dict = Field(
+        ...,
+        description="A dictionary mapping resource identifiers to their"
+        + " corresponding classes. This field details the resources referenced within"
+        + " the worksheet row.",
+    )
+    content: dict = Field(
+        ...,
+        description="A dictionary containing key-value pairs where keys"
+        + " represent the properties of the data fields, and values represent"
+        + " the corresponding data. This field does not include information"
+        + " about the relations.",
+    )
+class GHGAWorksheet(BaseModel):
+    """A model defining a GHGA worksheet."""
+    worksheet: dict[str, dict[str, GHGAWorksheetRow]] = Field(
+        ...,
+        description="A nested dictionary representing a GHGA worksheet."
+        + " The outer dictionary maps worksheet names (strings) to inner dictionaries."
+        + " Each inner dictionary maps row primary key values (strings) to their"
+        + " corresponding `GHGAWorksheetRow` instances.",
+    )
+    @model_serializer()
+    def serialize_model(self):
+        """Custom serializer method that returns a dictionary representation of the
+        worksheet, omitting the attribute name 'worksheet' from the serialized output.
+        """
+        return {key: value for key, value in self.worksheet.items()}
+class GHGAWorkbook(BaseModel):
+    """A model defining a GHGA workbook consists of multiple worksheets."""
+    workbook: tuple[GHGAWorksheet, ...] = Field(
+        ...,
+        description="A tuple of `GHGAWorksheet` instances."
+        + "Each `GHGAWorksheet` represents a worksheet within the workbook.",
+    )
+    @model_validator(mode="after")
+    def check_name(cls, values):  # noqa
+        """Function to ensure that workbook consists of worksheets with unique names."""
+        attrs_counter = Counter(
+            key for ws in values.workbook for key, _ in ws.worksheet.items()
+        )
+        dup_ws_names = [name for name, count in attrs_counter.items() if count > 1]
+        if dup_ws_names:
+            raise DuplicatedName(
+                "Duplicate worksheet names:: " + ", ".join(dup_ws_names)
+            )
+        return values
+    @model_serializer()
+    def serialize_model(self):
+        """Custom serializer method that returns a dictionary representation of the
+        workbook, omitting the attribute name 'workbook' from the serialized output and
+        returning a flattened dictionary instead of a tuple of worksheets.
+        """
+        return {
+            key: value
+            for worksheet in self.workbook
+            for key, value in worksheet.worksheet.items()
+        }

{ghga_transpiler-2.3.2 → ghga_transpiler-3.0.0rc1}/src/ghga_transpiler/transformations.py RENAMED Viewed

@@ -36,7 +36,7 @@ def to_attributes() -> Callable:
     def split_one(value: str) -> dict:
         """Returns a dictionary with key, value as keys, splitted string as values"""
         splitted = (elem.strip() for elem in value.split("="))
-        return dict(zip(("key", "value"), splitted))
+        return dict(zip(("key", "value"), splitted, strict=True))
     def split_mult(value: str) -> list[dict]:
         """Converts string to attributes"""

ghga_transpiler-3.0.0rc1/src/ghga_transpiler/transpile.py ADDED Viewed

@@ -0,0 +1,53 @@
+# Copyright 2021 - 2025 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln
+# for the German Human Genome-Phenome Archive (GHGA)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""This module contains functionalities for processing excel sheets into json object."""
+from pathlib import Path
+from arcticfreeze import FrozenDict
+from openpyxl import Workbook
+from schemapack.spec.datapack import DataPack
+from .config import WorkbookConfig
+from .metasheet_parser import get_workbook_config
+from .models import GHGAWorkbook
+from .transpiler_io import read_workbook
+from .workbook_parser import GHGAWorkbookParser
+def parse_workbook(workbook: Workbook, config: WorkbookConfig) -> GHGAWorkbook:
+    """Converts a workbook into GHGAWorkbook"""
+    return GHGAWorkbookParser(config=config, workbook=workbook).parse()
+def transpile_to_datapack(workbook: GHGAWorkbook) -> DataPack:
+    """Convert GHAWorkbook into a Datapack instance."""
+    return DataPack(
+        datapack="0.3.0",
+        resources=FrozenDict(workbook.model_dump()),
+        rootResource=None,
+        rootClass=None,
+    )
+def transpile(spread_sheet: Path) -> DataPack:
+    """The main flow with the steps to transpile a spreadsheet into a datapack."""
+    workbook = read_workbook(spread_sheet)
+    workbook_config = get_workbook_config(workbook)
+    ghga_workbook = parse_workbook(workbook, workbook_config)
+    ghga_datapack = transpile_to_datapack(ghga_workbook)
+    return ghga_datapack

ghga-transpiler 2.3.2__tar.gz → 3.0.0rc1__tar.gz

ghga-transpiler 2.3.2tar.gz → 3.0.0rc1tar.gz