PyPI - semantic-transformers - Versions diffs - 0.1.0__py3-none-any.whl - Mend

semantic-transformers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

semantic_transformers/__init__.py +23 -0
semantic_transformers/parser.py +83 -0
semantic_transformers/quick_mapper.py +255 -0
semantic_transformers/transformer.py +353 -0
semantic_transformers-0.1.0.dist-info/METADATA +189 -0
semantic_transformers-0.1.0.dist-info/RECORD +9 -0
semantic_transformers-0.1.0.dist-info/WHEEL +5 -0
semantic_transformers-0.1.0.dist-info/licenses/LICENSE +201 -0
semantic_transformers-0.1.0.dist-info/top_level.txt +1 -0

semantic_transformers/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+"""
+semantic-transformers
+=====================
+Converters and machine-file parsers for semantic schema pipelines.
+Public API
+----------
+    ParseResult:     normalised parser output (simplified_json + DataFrame)
+    Parser:          protocol that all parsers must satisfy
+    Transformer:     runs parsing → JSONata transform → RDF
+    TransformResult: everything produced by Transformer.run()
+    QuickMapper:     turns any tabular file into RDF with a simple YAML mapping
+"""
+from .parser import Parser, ParseResult
+from .transformer import Transformer, TransformResult
+from .quick_mapper import QuickMapper
+__all__ = [
+    "Parser", "ParseResult",
+    "Transformer", "TransformResult",
+    "QuickMapper",
+]

semantic_transformers/parser.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""
+Parser protocol and result type.
+A Parser reads a machine file (any format, any internal structure) and
+returns a ParseResult with two outputs:
+  simplified_json:  a plain dict matching the target schema's example.input.json
+                    format, ready to be fed into the JSONata transform.
+  timeseries:       a pandas DataFrame of the raw measurement columns, or None
+                    if the file contains no time-series data.
+  column_iris:      maps each DataFrame column name to an ontology class IRI.
+                    Only the descriptor goes into the knowledge graph; the
+                    numeric values stay in the DataFrame.
+  column_units:     maps each DataFrame column name to a QUDT unit IRI.
+Parsers are schema- and machine-specific: one parser per (machine model,
+schema) combination.  They live in the parsers/ directory alongside the
+schema they serve, not in this library.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Protocol, runtime_checkable
+import pandas as pd
+@dataclass
+class ParseResult:
+    """Normalised output produced by any Parser."""
+    # Flat dict matching the target schema's example.input.json.
+    # Feed this directly into the JSONata transform.
+    simplified_json: dict
+    # Raw time-series data.  None when the file has no tabular measurements.
+    timeseries: pd.DataFrame | None = None
+    # Column name → ontology class IRI (e.g. "https://w3id.org/pmd/tto/TestTime")
+    column_iris: dict[str, str] = field(default_factory=dict)
+    # Column name → QUDT unit IRI (e.g. "http://qudt.org/vocab/unit/SEC")
+    column_units: dict[str, str] = field(default_factory=dict)
+@runtime_checkable
+class Parser(Protocol):
+    """Any callable object that reads a file and returns a ParseResult."""
+    def parse(self, path: Path) -> ParseResult: ...
+class SchemaAwareParser:
+    """
+    Optional mixin for parsers that can use the input schema for type coercion.
+    Implement this alongside the Parser protocol when your parser needs to cast
+    field values to the types declared in ``schema.simplified.json``.
+    ``Transformer`` calls ``configure(schema)`` automatically after construction
+    whenever an ``input_schema`` is available, so the user never needs to pass
+    the schema path to the parser directly.
+    Example
+    -------
+    class MyParser(SchemaAwareParser):
+        def configure(self, schema: dict) -> None:
+            self._field_types = {
+                name: prop.get("type", "string")
+                for name, prop in schema.get("properties", {}).items()
+            }
+        def parse(self, path: Path) -> ParseResult:
+            ...
+    """
+    def configure(self, schema: dict) -> None:
+        """Receive the loaded input schema dict from Transformer."""
+        ...

semantic_transformers/quick_mapper.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+QuickMapper: turn any tabular file into RDF with a simple mapping config.
+No schema, no JSONata transform, no custom parser required.  The user
+provides a YAML config that names the columns and points each one at an
+ontology class IRI and an optional QUDT unit.  Everything else is automatic.
+Supported file formats
+----------------------
+    .csv                   Comma-separated values
+    .tsv  /  .tab          Tab-separated values
+    .txt                   Auto-sniffed (separator detected from content)
+    .xlsx  /  .xls         Excel workbook (requires openpyxl)
+    .parquet               Apache Parquet (requires pyarrow or fastparquet)
+    .json                  JSON (array of records or any orient supported by pandas)
+Mapping config format
+---------------------
+    # root_type is optional (defaults to dcat:Dataset)
+    root_type: "http://www.w3.org/ns/dcat#Dataset"
+    # label is optional (defaults to the file stem)
+    label: "Hardness profile, sample 42"
+    # file reading options (all optional)
+    file:
+      format:    auto       # auto | csv | tsv | excel | parquet | json
+      separator: ","        # csv/tsv only; sniffed when omitted
+      skip_rows: 0          # rows to skip before the header row
+      header_row: 0         # which row (after skipping) contains column names
+      encoding:  utf-8
+      sheet:     0          # Excel only: sheet name or 0-based index
+    # column annotations (only annotated columns get ontology triples)
+    columns:
+      Force:
+        iri:  "https://w3id.org/pmd/tto/StandardForce"
+        unit: "http://qudt.org/vocab/unit/N"        # optional
+      Temperature:
+        iri:  "https://example.org/vocab/Temperature"
+Usage
+-----
+    from semantic_transformers import QuickMapper
+    mapper = QuickMapper("mapping.yaml")
+    result = mapper.run("my_data.xlsx")
+    print(result.graph.serialize(format="turtle"))
+    print(result.dataframe.head())
+"""
+from __future__ import annotations
+import csv as _csv
+import io
+from pathlib import Path
+from typing import Union
+import rdflib
+import yaml
+from .transformer import TransformResult
+# ---------------------------------------------------------------------------
+# Namespaces
+# ---------------------------------------------------------------------------
+_DCAT  = rdflib.Namespace("http://www.w3.org/ns/dcat#")
+_DCT   = rdflib.Namespace("http://purl.org/dc/terms/")
+_QUDT  = rdflib.Namespace("http://qudt.org/schema/qudt/")
+_RDFS  = rdflib.RDFS
+_RDF   = rdflib.RDF
+_XSD   = rdflib.XSD
+_DEFAULT_ROOT_TYPE = "http://www.w3.org/ns/dcat#Dataset"
+_DEFAULT_BASE      = "https://example.org/datasets/"
+class QuickMapper:
+    """
+    Converts any tabular file into an RDF graph using a lightweight YAML
+    mapping config.  Returns a :class:`ConversionResult` so it is a drop-in
+    companion to :class:`Converter`.
+    Parameters
+    ----------
+    mapping:
+        Path to a YAML mapping file, or a plain dict with the same structure.
+    """
+    def __init__(self, mapping: Union[str, Path, dict]) -> None:
+        if isinstance(mapping, dict):
+            self._config: dict = mapping
+        else:
+            self._config = yaml.safe_load(
+                Path(mapping).read_text(encoding="utf-8")
+            )
+    # ------------------------------------------------------------------
+    def run(self, file_path: Union[str, Path], **overrides) -> TransformResult:
+        """
+        Convert *file_path* to RDF.
+        Keyword arguments override the corresponding top-level keys in the
+        mapping config (e.g. ``label="Custom name"``).
+        Returns
+        -------
+        TransformResult
+            Same type as :meth:`Transformer.run`: graph, oold_doc, dataframe,
+            column_iris, column_units.
+        """
+        path   = Path(file_path)
+        config = {**self._config, **overrides}
+        # ── 1. Read the file into a DataFrame ────────────────────────
+        df = self._read_file(path, config.get("file", {}))
+        # ── 2. Collect column annotations ────────────────────────────
+        columns_cfg  = config.get("columns", {})
+        column_iris  = {
+            col: cfg["iri"]
+            for col, cfg in columns_cfg.items()
+            if "iri" in cfg
+        }
+        column_units = {
+            col: cfg["unit"]
+            for col, cfg in columns_cfg.items()
+            if "unit" in cfg
+        }
+        # ── 3. Build the RDF graph ────────────────────────────────────
+        root_type  = config.get("root_type", _DEFAULT_ROOT_TYPE)
+        label      = config.get("label", path.stem)
+        base       = config.get("base", _DEFAULT_BASE)
+        dataset_id = rdflib.URIRef(base + path.stem)
+        g = rdflib.Dataset()
+        ctx = g.default_graph
+        ctx.add((dataset_id, _RDF.type,   rdflib.URIRef(root_type)))
+        ctx.add((dataset_id, _RDFS.label, rdflib.Literal(label)))
+        ctx.add((dataset_id, _DCT.title,  rdflib.Literal(label)))
+        ctx.add((dataset_id, _DCT.source, rdflib.Literal(str(path.name))))
+        for col_name, col_iri in column_iris.items():
+            safe    = col_name.replace(" ", "_")
+            col_uri = rdflib.URIRef(str(dataset_id) + "/" + safe)
+            ctx.add((dataset_id, _DCAT.distribution, col_uri))
+            ctx.add((col_uri,    _RDF.type,           rdflib.URIRef(col_iri)))
+            ctx.add((col_uri,    _RDFS.label,         rdflib.Literal(col_name)))
+            unit_iri = column_units.get(col_name)
+            if unit_iri:
+                ctx.add((col_uri, _QUDT.hasUnit, rdflib.URIRef(unit_iri)))
+        # ── 4. Build a lightweight summary doc ───────────────────────
+        oold_doc = {
+            "id":         str(dataset_id),
+            "type":       root_type,
+            "label":      label,
+            "source":     str(path.name),
+            "columns":    {
+                col: {"iri": iri, **({"unit": column_units[col]} if col in column_units else {})}
+                for col, iri in column_iris.items()
+            },
+        }
+        return TransformResult(
+            graph        = g,
+            oold_doc     = oold_doc,
+            dataframe    = df,
+            column_iris  = column_iris,
+            column_units = column_units,
+        )
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _read_file(self, path: Path, file_cfg: dict):
+        """Read *path* into a pandas DataFrame using *file_cfg* hints."""
+        import pandas as pd
+        fmt = file_cfg.get("format", "auto")
+        if fmt == "auto":
+            fmt = _detect_format(path)
+        skip   = file_cfg.get("skip_rows",  0)
+        header = file_cfg.get("header_row", 0)
+        enc    = file_cfg.get("encoding",   "utf-8")
+        if fmt in ("csv", "tsv", "txt"):
+            sep = file_cfg.get("separator")
+            if sep is None:
+                sep = _sniff_separator(path, enc)
+            return pd.read_csv(
+                path,
+                sep       = sep,
+                skiprows  = skip,
+                header    = header,
+                encoding  = enc,
+            )
+        if fmt == "excel":
+            sheet = file_cfg.get("sheet", 0)
+            return pd.read_excel(
+                path,
+                sheet_name = sheet,
+                skiprows   = skip,
+                header     = header,
+            )
+        if fmt == "parquet":
+            return pd.read_parquet(path)
+        if fmt == "json":
+            orient = file_cfg.get("orient", None)
+            return pd.read_json(path, orient=orient)
+        raise ValueError(
+            f"Unsupported format '{fmt}'. "
+            "Supported: csv, tsv, txt, excel, parquet, json."
+        )
+# ---------------------------------------------------------------------------
+# Module-level helpers
+# ---------------------------------------------------------------------------
+def _detect_format(path: Path) -> str:
+    suffix = path.suffix.lower()
+    mapping = {
+        ".csv":     "csv",
+        ".tsv":     "tsv",
+        ".tab":     "tsv",
+        ".txt":     "txt",
+        ".xlsx":    "excel",
+        ".xls":     "excel",
+        ".xlsm":    "excel",
+        ".parquet": "parquet",
+        ".json":    "json",
+    }
+    return mapping.get(suffix, "csv")
+def _sniff_separator(path: Path, encoding: str) -> str:
+    """Read the first 4 KB and ask csv.Sniffer to detect the delimiter."""
+    try:
+        sample = path.read_bytes()[:4096].decode(encoding, errors="replace")
+        dialect = _csv.Sniffer().sniff(sample, delimiters=",;\t|")
+        return dialect.delimiter
+    except _csv.Error:
+        return ","  # safe fallback

semantic_transformers/transformer.py ADDED Viewed

@@ -0,0 +1,353 @@
+"""
+Transformer: parser output → OO-LD → RDF + DataFrame.
+Usage: shorthand (recommended)
+--------------------------------
+    from semantic_transformers import Transformer
+    from zwick_parser import ZwickParser
+    # Pass the schema folder; all three file paths are derived automatically.
+    # Works with a local path or a GitHub tree URL:
+    transformer = Transformer(
+        parser           = ZwickParser(),
+        semantic_schema  = "https://github.com/org/semantic-schemas/tree/main/schemas/domain/Ontology/",
+    )
+    # Or for a locally cloned schema repository:
+    transformer = Transformer(
+        parser           = ZwickParser(),
+        semantic_schema  = Path("../semantic-schemas/schemas/domain/Ontology/"),
+    )
+Usage: explicit paths (full control / non-standard layouts)
+-------------------------------------------------------------
+    transformer = Transformer(
+        parser       = ZwickParser(),
+        jsonata      = "specs/transform.simplified.jsonata",
+        oold_schema  = "specs/schema.oold.yaml",
+        input_schema = "specs/schema.simplified.json",  # optional
+    )
+    result = transformer.run("my_file.csv")
+    print(result.graph.serialize(format="turtle"))
+    print(result.dataframe)
+"""
+from __future__ import annotations
+import json
+import re
+import urllib.request
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional
+import jsonschema
+import rdflib
+import yaml
+from jsonata.jsonata import Jsonata
+from .parser import Parser, ParseResult, SchemaAwareParser
+# Namespaces used when generating timeseries descriptor triples.
+_DCAT   = rdflib.Namespace("http://www.w3.org/ns/dcat#")
+_QUDT   = rdflib.Namespace("http://qudt.org/schema/qudt/")
+_OBI    = rdflib.Namespace("http://purl.obolibrary.org/obo/OBI_")
+_RDFS   = rdflib.RDFS
+_RDF    = rdflib.RDF
+# Standard file paths relative to a schema folder root.
+_JSONATA_REL     = "specs/transform.simplified.jsonata"
+_OOLD_SCHEMA_REL = "specs/schema.oold.yaml"
+_INPUT_SCHEMA_REL = "specs/schema.simplified.json"
+def _read_text(source: str | Path) -> str:
+    """Read text from a local file path or an HTTP(S) URL."""
+    s = str(source)
+    if s.startswith("http://") or s.startswith("https://"):
+        with urllib.request.urlopen(s) as resp:
+            return resp.read().decode("utf-8")
+    return Path(source).read_text(encoding="utf-8")
+def _github_tree_to_raw(url: str) -> str:
+    """
+    Convert a GitHub ``tree/`` URL to a raw.githubusercontent.com base URL.
+    Example
+    -------
+    https://github.com/org/repo/tree/main/schemas/domain/Ontology/
+    → https://raw.githubusercontent.com/org/repo/main/schemas/domain/Ontology
+    """
+    url = url.rstrip("/")
+    url = url.replace("https://github.com/", "https://raw.githubusercontent.com/", 1)
+    url = re.sub(r"/tree/", "/", url, count=1)
+    return url
+def _resolve_semantic_schema(
+    semantic_schema: str | Path,
+) -> tuple[str | Path, str | Path, str | Path]:
+    """
+    Derive the three schema file locations from a folder root.
+    Accepts either a local ``Path`` or a GitHub ``tree/`` URL string.
+    Returns (jsonata, oold_schema, input_schema) as paths or URL strings.
+    """
+    s = str(semantic_schema)
+    if s.startswith("http://") or s.startswith("https://"):
+        base = _github_tree_to_raw(s)
+        return (
+            base + "/" + _JSONATA_REL,
+            base + "/" + _OOLD_SCHEMA_REL,
+            base + "/" + _INPUT_SCHEMA_REL,
+        )
+    p = Path(semantic_schema)
+    return (
+        p / _JSONATA_REL,
+        p / _OOLD_SCHEMA_REL,
+        p / _INPUT_SCHEMA_REL,
+    )
+@dataclass
+class TransformResult:
+    """Everything produced by a single Transformer run."""
+    # RDF graph containing the semantic metadata and timeseries descriptors.
+    graph: rdflib.Dataset
+    # The intermediate OO-LD document (after the JSONata transform, before RDF).
+    oold_doc: dict
+    # Raw measurement data.  None when the file had no tabular section.
+    dataframe: object  # pd.DataFrame | None (avoid importing pandas at module level)
+    # Column name → ontology class IRI (same as ParseResult.column_iris).
+    column_iris: dict[str, str]
+    # Column name → QUDT unit IRI (same as ParseResult.column_units).
+    column_units: dict[str, str]
+class Transformer:
+    """
+    Connects a machine-specific Parser to an OO-LD schema, producing an RDF
+    graph and a pandas DataFrame in one call.
+    Parameters
+    ----------
+    parser:
+        Any object implementing the Parser protocol.
+    semantic_schema:
+        Shorthand: the root folder of the schema, either a local ``Path``
+        or a GitHub ``tree/`` URL.  Derives all three file paths using the
+        standard schema folder layout.  Any explicitly provided ``jsonata``,
+        ``oold_schema``, or ``input_schema`` value takes precedence over the
+        derived path.
+    jsonata:
+        Path or URL to the schema's ``specs/transform.simplified.jsonata`` file.
+    oold_schema:
+        Path or URL to the schema's ``specs/schema.oold.yaml`` file (contains
+        the JSON-LD ``@context`` used to convert OO-LD output to RDF).
+    input_schema:
+        Optional path or URL to the schema's ``specs/schema.simplified.json``
+        file.  When provided, the parser's output (after caller overrides) is
+        validated for type correctness before being passed to the JSONata
+        transform.  Catches field-name mismatches between a parser and its
+        target schema early.  Required-field completeness is intentionally not
+        enforced here; SHACL validation handles that downstream.
+    Examples
+    --------
+    Shorthand with a GitHub URL (no local clone needed)::
+        transformer = Transformer(
+            parser          = ZwickParser(),
+            semantic_schema = "https://github.com/org/semantic-schemas/tree/main/schemas/domain/Ontology/",
+        )
+    Shorthand with a local path::
+        transformer = Transformer(
+            parser          = ZwickParser(),
+            semantic_schema = Path("../semantic-schemas/schemas/domain/Ontology/"),
+        )
+    Explicit paths (non-standard layout, or to override one file)::
+        transformer = Transformer(
+            parser       = ZwickParser(),
+            jsonata      = "specs/transform.simplified.jsonata",
+            oold_schema  = "specs/schema.oold.yaml",
+            input_schema = "specs/schema.simplified.json",
+        )
+    """
+    def __init__(
+        self,
+        parser: Parser,
+        jsonata: Optional[str | Path] = None,
+        oold_schema: Optional[str | Path] = None,
+        input_schema: Optional[str | Path] = None,
+        *,
+        semantic_schema: Optional[str | Path] = None,
+    ) -> None:
+        self.parser = parser
+        # Resolve shorthand, then let explicit values override.
+        if semantic_schema is not None:
+            derived_jsonata, derived_oold, derived_input = _resolve_semantic_schema(semantic_schema)
+            jsonata      = jsonata      or derived_jsonata
+            oold_schema  = oold_schema  or derived_oold
+            input_schema = input_schema or derived_input
+        if jsonata is None:
+            raise ValueError(
+                "Provide either 'semantic_schema' (shorthand) or 'jsonata' explicitly."
+            )
+        if oold_schema is None:
+            raise ValueError(
+                "Provide either 'semantic_schema' (shorthand) or 'oold_schema' explicitly."
+            )
+        self._transform_src = _read_text(jsonata)
+        raw = yaml.safe_load(_read_text(oold_schema))
+        self._context = raw["@context"]
+        self._base    = self._context.get("@base", "")
+        self._input_schema: dict | None = (
+            json.loads(_read_text(input_schema))
+            if input_schema is not None
+            else None
+        )
+        # Share the loaded schema with the parser if it supports it.
+        if self._input_schema is not None and isinstance(parser, SchemaAwareParser):
+            parser.configure(self._input_schema)
+    # ------------------------------------------------------------------
+    def run(self, file_path: str | Path, **overrides) -> TransformResult:
+        """
+        Process *file_path* end-to-end.
+        Any keyword arguments (e.g. ``test_name``, ``specimen_iri``) are
+        merged into the parsed simplified JSON, overriding whatever the
+        parser produced.  Use this to supply values that cannot be read
+        from the file itself.
+        Returns
+        -------
+        TransformResult
+        """
+        parsed = self.parser.parse(Path(file_path))
+        # Merge: parser output first, then caller overrides.
+        simplified = {**parsed.simplified_json, **overrides}
+        # ── Validate against input schema (if provided) ───────────────
+        # Strip 'required' before validating: fields that cannot be parsed
+        # from the file (e.g. specimen_iri, which must be supplied by the
+        # caller) are legitimately absent here.  The goal is to catch type
+        # mismatches and unknown field names, not to enforce completeness —
+        # SHACL validation downstream will flag any missing required triples.
+        if self._input_schema is not None:
+            schema_for_validation = {**self._input_schema, "required": []}
+            jsonschema.validate(instance=simplified, schema=schema_for_validation)
+        # ── JSONata transform ──────────────────────────────────────────
+        oold_doc = Jsonata(self._transform_src).evaluate(simplified)
+        # ── OO-LD → RDF ───────────────────────────────────────────────
+        g = rdflib.Dataset()
+        g.parse(
+            data   = json.dumps({"@context": self._context, **oold_doc}),
+            format = "json-ld",
+        )
+        # ── Timeseries descriptor triples ─────────────────────────────
+        if parsed.timeseries is not None and parsed.column_iris:
+            test_iri = self._resolve_test_iri(g, oold_doc)
+            if test_iri:
+                self._add_timeseries_nodes(g, test_iri, parsed)
+        return TransformResult(
+            graph        = g,
+            oold_doc     = oold_doc,
+            dataframe    = parsed.timeseries,
+            column_iris  = parsed.column_iris,
+            column_units = parsed.column_units,
+        )
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _resolve_test_iri(self, g: rdflib.Dataset, oold_doc: dict) -> rdflib.URIRef | None:
+        """
+        Find the root test node's IRI in the parsed graph.
+        We look it up rather than constructing it from ``@base + id`` because
+        JSON-LD follows RFC 3986 which strips any fragment from the base URI
+        before resolving relative references.  Naive string concatenation would
+        therefore produce the wrong IRI when the schema context uses a
+        ``@base`` that ends with ``#``.
+        """
+        test_id = oold_doc.get("id", "")
+        if not test_id:
+            return None
+        if test_id.startswith("http"):
+            return rdflib.URIRef(test_id)
+        for s, _p, _o, _c in g.quads():
+            if isinstance(s, rdflib.URIRef) and str(s).endswith(test_id):
+                return s
+        return None
+    def _add_timeseries_nodes(
+        self,
+        g: rdflib.Dataset,
+        test_iri: rdflib.URIRef,
+        parsed: ParseResult,
+    ) -> None:
+        """
+        Add a dcat:Dataset node for the time series and one descriptor node
+        per column.  Only IRIs and units go into the graph (not the values).
+        Graph pattern added
+        -------------------
+            <test_iri>  obi:has_specified_output  <test_iri/timeseries> .
+            <test_iri/timeseries>
+                a               dcat:Dataset ;
+                rdfs:label      "Raw time series" ;
+                dcat:distribution  <test_iri/timeseries/ColumnName>, ... .
+            <test_iri/timeseries/ColumnName>
+                a               <column_class_iri> ;
+                rdfs:label      "ColumnName" ;
+                qudt:hasUnit    <unit_iri> .
+        """
+        ctx = g.default_graph
+        ds_iri = rdflib.URIRef(str(test_iri) + "/timeseries")
+        ctx.add((test_iri, _OBI["0000299"], ds_iri))   # has_specified_output
+        ctx.add((ds_iri, _RDF.type,   _DCAT.Dataset))
+        ctx.add((ds_iri, _RDFS.label, rdflib.Literal("Raw time series")))
+        for col_name, col_class in parsed.column_iris.items():
+            safe    = col_name.replace(" ", "_")
+            col_uri = rdflib.URIRef(str(ds_iri) + "/" + safe)
+            ctx.add((ds_iri,    _DCAT.distribution, col_uri))
+            ctx.add((col_uri,   _RDF.type,          rdflib.URIRef(col_class)))
+            ctx.add((col_uri,   _RDFS.label,        rdflib.Literal(col_name)))
+            unit_iri = parsed.column_units.get(col_name)
+            if unit_iri:
+                ctx.add((col_uri, _QUDT.hasUnit, rdflib.URIRef(unit_iri)))

semantic_transformers-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,189 @@
+Metadata-Version: 2.4
+Name: semantic-transformers
+Version: 0.1.0
+Summary: Machine-file extractors and transformers for semantic schema pipelines
+Author: Semantic Dataspace contributors
+License: MIT
+Project-URL: Homepage, https://github.com/Semantic-Dataspace/semantic-transformers
+Project-URL: Repository, https://github.com/Semantic-Dataspace/semantic-transformers
+Project-URL: Bug Tracker, https://github.com/Semantic-Dataspace/semantic-transformers/issues
+Keywords: materials science,ontology,linked data,rdf,etl,parsers
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Topic :: Scientific/Engineering :: Information Analysis
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: pandas
+Requires-Dist: rdflib
+Requires-Dist: pyyaml
+Requires-Dist: jsonata-python
+Requires-Dist: jsonschema
+Provides-Extra: excel
+Requires-Dist: openpyxl; extra == "excel"
+Provides-Extra: dev
+Requires-Dist: pytest; extra == "dev"
+Requires-Dist: nbmake; extra == "dev"
+Dynamic: license-file
+# semantic-transformers
+A library and a curated collection of parsers that bridge raw instrument output
+files and the [semantic-schemas](../semantic-schemas/) knowledge graph pipeline.
+## What this repository contains
+```text
+semantic-transformers/
+  src/semantic_transformers/   Python library (Transformer, QuickMapper, …)
+  parsers/                     Machine-specific file parsers
+    <domain>/                  Mirrors the semantic-schemas folder structure
+      <specialisation>/
+        <machine>/             One folder per instrument model
+          <machine>_parser.py  Reads the instrument file
+          column_mapping.json  Maps column names to ontology class IRIs and units
+          README.md            Quick-start, schema compatibility, and known limitations
+  docs/                        Guides for users and contributors
+```
+## The two parts
+### 1. The library (`src/semantic_transformers/`)
+| Class | Role |
+|---|---|
+| `Parser` | Protocol to implement when adding support for a new instrument |
+| `ParseResult` | What every parser returns: simplified JSON + DataFrame |
+| `Transformer` | Runs parsing → JSONata transform → RDF graph |
+| `TransformResult` | What `Transformer.run()` returns: RDF graph + DataFrame |
+| `QuickMapper` | Turns any tabular file into RDF using a simple YAML mapping (no parser needed) |
+### 2. The parsers (`parsers/`)
+Each parser targets a specific instrument model. The folder path mirrors the
+`schemas/` tree in `semantic-schemas`:
+| Schema | Instrument | Parser path |
+|---|---|---|
+| `characterization/tensile-test/TTO` | Zwick/Roell (testXpert III) | `parsers/characterization/tensile-test/zwick/` |
+## Installation
+### Using pip (recommended)
+```bash
+# Install the transformers library
+pip install semantic-transformers
+# Optional: install optional dependencies
+pip install semantic-transformers[excel]  # for Excel file support
+pip install semantic-transformers[dev]    # for development and testing
+```
+### Development installation
+Both repositories are designed to be cloned as siblings under a shared folder:
+```bash
+mkdir semantic-dataspace && cd semantic-dataspace
+git clone https://github.com/Semantic-Dataspace/semantic-schemas
+git clone https://github.com/Semantic-Dataspace/semantic-transformers
+python3 -m venv .venv
+source .venv/bin/activate        # Windows: .venv\Scripts\activate
+pip install -e semantic-transformers/
+pip install jupyterlab            # only needed for the interactive notebooks
+```
+## Two ways to use this library
+### Option A: you have a supported instrument
+Use a ready-made parser and the matching schema notebook. For a Zwick/Roell
+tensile test:
+```bash
+jupyter lab semantic-schemas/schemas/characterization/tensile-test/TTO/docs/2_tensile_test_csv_workflow.ipynb
+```
+Edit **Step 0** (one line, point to your file) and run all cells. Done.
+### Option B: you have a tabular file with no existing parser
+Use `QuickMapper`. Provide a short YAML that names the columns and points each
+one at an ontology class IRI:
+```python
+from semantic_transformers import QuickMapper
+mapping = {
+    "label": "my experiment",
+    "columns": {
+        "Force": {
+            "iri":  "https://w3id.org/pmd/tto/StandardForce",
+            "unit": "http://qudt.org/vocab/unit/N",
+        },
+        "Extension": {
+            "iri": "https://w3id.org/pmd/tto/Extension",
+        },
+    },
+}
+result = QuickMapper(mapping).run("my_data.csv")
+print(result.graph.serialize(format="turtle"))
+print(result.dataframe.head())
+```
+Supported file formats: CSV, TSV, Excel (.xlsx), Parquet, JSON.
+See the [QuickMapper notebook](docs/3_quickstart-mapping.ipynb) for a guided walkthrough.
+## Development
+### Running the tests
+```bash
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -e ".[dev]"
+pytest -v
+```
+### Refreshing notebook outputs (for documentation)
+Notebooks are committed with their output cells so that GitHub renders them as
+readable documentation.  After changing a parser or the library, re-execute all
+notebooks in-place to update the stored outputs before committing:
+```bash
+find docs -name "*.ipynb" ! -path "*/.ipynb_checkpoints/*" \
+  | xargs jupyter nbconvert \
+      --to notebook \
+      --execute \
+      --inplace \
+      --ExecutePreprocessor.timeout=300
+```
+Run this from the repository root.  Commit the resulting `*.ipynb` changes
+together with any code changes so that the rendered output on GitHub stays
+in sync.
+> **Tip.** To refresh a single notebook only, pass its path directly:
+>
+> ```bash
+> jupyter nbconvert --to notebook --execute --inplace \
+>     --ExecutePreprocessor.timeout=300 \
+>     docs/3_quickstart-mapping.ipynb
+> ```
+## Documentation
+- [Getting started](docs/1_getting-started.md): convert your first instrument file
+- [QuickMapper walkthrough](docs/3_quickstart-mapping.ipynb): turn any tabular file into RDF
+- [Adding a parser](docs/2_adding-a-parser.md): support a new instrument or handle file variants

semantic_transformers-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+semantic_transformers/__init__.py,sha256=BT2JaGozDaPcdWNGBzKQU8lVESnjaOfMM2Pu8r98xAg,718
+semantic_transformers/parser.py,sha256=2HVYeNlQHnTPspN9HjPpYekldifxgoIw7r_Yklu76jw,2860
+semantic_transformers/quick_mapper.py,sha256=GeUpJPobWStqf0cg0_bb8SCYJmK7qyKUkCpqC3ipcKk,8972
+semantic_transformers/transformer.py,sha256=bs5BXK65b_1VnBkCBxMZtFiU-_KD00IWYtJSro9Xh-M,13330
+semantic_transformers-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+semantic_transformers-0.1.0.dist-info/METADATA,sha256=77V1nzt15PBIKe49uTXe2aAcQrIUQMDzS5vtteCj-RY,6307
+semantic_transformers-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+semantic_transformers-0.1.0.dist-info/top_level.txt,sha256=bwBVVBWMiRPhMwHs-l5rnapEIQdkXMfqZMiiEQZP_QI,22
+semantic_transformers-0.1.0.dist-info/RECORD,,

semantic_transformers-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

semantic_transformers-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

semantic_transformers-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ semantic_transformers