semantic-transformers 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ """
2
+ semantic-transformers
3
+ =====================
4
+ Converters and machine-file parsers for semantic schema pipelines.
5
+
6
+ Public API
7
+ ----------
8
+ ParseResult: normalised parser output (simplified_json + DataFrame)
9
+ Parser: protocol that all parsers must satisfy
10
+ Transformer: runs parsing → JSONata transform → RDF
11
+ TransformResult: everything produced by Transformer.run()
12
+ QuickMapper: turns any tabular file into RDF with a simple YAML mapping
13
+ """
14
+
15
+ from .parser import Parser, ParseResult
16
+ from .transformer import Transformer, TransformResult
17
+ from .quick_mapper import QuickMapper
18
+
19
+ __all__ = [
20
+ "Parser", "ParseResult",
21
+ "Transformer", "TransformResult",
22
+ "QuickMapper",
23
+ ]
@@ -0,0 +1,83 @@
1
+ """
2
+ Parser protocol and result type.
3
+
4
+ A Parser reads a machine file (any format, any internal structure) and
5
+ returns a ParseResult with two outputs:
6
+
7
+ simplified_json: a plain dict matching the target schema's example.input.json
8
+ format, ready to be fed into the JSONata transform.
9
+
10
+ timeseries: a pandas DataFrame of the raw measurement columns, or None
11
+ if the file contains no time-series data.
12
+
13
+ column_iris: maps each DataFrame column name to an ontology class IRI.
14
+ Only the descriptor goes into the knowledge graph; the
15
+ numeric values stay in the DataFrame.
16
+
17
+ column_units: maps each DataFrame column name to a QUDT unit IRI.
18
+
19
+ Parsers are schema- and machine-specific: one parser per (machine model,
20
+ schema) combination. They live in the parsers/ directory alongside the
21
+ schema they serve, not in this library.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from dataclasses import dataclass, field
27
+ from pathlib import Path
28
+ from typing import Protocol, runtime_checkable
29
+
30
+ import pandas as pd
31
+
32
+
33
+ @dataclass
34
+ class ParseResult:
35
+ """Normalised output produced by any Parser."""
36
+
37
+ # Flat dict matching the target schema's example.input.json.
38
+ # Feed this directly into the JSONata transform.
39
+ simplified_json: dict
40
+
41
+ # Raw time-series data. None when the file has no tabular measurements.
42
+ timeseries: pd.DataFrame | None = None
43
+
44
+ # Column name → ontology class IRI (e.g. "https://w3id.org/pmd/tto/TestTime")
45
+ column_iris: dict[str, str] = field(default_factory=dict)
46
+
47
+ # Column name → QUDT unit IRI (e.g. "http://qudt.org/vocab/unit/SEC")
48
+ column_units: dict[str, str] = field(default_factory=dict)
49
+
50
+
51
+ @runtime_checkable
52
+ class Parser(Protocol):
53
+ """Any callable object that reads a file and returns a ParseResult."""
54
+
55
+ def parse(self, path: Path) -> ParseResult: ...
56
+
57
+
58
+ class SchemaAwareParser:
59
+ """
60
+ Optional mixin for parsers that can use the input schema for type coercion.
61
+
62
+ Implement this alongside the Parser protocol when your parser needs to cast
63
+ field values to the types declared in ``schema.simplified.json``.
64
+ ``Transformer`` calls ``configure(schema)`` automatically after construction
65
+ whenever an ``input_schema`` is available, so the user never needs to pass
66
+ the schema path to the parser directly.
67
+
68
+ Example
69
+ -------
70
+ class MyParser(SchemaAwareParser):
71
+ def configure(self, schema: dict) -> None:
72
+ self._field_types = {
73
+ name: prop.get("type", "string")
74
+ for name, prop in schema.get("properties", {}).items()
75
+ }
76
+
77
+ def parse(self, path: Path) -> ParseResult:
78
+ ...
79
+ """
80
+
81
+ def configure(self, schema: dict) -> None:
82
+ """Receive the loaded input schema dict from Transformer."""
83
+ ...
@@ -0,0 +1,255 @@
1
+ """
2
+ QuickMapper: turn any tabular file into RDF with a simple mapping config.
3
+
4
+ No schema, no JSONata transform, no custom parser required. The user
5
+ provides a YAML config that names the columns and points each one at an
6
+ ontology class IRI and an optional QUDT unit. Everything else is automatic.
7
+
8
+ Supported file formats
9
+ ----------------------
10
+ .csv Comma-separated values
11
+ .tsv / .tab Tab-separated values
12
+ .txt Auto-sniffed (separator detected from content)
13
+ .xlsx / .xls Excel workbook (requires openpyxl)
14
+ .parquet Apache Parquet (requires pyarrow or fastparquet)
15
+ .json JSON (array of records or any orient supported by pandas)
16
+
17
+ Mapping config format
18
+ ---------------------
19
+ # root_type is optional (defaults to dcat:Dataset)
20
+ root_type: "http://www.w3.org/ns/dcat#Dataset"
21
+
22
+ # label is optional (defaults to the file stem)
23
+ label: "Hardness profile, sample 42"
24
+
25
+ # file reading options (all optional)
26
+ file:
27
+ format: auto # auto | csv | tsv | excel | parquet | json
28
+ separator: "," # csv/tsv only; sniffed when omitted
29
+ skip_rows: 0 # rows to skip before the header row
30
+ header_row: 0 # which row (after skipping) contains column names
31
+ encoding: utf-8
32
+ sheet: 0 # Excel only: sheet name or 0-based index
33
+
34
+ # column annotations (only annotated columns get ontology triples)
35
+ columns:
36
+ Force:
37
+ iri: "https://w3id.org/pmd/tto/StandardForce"
38
+ unit: "http://qudt.org/vocab/unit/N" # optional
39
+ Temperature:
40
+ iri: "https://example.org/vocab/Temperature"
41
+
42
+ Usage
43
+ -----
44
+ from semantic_transformers import QuickMapper
45
+
46
+ mapper = QuickMapper("mapping.yaml")
47
+ result = mapper.run("my_data.xlsx")
48
+
49
+ print(result.graph.serialize(format="turtle"))
50
+ print(result.dataframe.head())
51
+ """
52
+
53
+ from __future__ import annotations
54
+
55
+ import csv as _csv
56
+ import io
57
+ from pathlib import Path
58
+ from typing import Union
59
+
60
+ import rdflib
61
+ import yaml
62
+
63
+ from .transformer import TransformResult
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Namespaces
67
+ # ---------------------------------------------------------------------------
68
+ _DCAT = rdflib.Namespace("http://www.w3.org/ns/dcat#")
69
+ _DCT = rdflib.Namespace("http://purl.org/dc/terms/")
70
+ _QUDT = rdflib.Namespace("http://qudt.org/schema/qudt/")
71
+ _RDFS = rdflib.RDFS
72
+ _RDF = rdflib.RDF
73
+ _XSD = rdflib.XSD
74
+
75
+ _DEFAULT_ROOT_TYPE = "http://www.w3.org/ns/dcat#Dataset"
76
+ _DEFAULT_BASE = "https://example.org/datasets/"
77
+
78
+
79
+ class QuickMapper:
80
+ """
81
+ Converts any tabular file into an RDF graph using a lightweight YAML
82
+ mapping config. Returns a :class:`ConversionResult` so it is a drop-in
83
+ companion to :class:`Converter`.
84
+
85
+ Parameters
86
+ ----------
87
+ mapping:
88
+ Path to a YAML mapping file, or a plain dict with the same structure.
89
+ """
90
+
91
+ def __init__(self, mapping: Union[str, Path, dict]) -> None:
92
+ if isinstance(mapping, dict):
93
+ self._config: dict = mapping
94
+ else:
95
+ self._config = yaml.safe_load(
96
+ Path(mapping).read_text(encoding="utf-8")
97
+ )
98
+
99
+ # ------------------------------------------------------------------
100
+ def run(self, file_path: Union[str, Path], **overrides) -> TransformResult:
101
+ """
102
+ Convert *file_path* to RDF.
103
+
104
+ Keyword arguments override the corresponding top-level keys in the
105
+ mapping config (e.g. ``label="Custom name"``).
106
+
107
+ Returns
108
+ -------
109
+ TransformResult
110
+ Same type as :meth:`Transformer.run`: graph, oold_doc, dataframe,
111
+ column_iris, column_units.
112
+ """
113
+ path = Path(file_path)
114
+ config = {**self._config, **overrides}
115
+
116
+ # ── 1. Read the file into a DataFrame ────────────────────────
117
+ df = self._read_file(path, config.get("file", {}))
118
+
119
+ # ── 2. Collect column annotations ────────────────────────────
120
+ columns_cfg = config.get("columns", {})
121
+ column_iris = {
122
+ col: cfg["iri"]
123
+ for col, cfg in columns_cfg.items()
124
+ if "iri" in cfg
125
+ }
126
+ column_units = {
127
+ col: cfg["unit"]
128
+ for col, cfg in columns_cfg.items()
129
+ if "unit" in cfg
130
+ }
131
+
132
+ # ── 3. Build the RDF graph ────────────────────────────────────
133
+ root_type = config.get("root_type", _DEFAULT_ROOT_TYPE)
134
+ label = config.get("label", path.stem)
135
+ base = config.get("base", _DEFAULT_BASE)
136
+ dataset_id = rdflib.URIRef(base + path.stem)
137
+
138
+ g = rdflib.Dataset()
139
+ ctx = g.default_graph
140
+
141
+ ctx.add((dataset_id, _RDF.type, rdflib.URIRef(root_type)))
142
+ ctx.add((dataset_id, _RDFS.label, rdflib.Literal(label)))
143
+ ctx.add((dataset_id, _DCT.title, rdflib.Literal(label)))
144
+ ctx.add((dataset_id, _DCT.source, rdflib.Literal(str(path.name))))
145
+
146
+ for col_name, col_iri in column_iris.items():
147
+ safe = col_name.replace(" ", "_")
148
+ col_uri = rdflib.URIRef(str(dataset_id) + "/" + safe)
149
+
150
+ ctx.add((dataset_id, _DCAT.distribution, col_uri))
151
+ ctx.add((col_uri, _RDF.type, rdflib.URIRef(col_iri)))
152
+ ctx.add((col_uri, _RDFS.label, rdflib.Literal(col_name)))
153
+
154
+ unit_iri = column_units.get(col_name)
155
+ if unit_iri:
156
+ ctx.add((col_uri, _QUDT.hasUnit, rdflib.URIRef(unit_iri)))
157
+
158
+ # ── 4. Build a lightweight summary doc ───────────────────────
159
+ oold_doc = {
160
+ "id": str(dataset_id),
161
+ "type": root_type,
162
+ "label": label,
163
+ "source": str(path.name),
164
+ "columns": {
165
+ col: {"iri": iri, **({"unit": column_units[col]} if col in column_units else {})}
166
+ for col, iri in column_iris.items()
167
+ },
168
+ }
169
+
170
+ return TransformResult(
171
+ graph = g,
172
+ oold_doc = oold_doc,
173
+ dataframe = df,
174
+ column_iris = column_iris,
175
+ column_units = column_units,
176
+ )
177
+
178
+ # ------------------------------------------------------------------
179
+ # Private helpers
180
+ # ------------------------------------------------------------------
181
+
182
+ def _read_file(self, path: Path, file_cfg: dict):
183
+ """Read *path* into a pandas DataFrame using *file_cfg* hints."""
184
+ import pandas as pd
185
+
186
+ fmt = file_cfg.get("format", "auto")
187
+ if fmt == "auto":
188
+ fmt = _detect_format(path)
189
+
190
+ skip = file_cfg.get("skip_rows", 0)
191
+ header = file_cfg.get("header_row", 0)
192
+ enc = file_cfg.get("encoding", "utf-8")
193
+
194
+ if fmt in ("csv", "tsv", "txt"):
195
+ sep = file_cfg.get("separator")
196
+ if sep is None:
197
+ sep = _sniff_separator(path, enc)
198
+ return pd.read_csv(
199
+ path,
200
+ sep = sep,
201
+ skiprows = skip,
202
+ header = header,
203
+ encoding = enc,
204
+ )
205
+
206
+ if fmt == "excel":
207
+ sheet = file_cfg.get("sheet", 0)
208
+ return pd.read_excel(
209
+ path,
210
+ sheet_name = sheet,
211
+ skiprows = skip,
212
+ header = header,
213
+ )
214
+
215
+ if fmt == "parquet":
216
+ return pd.read_parquet(path)
217
+
218
+ if fmt == "json":
219
+ orient = file_cfg.get("orient", None)
220
+ return pd.read_json(path, orient=orient)
221
+
222
+ raise ValueError(
223
+ f"Unsupported format '{fmt}'. "
224
+ "Supported: csv, tsv, txt, excel, parquet, json."
225
+ )
226
+
227
+
228
+ # ---------------------------------------------------------------------------
229
+ # Module-level helpers
230
+ # ---------------------------------------------------------------------------
231
+
232
+ def _detect_format(path: Path) -> str:
233
+ suffix = path.suffix.lower()
234
+ mapping = {
235
+ ".csv": "csv",
236
+ ".tsv": "tsv",
237
+ ".tab": "tsv",
238
+ ".txt": "txt",
239
+ ".xlsx": "excel",
240
+ ".xls": "excel",
241
+ ".xlsm": "excel",
242
+ ".parquet": "parquet",
243
+ ".json": "json",
244
+ }
245
+ return mapping.get(suffix, "csv")
246
+
247
+
248
+ def _sniff_separator(path: Path, encoding: str) -> str:
249
+ """Read the first 4 KB and ask csv.Sniffer to detect the delimiter."""
250
+ try:
251
+ sample = path.read_bytes()[:4096].decode(encoding, errors="replace")
252
+ dialect = _csv.Sniffer().sniff(sample, delimiters=",;\t|")
253
+ return dialect.delimiter
254
+ except _csv.Error:
255
+ return "," # safe fallback
@@ -0,0 +1,353 @@
1
+ """
2
+ Transformer: parser output → OO-LD → RDF + DataFrame.
3
+
4
+ Usage: shorthand (recommended)
5
+ --------------------------------
6
+ from semantic_transformers import Transformer
7
+ from zwick_parser import ZwickParser
8
+
9
+ # Pass the schema folder; all three file paths are derived automatically.
10
+ # Works with a local path or a GitHub tree URL:
11
+ transformer = Transformer(
12
+ parser = ZwickParser(),
13
+ semantic_schema = "https://github.com/org/semantic-schemas/tree/main/schemas/domain/Ontology/",
14
+ )
15
+
16
+ # Or for a locally cloned schema repository:
17
+ transformer = Transformer(
18
+ parser = ZwickParser(),
19
+ semantic_schema = Path("../semantic-schemas/schemas/domain/Ontology/"),
20
+ )
21
+
22
+ Usage: explicit paths (full control / non-standard layouts)
23
+ -------------------------------------------------------------
24
+ transformer = Transformer(
25
+ parser = ZwickParser(),
26
+ jsonata = "specs/transform.simplified.jsonata",
27
+ oold_schema = "specs/schema.oold.yaml",
28
+ input_schema = "specs/schema.simplified.json", # optional
29
+ )
30
+
31
+ result = transformer.run("my_file.csv")
32
+ print(result.graph.serialize(format="turtle"))
33
+ print(result.dataframe)
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ import json
39
+ import re
40
+ import urllib.request
41
+ from dataclasses import dataclass
42
+ from pathlib import Path
43
+ from typing import Optional
44
+
45
+ import jsonschema
46
+ import rdflib
47
+ import yaml
48
+ from jsonata.jsonata import Jsonata
49
+
50
+ from .parser import Parser, ParseResult, SchemaAwareParser
51
+
52
+ # Namespaces used when generating timeseries descriptor triples.
53
+ _DCAT = rdflib.Namespace("http://www.w3.org/ns/dcat#")
54
+ _QUDT = rdflib.Namespace("http://qudt.org/schema/qudt/")
55
+ _OBI = rdflib.Namespace("http://purl.obolibrary.org/obo/OBI_")
56
+ _RDFS = rdflib.RDFS
57
+ _RDF = rdflib.RDF
58
+
59
+ # Standard file paths relative to a schema folder root.
60
+ _JSONATA_REL = "specs/transform.simplified.jsonata"
61
+ _OOLD_SCHEMA_REL = "specs/schema.oold.yaml"
62
+ _INPUT_SCHEMA_REL = "specs/schema.simplified.json"
63
+
64
+
65
+ def _read_text(source: str | Path) -> str:
66
+ """Read text from a local file path or an HTTP(S) URL."""
67
+ s = str(source)
68
+ if s.startswith("http://") or s.startswith("https://"):
69
+ with urllib.request.urlopen(s) as resp:
70
+ return resp.read().decode("utf-8")
71
+ return Path(source).read_text(encoding="utf-8")
72
+
73
+
74
+ def _github_tree_to_raw(url: str) -> str:
75
+ """
76
+ Convert a GitHub ``tree/`` URL to a raw.githubusercontent.com base URL.
77
+
78
+ Example
79
+ -------
80
+ https://github.com/org/repo/tree/main/schemas/domain/Ontology/
81
+ → https://raw.githubusercontent.com/org/repo/main/schemas/domain/Ontology
82
+ """
83
+ url = url.rstrip("/")
84
+ url = url.replace("https://github.com/", "https://raw.githubusercontent.com/", 1)
85
+ url = re.sub(r"/tree/", "/", url, count=1)
86
+ return url
87
+
88
+
89
+ def _resolve_semantic_schema(
90
+ semantic_schema: str | Path,
91
+ ) -> tuple[str | Path, str | Path, str | Path]:
92
+ """
93
+ Derive the three schema file locations from a folder root.
94
+
95
+ Accepts either a local ``Path`` or a GitHub ``tree/`` URL string.
96
+ Returns (jsonata, oold_schema, input_schema) as paths or URL strings.
97
+ """
98
+ s = str(semantic_schema)
99
+ if s.startswith("http://") or s.startswith("https://"):
100
+ base = _github_tree_to_raw(s)
101
+ return (
102
+ base + "/" + _JSONATA_REL,
103
+ base + "/" + _OOLD_SCHEMA_REL,
104
+ base + "/" + _INPUT_SCHEMA_REL,
105
+ )
106
+ p = Path(semantic_schema)
107
+ return (
108
+ p / _JSONATA_REL,
109
+ p / _OOLD_SCHEMA_REL,
110
+ p / _INPUT_SCHEMA_REL,
111
+ )
112
+
113
+
114
+ @dataclass
115
+ class TransformResult:
116
+ """Everything produced by a single Transformer run."""
117
+
118
+ # RDF graph containing the semantic metadata and timeseries descriptors.
119
+ graph: rdflib.Dataset
120
+
121
+ # The intermediate OO-LD document (after the JSONata transform, before RDF).
122
+ oold_doc: dict
123
+
124
+ # Raw measurement data. None when the file had no tabular section.
125
+ dataframe: object # pd.DataFrame | None (avoid importing pandas at module level)
126
+
127
+ # Column name → ontology class IRI (same as ParseResult.column_iris).
128
+ column_iris: dict[str, str]
129
+
130
+ # Column name → QUDT unit IRI (same as ParseResult.column_units).
131
+ column_units: dict[str, str]
132
+
133
+
134
+ class Transformer:
135
+ """
136
+ Connects a machine-specific Parser to an OO-LD schema, producing an RDF
137
+ graph and a pandas DataFrame in one call.
138
+
139
+ Parameters
140
+ ----------
141
+ parser:
142
+ Any object implementing the Parser protocol.
143
+
144
+ semantic_schema:
145
+ Shorthand: the root folder of the schema, either a local ``Path``
146
+ or a GitHub ``tree/`` URL. Derives all three file paths using the
147
+ standard schema folder layout. Any explicitly provided ``jsonata``,
148
+ ``oold_schema``, or ``input_schema`` value takes precedence over the
149
+ derived path.
150
+
151
+ jsonata:
152
+ Path or URL to the schema's ``specs/transform.simplified.jsonata`` file.
153
+
154
+ oold_schema:
155
+ Path or URL to the schema's ``specs/schema.oold.yaml`` file (contains
156
+ the JSON-LD ``@context`` used to convert OO-LD output to RDF).
157
+
158
+ input_schema:
159
+ Optional path or URL to the schema's ``specs/schema.simplified.json``
160
+ file. When provided, the parser's output (after caller overrides) is
161
+ validated for type correctness before being passed to the JSONata
162
+ transform. Catches field-name mismatches between a parser and its
163
+ target schema early. Required-field completeness is intentionally not
164
+ enforced here; SHACL validation handles that downstream.
165
+
166
+ Examples
167
+ --------
168
+ Shorthand with a GitHub URL (no local clone needed)::
169
+
170
+ transformer = Transformer(
171
+ parser = ZwickParser(),
172
+ semantic_schema = "https://github.com/org/semantic-schemas/tree/main/schemas/domain/Ontology/",
173
+ )
174
+
175
+ Shorthand with a local path::
176
+
177
+ transformer = Transformer(
178
+ parser = ZwickParser(),
179
+ semantic_schema = Path("../semantic-schemas/schemas/domain/Ontology/"),
180
+ )
181
+
182
+ Explicit paths (non-standard layout, or to override one file)::
183
+
184
+ transformer = Transformer(
185
+ parser = ZwickParser(),
186
+ jsonata = "specs/transform.simplified.jsonata",
187
+ oold_schema = "specs/schema.oold.yaml",
188
+ input_schema = "specs/schema.simplified.json",
189
+ )
190
+ """
191
+
192
+ def __init__(
193
+ self,
194
+ parser: Parser,
195
+ jsonata: Optional[str | Path] = None,
196
+ oold_schema: Optional[str | Path] = None,
197
+ input_schema: Optional[str | Path] = None,
198
+ *,
199
+ semantic_schema: Optional[str | Path] = None,
200
+ ) -> None:
201
+ self.parser = parser
202
+
203
+ # Resolve shorthand, then let explicit values override.
204
+ if semantic_schema is not None:
205
+ derived_jsonata, derived_oold, derived_input = _resolve_semantic_schema(semantic_schema)
206
+ jsonata = jsonata or derived_jsonata
207
+ oold_schema = oold_schema or derived_oold
208
+ input_schema = input_schema or derived_input
209
+
210
+ if jsonata is None:
211
+ raise ValueError(
212
+ "Provide either 'semantic_schema' (shorthand) or 'jsonata' explicitly."
213
+ )
214
+ if oold_schema is None:
215
+ raise ValueError(
216
+ "Provide either 'semantic_schema' (shorthand) or 'oold_schema' explicitly."
217
+ )
218
+
219
+ self._transform_src = _read_text(jsonata)
220
+ raw = yaml.safe_load(_read_text(oold_schema))
221
+ self._context = raw["@context"]
222
+ self._base = self._context.get("@base", "")
223
+
224
+ self._input_schema: dict | None = (
225
+ json.loads(_read_text(input_schema))
226
+ if input_schema is not None
227
+ else None
228
+ )
229
+
230
+ # Share the loaded schema with the parser if it supports it.
231
+ if self._input_schema is not None and isinstance(parser, SchemaAwareParser):
232
+ parser.configure(self._input_schema)
233
+
234
+ # ------------------------------------------------------------------
235
+ def run(self, file_path: str | Path, **overrides) -> TransformResult:
236
+ """
237
+ Process *file_path* end-to-end.
238
+
239
+ Any keyword arguments (e.g. ``test_name``, ``specimen_iri``) are
240
+ merged into the parsed simplified JSON, overriding whatever the
241
+ parser produced. Use this to supply values that cannot be read
242
+ from the file itself.
243
+
244
+ Returns
245
+ -------
246
+ TransformResult
247
+ """
248
+ parsed = self.parser.parse(Path(file_path))
249
+
250
+ # Merge: parser output first, then caller overrides.
251
+ simplified = {**parsed.simplified_json, **overrides}
252
+
253
+ # ── Validate against input schema (if provided) ───────────────
254
+ # Strip 'required' before validating: fields that cannot be parsed
255
+ # from the file (e.g. specimen_iri, which must be supplied by the
256
+ # caller) are legitimately absent here. The goal is to catch type
257
+ # mismatches and unknown field names, not to enforce completeness —
258
+ # SHACL validation downstream will flag any missing required triples.
259
+ if self._input_schema is not None:
260
+ schema_for_validation = {**self._input_schema, "required": []}
261
+ jsonschema.validate(instance=simplified, schema=schema_for_validation)
262
+
263
+ # ── JSONata transform ──────────────────────────────────────────
264
+ oold_doc = Jsonata(self._transform_src).evaluate(simplified)
265
+
266
+ # ── OO-LD → RDF ───────────────────────────────────────────────
267
+ g = rdflib.Dataset()
268
+ g.parse(
269
+ data = json.dumps({"@context": self._context, **oold_doc}),
270
+ format = "json-ld",
271
+ )
272
+
273
+ # ── Timeseries descriptor triples ─────────────────────────────
274
+ if parsed.timeseries is not None and parsed.column_iris:
275
+ test_iri = self._resolve_test_iri(g, oold_doc)
276
+ if test_iri:
277
+ self._add_timeseries_nodes(g, test_iri, parsed)
278
+
279
+ return TransformResult(
280
+ graph = g,
281
+ oold_doc = oold_doc,
282
+ dataframe = parsed.timeseries,
283
+ column_iris = parsed.column_iris,
284
+ column_units = parsed.column_units,
285
+ )
286
+
287
+ # ------------------------------------------------------------------
288
+ # Internal helpers
289
+ # ------------------------------------------------------------------
290
+
291
+ def _resolve_test_iri(self, g: rdflib.Dataset, oold_doc: dict) -> rdflib.URIRef | None:
292
+ """
293
+ Find the root test node's IRI in the parsed graph.
294
+
295
+ We look it up rather than constructing it from ``@base + id`` because
296
+ JSON-LD follows RFC 3986 which strips any fragment from the base URI
297
+ before resolving relative references. Naive string concatenation would
298
+ therefore produce the wrong IRI when the schema context uses a
299
+ ``@base`` that ends with ``#``.
300
+ """
301
+ test_id = oold_doc.get("id", "")
302
+ if not test_id:
303
+ return None
304
+ if test_id.startswith("http"):
305
+ return rdflib.URIRef(test_id)
306
+ for s, _p, _o, _c in g.quads():
307
+ if isinstance(s, rdflib.URIRef) and str(s).endswith(test_id):
308
+ return s
309
+ return None
310
+
311
+ def _add_timeseries_nodes(
312
+ self,
313
+ g: rdflib.Dataset,
314
+ test_iri: rdflib.URIRef,
315
+ parsed: ParseResult,
316
+ ) -> None:
317
+ """
318
+ Add a dcat:Dataset node for the time series and one descriptor node
319
+ per column. Only IRIs and units go into the graph (not the values).
320
+
321
+ Graph pattern added
322
+ -------------------
323
+ <test_iri> obi:has_specified_output <test_iri/timeseries> .
324
+
325
+ <test_iri/timeseries>
326
+ a dcat:Dataset ;
327
+ rdfs:label "Raw time series" ;
328
+ dcat:distribution <test_iri/timeseries/ColumnName>, ... .
329
+
330
+ <test_iri/timeseries/ColumnName>
331
+ a <column_class_iri> ;
332
+ rdfs:label "ColumnName" ;
333
+ qudt:hasUnit <unit_iri> .
334
+ """
335
+ ctx = g.default_graph
336
+
337
+ ds_iri = rdflib.URIRef(str(test_iri) + "/timeseries")
338
+
339
+ ctx.add((test_iri, _OBI["0000299"], ds_iri)) # has_specified_output
340
+ ctx.add((ds_iri, _RDF.type, _DCAT.Dataset))
341
+ ctx.add((ds_iri, _RDFS.label, rdflib.Literal("Raw time series")))
342
+
343
+ for col_name, col_class in parsed.column_iris.items():
344
+ safe = col_name.replace(" ", "_")
345
+ col_uri = rdflib.URIRef(str(ds_iri) + "/" + safe)
346
+
347
+ ctx.add((ds_iri, _DCAT.distribution, col_uri))
348
+ ctx.add((col_uri, _RDF.type, rdflib.URIRef(col_class)))
349
+ ctx.add((col_uri, _RDFS.label, rdflib.Literal(col_name)))
350
+
351
+ unit_iri = parsed.column_units.get(col_name)
352
+ if unit_iri:
353
+ ctx.add((col_uri, _QUDT.hasUnit, rdflib.URIRef(unit_iri)))
@@ -0,0 +1,189 @@
1
+ Metadata-Version: 2.4
2
+ Name: semantic-transformers
3
+ Version: 0.1.0
4
+ Summary: Machine-file extractors and transformers for semantic schema pipelines
5
+ Author: Semantic Dataspace contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Semantic-Dataspace/semantic-transformers
8
+ Project-URL: Repository, https://github.com/Semantic-Dataspace/semantic-transformers
9
+ Project-URL: Bug Tracker, https://github.com/Semantic-Dataspace/semantic-transformers/issues
10
+ Keywords: materials science,ontology,linked data,rdf,etl,parsers
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: pandas
23
+ Requires-Dist: rdflib
24
+ Requires-Dist: pyyaml
25
+ Requires-Dist: jsonata-python
26
+ Requires-Dist: jsonschema
27
+ Provides-Extra: excel
28
+ Requires-Dist: openpyxl; extra == "excel"
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest; extra == "dev"
31
+ Requires-Dist: nbmake; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # semantic-transformers
35
+
36
+ A library and a curated collection of parsers that bridge raw instrument output
37
+ files and the [semantic-schemas](../semantic-schemas/) knowledge graph pipeline.
38
+
39
+ ## What this repository contains
40
+
41
+ ```text
42
+ semantic-transformers/
43
+ src/semantic_transformers/ Python library (Transformer, QuickMapper, …)
44
+ parsers/ Machine-specific file parsers
45
+ <domain>/ Mirrors the semantic-schemas folder structure
46
+ <specialisation>/
47
+ <machine>/ One folder per instrument model
48
+ <machine>_parser.py Reads the instrument file
49
+ column_mapping.json Maps column names to ontology class IRIs and units
50
+ README.md Quick-start, schema compatibility, and known limitations
51
+ docs/ Guides for users and contributors
52
+ ```
53
+
54
+ ## The two parts
55
+
56
+ ### 1. The library (`src/semantic_transformers/`)
57
+
58
+ | Class | Role |
59
+ |---|---|
60
+ | `Parser` | Protocol to implement when adding support for a new instrument |
61
+ | `ParseResult` | What every parser returns: simplified JSON + DataFrame |
62
+ | `Transformer` | Runs parsing → JSONata transform → RDF graph |
63
+ | `TransformResult` | What `Transformer.run()` returns: RDF graph + DataFrame |
64
+ | `QuickMapper` | Turns any tabular file into RDF using a simple YAML mapping (no parser needed) |
65
+
66
+ ### 2. The parsers (`parsers/`)
67
+
68
+ Each parser targets a specific instrument model. The folder path mirrors the
69
+ `schemas/` tree in `semantic-schemas`:
70
+
71
+ | Schema | Instrument | Parser path |
72
+ |---|---|---|
73
+ | `characterization/tensile-test/TTO` | Zwick/Roell (testXpert III) | `parsers/characterization/tensile-test/zwick/` |
74
+
75
+ ## Installation
76
+
77
+ ### Using pip (recommended)
78
+
79
+ ```bash
80
+ # Install the transformers library
81
+ pip install semantic-transformers
82
+
83
+ # Optional: install optional dependencies
84
+ pip install semantic-transformers[excel] # for Excel file support
85
+ pip install semantic-transformers[dev] # for development and testing
86
+ ```
87
+
88
+ ### Development installation
89
+
90
+ Both repositories are designed to be cloned as siblings under a shared folder:
91
+
92
+ ```bash
93
+ mkdir semantic-dataspace && cd semantic-dataspace
94
+
95
+ git clone https://github.com/Semantic-Dataspace/semantic-schemas
96
+ git clone https://github.com/Semantic-Dataspace/semantic-transformers
97
+
98
+ python3 -m venv .venv
99
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
100
+
101
+ pip install -e semantic-transformers/
102
+ pip install jupyterlab # only needed for the interactive notebooks
103
+ ```
104
+
105
+ ## Two ways to use this library
106
+
107
+ ### Option A: you have a supported instrument
108
+
109
+ Use a ready-made parser and the matching schema notebook. For a Zwick/Roell
110
+ tensile test:
111
+
112
+ ```bash
113
+ jupyter lab semantic-schemas/schemas/characterization/tensile-test/TTO/docs/2_tensile_test_csv_workflow.ipynb
114
+ ```
115
+
116
+ Edit **Step 0** (one line, point to your file) and run all cells. Done.
117
+
118
+ ### Option B: you have a tabular file with no existing parser
119
+
120
+ Use `QuickMapper`. Provide a short YAML that names the columns and points each
121
+ one at an ontology class IRI:
122
+
123
+ ```python
124
+ from semantic_transformers import QuickMapper
125
+
126
+ mapping = {
127
+ "label": "my experiment",
128
+ "columns": {
129
+ "Force": {
130
+ "iri": "https://w3id.org/pmd/tto/StandardForce",
131
+ "unit": "http://qudt.org/vocab/unit/N",
132
+ },
133
+ "Extension": {
134
+ "iri": "https://w3id.org/pmd/tto/Extension",
135
+ },
136
+ },
137
+ }
138
+
139
+ result = QuickMapper(mapping).run("my_data.csv")
140
+ print(result.graph.serialize(format="turtle"))
141
+ print(result.dataframe.head())
142
+ ```
143
+
144
+ Supported file formats: CSV, TSV, Excel (.xlsx), Parquet, JSON.
145
+ See the [QuickMapper notebook](docs/3_quickstart-mapping.ipynb) for a guided walkthrough.
146
+
147
+ ## Development
148
+
149
+ ### Running the tests
150
+
151
+ ```bash
152
+ python3 -m venv .venv
153
+ source .venv/bin/activate
154
+ pip install -e ".[dev]"
155
+ pytest -v
156
+ ```
157
+
158
+ ### Refreshing notebook outputs (for documentation)
159
+
160
+ Notebooks are committed with their output cells so that GitHub renders them as
161
+ readable documentation. After changing a parser or the library, re-execute all
162
+ notebooks in-place to update the stored outputs before committing:
163
+
164
+ ```bash
165
+ find docs -name "*.ipynb" ! -path "*/.ipynb_checkpoints/*" \
166
+ | xargs jupyter nbconvert \
167
+ --to notebook \
168
+ --execute \
169
+ --inplace \
170
+ --ExecutePreprocessor.timeout=300
171
+ ```
172
+
173
+ Run this from the repository root. Commit the resulting `*.ipynb` changes
174
+ together with any code changes so that the rendered output on GitHub stays
175
+ in sync.
176
+
177
+ > **Tip.** To refresh a single notebook only, pass its path directly:
178
+ >
179
+ > ```bash
180
+ > jupyter nbconvert --to notebook --execute --inplace \
181
+ > --ExecutePreprocessor.timeout=300 \
182
+ > docs/3_quickstart-mapping.ipynb
183
+ > ```
184
+
185
+ ## Documentation
186
+
187
+ - [Getting started](docs/1_getting-started.md): convert your first instrument file
188
+ - [QuickMapper walkthrough](docs/3_quickstart-mapping.ipynb): turn any tabular file into RDF
189
+ - [Adding a parser](docs/2_adding-a-parser.md): support a new instrument or handle file variants
@@ -0,0 +1,9 @@
1
+ semantic_transformers/__init__.py,sha256=BT2JaGozDaPcdWNGBzKQU8lVESnjaOfMM2Pu8r98xAg,718
2
+ semantic_transformers/parser.py,sha256=2HVYeNlQHnTPspN9HjPpYekldifxgoIw7r_Yklu76jw,2860
3
+ semantic_transformers/quick_mapper.py,sha256=GeUpJPobWStqf0cg0_bb8SCYJmK7qyKUkCpqC3ipcKk,8972
4
+ semantic_transformers/transformer.py,sha256=bs5BXK65b_1VnBkCBxMZtFiU-_KD00IWYtJSro9Xh-M,13330
5
+ semantic_transformers-0.1.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
6
+ semantic_transformers-0.1.0.dist-info/METADATA,sha256=77V1nzt15PBIKe49uTXe2aAcQrIUQMDzS5vtteCj-RY,6307
7
+ semantic_transformers-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ semantic_transformers-0.1.0.dist-info/top_level.txt,sha256=bwBVVBWMiRPhMwHs-l5rnapEIQdkXMfqZMiiEQZP_QI,22
9
+ semantic_transformers-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,201 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
@@ -0,0 +1 @@
1
+ semantic_transformers