PyPI - querygraph - Versions diffs - 0.2.0__py3-none-any.whl - Mend

querygraph 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

querygraph/__init__.py +14 -0
querygraph/__main__.py +4 -0
querygraph/agents.py +89 -0
querygraph/base58.py +15 -0
querygraph/cdif.py +205 -0
querygraph/cli.py +123 -0
querygraph/codata.py +38 -0
querygraph/croissant.py +86 -0
querygraph/dataverse.py +155 -0
querygraph/did.py +51 -0
querygraph/lakehouse.py +115 -0
querygraph/lineage.py +106 -0
querygraph/navigator.py +141 -0
querygraph/odrl.py +60 -0
querygraph/odrl_rights.py +50 -0
querygraph/osi.py +155 -0
querygraph/qglake.py +99 -0
querygraph/rbac.py +31 -0
querygraph/typedid.py +211 -0
querygraph/validation.py +41 -0
querygraph-0.2.0.dist-info/METADATA +172 -0
querygraph-0.2.0.dist-info/RECORD +24 -0
querygraph-0.2.0.dist-info/WHEEL +4 -0
querygraph-0.2.0.dist-info/entry_points.txt +2 -0

querygraph/dataverse.py ADDED Viewed

@@ -0,0 +1,155 @@
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from urllib.request import urlopen
+from pydantic import BaseModel, Field
+from querygraph.croissant import CroissantDataset, Field as CroissantField, FileObject, RecordSet
+class DataverseFile(BaseModel):
+    id: int | str
+    label: str
+    download_url: str
+    content_type: str = "application/octet-stream"
+class DataverseDataset(BaseModel):
+    id: int | str
+    persistent_id: str
+    title: str
+    description: str = ""
+    landing_page: str
+    subjects: list[str] = Field(default_factory=list)
+    keywords: list[str] = Field(default_factory=list)
+    files: list[DataverseFile] = Field(default_factory=list)
+    @classmethod
+    def from_native_api(cls, payload: dict[str, Any]) -> "DataverseDataset":
+        data = payload.get("data", payload)
+        citation = data.get("latestVersion", data).get("metadataBlocks", {}).get("citation", {})
+        fields = citation.get("fields", [])
+        values = {_field_name(field): _field_value(field) for field in fields}
+        files = []
+        for file_entry in data.get("latestVersion", data).get("files", []):
+            data_file = file_entry.get("dataFile", file_entry)
+            file_id = data_file.get("id", file_entry.get("id", "file"))
+            files.append(
+                DataverseFile(
+                    id=file_id,
+                    label=data_file.get("filename", file_entry.get("label", str(file_id))),
+                    download_url=data_file.get(
+                        "downloadUrl",
+                        f"https://dataverse.harvard.edu/api/access/datafile/{file_id}",
+                    ),
+                    content_type=data_file.get("contentType", "application/octet-stream"),
+                )
+            )
+        return cls(
+            id=data.get("id", values.get("datasetId", "dataset")),
+            persistent_id=data.get("persistentId", values.get("persistentId", "")),
+            title=values.get("title", data.get("title", "Dataverse dataset")),
+            description=_first_text(values.get("dsDescription")) or data.get("description", ""),
+            landing_page=data.get("persistentUrl", data.get("url", "")),
+            subjects=_as_text_list(values.get("subject")),
+            keywords=_keyword_values(values.get("keyword")),
+            files=files,
+        )
+    @classmethod
+    def from_json_file(cls, path: str | Path) -> "DataverseDataset":
+        return cls.from_native_api(json.loads(Path(path).read_text()))
+    @classmethod
+    def fetch(cls, url: str) -> "DataverseDataset":
+        with urlopen(url) as response:  # nosec - user-supplied CLI/library URL.
+            return cls.from_native_api(json.loads(response.read().decode("utf-8")))
+    def to_croissant(self) -> CroissantDataset:
+        dataset_id = f"{self.landing_page.rstrip('/')}/#dataset" if self.landing_page else f"urn:dataverse:{self.id}"
+        return CroissantDataset(
+            id=dataset_id,
+            name=self.title,
+            description=self.description,
+            license="https://creativecommons.org/licenses/by/4.0/",
+            creators=["Dataverse"],
+            files=[
+                FileObject(
+                    id=f"{dataset_id}/file/{file.id}",
+                    name=file.label,
+                    content_url=file.download_url,
+                    encoding_format=file.content_type,
+                )
+                for file in self.files
+            ],
+            record_sets=[
+                RecordSet(
+                    id=f"{dataset_id}/recordset/files",
+                    name="Dataverse files",
+                    fields=[
+                        CroissantField(
+                            "dataset_persistent_id",
+                            "sc:Text",
+                            "Dataverse persistent dataset identifier.",
+                        ).semantic_type("https://schema.org/identifier"),
+                        CroissantField(
+                            "file_name",
+                            "sc:Text",
+                            "Dataverse file name.",
+                        ).semantic_type("https://schema.org/name"),
+                        CroissantField(
+                            "download_url",
+                            "sc:URL",
+                            "Dataverse file download URL.",
+                        ).semantic_type("https://schema.org/contentUrl"),
+                    ],
+                )
+            ],
+            keywords=[*self.subjects, *self.keywords],
+        )
+def _field_name(field: dict[str, Any]) -> str:
+    return str(field.get("typeName", field.get("name", "")))
+def _field_value(field: dict[str, Any]) -> Any:
+    return field.get("value", field.get("values"))
+def _first_text(value: Any) -> str | None:
+    if isinstance(value, list) and value:
+        item = value[0]
+        if isinstance(item, dict):
+            return str(item.get("dsDescriptionValue", item.get("value", "")))
+        return str(item)
+    if isinstance(value, dict):
+        return str(value.get("dsDescriptionValue", value.get("value", "")))
+    if value:
+        return str(value)
+    return None
+def _as_text_list(value: Any) -> list[str]:
+    if value is None:
+        return []
+    if isinstance(value, list):
+        return [str(item) for item in value]
+    return [str(value)]
+def _keyword_values(value: Any) -> list[str]:
+    if not isinstance(value, list):
+        return _as_text_list(value)
+    out = []
+    for item in value:
+        if isinstance(item, dict):
+            keyword = item.get("keywordValue") or item.get("value")
+            if keyword:
+                out.append(str(keyword))
+        else:
+            out.append(str(item))
+    return out

querygraph/did.py ADDED Viewed

@@ -0,0 +1,51 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from hashlib import sha256
+from querygraph.base58 import b58encode
+@dataclass(frozen=True)
+class DidDocument:
+    id: str
+    controller: str
+    public_key_multibase: str
+    context: list[str] | None = None
+    service_endpoint: str | None = None
+    @classmethod
+    def new_oyd(cls, seed: bytes | str, controller: str) -> "DidDocument":
+        seed_bytes = seed.encode() if isinstance(seed, str) else seed
+        digest = sha256(seed_bytes).digest()
+        multihash = bytes([0x12, 0x20]) + digest
+        fingerprint = b58encode(multihash)
+        return cls(
+            context=[
+                "https://www.w3.org/ns/did/v1",
+                "https://w3id.org/security/suites/ed25519-2020/v1",
+            ],
+            id=f"did:oyd:z{fingerprint}",
+            controller=controller,
+            public_key_multibase=f"z{b58encode(digest)}",
+        )
+    def with_service_endpoint(self, endpoint: str) -> "DidDocument":
+        return DidDocument(
+            context=self.context,
+            id=self.id,
+            controller=self.controller,
+            public_key_multibase=self.public_key_multibase,
+            service_endpoint=endpoint,
+        )
+    def to_json(self) -> dict:
+        doc = {
+            "id": self.id,
+            "controller": self.controller,
+            "public_key_multibase": self.public_key_multibase,
+            "service_endpoint": self.service_endpoint,
+        }
+        if self.context is not None:
+            doc["@context"] = self.context
+        return doc

querygraph/lakehouse.py ADDED Viewed

@@ -0,0 +1,115 @@
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+@dataclass(frozen=True)
+class TableSpec:
+    logical_name: str
+    bare_name: str
+    rows: int
+    location: Path
+def load_table_specs(
+    manifest: str | Path,
+    warehouse: str | Path,
+) -> list[TableSpec]:
+    report = json.loads(Path(manifest).read_text())
+    warehouse_path = Path(warehouse).resolve()
+    specs: list[TableSpec] = []
+    for dataset in report.get("datasets", []):
+        for file in dataset.get("files", []):
+            table = file.get("table")
+            rows = file.get("rows")
+            if not table or rows is None:
+                continue
+            bare = table.split(".", 1)[-1]
+            specs.append(
+                TableSpec(
+                    logical_name=table,
+                    bare_name=bare,
+                    rows=int(rows),
+                    location=find_latest_parquet_dir(warehouse_path, bare),
+                )
+            )
+    return specs
+def find_latest_parquet_dir(warehouse: Path, table: str) -> Path:
+    matches = sorted(
+        [path for path in warehouse.iterdir() if path.is_dir() and path.name.startswith(f"{table}-")],
+        key=lambda path: path.stat().st_mtime,
+        reverse=True,
+    )
+    if not matches:
+        raise FileNotFoundError(f"no Parquet directory found for {table} in {warehouse}")
+    return matches[0]
+def spark_session(remote: str = "sc://127.0.0.1:50051"):
+    try:
+        from pyspark.sql import SparkSession
+    except ImportError as exc:  # pragma: no cover - depends on optional extra.
+        raise RuntimeError("Install querygraph[lakehouse] to use PySpark helpers.") from exc
+    return SparkSession.builder.remote(remote).getOrCreate()
+def register_lakehouse(
+    *,
+    manifest: str | Path = ".querygraph/lakehouse/manifest/load-report.json",
+    warehouse: str | Path = "spark-warehouse",
+    remote: str = "sc://127.0.0.1:50051",
+    create_global_temp: bool = True,
+) -> list[dict[str, Any]]:
+    spark = spark_session(remote)
+    results: list[dict[str, Any]] = []
+    for spec in load_table_specs(manifest, warehouse):
+        df = spark.read.parquet(str(spec.location))
+        df.createOrReplaceTempView(spec.bare_name)
+        if create_global_temp:
+            df.createOrReplaceGlobalTempView(spec.bare_name)
+        observed = df.count()
+        results.append(
+            {
+                "table": spec.bare_name,
+                "logicalName": spec.logical_name,
+                "rows": observed,
+                "expectedRows": spec.rows,
+                "location": str(spec.location),
+                "status": "ok" if observed == spec.rows else "mismatch",
+            }
+        )
+    return results
+def register_audit(
+    *,
+    warehouse: str | Path = "spark-warehouse",
+    remote: str = "sc://127.0.0.1:50051",
+    create_global_temp: bool = True,
+    tables: tuple[str, ...] = ("openlineage_events", "openlineage_attestations"),
+) -> list[dict[str, Any]]:
+    spark = spark_session(remote)
+    warehouse_path = Path(warehouse).resolve()
+    results: list[dict[str, Any]] = []
+    for table in tables:
+        location = find_latest_parquet_dir(warehouse_path, table)
+        df = spark.read.parquet(str(location))
+        df.createOrReplaceTempView(table)
+        if create_global_temp:
+            df.createOrReplaceGlobalTempView(table)
+        results.append({"table": table, "rows": df.count(), "location": str(location)})
+    return results
+def example_queries(scope: str = "global_temp") -> list[str]:
+    return [
+        f"SELECT COUNT(*) AS rows FROM {scope}.government_finance__countydata",
+        f"SELECT COUNT(*) AS rows FROM {scope}.codata_constants_2022__codata_constants_2022",
+        f"SELECT quantity, value, unit FROM {scope}.codata_constants_2022__codata_constants_2022 LIMIT 5",
+        f"SELECT event_hash, event_type, job_name FROM {scope}.openlineage_events LIMIT 10",
+    ]

querygraph/lineage.py ADDED Viewed

@@ -0,0 +1,106 @@
+from __future__ import annotations
+import json
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+from pydantic import BaseModel, Field
+from querygraph.typedid import TypeDidEnvelope, sha256_hex
+class OpenLineageRunEvent(BaseModel):
+    eventType: str = "COMPLETE"
+    eventTime: datetime = Field(default_factory=lambda: datetime.now(UTC))
+    run: dict[str, Any]
+    job: dict[str, Any]
+    inputs: list[dict[str, Any]] = Field(default_factory=list)
+    outputs: list[dict[str, Any]] = Field(default_factory=list)
+    producer: str = "https://querygraph.ai/qg-python"
+    schemaURL: str = "https://openlineage.io/spec/2-0-2/OpenLineage.json"
+    @classmethod
+    def for_agent_run(
+        cls,
+        *,
+        request: TypeDidEnvelope,
+        job_name: str,
+        inputs: list[str],
+        outputs: list[str],
+        namespace: str = "querygraph.python",
+    ) -> "OpenLineageRunEvent":
+        return cls(
+            run={
+                "runId": f"querygraph-python-{request.signature[-12:]}",
+                "facets": {
+                    "queryGraph_typeDid": {
+                        "_producer": "https://querygraph.ai/qg-python",
+                        "_schemaURL": "https://querygraph.ai/schemas/openlineage/querygraph-typedid-facet/0.1.0.json",
+                        "protocol": request.protocol,
+                        "conversationId": request.conversation_id,
+                        "payloadSha256": request.payload_sha256,
+                        "signature": request.signature,
+                    }
+                },
+            },
+            job={"namespace": namespace, "name": job_name},
+            inputs=[{"namespace": "sail", "name": item} for item in inputs],
+            outputs=[{"namespace": "querygraph", "name": item} for item in outputs],
+        )
+    def event_hash(self) -> str:
+        return sha256_hex(self.model_dump_json(exclude_none=True))
+class LineageAttestation(BaseModel):
+    issuer: str
+    subject: str
+    event_hash: str
+    merkle_root: str
+    signature_type: str = "QueryGraphDemoSha256Signature"
+    verification_method: str
+    signature: str
+    signed_payload_sha256: str
+    created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
+    @classmethod
+    def from_event(
+        cls,
+        *,
+        issuer: str,
+        subject: str,
+        event_hash: str,
+    ) -> "LineageAttestation":
+        created_at = datetime.now(UTC)
+        merkle_root = sha256_hex(f"querygraph-lineage\n{event_hash}")
+        payload = "\n".join(
+            [
+                "querygraph-lineage-attestation-v1",
+                f"issuer:{issuer}",
+                f"subject:{subject}",
+                f"event_hash:{event_hash}",
+                f"merkle_root:{merkle_root}",
+                f"created_at:{created_at.isoformat()}",
+            ]
+        )
+        return cls(
+            issuer=issuer,
+            subject=subject,
+            event_hash=event_hash,
+            merkle_root=merkle_root,
+            verification_method=f"{issuer}#querygraph-demo-key",
+            signature=f"sha256:{sha256_hex(payload)}",
+            signed_payload_sha256=sha256_hex(payload),
+            created_at=created_at,
+        )
+def append_jsonl(path: str | Path, value: BaseModel | dict[str, Any]) -> Path:
+    target = Path(path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    data = value.model_dump(mode="json") if isinstance(value, BaseModel) else value
+    with target.open("a", encoding="utf-8") as handle:
+        handle.write(json.dumps(data, sort_keys=True))
+        handle.write("\n")
+    return target

querygraph/navigator.py ADDED Viewed

@@ -0,0 +1,141 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from datetime import UTC, datetime
+from querygraph.cdif import CdifResource
+from querygraph.croissant import CroissantDataset, Field, FileObject, RecordSet
+from querygraph.did import DidDocument
+from querygraph.odrl import Action, Policy, Rule
+@dataclass(frozen=True)
+class NavigatorInput:
+    dataset_name: str
+    description: str
+    landing_page: str
+    data_url: str
+    creator: str
+    agent_name: str
+@dataclass(frozen=True)
+class NavigatorOutput:
+    generated_at: datetime
+    croissant: dict
+    cdif: dict
+    did: DidDocument
+    odrl: dict
+    bundle: dict
+class AiNavigator:
+    def build(self, input: NavigatorInput) -> NavigatorOutput:
+        did = DidDocument.new_oyd(
+            f"{input.agent_name}:{input.creator}:{input.dataset_name}",
+            input.agent_name,
+        ).with_service_endpoint(input.landing_page)
+        dataset_id = f"{input.landing_page.rstrip('/')}/#dataset"
+        dataset = CroissantDataset(
+            id=dataset_id,
+            name=input.dataset_name,
+            description=input.description,
+            license="https://creativecommons.org/licenses/by/4.0/",
+            creators=[input.creator],
+            files=[
+                FileObject(
+                    id=f"{dataset_id}/file/source",
+                    name="source-data",
+                    content_url=input.data_url,
+                    encoding_format="application/octet-stream",
+                )
+            ],
+            record_sets=[
+                RecordSet(
+                    id=f"{dataset_id}/recordset/default",
+                    name="default observations",
+                    fields=[
+                        Field(
+                            "subject",
+                            "sc:Text",
+                            "Primary entity or observation subject",
+                        ).semantic_type("https://schema.org/about"),
+                        Field(
+                            "value",
+                            "sc:Text",
+                            "Observed value, label, or narrative",
+                        ).semantic_type("https://schema.org/value"),
+                        Field(
+                            "source",
+                            "sc:URL",
+                            "Evidence or provenance URL",
+                        ).semantic_type("https://schema.org/citation"),
+                    ],
+                )
+            ],
+            keywords=["AI Navigator", "Croissant", "CDIF", "DID", "ODRL"],
+        )
+        policy = Policy(
+            id=f"{dataset_id}/policy/default",
+            target=dataset_id,
+            assigner=did.id,
+            permissions=[
+                Rule(
+                    action=Action.READ,
+                    assignee="public",
+                    constraint="attribution required",
+                ),
+                Rule(
+                    action=Action.INDEX,
+                    assignee=did.id,
+                    constraint="local semantic indexing for AI Navigator",
+                ),
+            ],
+            prohibitions=[
+                Rule(
+                    action=Action.DERIVE,
+                    assignee="public",
+                    constraint="no model training without separate agreement",
+                )
+            ],
+        )
+        odrl_json = policy.to_json_ld()
+        cdif = CdifResource.from_croissant(
+            dataset, input.landing_page, input.data_url
+        ).with_odrl_policy(policy.id, odrl_json)
+        croissant_json = dataset.to_json_ld()
+        cdif_json = cdif.to_json_ld()
+        generated_at = datetime.now(UTC)
+        did_json = did.to_json()
+        bundle = {
+            "@context": {
+                "schema": "https://schema.org/",
+                "cr": "http://mlcommons.org/croissant/",
+                "cdif": "https://cdif.codata.org/",
+                "dcat": "http://www.w3.org/ns/dcat#",
+                "dct": "http://purl.org/dc/terms/",
+                "odrl": "http://www.w3.org/ns/odrl/2/",
+                "querygraph": "https://querygraph.ai/ns#",
+            },
+            "@type": "querygraph:AiNavigatorSemanticBundle",
+            "generatedAt": generated_at.isoformat().replace("+00:00", "Z"),
+            "identity": did_json,
+            "layers": {
+                "semanticCroissant": croissant_json,
+                "cdif": cdif_json,
+                "did": did_json,
+                "odrl": odrl_json,
+            },
+        }
+        return NavigatorOutput(
+            generated_at=generated_at,
+            croissant=croissant_json,
+            cdif=cdif_json,
+            did=did,
+            odrl=odrl_json,
+            bundle=bundle,
+        )

querygraph/odrl.py ADDED Viewed

@@ -0,0 +1,60 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from enum import Enum
+class Action(Enum):
+    USE = "odrl:use"
+    READ = "odrl:read"
+    DERIVE = "odrl:derive"
+    TRANSLATE = "querygraph:translate"
+    INDEX = "querygraph:index"
+    def iri(self) -> str:
+        return self.value
+@dataclass(frozen=True)
+class Rule:
+    action: Action
+    assignee: str
+    constraint: str | None = None
+@dataclass(frozen=True)
+class Policy:
+    id: str
+    target: str
+    assigner: str
+    permissions: list[Rule]
+    prohibitions: list[Rule]
+    def allows(self, assignee: str, action: Action) -> bool:
+        prohibited = any(
+            rule.assignee == assignee and rule.action == action
+            for rule in self.prohibitions
+        )
+        permitted = any(
+            rule.assignee == assignee and rule.action == action
+            for rule in self.permissions
+        )
+        return permitted and not prohibited
+    def to_json_ld(self) -> dict:
+        return {
+            "@type": "odrl:Policy",
+            "@id": self.id,
+            "odrl:target": self.target,
+            "odrl:assigner": self.assigner,
+            "odrl:permission": [_rule_json(rule) for rule in self.permissions],
+            "odrl:prohibition": [_rule_json(rule) for rule in self.prohibitions],
+        }
+def _rule_json(rule: Rule) -> dict:
+    return {
+        "odrl:action": rule.action.iri(),
+        "odrl:assignee": rule.assignee,
+        "odrl:constraint": rule.constraint,
+    }

querygraph/odrl_rights.py ADDED Viewed

@@ -0,0 +1,50 @@
+from __future__ import annotations
+from pydantic import BaseModel
+from querygraph.odrl import Action, Policy
+from querygraph.rbac import RbacPolicy
+from querygraph.typedid import AccessReceipt
+class OdrlDecision(BaseModel):
+    principal: str
+    resource: str
+    action: str
+    rbac_allowed: bool
+    odrl_allowed: bool
+    allowed: bool
+    receipt: AccessReceipt
+class OdrlRightsLayer(BaseModel):
+    """ODRL policy evaluation with RBAC and QueryGraph access receipts."""
+    rbac: RbacPolicy
+    odrl: Policy
+    def decide(self, principal: str, resource: str, action: Action) -> OdrlDecision:
+        rbac_allowed = self.rbac.allows(principal, resource, action.value)
+        odrl_allowed = self.odrl.allows(principal, action)
+        allowed = rbac_allowed and odrl_allowed
+        receipt = AccessReceipt(
+            principal=principal,
+            resource=resource,
+            action=action.iri(),
+            allowed=allowed,
+            reason=(
+                "RBAC and ODRL permitted action"
+                if allowed
+                else "RBAC or ODRL denied action"
+            ),
+            policy_id=self.odrl.id,
+        )
+        return OdrlDecision(
+            principal=principal,
+            resource=resource,
+            action=action.iri(),
+            rbac_allowed=rbac_allowed,
+            odrl_allowed=odrl_allowed,
+            allowed=allowed,
+            receipt=receipt,
+        )