querygraph 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
querygraph/__init__.py ADDED
@@ -0,0 +1,14 @@
1
+ from querygraph.navigator import AiNavigator, NavigatorInput, NavigatorOutput
2
+ from querygraph.osi import OsiDocument
3
+ from querygraph.typedid import TypeDidAgent, TypeDidEnvelope
4
+ from querygraph.odrl_rights import OdrlRightsLayer
5
+
6
+ __all__ = [
7
+ "AiNavigator",
8
+ "NavigatorInput",
9
+ "OdrlRightsLayer",
10
+ "NavigatorOutput",
11
+ "OsiDocument",
12
+ "TypeDidAgent",
13
+ "TypeDidEnvelope",
14
+ ]
querygraph/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from querygraph.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ raise SystemExit(main())
querygraph/agents.py ADDED
@@ -0,0 +1,89 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Callable
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+ from querygraph.typedid import AgentResponse, GovernedPrompt, TypeDidAgent
8
+
9
+
10
+ class TypeDidAgentRun(BaseModel):
11
+ supervisor: TypeDidAgent
12
+ specialists: list[TypeDidAgent]
13
+ prompt: GovernedPrompt
14
+ responses: list[AgentResponse] = Field(default_factory=list)
15
+
16
+ def aggregate(self) -> dict[str, Any]:
17
+ allowed = [response for response in self.responses if response.status == "allowed"]
18
+ denied = [response for response in self.responses if response.status == "denied"]
19
+ return {
20
+ "supervisor": self.supervisor.name,
21
+ "question": self.prompt.question,
22
+ "allowedSummaries": [response.summary for response in allowed],
23
+ "denials": [response.summary for response in denied],
24
+ "evidenceHashes": [
25
+ response.envelope.payload_sha256 for response in self.responses
26
+ ],
27
+ }
28
+
29
+
30
+ def deterministic_specialist(
31
+ agent: TypeDidAgent,
32
+ *,
33
+ summary: str,
34
+ status: str = "allowed",
35
+ evidence: list[str] | None = None,
36
+ redactions: list[str] | None = None,
37
+ ) -> Callable[[dict[str, Any]], AgentResponse]:
38
+ def invoke(payload: dict[str, Any]) -> AgentResponse:
39
+ supervisor = TypeDidAgent.new("SupervisorAgent")
40
+ request = supervisor.request(
41
+ agent,
42
+ action=payload.get("action", "summarize"),
43
+ resource=payload.get("resource", "qg_lakehouse"),
44
+ payload=payload,
45
+ )
46
+ return agent.answer(
47
+ request,
48
+ status="allowed" if status == "allowed" else "denied",
49
+ summary=summary,
50
+ evidence=evidence or [payload.get("resource", "qg_lakehouse")],
51
+ redactions=redactions or [],
52
+ )
53
+
54
+ return invoke
55
+
56
+
57
+ class TypeDidLangChainToolAdapter:
58
+ """Small adapter that exposes a TypeDID agent as a LangChain StructuredTool."""
59
+
60
+ def __init__(
61
+ self,
62
+ agent: TypeDidAgent,
63
+ handler: Callable[[dict[str, Any]], AgentResponse],
64
+ ) -> None:
65
+ self.agent = agent
66
+ self.handler = handler
67
+
68
+ def as_tool(self):
69
+ try:
70
+ from langchain_core.tools import StructuredTool
71
+ except ImportError as exc: # pragma: no cover - depends on optional extra.
72
+ raise RuntimeError(
73
+ "Install querygraph[agents] to use LangChain tool adapters."
74
+ ) from exc
75
+
76
+ def run(question: str, resource: str = "qg_lakehouse") -> dict[str, Any]:
77
+ response = self.handler(
78
+ {"question": question, "resource": resource, "action": "summarize"}
79
+ )
80
+ return response.model_dump(mode="json")
81
+
82
+ return StructuredTool.from_function(
83
+ func=run,
84
+ name=self.agent.name,
85
+ description=(
86
+ f"Governed TypeDID tool for {self.agent.name}; returns a signed "
87
+ "summary or denial."
88
+ ),
89
+ )
querygraph/base58.py ADDED
@@ -0,0 +1,15 @@
1
+ ALPHABET = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"
2
+
3
+
4
+ def b58encode(data: bytes) -> str:
5
+ if not data:
6
+ return ""
7
+
8
+ value = int.from_bytes(data, "big")
9
+ encoded = ""
10
+ while value:
11
+ value, remainder = divmod(value, 58)
12
+ encoded = ALPHABET[remainder] + encoded
13
+
14
+ leading_zeroes = len(data) - len(data.lstrip(b"\0"))
15
+ return "1" * leading_zeroes + encoded
querygraph/cdif.py ADDED
@@ -0,0 +1,205 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+
6
+ from querygraph.croissant import CroissantDataset
7
+
8
+
9
+ class CdifProfile(Enum):
10
+ DISCOVERY = "https://cdif.codata.org/profile/discovery"
11
+ MANIFEST = "https://cdif.codata.org/profile/manifest"
12
+ DATA_DESCRIPTION = "https://cdif.codata.org/profile/data-description"
13
+ DATA_ACCESS = "https://cdif.codata.org/profile/data-access"
14
+ ACCESS_RIGHTS = "https://cdif.codata.org/profile/access-rights"
15
+ CONTROLLED_VOCABULARIES = "https://cdif.codata.org/profile/controlled-vocabularies"
16
+ DATA_INTEGRATION = "https://cdif.codata.org/profile/data-integration"
17
+ UNIVERSALS = "https://cdif.codata.org/profile/universals"
18
+ PROVENANCE = "https://cdif.codata.org/profile/provenance"
19
+
20
+ def iri(self) -> str:
21
+ return self.value
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class CdifDistribution:
26
+ id: str
27
+ name: str
28
+ content_url: str
29
+ encoding_format: str
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class CdifDataElement:
34
+ id: str
35
+ name: str
36
+ data_type: str
37
+ description: str
38
+ semantic_type: str | None
39
+ record_set: str
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class CdifAccessRights:
44
+ license: str
45
+ policy_id: str | None = None
46
+ rights_statement: str | None = None
47
+ odrl_policy: dict | None = None
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class CdifResource:
52
+ dataset_id: str
53
+ title: str
54
+ description: str
55
+ profiles: list[CdifProfile]
56
+ landing_page: str
57
+ access_service: str
58
+ distributions: list[CdifDistribution] = field(default_factory=list)
59
+ data_elements: list[CdifDataElement] = field(default_factory=list)
60
+ access_rights: CdifAccessRights | None = None
61
+ temporal_coverage: str | None = None
62
+ spatial_coverage: str | None = None
63
+ units: list[str] = field(default_factory=list)
64
+ vocabularies: list[str] = field(default_factory=list)
65
+ keywords: list[str] = field(default_factory=list)
66
+
67
+ @classmethod
68
+ def from_croissant(
69
+ cls, dataset: CroissantDataset, landing_page: str, access_service: str
70
+ ) -> "CdifResource":
71
+ distributions = [
72
+ CdifDistribution(
73
+ id=file.id,
74
+ name=file.name,
75
+ content_url=file.content_url,
76
+ encoding_format=file.encoding_format,
77
+ )
78
+ for file in dataset.files
79
+ ]
80
+ data_elements = [
81
+ CdifDataElement(
82
+ id=f"{record_set.id}/field/{field.name}",
83
+ name=field.name,
84
+ data_type=field.data_type,
85
+ description=field.description,
86
+ semantic_type=field.semantic_type_value,
87
+ record_set=record_set.id,
88
+ )
89
+ for record_set in dataset.record_sets
90
+ for field in record_set.fields
91
+ ]
92
+ return cls(
93
+ dataset_id=dataset.id,
94
+ title=dataset.name,
95
+ description=dataset.description,
96
+ profiles=[
97
+ CdifProfile.DISCOVERY,
98
+ CdifProfile.MANIFEST,
99
+ CdifProfile.DATA_DESCRIPTION,
100
+ CdifProfile.DATA_ACCESS,
101
+ CdifProfile.ACCESS_RIGHTS,
102
+ CdifProfile.CONTROLLED_VOCABULARIES,
103
+ CdifProfile.DATA_INTEGRATION,
104
+ CdifProfile.UNIVERSALS,
105
+ ],
106
+ landing_page=landing_page,
107
+ access_service=access_service,
108
+ distributions=distributions,
109
+ data_elements=data_elements,
110
+ access_rights=CdifAccessRights(
111
+ license=dataset.license,
112
+ rights_statement=(
113
+ "Access and usage must satisfy the attached ODRL/TypeSec "
114
+ "policy before agent use."
115
+ ),
116
+ ),
117
+ vocabularies=[
118
+ element.semantic_type
119
+ for element in data_elements
120
+ if element.semantic_type is not None
121
+ ],
122
+ keywords=dataset.keywords,
123
+ )
124
+
125
+ def with_odrl_policy(self, policy_id: str, policy: dict) -> "CdifResource":
126
+ rights = self.access_rights or CdifAccessRights(license="")
127
+ return CdifResource(
128
+ dataset_id=self.dataset_id,
129
+ title=self.title,
130
+ description=self.description,
131
+ profiles=self.profiles,
132
+ landing_page=self.landing_page,
133
+ access_service=self.access_service,
134
+ distributions=self.distributions,
135
+ data_elements=self.data_elements,
136
+ access_rights=CdifAccessRights(
137
+ license=rights.license,
138
+ policy_id=policy_id,
139
+ rights_statement=rights.rights_statement,
140
+ odrl_policy=policy,
141
+ ),
142
+ temporal_coverage=self.temporal_coverage,
143
+ spatial_coverage=self.spatial_coverage,
144
+ units=self.units,
145
+ vocabularies=self.vocabularies,
146
+ keywords=self.keywords,
147
+ )
148
+
149
+ def to_json_ld(self) -> dict:
150
+ return {
151
+ "@context": {
152
+ "cdif": "https://cdif.codata.org/",
153
+ "dcat": "http://www.w3.org/ns/dcat#",
154
+ "dct": "http://purl.org/dc/terms/",
155
+ "odrl": "http://www.w3.org/ns/odrl/2/",
156
+ },
157
+ "@type": "dcat:Dataset",
158
+ "@id": self.dataset_id,
159
+ "dct:title": self.title,
160
+ "dct:description": self.description,
161
+ "cdif:profile": [profile.iri() for profile in self.profiles],
162
+ "dcat:landingPage": self.landing_page,
163
+ "dcat:accessService": {
164
+ "@type": "dcat:DataService",
165
+ "endpointURL": self.access_service,
166
+ },
167
+ "dcat:distribution": [
168
+ {
169
+ "@type": "dcat:Distribution",
170
+ "@id": distribution.id,
171
+ "dct:title": distribution.name,
172
+ "dcat:downloadURL": distribution.content_url,
173
+ "dcat:mediaType": distribution.encoding_format,
174
+ }
175
+ for distribution in self.distributions
176
+ ],
177
+ "cdif:dataElement": [
178
+ {
179
+ "@type": "cdif:DataElement",
180
+ "@id": element.id,
181
+ "dct:title": element.name,
182
+ "dct:description": element.description,
183
+ "cdif:dataType": element.data_type,
184
+ "cdif:semanticType": element.semantic_type,
185
+ "cdif:recordSet": element.record_set,
186
+ }
187
+ for element in self.data_elements
188
+ ],
189
+ "dct:accessRights": (
190
+ {
191
+ "@type": "dct:RightsStatement",
192
+ "@id": self.access_rights.policy_id,
193
+ "dct:license": self.access_rights.license,
194
+ "dct:description": self.access_rights.rights_statement,
195
+ "odrl:policy": self.access_rights.odrl_policy,
196
+ }
197
+ if self.access_rights is not None
198
+ else None
199
+ ),
200
+ "dct:temporal": self.temporal_coverage,
201
+ "dct:spatial": self.spatial_coverage,
202
+ "cdif:unit": self.units,
203
+ "cdif:controlledVocabulary": self.vocabularies,
204
+ "dcat:keyword": self.keywords,
205
+ }
querygraph/cli.py ADDED
@@ -0,0 +1,123 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ from dataclasses import asdict, is_dataclass
6
+ from typing import Any
7
+
8
+ from querygraph.codata import CodataOdrlClient
9
+ from querygraph.lakehouse import example_queries, register_audit, register_lakehouse
10
+ from querygraph.navigator import AiNavigator, NavigatorInput
11
+ from querygraph.qglake import build_python_qglake_story
12
+
13
+
14
+ def main(argv: list[str] | None = None) -> int:
15
+ parser = argparse.ArgumentParser(
16
+ prog="querygraph", description="AI Navigator semantic layer CLI"
17
+ )
18
+ subparsers = parser.add_subparsers(dest="command", required=True)
19
+
20
+ navigator = subparsers.add_parser(
21
+ "navigator",
22
+ help="Build a four-layer semantic bundle: Croissant, CDIF, DID, and ODRL.",
23
+ )
24
+ navigator.add_argument("--dataset-name", required=True)
25
+ navigator.add_argument("--description", required=True)
26
+ navigator.add_argument("--landing-page", required=True)
27
+ navigator.add_argument("--data-url", required=True)
28
+ navigator.add_argument("--creator", default="QueryGraph")
29
+ navigator.add_argument("--agent-name", default="AI Navigator")
30
+
31
+ anchor_url = subparsers.add_parser(
32
+ "anchor-url", help="Reproduce the CODATA ODRL demo's URL-to-DID anchoring call."
33
+ )
34
+ anchor_url.add_argument("--url", default="https://querygraph.ai/resources/")
35
+ anchor_url.add_argument("--endpoint", default="https://odrl.dev.codata.org")
36
+
37
+ qglake_story = subparsers.add_parser(
38
+ "qglake-story",
39
+ help="Run the Python TypeDID/Pydantic QG Lakehouse agent story.",
40
+ )
41
+ qglake_story.add_argument("--pretty", action="store_true")
42
+
43
+ lakehouse_register = subparsers.add_parser(
44
+ "lakehouse-register",
45
+ help="Register QueryGraph Sail lakehouse Parquet tables in a Spark Connect session.",
46
+ )
47
+ lakehouse_register.add_argument("--remote", default="sc://127.0.0.1:50051")
48
+ lakehouse_register.add_argument(
49
+ "--manifest", default=".querygraph/lakehouse/manifest/load-report.json"
50
+ )
51
+ lakehouse_register.add_argument("--warehouse", default="spark-warehouse")
52
+ lakehouse_register.add_argument("--session-temp", action="store_true")
53
+
54
+ audit_register = subparsers.add_parser(
55
+ "audit-register",
56
+ help="Register QueryGraph OpenLineage audit Parquet tables in a Spark Connect session.",
57
+ )
58
+ audit_register.add_argument("--remote", default="sc://127.0.0.1:50051")
59
+ audit_register.add_argument("--warehouse", default="spark-warehouse")
60
+ audit_register.add_argument("--session-temp", action="store_true")
61
+
62
+ pyspark_examples = subparsers.add_parser(
63
+ "pyspark-examples",
64
+ help="Print example PySpark SQL queries for the registered Sail warehouse.",
65
+ )
66
+ pyspark_examples.add_argument("--scope", default="global_temp")
67
+
68
+ args = parser.parse_args(argv)
69
+ if args.command == "navigator":
70
+ output = AiNavigator().build(
71
+ NavigatorInput(
72
+ dataset_name=args.dataset_name,
73
+ description=args.description,
74
+ landing_page=args.landing_page,
75
+ data_url=args.data_url,
76
+ creator=args.creator,
77
+ agent_name=args.agent_name,
78
+ )
79
+ )
80
+ print(json.dumps(output.bundle, indent=2))
81
+ return 0
82
+
83
+ if args.command == "qglake-story":
84
+ indent = 2 if args.pretty else None
85
+ print(json.dumps(build_python_qglake_story(), indent=indent))
86
+ return 0
87
+
88
+ if args.command == "lakehouse-register":
89
+ rows = register_lakehouse(
90
+ manifest=args.manifest,
91
+ warehouse=args.warehouse,
92
+ remote=args.remote,
93
+ create_global_temp=not args.session_temp,
94
+ )
95
+ print(json.dumps(rows, indent=2))
96
+ return 0
97
+
98
+ if args.command == "audit-register":
99
+ rows = register_audit(
100
+ warehouse=args.warehouse,
101
+ remote=args.remote,
102
+ create_global_temp=not args.session_temp,
103
+ )
104
+ print(json.dumps(rows, indent=2))
105
+ return 0
106
+
107
+ if args.command == "pyspark-examples":
108
+ print("\n".join(example_queries(args.scope)))
109
+ return 0
110
+
111
+ anchored = CodataOdrlClient(args.endpoint).create_did_from_url(args.url)
112
+ print(json.dumps(_to_json(anchored), indent=2))
113
+ return 0
114
+
115
+
116
+ def _to_json(value: Any) -> Any:
117
+ if is_dataclass(value):
118
+ return {key: _to_json(item) for key, item in asdict(value).items()}
119
+ if isinstance(value, list):
120
+ return [_to_json(item) for item in value]
121
+ if isinstance(value, dict):
122
+ return {key: _to_json(item) for key, item in value.items()}
123
+ return value
querygraph/codata.py ADDED
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from urllib.parse import urlencode
6
+ from urllib.request import urlopen
7
+
8
+
9
+ @dataclass(frozen=True)
10
+ class StoredPayload:
11
+ url: str | None = None
12
+ timestamp: str | None = None
13
+ title: str | None = None
14
+ is_rdf: bool | None = None
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class AnchoredDid:
19
+ did: str
20
+ doc: dict | None = None
21
+ stored_payload: StoredPayload | None = None
22
+
23
+
24
+ class CodataOdrlClient:
25
+ def __init__(self, base_url: str = "https://odrl.dev.codata.org") -> None:
26
+ self.base_url = base_url.rstrip("/")
27
+
28
+ def create_did_from_url(self, url: str) -> AnchoredDid:
29
+ query = urlencode({"url": url})
30
+ with urlopen(f"{self.base_url}/api/did/create_from_url?{query}") as response:
31
+ payload = json.loads(response.read().decode())
32
+
33
+ stored_payload = payload.get("stored_payload")
34
+ return AnchoredDid(
35
+ did=payload["did"],
36
+ doc=payload.get("doc"),
37
+ stored_payload=StoredPayload(**stored_payload) if stored_payload else None,
38
+ )
@@ -0,0 +1,86 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class Field:
8
+ name: str
9
+ data_type: str
10
+ description: str
11
+ semantic_type_value: str | None = None
12
+
13
+ def semantic_type(self, semantic_type: str) -> "Field":
14
+ return Field(self.name, self.data_type, self.description, semantic_type)
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class FileObject:
19
+ id: str
20
+ name: str
21
+ content_url: str
22
+ encoding_format: str
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class RecordSet:
27
+ id: str
28
+ name: str
29
+ fields: list[Field] = field(default_factory=list)
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class CroissantDataset:
34
+ id: str
35
+ name: str
36
+ description: str
37
+ license: str
38
+ creators: list[str]
39
+ files: list[FileObject]
40
+ record_sets: list[RecordSet]
41
+ keywords: list[str]
42
+
43
+ def to_json_ld(self) -> dict:
44
+ return {
45
+ "@context": {
46
+ "@vocab": "https://schema.org/",
47
+ "cr": "http://mlcommons.org/croissant/",
48
+ "dcat": "http://www.w3.org/ns/dcat#",
49
+ "odrl": "http://www.w3.org/ns/odrl/2/",
50
+ },
51
+ "@type": "cr:Dataset",
52
+ "@id": self.id,
53
+ "name": self.name,
54
+ "description": self.description,
55
+ "license": self.license,
56
+ "creator": [{"@type": "Person", "name": name} for name in self.creators],
57
+ "keywords": self.keywords,
58
+ "distribution": [
59
+ {
60
+ "@type": "cr:FileObject",
61
+ "@id": file.id,
62
+ "name": file.name,
63
+ "contentUrl": file.content_url,
64
+ "encodingFormat": file.encoding_format,
65
+ }
66
+ for file in self.files
67
+ ],
68
+ "recordSet": [
69
+ {
70
+ "@type": "cr:RecordSet",
71
+ "@id": record_set.id,
72
+ "name": record_set.name,
73
+ "field": [
74
+ {
75
+ "@type": "cr:Field",
76
+ "name": field.name,
77
+ "dataType": field.data_type,
78
+ "description": field.description,
79
+ "sameAs": field.semantic_type_value,
80
+ }
81
+ for field in record_set.fields
82
+ ],
83
+ }
84
+ for record_set in self.record_sets
85
+ ],
86
+ }