@ekairos/dataset 1.22.37-beta.development.0 → 1.22.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +347 -0
- package/dist/agents.d.ts +8 -0
- package/dist/agents.js +8 -0
- package/dist/builder/agentMaterializers.d.ts +9 -0
- package/dist/builder/agentMaterializers.js +10 -0
- package/dist/builder/context.d.ts +15 -0
- package/dist/builder/context.js +251 -0
- package/dist/builder/instructions.d.ts +5 -0
- package/dist/builder/instructions.js +40 -0
- package/dist/builder/materialize.d.ts +83 -0
- package/dist/builder/materialize.js +548 -0
- package/dist/builder/materializeQuery.d.ts +12 -0
- package/dist/builder/materializeQuery.js +31 -0
- package/dist/builder/persistence.d.ts +22 -0
- package/dist/builder/persistence.js +153 -0
- package/dist/builder/rows.d.ts +7 -0
- package/dist/builder/rows.js +56 -0
- package/dist/builder/schemaInference.d.ts +3 -0
- package/dist/builder/schemaInference.js +61 -0
- package/dist/builder/types.d.ts +140 -0
- package/dist/builder/types.js +1 -0
- package/dist/clearDataset.tool.d.ts +2 -3
- package/dist/clearDataset.tool.js +13 -17
- package/dist/completeDataset.steps.d.ts +117 -0
- package/dist/completeDataset.steps.js +487 -0
- package/dist/completeDataset.tool.d.ts +132 -7
- package/dist/completeDataset.tool.js +46 -192
- package/dist/contextResources.d.ts +31 -0
- package/dist/contextResources.js +151 -0
- package/dist/contextWorkspace.d.ts +79 -0
- package/dist/contextWorkspace.js +234 -0
- package/dist/dataset/steps.d.ts +39 -15
- package/dist/dataset/steps.js +96 -39
- package/dist/dataset.d.ts +3 -67
- package/dist/dataset.js +129 -521
- package/dist/datasetFiles.d.ts +5 -1
- package/dist/datasetFiles.js +29 -27
- package/dist/domain.d.ts +1 -2
- package/dist/domain.js +1 -6
- package/dist/executeCommand.tool.d.ts +2 -30
- package/dist/executeCommand.tool.js +165 -39
- package/dist/file/file-dataset.agent.d.ts +19 -56
- package/dist/file/file-dataset.agent.js +176 -134
- package/dist/file/file-dataset.steps.d.ts +27 -0
- package/dist/file/file-dataset.steps.js +47 -0
- package/dist/file/file-dataset.types.d.ts +64 -0
- package/dist/file/file-dataset.types.js +1 -0
- package/dist/file/filepreview.d.ts +5 -35
- package/dist/file/filepreview.js +60 -107
- package/dist/file/filepreview.types.d.ts +31 -0
- package/dist/file/filepreview.types.js +1 -0
- package/dist/file/generateSchema.tool.d.ts +2 -3
- package/dist/file/generateSchema.tool.js +11 -15
- package/dist/file/index.d.ts +1 -2
- package/dist/file/index.js +1 -18
- package/dist/file/prompts.d.ts +2 -3
- package/dist/file/prompts.js +134 -27
- package/dist/file/scripts.generated.d.ts +1 -0
- package/dist/file/scripts.generated.js +11 -0
- package/dist/file/steps.d.ts +1 -2
- package/dist/file/steps.js +9 -7
- package/dist/id.d.ts +1 -0
- package/dist/id.js +10 -0
- package/dist/index.d.ts +8 -7
- package/dist/index.js +8 -23
- package/dist/materializeDataset.tool.d.ts +52 -32
- package/dist/materializeDataset.tool.js +81 -65
- package/dist/query/index.d.ts +1 -2
- package/dist/query/index.js +1 -18
- package/dist/query/queryDomain.d.ts +3 -4
- package/dist/query/queryDomain.js +3 -40
- package/dist/query/queryDomain.step.d.ts +1 -1
- package/dist/query/queryDomain.step.js +13 -13
- package/dist/sandbox/steps.d.ts +23 -15
- package/dist/sandbox/steps.js +73 -76
- package/dist/sandbox.steps.d.ts +1 -2
- package/dist/sandbox.steps.js +1 -18
- package/dist/schema.d.ts +13 -13
- package/dist/schema.js +25 -37
- package/dist/service.d.ts +8 -5
- package/dist/service.js +70 -15
- package/dist/skill.d.ts +0 -1
- package/dist/skill.js +12 -17
- package/dist/transform/filepreview.d.ts +2 -3
- package/dist/transform/filepreview.js +9 -26
- package/dist/transform/index.d.ts +2 -3
- package/dist/transform/index.js +2 -8
- package/dist/transform/prompts.d.ts +1 -34
- package/dist/transform/prompts.js +58 -43
- package/dist/transform/transform-dataset.agent.d.ts +20 -45
- package/dist/transform/transform-dataset.agent.js +146 -91
- package/dist/transform/transform-dataset.steps.d.ts +30 -0
- package/dist/transform/transform-dataset.steps.js +61 -0
- package/dist/transform/transform-dataset.types.d.ts +95 -0
- package/dist/transform/transform-dataset.types.js +1 -0
- package/dist/transform/transformDataset.d.ts +3 -3
- package/dist/transform/transformDataset.js +15 -18
- package/dist/writeDatasetRows.tool.d.ts +188 -0
- package/dist/writeDatasetRows.tool.js +258 -0
- package/package.json +35 -10
- package/dist/clearDataset.tool.d.ts.map +0 -1
- package/dist/clearDataset.tool.js.map +0 -1
- package/dist/completeDataset.tool.d.ts.map +0 -1
- package/dist/completeDataset.tool.js.map +0 -1
- package/dist/dataset/steps.d.ts.map +0 -1
- package/dist/dataset/steps.js.map +0 -1
- package/dist/dataset.d.ts.map +0 -1
- package/dist/dataset.js.map +0 -1
- package/dist/datasetFiles.d.ts.map +0 -1
- package/dist/datasetFiles.js.map +0 -1
- package/dist/domain.d.ts.map +0 -1
- package/dist/domain.js.map +0 -1
- package/dist/eventsReactRuntime.d.ts +0 -22
- package/dist/eventsReactRuntime.d.ts.map +0 -1
- package/dist/eventsReactRuntime.js +0 -29
- package/dist/eventsReactRuntime.js.map +0 -1
- package/dist/executeCommand.tool.d.ts.map +0 -1
- package/dist/executeCommand.tool.js.map +0 -1
- package/dist/file/file-dataset.agent.d.ts.map +0 -1
- package/dist/file/file-dataset.agent.js.map +0 -1
- package/dist/file/filepreview.d.ts.map +0 -1
- package/dist/file/filepreview.js.map +0 -1
- package/dist/file/generateSchema.tool.d.ts.map +0 -1
- package/dist/file/generateSchema.tool.js.map +0 -1
- package/dist/file/index.d.ts.map +0 -1
- package/dist/file/index.js.map +0 -1
- package/dist/file/prompts.d.ts.map +0 -1
- package/dist/file/prompts.js.map +0 -1
- package/dist/file/steps.d.ts.map +0 -1
- package/dist/file/steps.js.map +0 -1
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js.map +0 -1
- package/dist/materializeDataset.tool.d.ts.map +0 -1
- package/dist/materializeDataset.tool.js.map +0 -1
- package/dist/query/index.d.ts.map +0 -1
- package/dist/query/index.js.map +0 -1
- package/dist/query/queryDomain.d.ts.map +0 -1
- package/dist/query/queryDomain.js.map +0 -1
- package/dist/query/queryDomain.step.d.ts.map +0 -1
- package/dist/query/queryDomain.step.js.map +0 -1
- package/dist/sandbox/steps.d.ts.map +0 -1
- package/dist/sandbox/steps.js.map +0 -1
- package/dist/sandbox.steps.d.ts.map +0 -1
- package/dist/sandbox.steps.js.map +0 -1
- package/dist/schema.d.ts.map +0 -1
- package/dist/schema.js.map +0 -1
- package/dist/service.d.ts.map +0 -1
- package/dist/service.js.map +0 -1
- package/dist/skill.d.ts.map +0 -1
- package/dist/skill.js.map +0 -1
- package/dist/transform/filepreview.d.ts.map +0 -1
- package/dist/transform/filepreview.js.map +0 -1
- package/dist/transform/index.d.ts.map +0 -1
- package/dist/transform/index.js.map +0 -1
- package/dist/transform/prompts.d.ts.map +0 -1
- package/dist/transform/prompts.js.map +0 -1
- package/dist/transform/transform-dataset.agent.d.ts.map +0 -1
- package/dist/transform/transform-dataset.agent.js.map +0 -1
- package/dist/transform/transformDataset.d.ts.map +0 -1
- package/dist/transform/transformDataset.js.map +0 -1
package/README.md
ADDED
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
# @ekairos/dataset
|
|
2
|
+
|
|
3
|
+
Runtime-first dataset materialization for Ekairos domains.
|
|
4
|
+
|
|
5
|
+
`@ekairos/dataset` replaces the older `@ekairos/structure` flow with a domain-owned dataset API. It persists dataset metadata and output rows in InstantDB, while file parsing and transformations run through sandbox-backed reactors when work cannot be materialized directly.
|
|
6
|
+
|
|
7
|
+
## Mental Model
|
|
8
|
+
|
|
9
|
+
A dataset build has two responsibilities:
|
|
10
|
+
|
|
11
|
+
1. Read or produce source rows from one or more sources.
|
|
12
|
+
2. Persist the resulting dataset under `datasetDomain`.
|
|
13
|
+
|
|
14
|
+
The caller passes a typed `EkairosRuntime`. The runtime must include `datasetDomain`; query sources also require the runtime to include the queried source domain.
|
|
15
|
+
|
|
16
|
+
```ts
|
|
17
|
+
import { domain } from "@ekairos/domain"
|
|
18
|
+
import { EkairosRuntime } from "@ekairos/domain/runtime"
|
|
19
|
+
import { dataset, datasetDomain } from "@ekairos/dataset"
|
|
20
|
+
|
|
21
|
+
import { sourceDomain } from "./source.domain"
|
|
22
|
+
|
|
23
|
+
const appDomain = domain("app")
|
|
24
|
+
.includes(datasetDomain)
|
|
25
|
+
.includes(sourceDomain)
|
|
26
|
+
.withSchema({ entities: {}, links: {}, rooms: {} })
|
|
27
|
+
|
|
28
|
+
class AppRuntime extends EkairosRuntime<{ orgId: string }, typeof appDomain, any> {
|
|
29
|
+
protected getDomain() {
|
|
30
|
+
return appDomain
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
protected resolveDb() {
|
|
34
|
+
return db
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
const runtime = new AppRuntime({ orgId: "org_1" })
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Use `appDomain.toInstantSchema()` to provision or push the InstantDB schema. Dataset itself does not own global DB access; it uses `runtime.use(datasetDomain)` internally.
|
|
42
|
+
|
|
43
|
+
## Basic Usage
|
|
44
|
+
|
|
45
|
+
```ts
|
|
46
|
+
const result = await dataset(runtime, { datasetId: "products_v1" })
|
|
47
|
+
.from({ kind: "text", text: "sku,price\nA1,10", mimeType: "text/csv" })
|
|
48
|
+
.auto()
|
|
49
|
+
.asRows()
|
|
50
|
+
.sandbox({ sandboxId })
|
|
51
|
+
.reactor(reactor)
|
|
52
|
+
.build()
|
|
53
|
+
|
|
54
|
+
console.log(result.datasetId)
|
|
55
|
+
console.log(result.previewRows)
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
`dataset(runtime, { datasetId })` mirrors the old `structure(env, { datasetId })` style. You can also pass the id at build time:
|
|
59
|
+
|
|
60
|
+
```ts
|
|
61
|
+
await dataset(runtime)
|
|
62
|
+
.from({ kind: "dataset", datasetId: "source_dataset_v1" })
|
|
63
|
+
.schema(productSchema)
|
|
64
|
+
.sandbox({ sandboxId })
|
|
65
|
+
.reactor(reactor)
|
|
66
|
+
.build({ datasetId: "normalized_products_v1" })
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Sources
|
|
70
|
+
|
|
71
|
+
Use `.from(...)` for structure-compatible sources:
|
|
72
|
+
|
|
73
|
+
```ts
|
|
74
|
+
dataset(runtime).from(
|
|
75
|
+
{ kind: "file", fileId: "file_1", description: "Supplier CSV" },
|
|
76
|
+
{ kind: "text", text: "sku,price\nA1,10", mimeType: "text/csv", name: "inline.csv" },
|
|
77
|
+
{ kind: "dataset", datasetId: "existing_dataset_v1" },
|
|
78
|
+
)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
The builder also keeps explicit source methods:
|
|
82
|
+
|
|
83
|
+
```ts
|
|
84
|
+
dataset(runtime)
|
|
85
|
+
.fromFile({ fileId: "file_1" })
|
|
86
|
+
.fromText({ text: "raw input", name: "input.txt" })
|
|
87
|
+
.fromDataset({ datasetId: "existing_dataset_v1" })
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Query sources must use `.fromQuery(sourceDomain, ...)` because they need a second domain:
|
|
91
|
+
|
|
92
|
+
```ts
|
|
93
|
+
const snapshot = await dataset(runtime, { datasetId: "open_items_v1" })
|
|
94
|
+
.fromQuery(sourceDomain, {
|
|
95
|
+
query: {
|
|
96
|
+
source_items: {
|
|
97
|
+
$: {
|
|
98
|
+
where: { status: "open" },
|
|
99
|
+
fields: ["title", "quantity"],
|
|
100
|
+
limit: 100,
|
|
101
|
+
},
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
title: "Open items",
|
|
105
|
+
explanation: "Snapshot of open source items",
|
|
106
|
+
})
|
|
107
|
+
.build()
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
The query is typed with the same InstantDB query types used by `db.query`. Unknown entities, fields, filters, and link shapes fail at compile time.
|
|
111
|
+
|
|
112
|
+
## Runtime Compatibility
|
|
113
|
+
|
|
114
|
+
The runtime check is name plus schema, not name only.
|
|
115
|
+
|
|
116
|
+
```ts
|
|
117
|
+
const appDomain = domain("app")
|
|
118
|
+
.includes(datasetDomain)
|
|
119
|
+
.includes(sourceDomain)
|
|
120
|
+
.withSchema({ entities: {}, links: {}, rooms: {} })
|
|
121
|
+
|
|
122
|
+
dataset(runtime).fromQuery(sourceDomain, { query: { source_items: {} } })
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Subdomains are supported transitively. If domain `B` includes domain `A`, and the runtime root includes `B`, then `.fromQuery(A, ...)` is accepted.
|
|
126
|
+
|
|
127
|
+
A different domain with the same name but incompatible schema is rejected. A runtime that includes only `datasetDomain` can persist datasets but cannot query a source domain through `.fromQuery(...)`.
|
|
128
|
+
|
|
129
|
+
## Output Modes
|
|
130
|
+
|
|
131
|
+
Rows are the default output:
|
|
132
|
+
|
|
133
|
+
```ts
|
|
134
|
+
await dataset(runtime)
|
|
135
|
+
.from({ kind: "dataset", datasetId: "source_v1" })
|
|
136
|
+
.asRows()
|
|
137
|
+
.build({ datasetId: "rows_v1" })
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Object output is represented as a single-row dataset:
|
|
141
|
+
|
|
142
|
+
```ts
|
|
143
|
+
const result = await dataset(runtime, { datasetId: "summary_v1" })
|
|
144
|
+
.from({ kind: "dataset", datasetId: "orders_v1" })
|
|
145
|
+
.instructions("Summarize orders by currency.")
|
|
146
|
+
.schema(summarySchema)
|
|
147
|
+
.asObject()
|
|
148
|
+
.sandbox({ sandboxId })
|
|
149
|
+
.reactor(reactor)
|
|
150
|
+
.build()
|
|
151
|
+
|
|
152
|
+
console.log(result.object)
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
`asObject()` forces a one-row output contract. The persisted dataset still uses JSONL rows, and the returned result exposes the row as both `firstRow` and `object`.
|
|
156
|
+
|
|
157
|
+
## Schema Modes
|
|
158
|
+
|
|
159
|
+
Use `schema(...)` when the output contract is known:
|
|
160
|
+
|
|
161
|
+
```ts
|
|
162
|
+
const productSchema = {
|
|
163
|
+
title: "ProductRow",
|
|
164
|
+
description: "One product row",
|
|
165
|
+
schema: {
|
|
166
|
+
type: "object",
|
|
167
|
+
additionalProperties: false,
|
|
168
|
+
properties: {
|
|
169
|
+
sku: { type: "string" },
|
|
170
|
+
price: { type: "number" },
|
|
171
|
+
},
|
|
172
|
+
required: ["sku", "price"],
|
|
173
|
+
},
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
await dataset(runtime)
|
|
177
|
+
.from({ kind: "file", fileId })
|
|
178
|
+
.schema(productSchema)
|
|
179
|
+
.sandbox({ sandboxId })
|
|
180
|
+
.reactor(reactor)
|
|
181
|
+
.build({ datasetId: "products_v1" })
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
Use `auto()` or `inferSchema()` when the builder should infer a schema from the materialized rows:
|
|
185
|
+
|
|
186
|
+
```ts
|
|
187
|
+
await dataset(runtime)
|
|
188
|
+
.from({ kind: "text", text: csv, mimeType: "text/csv" })
|
|
189
|
+
.auto()
|
|
190
|
+
.sandbox({ sandboxId })
|
|
191
|
+
.reactor(reactor)
|
|
192
|
+
.build({ datasetId: "auto_products_v1" })
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
`auto()` is an alias for `inferSchema()`.
|
|
196
|
+
|
|
197
|
+
## Sandbox And Reactor Requirements
|
|
198
|
+
|
|
199
|
+
Some builds can materialize directly:
|
|
200
|
+
|
|
201
|
+
- A single query source without custom instructions can run without sandbox or reactor.
|
|
202
|
+
|
|
203
|
+
Other builds require sandbox execution and a reactor:
|
|
204
|
+
|
|
205
|
+
- File sources
|
|
206
|
+
- Text sources
|
|
207
|
+
- Existing dataset transformations
|
|
208
|
+
- Multiple sources
|
|
209
|
+
- Query sources with custom instructions
|
|
210
|
+
- Any build that needs agent-driven parsing or transformation
|
|
211
|
+
|
|
212
|
+
If these are missing, the builder throws `dataset_sandbox_required` or `dataset_reactor_required`.
|
|
213
|
+
|
|
214
|
+
## Build Result
|
|
215
|
+
|
|
216
|
+
```ts
|
|
217
|
+
type DatasetBuildResult = {
|
|
218
|
+
datasetId: string
|
|
219
|
+
dataset: any
|
|
220
|
+
previewRows: any[]
|
|
221
|
+
reader: {
|
|
222
|
+
read(cursor?: number, limit?: number): Promise<{
|
|
223
|
+
rows: any[]
|
|
224
|
+
cursor: number
|
|
225
|
+
done: boolean
|
|
226
|
+
}>
|
|
227
|
+
read(params?: { cursor?: number; limit?: number }): Promise<{
|
|
228
|
+
rows: any[]
|
|
229
|
+
cursor: number
|
|
230
|
+
done: boolean
|
|
231
|
+
}>
|
|
232
|
+
}
|
|
233
|
+
firstRow?: any | null
|
|
234
|
+
object?: any | null
|
|
235
|
+
}
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Read more rows with the returned reader:
|
|
239
|
+
|
|
240
|
+
```ts
|
|
241
|
+
const page = await result.reader.read({ cursor: 0, limit: 100 })
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Use `.first()` when the build must produce zero or one row:
|
|
245
|
+
|
|
246
|
+
```ts
|
|
247
|
+
const result = await dataset(runtime)
|
|
248
|
+
.fromQuery(sourceDomain, { query: { source_items: { $: { limit: 1 } } } })
|
|
249
|
+
.first()
|
|
250
|
+
.build({ datasetId: "single_item_v1" })
|
|
251
|
+
|
|
252
|
+
console.log(result.firstRow)
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
If more than one row is produced, the builder throws `dataset_first_expected_zero_or_one_row`.
|
|
256
|
+
|
|
257
|
+
## Declarative Tool
|
|
258
|
+
|
|
259
|
+
`createMaterializeDatasetTool` exposes the same materialization contract as an AI SDK tool. It is useful when a reactor or agent needs to request dataset builds declaratively.
|
|
260
|
+
|
|
261
|
+
```ts
|
|
262
|
+
import { createMaterializeDatasetTool } from "@ekairos/dataset"
|
|
263
|
+
|
|
264
|
+
const materializeDataset = createMaterializeDatasetTool({
|
|
265
|
+
runtime,
|
|
266
|
+
reactor,
|
|
267
|
+
queryDomain: sourceDomain,
|
|
268
|
+
})
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
Tool input:
|
|
272
|
+
|
|
273
|
+
```ts
|
|
274
|
+
{
|
|
275
|
+
datasetId?: string
|
|
276
|
+
sandboxId?: string
|
|
277
|
+
title?: string
|
|
278
|
+
sources: Array<
|
|
279
|
+
| { kind: "file"; fileId: string; description?: string }
|
|
280
|
+
| { kind: "text"; text: string; mimeType?: string; name?: string; description?: string }
|
|
281
|
+
| { kind: "dataset"; datasetId: string; description?: string }
|
|
282
|
+
| { kind: "query"; query: Record<string, any>; title?: string; explanation?: string }
|
|
283
|
+
>
|
|
284
|
+
instructions?: string
|
|
285
|
+
mode?: "auto" | "schema"
|
|
286
|
+
output?: "rows" | "object"
|
|
287
|
+
schema?: DatasetSchemaInput
|
|
288
|
+
first?: boolean
|
|
289
|
+
}
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
The tool returns only `{ datasetId }`.
|
|
293
|
+
|
|
294
|
+
The tool runtime must include `datasetDomain`, and its `queryDomain` must also be compatible with that same runtime.
|
|
295
|
+
|
|
296
|
+
## Replacing Structure
|
|
297
|
+
|
|
298
|
+
Old structure-style chain:
|
|
299
|
+
|
|
300
|
+
```ts
|
|
301
|
+
await structure(env, { datasetId: "summary_v1" })
|
|
302
|
+
.from({ kind: "text", text, mimeType: "text/plain", name: "input.txt" })
|
|
303
|
+
.instructions("Return a summary object.")
|
|
304
|
+
.schema(summarySchema)
|
|
305
|
+
.asObject()
|
|
306
|
+
.build()
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
Dataset replacement:
|
|
310
|
+
|
|
311
|
+
```ts
|
|
312
|
+
await dataset(runtime, { datasetId: "summary_v1" })
|
|
313
|
+
.from({ kind: "text", text, mimeType: "text/plain", name: "input.txt" })
|
|
314
|
+
.instructions("Return a summary object.")
|
|
315
|
+
.schema(summarySchema)
|
|
316
|
+
.asObject()
|
|
317
|
+
.sandbox({ sandboxId })
|
|
318
|
+
.reactor(reactor)
|
|
319
|
+
.build()
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
Key differences:
|
|
323
|
+
|
|
324
|
+
- Pass `runtime`, not `env`.
|
|
325
|
+
- The runtime must include `datasetDomain`.
|
|
326
|
+
- Query sources must be explicit: `.fromQuery(sourceDomain, { query })`.
|
|
327
|
+
- Object output is stored as a one-row dataset, not as structure context content.
|
|
328
|
+
- Sandbox and reactor are explicit when parsing or transforming is required.
|
|
329
|
+
|
|
330
|
+
## Exports
|
|
331
|
+
|
|
332
|
+
Main exports:
|
|
333
|
+
|
|
334
|
+
- `dataset`
|
|
335
|
+
- `datasetDomain`
|
|
336
|
+
- `createMaterializeDatasetTool`
|
|
337
|
+
- `materializeDatasetToolInputSchema`
|
|
338
|
+
- `DatasetBuilder`
|
|
339
|
+
- `DatasetBuildResult`
|
|
340
|
+
- `DatasetSourceInput`
|
|
341
|
+
- `DatasetSchemaInput`
|
|
342
|
+
- `DatasetOutput`
|
|
343
|
+
- `DatasetMode`
|
|
344
|
+
|
|
345
|
+
## Internal Notes
|
|
346
|
+
|
|
347
|
+
Implementation notes live in `src/README.md`. Public callers should use the root package API from `@ekairos/dataset`.
|
package/dist/agents.d.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import "./builder/materialize.js";
|
|
2
|
+
export * from "./dataset.js";
|
|
3
|
+
export * from "./domain.js";
|
|
4
|
+
export * from "./materializeDataset.tool.js";
|
|
5
|
+
export * from "./schema.js";
|
|
6
|
+
export * from "./service.js";
|
|
7
|
+
export * from "./file/file-dataset.agent.js";
|
|
8
|
+
export * from "./transform/index.js";
|
package/dist/agents.js
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import "./builder/materialize.js";
|
|
2
|
+
export * from "./dataset.js";
|
|
3
|
+
export * from "./domain.js";
|
|
4
|
+
export * from "./materializeDataset.tool.js";
|
|
5
|
+
export * from "./schema.js";
|
|
6
|
+
export * from "./service.js";
|
|
7
|
+
export * from "./file/file-dataset.agent.js";
|
|
8
|
+
export * from "./transform/index.js";
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { AnyDatasetRuntime, DatasetBuilderState, InternalDatasetResource } from "./types.js";
|
|
2
|
+
export type DatasetAgentMaterializers = {
|
|
3
|
+
materializeSingleFileLikeResource<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, resource: Extract<InternalDatasetResource, {
|
|
4
|
+
kind: "file" | "text";
|
|
5
|
+
}>, targetDatasetId: string): Promise<string>;
|
|
6
|
+
materializeDerivedDataset<Runtime extends AnyDatasetRuntime>(state: DatasetBuilderState<Runtime>, targetDatasetId: string): Promise<string>;
|
|
7
|
+
};
|
|
8
|
+
export declare function registerDatasetAgentMaterializers(materializers: DatasetAgentMaterializers): void;
|
|
9
|
+
export declare function getDatasetAgentMaterializers(): DatasetAgentMaterializers;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
let agentMaterializers = null;
|
|
2
|
+
export function registerDatasetAgentMaterializers(materializers) {
|
|
3
|
+
agentMaterializers = materializers;
|
|
4
|
+
}
|
|
5
|
+
export function getDatasetAgentMaterializers() {
|
|
6
|
+
if (!agentMaterializers) {
|
|
7
|
+
throw new Error("dataset_agent_materializers_not_registered");
|
|
8
|
+
}
|
|
9
|
+
return agentMaterializers;
|
|
10
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import type { AnyDatasetRuntime, InternalDatasetResource } from "./types.js";
|
|
2
|
+
type DatasetContextResolution = {
|
|
3
|
+
contextId: string;
|
|
4
|
+
resources: InternalDatasetResource[];
|
|
5
|
+
contextResources: DatasetContextResourceRecord[];
|
|
6
|
+
};
|
|
7
|
+
type DatasetContextResourceRecord = {
|
|
8
|
+
key: string;
|
|
9
|
+
type: string;
|
|
10
|
+
name: string;
|
|
11
|
+
description: string;
|
|
12
|
+
[key: string]: unknown;
|
|
13
|
+
};
|
|
14
|
+
export declare function resolveDatasetResourceContext<Runtime extends AnyDatasetRuntime>(runtime: Runtime, datasetId: string, resources: InternalDatasetResource[]): Promise<DatasetContextResolution>;
|
|
15
|
+
export {};
|
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import { eventsDomain } from "@ekairos/events";
|
|
2
|
+
import { createDatasetId } from "../id.js";
|
|
3
|
+
import { datasetDomain } from "../schema.js";
|
|
4
|
+
import { DatasetService } from "../service.js";
|
|
5
|
+
import { getDomainDescriptor } from "./rows.js";
|
|
6
|
+
function getContextWhere(context) {
|
|
7
|
+
return "id" in context ? { id: context.id } : { key: context.key };
|
|
8
|
+
}
|
|
9
|
+
async function getEventsDb(runtime) {
|
|
10
|
+
const scoped = await runtime.use(eventsDomain);
|
|
11
|
+
return scoped.db;
|
|
12
|
+
}
|
|
13
|
+
async function getDatasetDb(runtime) {
|
|
14
|
+
const scoped = await runtime.use(datasetDomain);
|
|
15
|
+
return scoped.db;
|
|
16
|
+
}
|
|
17
|
+
function resourceKey(index, resource) {
|
|
18
|
+
if (resource.kind === "file")
|
|
19
|
+
return `file:${index}:${resource.fileId}`;
|
|
20
|
+
if (resource.kind === "text")
|
|
21
|
+
return `text:${index}:${resource.name ?? "inline"}`;
|
|
22
|
+
if (resource.kind === "dataset")
|
|
23
|
+
return `dataset:${index}:${resource.datasetId}`;
|
|
24
|
+
if (resource.kind === "query")
|
|
25
|
+
return `query:${index}:${resource.title ?? "query"}`;
|
|
26
|
+
return `resource:${index}`;
|
|
27
|
+
}
|
|
28
|
+
function resourceName(index, resource) {
|
|
29
|
+
if (resource.kind === "file")
|
|
30
|
+
return resource.filename ?? `File ${index + 1}`;
|
|
31
|
+
if (resource.kind === "text")
|
|
32
|
+
return resource.name ?? `Text ${index + 1}`;
|
|
33
|
+
if (resource.kind === "dataset")
|
|
34
|
+
return resource.datasetId;
|
|
35
|
+
if (resource.kind === "query")
|
|
36
|
+
return resource.title ?? `Query ${index + 1}`;
|
|
37
|
+
return `Resource ${index + 1}`;
|
|
38
|
+
}
|
|
39
|
+
function resourceDescription(resource) {
|
|
40
|
+
if ("description" in resource && typeof resource.description === "string" && resource.description.trim()) {
|
|
41
|
+
return resource.description.trim();
|
|
42
|
+
}
|
|
43
|
+
if (resource.kind === "query" && typeof resource.explanation === "string" && resource.explanation.trim()) {
|
|
44
|
+
return resource.explanation.trim();
|
|
45
|
+
}
|
|
46
|
+
return `Dataset ${resource.kind} resource.`;
|
|
47
|
+
}
|
|
48
|
+
function resourceToContextResource(index, resource) {
|
|
49
|
+
const base = {
|
|
50
|
+
key: resourceKey(index, resource),
|
|
51
|
+
type: resource.kind,
|
|
52
|
+
name: resourceName(index, resource),
|
|
53
|
+
description: resourceDescription(resource),
|
|
54
|
+
};
|
|
55
|
+
if (resource.kind === "file") {
|
|
56
|
+
return {
|
|
57
|
+
...base,
|
|
58
|
+
fileId: resource.fileId,
|
|
59
|
+
filename: resource.filename,
|
|
60
|
+
mediaType: resource.mediaType,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
if (resource.kind === "text") {
|
|
64
|
+
return {
|
|
65
|
+
...base,
|
|
66
|
+
text: resource.text,
|
|
67
|
+
mimeType: resource.mimeType,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
if (resource.kind === "dataset") {
|
|
71
|
+
return {
|
|
72
|
+
...base,
|
|
73
|
+
datasetId: resource.datasetId,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
if (resource.kind === "query") {
|
|
77
|
+
return {
|
|
78
|
+
...base,
|
|
79
|
+
query: resource.query,
|
|
80
|
+
title: resource.title,
|
|
81
|
+
explanation: resource.explanation,
|
|
82
|
+
...getDomainDescriptor(resource.domain),
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
return base;
|
|
86
|
+
}
|
|
87
|
+
async function createDatasetResourceContextStep(params) {
|
|
88
|
+
"use step";
|
|
89
|
+
const db = await getEventsDb(params.runtime);
|
|
90
|
+
const contextKey = `dataset:${params.datasetId}`;
|
|
91
|
+
const existing = await db.query({
|
|
92
|
+
event_contexts: {
|
|
93
|
+
$: { where: { key: contextKey }, limit: 1 },
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
const contextId = existing.event_contexts?.[0]?.id ?? createDatasetId();
|
|
97
|
+
const now = new Date();
|
|
98
|
+
const resources = await enrichDatasetContextResources(params.runtime, params.resources);
|
|
99
|
+
await db.transact([
|
|
100
|
+
db.tx.event_contexts[contextId].update({
|
|
101
|
+
key: contextKey,
|
|
102
|
+
createdAt: now,
|
|
103
|
+
updatedAt: now,
|
|
104
|
+
name: `Dataset ${params.datasetId}`,
|
|
105
|
+
status: "open_idle",
|
|
106
|
+
content: {
|
|
107
|
+
datasetId: params.datasetId,
|
|
108
|
+
resourceCount: resources.length,
|
|
109
|
+
},
|
|
110
|
+
resources,
|
|
111
|
+
description: `Dataset execution context for ${params.datasetId}.`,
|
|
112
|
+
goal: "Produce the dataset output from the resources declared in this context.",
|
|
113
|
+
}),
|
|
114
|
+
]);
|
|
115
|
+
return {
|
|
116
|
+
contextId,
|
|
117
|
+
};
|
|
118
|
+
}
|
|
119
|
+
async function enrichDatasetContextResources(runtime, resources) {
|
|
120
|
+
const datasetResources = resources.filter((resource) => resource.type === "dataset" && typeof resource.datasetId === "string");
|
|
121
|
+
if (datasetResources.length === 0)
|
|
122
|
+
return resources;
|
|
123
|
+
const db = await getDatasetDb(runtime);
|
|
124
|
+
const service = new DatasetService(db);
|
|
125
|
+
const enriched = [];
|
|
126
|
+
for (const resource of resources) {
|
|
127
|
+
if (resource.type !== "dataset" || typeof resource.datasetId !== "string") {
|
|
128
|
+
enriched.push(resource);
|
|
129
|
+
continue;
|
|
130
|
+
}
|
|
131
|
+
const preview = await service.previewRows(resource.datasetId, 20);
|
|
132
|
+
if (!preview.ok) {
|
|
133
|
+
enriched.push({
|
|
134
|
+
...resource,
|
|
135
|
+
previewError: preview.error,
|
|
136
|
+
});
|
|
137
|
+
continue;
|
|
138
|
+
}
|
|
139
|
+
enriched.push({
|
|
140
|
+
...resource,
|
|
141
|
+
previewRows: preview.data,
|
|
142
|
+
previewLimit: 20,
|
|
143
|
+
});
|
|
144
|
+
}
|
|
145
|
+
return enriched;
|
|
146
|
+
}
|
|
147
|
+
function contextResourceToDatasetResource(resource) {
|
|
148
|
+
if (resource.type === "file" && typeof resource.fileId === "string" && resource.fileId.trim()) {
|
|
149
|
+
return {
|
|
150
|
+
kind: "file",
|
|
151
|
+
fileId: resource.fileId.trim(),
|
|
152
|
+
description: resource.description,
|
|
153
|
+
filename: typeof resource.filename === "string" ? resource.filename : undefined,
|
|
154
|
+
mediaType: typeof resource.mediaType === "string" ? resource.mediaType : undefined,
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
if (resource.type === "dataset" &&
|
|
158
|
+
typeof resource.datasetId === "string" &&
|
|
159
|
+
resource.datasetId.trim()) {
|
|
160
|
+
return {
|
|
161
|
+
kind: "dataset",
|
|
162
|
+
datasetId: resource.datasetId.trim(),
|
|
163
|
+
description: resource.description,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
if (resource.type === "text" && typeof resource.text === "string") {
|
|
167
|
+
return {
|
|
168
|
+
kind: "text",
|
|
169
|
+
text: String(resource.text),
|
|
170
|
+
mimeType: typeof resource.mimeType === "string"
|
|
171
|
+
? String(resource.mimeType)
|
|
172
|
+
: "text/plain",
|
|
173
|
+
name: resource.name,
|
|
174
|
+
description: resource.description,
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
if (resource.type === "query") {
|
|
178
|
+
throw new Error("dataset_context_query_resource_requires_builder_shortcut");
|
|
179
|
+
}
|
|
180
|
+
return {
|
|
181
|
+
kind: "text",
|
|
182
|
+
text: JSON.stringify({ resource }, null, 2),
|
|
183
|
+
mimeType: "application/vnd.ekairos.context-resource+json",
|
|
184
|
+
name: `${resource.key}.context-resource.json`,
|
|
185
|
+
description: resource.description,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
async function readExistingContext(params) {
|
|
189
|
+
"use step";
|
|
190
|
+
const db = await getEventsDb(params.runtime);
|
|
191
|
+
const res = await db.query({
|
|
192
|
+
event_contexts: {
|
|
193
|
+
$: {
|
|
194
|
+
where: getContextWhere(params.context),
|
|
195
|
+
limit: 1,
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
});
|
|
199
|
+
const row = res?.event_contexts?.[0];
|
|
200
|
+
if (!row?.id)
|
|
201
|
+
throw new Error("dataset_context_not_found");
|
|
202
|
+
const resources = Array.isArray(row.resources)
|
|
203
|
+
? row.resources
|
|
204
|
+
: [];
|
|
205
|
+
if (resources.length === 0) {
|
|
206
|
+
throw new Error("dataset_context_resources_required");
|
|
207
|
+
}
|
|
208
|
+
const sourceContextId = String(row.id);
|
|
209
|
+
const copiedResources = resources.map((resource) => ({
|
|
210
|
+
...resource,
|
|
211
|
+
sourceContextId: resource.sourceContextId ?? sourceContextId,
|
|
212
|
+
sourceResourceKey: resource.sourceResourceKey ?? resource.key,
|
|
213
|
+
}));
|
|
214
|
+
return {
|
|
215
|
+
contextId: sourceContextId,
|
|
216
|
+
resources: resources.map((resource) => contextResourceToDatasetResource(resource)),
|
|
217
|
+
contextResources: copiedResources,
|
|
218
|
+
};
|
|
219
|
+
}
|
|
220
|
+
export async function resolveDatasetResourceContext(runtime, datasetId, resources) {
|
|
221
|
+
const contextRefs = resources.filter((resource) => resource.kind === "context");
|
|
222
|
+
if (contextRefs.length > 1) {
|
|
223
|
+
throw new Error("dataset_context_resource_must_be_unique");
|
|
224
|
+
}
|
|
225
|
+
if (contextRefs.length === 1) {
|
|
226
|
+
if (resources.length > 1) {
|
|
227
|
+
throw new Error("dataset_context_resource_is_exclusive");
|
|
228
|
+
}
|
|
229
|
+
const source = await readExistingContext({ runtime, context: contextRefs[0] });
|
|
230
|
+
const created = await createDatasetResourceContextStep({
|
|
231
|
+
runtime,
|
|
232
|
+
datasetId,
|
|
233
|
+
resources: source.contextResources,
|
|
234
|
+
});
|
|
235
|
+
return {
|
|
236
|
+
...source,
|
|
237
|
+
contextId: created.contextId,
|
|
238
|
+
};
|
|
239
|
+
}
|
|
240
|
+
const contextResourceRecords = resources.map((resource, index) => resourceToContextResource(index, resource));
|
|
241
|
+
const created = await createDatasetResourceContextStep({
|
|
242
|
+
runtime,
|
|
243
|
+
datasetId,
|
|
244
|
+
resources: contextResourceRecords,
|
|
245
|
+
});
|
|
246
|
+
return {
|
|
247
|
+
contextId: created.contextId,
|
|
248
|
+
resources,
|
|
249
|
+
contextResources: contextResourceRecords,
|
|
250
|
+
};
|
|
251
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { DatasetSchemaInput } from "./types.js";
|
|
2
|
+
export declare function buildFileDefaultInstructions(schema?: DatasetSchemaInput): "Create a dataset from the resource file and ensure each output row matches the provided dataset schema exactly." | "Create a dataset representing the resource content as structured rows.";
|
|
3
|
+
export declare function buildRawResourceInstructions(resourceKind: "file" | "text"): "Create a dataset representing the raw text content as structured rows without applying business transformations." | "Create a dataset representing the raw file content as structured rows without applying business transformations.";
|
|
4
|
+
export declare function buildTransformInstructions(resourceCount: number, userInstructions?: string, schema?: DatasetSchemaInput): string;
|
|
5
|
+
export declare function buildObjectOutputInstructions(userInstructions?: string): string;
|