@sanity/ailf 6.0.0 → 6.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/airbyte/ai_literacy_framework.connector.yaml +276 -0
- package/config/bigquery/views/synthesis_parse_failure_rate_7d.sql +42 -0
- package/dist/_vendor/ailf-core/artifact-registry.d.ts +17 -0
- package/dist/_vendor/ailf-core/artifact-registry.js +14 -0
- package/dist/_vendor/ailf-core/ports/context.d.ts +22 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.d.ts +7 -0
- package/dist/_vendor/ailf-core/schemas/eval-config.js +8 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js +59 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/doc-attribution-spotlight.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/failure-mode-summary.js +47 -3
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.d.ts +10 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/index.js +13 -0
- package/dist/_vendor/ailf-core/services/diagnosis/cards/low-confidence-attribution.js +17 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/no-issues.js +1 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/regression-vs-baseline.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/top-recommendations.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/cards/weakest-area.js +5 -1
- package/dist/_vendor/ailf-core/services/diagnosis/prompt-builders.js +15 -2
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.d.ts +5 -3
- package/dist/_vendor/ailf-core/services/diagnosis/prompts/weakest-area.system.js +19 -31
- package/dist/_vendor/ailf-core/services/index.d.ts +1 -1
- package/dist/_vendor/ailf-core/services/index.js +1 -1
- package/dist/_vendor/ailf-core/types/diagnosis.d.ts +3 -0
- package/dist/_vendor/ailf-core/types/index.d.ts +7 -0
- package/dist/_vendor/ailf-core/types/repo-config.d.ts +16 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.d.ts +101 -0
- package/dist/_vendor/ailf-core/types/synthesis-telemetry.js +18 -0
- package/dist/adapters/config-sources/file-config-adapter.js +8 -6
- package/dist/adapters/llm/index.d.ts +1 -1
- package/dist/adapters/llm/index.js +1 -1
- package/dist/adapters/llm/openai-llm-client.js +7 -2
- package/dist/adapters/llm/retry.d.ts +18 -0
- package/dist/adapters/llm/retry.js +21 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.d.ts +49 -0
- package/dist/adapters/synthesis/synthesis-telemetry-schema.js +55 -0
- package/dist/adapters/task-sources/content-lake-task-source.js +10 -5
- package/dist/adapters/task-sources/repo-schemas.d.ts +7 -0
- package/dist/adapters/task-sources/repo-schemas.js +10 -0
- package/dist/artifact-capture/api-gateway-artifact-writer.d.ts +11 -1
- package/dist/artifact-capture/api-gateway-artifact-writer.js +3 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.d.ts +11 -1
- package/dist/artifact-capture/batching-api-gateway-artifact-writer.js +3 -1
- package/dist/artifact-capture/gcs-artifact-writer.d.ts +11 -1
- package/dist/artifact-capture/gcs-artifact-writer.js +6 -3
- package/dist/artifact-capture/local-fs-artifact-writer.d.ts +11 -1
- package/dist/artifact-capture/local-fs-artifact-writer.js +6 -3
- package/dist/commands/interpret.d.ts +21 -1
- package/dist/commands/interpret.js +13 -4
- package/dist/commands/pipeline-action.d.ts +44 -0
- package/dist/commands/pipeline-action.js +193 -1
- package/dist/commands/run.d.ts +2 -0
- package/dist/commands/run.js +2 -0
- package/dist/composition-root.d.ts +22 -5
- package/dist/composition-root.js +78 -8
- package/dist/orchestration/pipeline-orchestrator.js +3 -0
- package/dist/orchestration/steps/gap-analysis-step.js +0 -1
- package/dist/report-store.d.ts +40 -0
- package/dist/report-store.js +88 -0
- package/package.json +1 -1
|
@@ -225,6 +225,134 @@ definitions:
|
|
|
225
225
|
schema:
|
|
226
226
|
$ref: "#/schemas/area_scores"
|
|
227
227
|
|
|
228
|
+
# ------------------------------------------------------------------
|
|
229
|
+
# Stream 3: synthesis_summary — one row per report with synthesis telemetry
|
|
230
|
+
# ------------------------------------------------------------------
|
|
231
|
+
# GROQ projection emits cost, parse-failure counts, and rate from the
|
|
232
|
+
# summary.synthesis.diagnosis path written by the Phase-6 post-run hook.
|
|
233
|
+
# Rows are gated on defined(summary.synthesis.diagnosis) so reports that
|
|
234
|
+
# predate Phase 6 produce no rows (incremental cursor still catches them
|
|
235
|
+
# on re-sync once backfilled).
|
|
236
|
+
synthesis_summary:
|
|
237
|
+
type: DeclarativeStream
|
|
238
|
+
name: synthesis_summary
|
|
239
|
+
retriever:
|
|
240
|
+
type: SimpleRetriever
|
|
241
|
+
decoder:
|
|
242
|
+
type: JsonDecoder
|
|
243
|
+
requester:
|
|
244
|
+
$ref: "#/definitions/base_requester"
|
|
245
|
+
path: /v2026-03-12/data/query/{{ config['dataset'] }}
|
|
246
|
+
http_method: GET
|
|
247
|
+
request_parameters:
|
|
248
|
+
query: >-
|
|
249
|
+
*[_type=="ailf.report" && _createdAt > "{{
|
|
250
|
+
stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
|
|
251
|
+
_createdAt <= "{{ stream_interval.end_time }}" &&
|
|
252
|
+
defined(summary.synthesis.diagnosis)]|order(_createdAt asc){
|
|
253
|
+
"report_id": reportId,
|
|
254
|
+
"completed_at": completedAt,
|
|
255
|
+
"mode": provenance.mode,
|
|
256
|
+
"source_name": provenance.source.name,
|
|
257
|
+
"grader_model": provenance.graderModel,
|
|
258
|
+
"synthesis_cost": summary.synthesis.diagnosis.cost,
|
|
259
|
+
"parse_failure_count":
|
|
260
|
+
summary.synthesis.diagnosis.parseFailureCount,
|
|
261
|
+
"parse_failure_rate":
|
|
262
|
+
summary.synthesis.diagnosis.parseFailureRate,
|
|
263
|
+
_createdAt
|
|
264
|
+
}
|
|
265
|
+
record_selector:
|
|
266
|
+
type: RecordSelector
|
|
267
|
+
extractor:
|
|
268
|
+
type: DpathExtractor
|
|
269
|
+
field_path:
|
|
270
|
+
- result
|
|
271
|
+
primary_key:
|
|
272
|
+
- report_id
|
|
273
|
+
incremental_sync:
|
|
274
|
+
type: DatetimeBasedCursor
|
|
275
|
+
cursor_field: _createdAt
|
|
276
|
+
cursor_datetime_formats:
|
|
277
|
+
- "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
278
|
+
- "%Y-%m-%dT%H:%M:%SZ"
|
|
279
|
+
datetime_format: "%Y-%m-%dT%H:%M:%SZ"
|
|
280
|
+
start_datetime:
|
|
281
|
+
type: MinMaxDatetime
|
|
282
|
+
datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
|
|
283
|
+
datetime_format: "%Y-%m-%dT%H:%M:%SZ"
|
|
284
|
+
step: P30D
|
|
285
|
+
cursor_granularity: PT1S
|
|
286
|
+
schema_loader:
|
|
287
|
+
type: InlineSchemaLoader
|
|
288
|
+
schema:
|
|
289
|
+
$ref: "#/schemas/synthesis_summary"
|
|
290
|
+
|
|
291
|
+
# ------------------------------------------------------------------
|
|
292
|
+
# Stream 4: synthesis_per_card — one row per report with per-card array
|
|
293
|
+
# ------------------------------------------------------------------
|
|
294
|
+
# GROQ projection emits the nested perCard array. GROQ cannot explode
|
|
295
|
+
# arrays into flat rows, so the nesting is preserved — BigQuery consumers
|
|
296
|
+
# should UNNEST(JSON_QUERY_ARRAY(per_card)) to get flat rows per card.
|
|
297
|
+
# primary_key is report_id only (not compound) for the same reason.
|
|
298
|
+
synthesis_per_card:
|
|
299
|
+
type: DeclarativeStream
|
|
300
|
+
name: synthesis_per_card
|
|
301
|
+
retriever:
|
|
302
|
+
type: SimpleRetriever
|
|
303
|
+
decoder:
|
|
304
|
+
type: JsonDecoder
|
|
305
|
+
requester:
|
|
306
|
+
$ref: "#/definitions/base_requester"
|
|
307
|
+
path: /v2026-03-12/data/query/{{ config['dataset'] }}
|
|
308
|
+
http_method: GET
|
|
309
|
+
request_parameters:
|
|
310
|
+
query: >-
|
|
311
|
+
*[_type=="ailf.report" && _createdAt > "{{
|
|
312
|
+
stream_interval.start_time or '1970-01-01T00:00:00Z' }}" &&
|
|
313
|
+
_createdAt <= "{{ stream_interval.end_time }}" &&
|
|
314
|
+
defined(summary.synthesis.diagnosis.perCard)]|order(_createdAt
|
|
315
|
+
asc){
|
|
316
|
+
"report_id": reportId,
|
|
317
|
+
"completed_at": completedAt,
|
|
318
|
+
"per_card": summary.synthesis.diagnosis.perCard[]{
|
|
319
|
+
"card_type": cardType,
|
|
320
|
+
"cost": cost,
|
|
321
|
+
"parse_failed": parseFailed,
|
|
322
|
+
"latency_ms": latencyMs,
|
|
323
|
+
"token_input": tokenInput,
|
|
324
|
+
"token_output": tokenOutput,
|
|
325
|
+
"card_version": cardVersion,
|
|
326
|
+
"generated_at": generatedAt
|
|
327
|
+
},
|
|
328
|
+
_createdAt
|
|
329
|
+
}
|
|
330
|
+
record_selector:
|
|
331
|
+
type: RecordSelector
|
|
332
|
+
extractor:
|
|
333
|
+
type: DpathExtractor
|
|
334
|
+
field_path:
|
|
335
|
+
- result
|
|
336
|
+
primary_key:
|
|
337
|
+
- report_id
|
|
338
|
+
incremental_sync:
|
|
339
|
+
type: DatetimeBasedCursor
|
|
340
|
+
cursor_field: _createdAt
|
|
341
|
+
cursor_datetime_formats:
|
|
342
|
+
- "%Y-%m-%dT%H:%M:%S.%fZ"
|
|
343
|
+
- "%Y-%m-%dT%H:%M:%SZ"
|
|
344
|
+
datetime_format: "%Y-%m-%dT%H:%M:%SZ"
|
|
345
|
+
start_datetime:
|
|
346
|
+
type: MinMaxDatetime
|
|
347
|
+
datetime: "{{ config.get('start_date', '2026-01-01T00:00:00Z') }}"
|
|
348
|
+
datetime_format: "%Y-%m-%dT%H:%M:%SZ"
|
|
349
|
+
step: P30D
|
|
350
|
+
cursor_granularity: PT1S
|
|
351
|
+
schema_loader:
|
|
352
|
+
type: InlineSchemaLoader
|
|
353
|
+
schema:
|
|
354
|
+
$ref: "#/schemas/synthesis_per_card"
|
|
355
|
+
|
|
228
356
|
base_requester:
|
|
229
357
|
type: HttpRequester
|
|
230
358
|
url_base: https://{{ config['project_id'] }}.api.sanity.io
|
|
@@ -235,6 +363,8 @@ definitions:
|
|
|
235
363
|
streams:
|
|
236
364
|
- $ref: "#/definitions/streams/reports"
|
|
237
365
|
- $ref: "#/definitions/streams/area_scores"
|
|
366
|
+
- $ref: "#/definitions/streams/synthesis_summary"
|
|
367
|
+
- $ref: "#/definitions/streams/synthesis_per_card"
|
|
238
368
|
|
|
239
369
|
spec:
|
|
240
370
|
type: Spec
|
|
@@ -299,9 +429,25 @@ metadata:
|
|
|
299
429
|
primaryKeysAreUnique: true
|
|
300
430
|
primaryKeysArePresent: true
|
|
301
431
|
responsesAreSuccessful: true
|
|
432
|
+
synthesis_summary:
|
|
433
|
+
hasRecords: true
|
|
434
|
+
streamHash: null
|
|
435
|
+
hasResponse: true
|
|
436
|
+
primaryKeysAreUnique: true
|
|
437
|
+
primaryKeysArePresent: true
|
|
438
|
+
responsesAreSuccessful: true
|
|
439
|
+
synthesis_per_card:
|
|
440
|
+
hasRecords: true
|
|
441
|
+
streamHash: null
|
|
442
|
+
hasResponse: true
|
|
443
|
+
primaryKeysAreUnique: true
|
|
444
|
+
primaryKeysArePresent: true
|
|
445
|
+
responsesAreSuccessful: true
|
|
302
446
|
autoImportSchema:
|
|
303
447
|
reports: false
|
|
304
448
|
area_scores: false
|
|
449
|
+
synthesis_summary: false
|
|
450
|
+
synthesis_per_card: false
|
|
305
451
|
|
|
306
452
|
# ======================================================================
|
|
307
453
|
# Inline schemas — manually defined to match the designed BigQuery tables.
|
|
@@ -757,3 +903,133 @@ schemas:
|
|
|
757
903
|
- "null"
|
|
758
904
|
description: Sanity document creation timestamp (incremental cursor)
|
|
759
905
|
additionalProperties: true
|
|
906
|
+
|
|
907
|
+
# ------------------------------------------------------------------
|
|
908
|
+
# synthesis_summary schema — flat, one row per report with synthesis telemetry
|
|
909
|
+
# ------------------------------------------------------------------
|
|
910
|
+
synthesis_summary:
|
|
911
|
+
type: object
|
|
912
|
+
$schema: http://json-schema.org/schema#
|
|
913
|
+
required:
|
|
914
|
+
- report_id
|
|
915
|
+
properties:
|
|
916
|
+
report_id:
|
|
917
|
+
type: string
|
|
918
|
+
description: UUID v7 report identifier (primary key)
|
|
919
|
+
completed_at:
|
|
920
|
+
type:
|
|
921
|
+
- string
|
|
922
|
+
- "null"
|
|
923
|
+
description: ISO 8601 timestamp when the evaluation completed
|
|
924
|
+
mode:
|
|
925
|
+
type:
|
|
926
|
+
- string
|
|
927
|
+
- "null"
|
|
928
|
+
description: "Evaluation mode: baseline, observed, or agentic"
|
|
929
|
+
source_name:
|
|
930
|
+
type:
|
|
931
|
+
- string
|
|
932
|
+
- "null"
|
|
933
|
+
description: Documentation source name (e.g., "production")
|
|
934
|
+
grader_model:
|
|
935
|
+
type:
|
|
936
|
+
- string
|
|
937
|
+
- "null"
|
|
938
|
+
description: Model used for LLM grading (context for cost comparison)
|
|
939
|
+
synthesis_cost:
|
|
940
|
+
type:
|
|
941
|
+
- number
|
|
942
|
+
- "null"
|
|
943
|
+
description:
|
|
944
|
+
Total USD cost of the Diagnosis synthesis run (sum of all card costs)
|
|
945
|
+
parse_failure_count:
|
|
946
|
+
type:
|
|
947
|
+
- number
|
|
948
|
+
- "null"
|
|
949
|
+
description:
|
|
950
|
+
Number of cards that failed Zod schema parse during synthesis
|
|
951
|
+
parse_failure_rate:
|
|
952
|
+
type:
|
|
953
|
+
- number
|
|
954
|
+
- "null"
|
|
955
|
+
description:
|
|
956
|
+
Fraction of cards that failed parse (0–1); 0.0 = no failures
|
|
957
|
+
_createdAt:
|
|
958
|
+
type:
|
|
959
|
+
- string
|
|
960
|
+
- "null"
|
|
961
|
+
description:
|
|
962
|
+
Sanity document creation timestamp (used as incremental cursor)
|
|
963
|
+
additionalProperties: true
|
|
964
|
+
|
|
965
|
+
# ------------------------------------------------------------------
|
|
966
|
+
# synthesis_per_card schema — nested per-card array, one row per report
|
|
967
|
+
# ------------------------------------------------------------------
|
|
968
|
+
# BigQuery consumers should UNNEST(JSON_QUERY_ARRAY(per_card)) to get
|
|
969
|
+
# flat rows per (report × card). See bigquery/views/synthesis_parse_failure_rate_7d.sql
|
|
970
|
+
synthesis_per_card:
|
|
971
|
+
type: object
|
|
972
|
+
$schema: http://json-schema.org/schema#
|
|
973
|
+
required:
|
|
974
|
+
- report_id
|
|
975
|
+
properties:
|
|
976
|
+
report_id:
|
|
977
|
+
type: string
|
|
978
|
+
description: UUID v7 report identifier (primary key)
|
|
979
|
+
completed_at:
|
|
980
|
+
type:
|
|
981
|
+
- string
|
|
982
|
+
- "null"
|
|
983
|
+
description: Denormalized timestamp for partitioning
|
|
984
|
+
per_card:
|
|
985
|
+
type:
|
|
986
|
+
- array
|
|
987
|
+
- "null"
|
|
988
|
+
description: >-
|
|
989
|
+
Per-card synthesis metrics array. UNNEST in BigQuery to get one flat
|
|
990
|
+
row per card. card_type identifies the diagnosis card type (e.g.,
|
|
991
|
+
"top-recommendations").
|
|
992
|
+
items:
|
|
993
|
+
type: object
|
|
994
|
+
properties:
|
|
995
|
+
card_type:
|
|
996
|
+
type: string
|
|
997
|
+
description: Diagnosis card type identifier (≤25 chars)
|
|
998
|
+
cost:
|
|
999
|
+
type:
|
|
1000
|
+
- number
|
|
1001
|
+
- "null"
|
|
1002
|
+
description:
|
|
1003
|
+
USD cost of this card's LLM call (null for deterministic cards)
|
|
1004
|
+
parse_failed:
|
|
1005
|
+
type: boolean
|
|
1006
|
+
description: Whether the card's Zod schema parse failed
|
|
1007
|
+
latency_ms:
|
|
1008
|
+
type:
|
|
1009
|
+
- number
|
|
1010
|
+
- "null"
|
|
1011
|
+
description: LLM call latency in milliseconds
|
|
1012
|
+
token_input:
|
|
1013
|
+
type:
|
|
1014
|
+
- number
|
|
1015
|
+
- "null"
|
|
1016
|
+
description: Input tokens consumed by the LLM call
|
|
1017
|
+
token_output:
|
|
1018
|
+
type:
|
|
1019
|
+
- number
|
|
1020
|
+
- "null"
|
|
1021
|
+
description: Output tokens produced by the LLM call
|
|
1022
|
+
card_version:
|
|
1023
|
+
type: string
|
|
1024
|
+
description:
|
|
1025
|
+
Card implementation version (e.g., "area-summary@0.1.0")
|
|
1026
|
+
generated_at:
|
|
1027
|
+
type: string
|
|
1028
|
+
description: ISO 8601 UTC timestamp when this card was generated
|
|
1029
|
+
_createdAt:
|
|
1030
|
+
type:
|
|
1031
|
+
- string
|
|
1032
|
+
- "null"
|
|
1033
|
+
description:
|
|
1034
|
+
Sanity document creation timestamp (used as incremental cursor)
|
|
1035
|
+
additionalProperties: true
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
-- ailf.synthesis_parse_failure_rate_7d — per-card parse-failure rate over 7 days
|
|
2
|
+
--
|
|
3
|
+
-- Computes the Zod-schema parse-failure rate per Diagnosis card type over the
|
|
4
|
+
-- previous 7 days, sourced from the synthesis_per_card Airbyte stream. Any row
|
|
5
|
+
-- returned by this view represents a card type that breached the 2% threshold
|
|
6
|
+
-- defined in D6-18 and should trigger a manual investigation per the runbook.
|
|
7
|
+
--
|
|
8
|
+
-- Source: ailf_raw.synthesis_per_card (Airbyte stream: "synthesis_per_card")
|
|
9
|
+
-- Target: ailf.synthesis_parse_failure_rate_7d (this view)
|
|
10
|
+
--
|
|
11
|
+
-- Threshold: failure_rate > 0.02 (2%) over INTERVAL 7 DAY [D6-18]
|
|
12
|
+
-- To change the threshold, edit the HAVING clause and WHERE clause below;
|
|
13
|
+
-- both are the single edit points per D6-18 (not lifted to config).
|
|
14
|
+
--
|
|
15
|
+
-- Usage:
|
|
16
|
+
-- bq query --use_legacy_sql=false < views/synthesis_parse_failure_rate_7d.sql
|
|
17
|
+
--
|
|
18
|
+
-- @see docs/runbooks/diagnosis-parse-failure-watch.md — operator runbook
|
|
19
|
+
-- @see packages/eval/config/airbyte/ai_literacy_framework.connector.yaml — synthesis_per_card stream
|
|
20
|
+
|
|
21
|
+
CREATE OR REPLACE VIEW `data-platform-302218.ailf.synthesis_parse_failure_rate_7d` AS
|
|
22
|
+
SELECT
|
|
23
|
+
JSON_VALUE(card, '$.card_type') AS card_type,
|
|
24
|
+
COUNT(*) AS total_runs,
|
|
25
|
+
COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)) AS parse_failures,
|
|
26
|
+
ROUND(SAFE_DIVIDE(
|
|
27
|
+
COUNTIF(SAFE_CAST(JSON_VALUE(card, '$.parse_failed') AS BOOL)),
|
|
28
|
+
COUNT(*)
|
|
29
|
+
), 4) AS failure_rate
|
|
30
|
+
FROM
|
|
31
|
+
`data-platform-302218.ailf_raw.synthesis_per_card` AS r,
|
|
32
|
+
UNNEST(JSON_QUERY_ARRAY(r.per_card)) AS card
|
|
33
|
+
WHERE
|
|
34
|
+
r.completed_at IS NOT NULL
|
|
35
|
+
AND TIMESTAMP(r.completed_at) >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 7 DAY)
|
|
36
|
+
AND JSON_VALUE(card, '$.card_type') IS NOT NULL
|
|
37
|
+
GROUP BY
|
|
38
|
+
card_type
|
|
39
|
+
HAVING
|
|
40
|
+
failure_rate > 0.02
|
|
41
|
+
ORDER BY
|
|
42
|
+
failure_rate DESC
|
|
@@ -200,6 +200,23 @@ export interface ArtifactDescriptor<TEntry = unknown, TPreview = unknown> {
|
|
|
200
200
|
* descriptors. `entryKey` is ignored on versioned bulk paths.
|
|
201
201
|
*/
|
|
202
202
|
readonly objectPath: ArtifactObjectPath;
|
|
203
|
+
/**
|
|
204
|
+
* Extract the positional args (beyond `runId`) to pass to `objectPath`
|
|
205
|
+
* when this descriptor needs more than `runId` to build its path.
|
|
206
|
+
*
|
|
207
|
+
* The default plain-bulk path is `objectPath(runId)`. The default
|
|
208
|
+
* per-entry path is `objectPath(runId, entryKey)`. Descriptors whose
|
|
209
|
+
* path needs additional axes from `association` and/or fields from the
|
|
210
|
+
* payload (e.g. the bulk-versioned-with-report-axis carve-out used by
|
|
211
|
+
* the diagnosis descriptor — see `BULK_VERSIONED_WITH_REPORT_AXIS`)
|
|
212
|
+
* set this function. The writer calls it and spreads the result:
|
|
213
|
+
* `objectPath(runId, ...extractPathArgs(association, payload))`.
|
|
214
|
+
*
|
|
215
|
+
* Returning `[undefined]` (or any `undefined` element) is fine — the
|
|
216
|
+
* underlying builder is expected to throw a meaningful error in that
|
|
217
|
+
* case, which the writer surfaces via its existing try/catch.
|
|
218
|
+
*/
|
|
219
|
+
readonly extractPathArgs?: (association: AssociationValues, payload: unknown) => readonly (string | undefined)[];
|
|
203
220
|
/**
|
|
204
221
|
* Build a filename-safe entry key from association values. Only meaningful
|
|
205
222
|
* for `layout === "per-entry"` — bulk descriptors omit it.
|
|
@@ -674,6 +674,7 @@ function buildDescriptor(input) {
|
|
|
674
674
|
versionedBy: input.versionedBy,
|
|
675
675
|
pathSafetyMarker: input.pathSafetyMarker,
|
|
676
676
|
objectPath,
|
|
677
|
+
extractPathArgs: input.extractPathArgs,
|
|
677
678
|
formatEntryKey,
|
|
678
679
|
parseEntryKey,
|
|
679
680
|
manifestPreview: input.manifestPreview,
|
|
@@ -1185,6 +1186,19 @@ export const ARTIFACT_REGISTRY = {
|
|
|
1185
1186
|
writePolicy: "post-hoc",
|
|
1186
1187
|
versionedBy: "diagnosisVersion",
|
|
1187
1188
|
objectPath: diagnosisPathBuilder(),
|
|
1189
|
+
// The diagnosis path builder takes (runId, reportId, version). The
|
|
1190
|
+
// reportId comes from association.report; the version is a compound
|
|
1191
|
+
// `diagnosisVersion|cardVersion` synthesized from the payload's
|
|
1192
|
+
// `inputs` field via `encodeDiagnosisPathVersion`. Writers spread
|
|
1193
|
+
// this beyond `runId` when calling `objectPath`.
|
|
1194
|
+
extractPathArgs: (assoc, payload) => {
|
|
1195
|
+
const reportId = typeof assoc.report === "string" ? assoc.report : undefined;
|
|
1196
|
+
const diag = payload;
|
|
1197
|
+
const dv = diag?.inputs?.diagnosisVersion;
|
|
1198
|
+
const cv = diag?.inputs?.cardVersion;
|
|
1199
|
+
const version = dv && cv ? encodeDiagnosisPathVersion(dv, cv) : undefined;
|
|
1200
|
+
return [reportId, version];
|
|
1201
|
+
},
|
|
1188
1202
|
// Defense-in-depth: this descriptor's axes (`run`, `report`) are both
|
|
1189
1203
|
// bounded, so the `assertValidArtifactDescriptor` unbounded-axis rule
|
|
1190
1204
|
// does not fire and the carve-out is never consulted at module load
|
|
@@ -237,6 +237,14 @@ export interface ResolvedConfig {
|
|
|
237
237
|
* Sourced from AILF_ARTIFACT_UPLOAD env var or `artifactUpload` in ailf.config.ts.
|
|
238
238
|
*/
|
|
239
239
|
artifactUpload?: boolean;
|
|
240
|
+
/**
|
|
241
|
+
* Post-run diagnosis summary policy (Phase 6 / DIAG-06). Carried from
|
|
242
|
+
* `.ailf/config.yaml` summary.onRun via RepoConfigSchema (CLI path) or
|
|
243
|
+
* EvalConfigSchema (file-config path). Precedence resolution is deferred to
|
|
244
|
+
* shouldRunPostSummary() at execution time — this field carries only the
|
|
245
|
+
* config-file signal.
|
|
246
|
+
*/
|
|
247
|
+
summaryOnRun?: "auto" | "always" | "never";
|
|
240
248
|
}
|
|
241
249
|
/**
|
|
242
250
|
* Application context — the complete dependency carrier.
|
|
@@ -331,6 +339,20 @@ export interface ReportStorePort {
|
|
|
331
339
|
findComparableBaseline(query: unknown): Promise<null | unknown>;
|
|
332
340
|
/** Write a report to the store */
|
|
333
341
|
write(report: unknown): Promise<unknown>;
|
|
342
|
+
/** Read a report by its ID (used by the post-run diagnosis hook). */
|
|
343
|
+
read(id: string): Promise<null | unknown>;
|
|
344
|
+
/** Patch synthesis telemetry onto a published report (Phase 6 / DIAG-06). */
|
|
345
|
+
patchSynthesis(id: string, telemetry: unknown): Promise<void>;
|
|
346
|
+
/**
|
|
347
|
+
* Patch a single artifact-manifest entry onto a published report.
|
|
348
|
+
*
|
|
349
|
+
* Used by deferred commands (e.g. `ailf interpret`) whose post-hoc writer
|
|
350
|
+
* produces a new ArtifactRef after the doc was already published. The
|
|
351
|
+
* pipeline path lifts the full manifest at publish time
|
|
352
|
+
* (publish-report-step); this is the post-hoc equivalent for one slot.
|
|
353
|
+
* Non-fatal on Sanity failure — mirrors `patchSynthesis`.
|
|
354
|
+
*/
|
|
355
|
+
patchArtifactManifest(id: string, slot: string, ref: unknown): Promise<void>;
|
|
334
356
|
}
|
|
335
357
|
/**
|
|
336
358
|
* Minimal report sink interface used by AppContext.
|
|
@@ -88,6 +88,13 @@ export declare const EvalConfigSchema: z.ZodObject<{
|
|
|
88
88
|
skipEval: z.ZodOptional<z.ZodBoolean>;
|
|
89
89
|
skipFetch: z.ZodOptional<z.ZodBoolean>;
|
|
90
90
|
source: z.ZodOptional<z.ZodString>;
|
|
91
|
+
summary: z.ZodOptional<z.ZodObject<{
|
|
92
|
+
onRun: z.ZodOptional<z.ZodEnum<{
|
|
93
|
+
never: "never";
|
|
94
|
+
always: "always";
|
|
95
|
+
auto: "auto";
|
|
96
|
+
}>>;
|
|
97
|
+
}, z.core.$strip>>;
|
|
91
98
|
tasks: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
92
99
|
urls: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
93
100
|
presets: z.ZodOptional<z.ZodArray<z.ZodString>>;
|
|
@@ -175,6 +175,14 @@ export const EvalConfigSchema = z
|
|
|
175
175
|
skipFetch: z.boolean().optional(),
|
|
176
176
|
/** Documentation source name */
|
|
177
177
|
source: z.string().optional(),
|
|
178
|
+
/**
|
|
179
|
+
* Post-run diagnosis summary policy (Phase 6 / DIAG-06). Mirrors
|
|
180
|
+
* `RepoConfigSchema`'s `summary` block for `--config <path>` parity.
|
|
181
|
+
* Precedence resolved at the CLI layer by `shouldRunPostSummary()`.
|
|
182
|
+
*/
|
|
183
|
+
summary: z
|
|
184
|
+
.object({ onRun: z.enum(["auto", "always", "never"]).optional() })
|
|
185
|
+
.optional(),
|
|
178
186
|
/** Task ID filter */
|
|
179
187
|
tasks: z.array(z.string()).optional(),
|
|
180
188
|
/** Doc source URL overrides */
|
package/dist/_vendor/ailf-core/services/diagnosis/cards/__tests__/failure-mode-summary.test.js
CHANGED
|
@@ -169,3 +169,62 @@ describe("generateFailureModeSummary — empty failure modes (Test 9)", () => {
|
|
|
169
169
|
}
|
|
170
170
|
});
|
|
171
171
|
});
|
|
172
|
+
describe("generateFailureModeSummary — cross-cutting mode resolution", () => {
|
|
173
|
+
// `missing-docs` appears in both LITERACY_FAILURE_MODES and MCP_FAILURE_MODES.
|
|
174
|
+
// Before the per-EvalMode preference table, linear scan of CANONICAL_DIMENSIONS
|
|
175
|
+
// always resolved it to task-completion, mislabelling MCP-only runs.
|
|
176
|
+
function withMode(report, mode) {
|
|
177
|
+
return {
|
|
178
|
+
...report,
|
|
179
|
+
provenance: { ...report.provenance, mode },
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
it("resolves 'missing-docs' to mcp-behavior on an mcp-server report", async () => {
|
|
183
|
+
const base = makeReport({
|
|
184
|
+
counts: { "mcp-behavior": 7 },
|
|
185
|
+
topTitles: [
|
|
186
|
+
{
|
|
187
|
+
id: "mcp-behavior::missing-docs",
|
|
188
|
+
category: "missing-docs",
|
|
189
|
+
severity: "high",
|
|
190
|
+
title: "missing-docs",
|
|
191
|
+
count: 7,
|
|
192
|
+
},
|
|
193
|
+
],
|
|
194
|
+
totalJudgments: 20,
|
|
195
|
+
classificationRate: 0.35,
|
|
196
|
+
});
|
|
197
|
+
const report = withMode(base, "mcp-server");
|
|
198
|
+
const card = await generateFailureModeSummary(report, makeCtx());
|
|
199
|
+
expect(card.status).toBe("ready");
|
|
200
|
+
if (card.status === "ready") {
|
|
201
|
+
const body = card.body;
|
|
202
|
+
expect(body.dimension).toBe("mcp-behavior");
|
|
203
|
+
expect(body.failureMode).toBe("missing-docs");
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
it("resolves 'missing-docs' to task-completion on a literacy report", async () => {
|
|
207
|
+
const base = makeReport({
|
|
208
|
+
counts: { "task-completion": 7 },
|
|
209
|
+
topTitles: [
|
|
210
|
+
{
|
|
211
|
+
id: "task-completion::missing-docs",
|
|
212
|
+
category: "missing-docs",
|
|
213
|
+
severity: "high",
|
|
214
|
+
title: "missing-docs",
|
|
215
|
+
count: 7,
|
|
216
|
+
},
|
|
217
|
+
],
|
|
218
|
+
totalJudgments: 20,
|
|
219
|
+
classificationRate: 0.35,
|
|
220
|
+
});
|
|
221
|
+
const report = withMode(base, "literacy");
|
|
222
|
+
const card = await generateFailureModeSummary(report, makeCtx());
|
|
223
|
+
expect(card.status).toBe("ready");
|
|
224
|
+
if (card.status === "ready") {
|
|
225
|
+
const body = card.body;
|
|
226
|
+
expect(body.dimension).toBe("task-completion");
|
|
227
|
+
expect(body.failureMode).toBe("missing-docs");
|
|
228
|
+
}
|
|
229
|
+
});
|
|
230
|
+
});
|
|
@@ -79,7 +79,9 @@ export const generateDocAttributionSpotlight = async (report, ctx) => {
|
|
|
79
79
|
.max(5),
|
|
80
80
|
});
|
|
81
81
|
const prompt = buildDocAttributionSpotlightPrompt(report, ctx.judgmentAttributions);
|
|
82
|
-
|
|
82
|
+
// Destructure `cost` and `model` from the LLMClient return —
|
|
83
|
+
// already provided per llm-client.ts:139-144, previously discarded.
|
|
84
|
+
const { value, usage, cost, model } = await ctx.llm.completeStructured({
|
|
83
85
|
model: CARD_MODEL,
|
|
84
86
|
prompt: `${prompt.system}\n\n${prompt.user}`,
|
|
85
87
|
schema: PerCallSchema,
|
|
@@ -99,6 +101,8 @@ export const generateDocAttributionSpotlight = async (report, ctx) => {
|
|
|
99
101
|
cardVersion: "doc-attribution-spotlight@0.1.0",
|
|
100
102
|
tokenUsage: { input: usage.promptTokens, output: usage.completionTokens },
|
|
101
103
|
generatedAt: new Date().toISOString(),
|
|
104
|
+
cost,
|
|
105
|
+
model,
|
|
102
106
|
},
|
|
103
107
|
};
|
|
104
108
|
};
|
|
@@ -38,10 +38,54 @@ export const FailureModeSummaryBodySchema = z
|
|
|
38
38
|
// Private helper — find the dimension a failure mode belongs to
|
|
39
39
|
// ---------------------------------------------------------------------------
|
|
40
40
|
/**
|
|
41
|
-
*
|
|
41
|
+
* Per-family dimension preference order.
|
|
42
|
+
*
|
|
43
|
+
* Some failure modes (e.g., `missing-docs`) appear in multiple dimension
|
|
44
|
+
* families. When we know the report's eval mode, we should resolve the mode
|
|
45
|
+
* to a dimension in the matching family first, falling back to the linear
|
|
46
|
+
* scan only when the mode-preferred family doesn't carry the failure mode.
|
|
47
|
+
*/
|
|
48
|
+
const MODE_TO_PREFERRED_DIMENSIONS = {
|
|
49
|
+
literacy: ["task-completion", "code-correctness", "doc-coverage"],
|
|
50
|
+
"mcp-server": [
|
|
51
|
+
"mcp-behavior",
|
|
52
|
+
"input-validation",
|
|
53
|
+
"output-correctness",
|
|
54
|
+
"error-handling",
|
|
55
|
+
"security",
|
|
56
|
+
],
|
|
57
|
+
"knowledge-probe": [
|
|
58
|
+
"knowledge-probe",
|
|
59
|
+
"factual-correctness",
|
|
60
|
+
"completeness",
|
|
61
|
+
"currency",
|
|
62
|
+
],
|
|
63
|
+
"agent-harness": [
|
|
64
|
+
"agent-harness",
|
|
65
|
+
"process-quality",
|
|
66
|
+
"agent-output",
|
|
67
|
+
"tool-usage",
|
|
68
|
+
],
|
|
69
|
+
custom: [],
|
|
70
|
+
};
|
|
71
|
+
/**
|
|
72
|
+
* Find a canonical dimension whose taxonomy includes `mode`. When a
|
|
73
|
+
* `preferredEvalMode` is supplied, prefer dimensions in the eval mode's
|
|
74
|
+
* family — e.g. a mode appearing in both literacy and MCP resolves to MCP on
|
|
75
|
+
* an MCP-only run. Falls back to the linear scan of CANONICAL_DIMENSIONS so
|
|
76
|
+
* cross-cutting modes (and modes from unknown eval modes) still resolve.
|
|
77
|
+
*
|
|
42
78
|
* Returns `undefined` if the mode is not in any dimension's taxonomy.
|
|
43
79
|
*/
|
|
44
|
-
function findDimensionForMode(mode) {
|
|
80
|
+
function findDimensionForMode(mode, preferredEvalMode) {
|
|
81
|
+
if (preferredEvalMode) {
|
|
82
|
+
const preferred = MODE_TO_PREFERRED_DIMENSIONS[preferredEvalMode] ?? [];
|
|
83
|
+
for (const dim of preferred) {
|
|
84
|
+
if (failureModesForDimension(dim).includes(mode)) {
|
|
85
|
+
return dim;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
45
89
|
for (const dim of CANONICAL_DIMENSIONS) {
|
|
46
90
|
if (failureModesForDimension(dim).includes(mode)) {
|
|
47
91
|
return dim;
|
|
@@ -67,7 +111,7 @@ export const generateFailureModeSummary = async (report) => {
|
|
|
67
111
|
// Find the top entry — topTitles is already sorted by count descending
|
|
68
112
|
const topEntry = slimFm.topTitles.reduce((best, entry) => (entry.count > best.count ? entry : best), slimFm.topTitles[0]);
|
|
69
113
|
const failureMode = topEntry.category;
|
|
70
|
-
const dimension = findDimensionForMode(failureMode);
|
|
114
|
+
const dimension = findDimensionForMode(failureMode, report.provenance?.mode);
|
|
71
115
|
if (!dimension) {
|
|
72
116
|
return {
|
|
73
117
|
status: "missing",
|
|
@@ -25,6 +25,16 @@ import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js"
|
|
|
25
25
|
import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
|
|
26
26
|
import type { CardGenerator } from "../../diagnosis-runner.js";
|
|
27
27
|
import type { CardType } from "../../../types/diagnosis.js";
|
|
28
|
+
/**
|
|
29
|
+
* Canonical version of the card-registry surface — bumped whenever any card
|
|
30
|
+
* generator or body schema in this barrel changes. Used as the `cardVersion`
|
|
31
|
+
* fallback in version-resolver helpers (CLI `interpret`, API
|
|
32
|
+
* `versionsFromRecord`) so the four-version cache invalidation envelope stays
|
|
33
|
+
* in sync with the actual registry.
|
|
34
|
+
*
|
|
35
|
+
* Mirrors the pattern of `diagnosisVersion` exported from `diagnosis-runner.ts`.
|
|
36
|
+
*/
|
|
37
|
+
export declare const CARD_REGISTRY_VERSION = "0.1.0";
|
|
28
38
|
/**
|
|
29
39
|
* The canonical card-generator registry for the diagnosis engine.
|
|
30
40
|
*
|
|
@@ -24,6 +24,19 @@ import { generateLowConfidenceAttribution } from "./low-confidence-attribution.j
|
|
|
24
24
|
import { generateDocAttributionSpotlight } from "./doc-attribution-spotlight.js";
|
|
25
25
|
import { generateRegressionVsBaseline } from "./regression-vs-baseline.js";
|
|
26
26
|
// ---------------------------------------------------------------------------
|
|
27
|
+
// Card registry version (cache invalidation segment)
|
|
28
|
+
// ---------------------------------------------------------------------------
|
|
29
|
+
/**
|
|
30
|
+
* Canonical version of the card-registry surface — bumped whenever any card
|
|
31
|
+
* generator or body schema in this barrel changes. Used as the `cardVersion`
|
|
32
|
+
* fallback in version-resolver helpers (CLI `interpret`, API
|
|
33
|
+
* `versionsFromRecord`) so the four-version cache invalidation envelope stays
|
|
34
|
+
* in sync with the actual registry.
|
|
35
|
+
*
|
|
36
|
+
* Mirrors the pattern of `diagnosisVersion` exported from `diagnosis-runner.ts`.
|
|
37
|
+
*/
|
|
38
|
+
export const CARD_REGISTRY_VERSION = "0.1.0";
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
27
40
|
// DIAGNOSIS_CARD_GENERATORS — full 8-card registry literal
|
|
28
41
|
// ---------------------------------------------------------------------------
|
|
29
42
|
/**
|