open-research-protocol 0.4.9 → 0.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -1
- package/bin/orp-compute.mjs +341 -0
- package/bin/orp.js +58 -15
- package/cli/orp.py +242 -36
- package/docs/ORP_YOUTUBE_INSPECT.md +10 -1
- package/llms.txt +1 -1
- package/package.json +1 -1
- package/scripts/orp-kernel-benchmark.py +8 -8
- package/spec/v1/youtube-source.schema.json +49 -8
package/README.md
CHANGED
|
@@ -58,6 +58,7 @@ ORP should feel like one CLI with built-in abilities:
|
|
|
58
58
|
- `collaborate` for repository collaboration setup and workflow execution
|
|
59
59
|
- `erdos` for Erdos-specific data and workflow support
|
|
60
60
|
- `report` and `packet` for ORP artifacts
|
|
61
|
+
- `compute` for targeted compute admission, local execution, and paid approval gating
|
|
61
62
|
|
|
62
63
|
The `pack` layer still exists, but it is now an advanced/internal surface rather
|
|
63
64
|
than the main product story.
|
|
@@ -145,6 +146,8 @@ orp pack install --pack-id erdos-open-problems --json
|
|
|
145
146
|
orp pack fetch --source <git-url> --pack-id <pack-id> --install-target . --json
|
|
146
147
|
orp gate run --profile default --json
|
|
147
148
|
orp packet emit --profile default --json
|
|
149
|
+
orp compute decide --input orp.compute.json --json
|
|
150
|
+
orp compute run-local --input orp.compute.json --task orp.compute.task.json --json
|
|
148
151
|
orp report summary --json
|
|
149
152
|
```
|
|
150
153
|
|
|
@@ -153,7 +156,8 @@ These surfaces are meant to help automated systems discover ORP quickly:
|
|
|
153
156
|
- bare `orp` opens a home screen with repo/runtime status, available packs, and next commands
|
|
154
157
|
- `orp home --json` returns the same landing context in machine-readable form
|
|
155
158
|
- `orp auth ...`, `orp ideas ...`, `orp world ...`, `orp checkpoint ...`, `orp runner ...`, and `orp agent ...` expose the hosted workspace surface directly through ORP
|
|
156
|
-
- `orp
|
|
159
|
+
- `orp compute ...` exposes targeted-compute admission, local execution, and paid-approval gating through a stable ORP wrapper surface
|
|
160
|
+
- `orp youtube inspect ...` exposes public YouTube metadata plus full transcript ingestion through a stable ORP artifact shape for agent use when caption tracks are available
|
|
157
161
|
- `orp init`, `orp status`, `orp branch start`, `orp checkpoint create`, `orp backup`, `orp ready`, `orp doctor`, and `orp cleanup` expose the local-first repo governance surface directly through ORP
|
|
158
162
|
- `orp discover ...` exposes profile-based GitHub scanning as a built-in ORP ability
|
|
159
163
|
- `orp collaborate ...` exposes built-in collaboration setup and workflow execution without asking users to think in terms of separate governance packs
|
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import fs from "node:fs/promises";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import process from "node:process";
|
|
6
|
+
import {
|
|
7
|
+
buildOrpComputeGateResult,
|
|
8
|
+
buildOrpComputePacket,
|
|
9
|
+
defineComputePacket,
|
|
10
|
+
defineDecision,
|
|
11
|
+
defineImpactRead,
|
|
12
|
+
definePolicy,
|
|
13
|
+
defineResultBundle,
|
|
14
|
+
defineRung,
|
|
15
|
+
evaluateDispatch,
|
|
16
|
+
runLocalShellPacket,
|
|
17
|
+
} from "breakthroughs";
|
|
18
|
+
|
|
19
|
+
function printHelp() {
|
|
20
|
+
console.log(`ORP compute
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
orp compute decide --input <path> [--packet-out <path>] [--json]
|
|
24
|
+
orp compute run-local --input <path> --task <path> [--receipt-out <path>] [--packet-out <path>] [--json]
|
|
25
|
+
|
|
26
|
+
Input JSON shape:
|
|
27
|
+
{
|
|
28
|
+
"decision": { ... },
|
|
29
|
+
"rung": { ... },
|
|
30
|
+
"policy": { ... },
|
|
31
|
+
"packet": { ... },
|
|
32
|
+
"repo": {
|
|
33
|
+
"rootPath": "/abs/path",
|
|
34
|
+
"git": { "branch": "main", "commit": "abc123" }
|
|
35
|
+
},
|
|
36
|
+
"orp": {
|
|
37
|
+
"boardId": "targeted_compute",
|
|
38
|
+
"problemId": "adult-vs-developmental-rgc",
|
|
39
|
+
"artifactRoot": "orp/artifacts"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
Task JSON shape for run-local:
|
|
44
|
+
{
|
|
45
|
+
"command": "node",
|
|
46
|
+
"args": ["-e", "console.log('hello')"],
|
|
47
|
+
"cwd": "/abs/path",
|
|
48
|
+
"timeoutMs": 30000,
|
|
49
|
+
"env": {}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
Policy semantics:
|
|
53
|
+
- local admitted rungs can resolve to run_local
|
|
54
|
+
- paid admitted rungs resolve to request_paid_approval unless the rung is explicitly approved in policy.paid.approvedRungs
|
|
55
|
+
`);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function parseArgs(argv) {
|
|
59
|
+
const options = {
|
|
60
|
+
json: false,
|
|
61
|
+
};
|
|
62
|
+
|
|
63
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
64
|
+
const arg = argv[i];
|
|
65
|
+
|
|
66
|
+
if (arg === "--json") {
|
|
67
|
+
options.json = true;
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
if (arg === "-h" || arg === "--help") {
|
|
71
|
+
options.help = true;
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
if (arg.startsWith("--")) {
|
|
75
|
+
const key = arg.slice(2).replace(/-([a-z])/g, (_, ch) => ch.toUpperCase());
|
|
76
|
+
const value = argv[i + 1];
|
|
77
|
+
if (value == null || value.startsWith("--")) {
|
|
78
|
+
throw new Error(`missing value for ${arg}`);
|
|
79
|
+
}
|
|
80
|
+
options[key] = value;
|
|
81
|
+
i += 1;
|
|
82
|
+
continue;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if (!options.command) {
|
|
86
|
+
options.command = arg;
|
|
87
|
+
} else {
|
|
88
|
+
throw new Error(`unexpected argument: ${arg}`);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return options;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async function readJson(filePath) {
|
|
96
|
+
const raw = await fs.readFile(filePath, "utf8");
|
|
97
|
+
return JSON.parse(raw);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
async function writeJson(filePath, payload) {
|
|
101
|
+
await fs.mkdir(path.dirname(filePath), { recursive: true });
|
|
102
|
+
await fs.writeFile(filePath, `${JSON.stringify(payload, null, 2)}\n`, "utf8");
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function buildContext(raw) {
|
|
106
|
+
if (!raw || typeof raw !== "object") {
|
|
107
|
+
throw new Error("input must be a JSON object");
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
raw,
|
|
112
|
+
decision: defineDecision(raw.decision),
|
|
113
|
+
rung: defineRung(raw.rung),
|
|
114
|
+
policy: definePolicy(raw.policy),
|
|
115
|
+
packet: defineComputePacket(raw.packet),
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function commandLabel(subcommand, options) {
|
|
120
|
+
const parts = ["orp", "compute", subcommand];
|
|
121
|
+
if (options.input) {
|
|
122
|
+
parts.push("--input", options.input);
|
|
123
|
+
}
|
|
124
|
+
if (options.task) {
|
|
125
|
+
parts.push("--task", options.task);
|
|
126
|
+
}
|
|
127
|
+
return parts.join(" ");
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function gateStatusForDispatch(action) {
|
|
131
|
+
if (action === "hold_packet") {
|
|
132
|
+
return "fail";
|
|
133
|
+
}
|
|
134
|
+
if (action === "request_paid_approval") {
|
|
135
|
+
return "hold";
|
|
136
|
+
}
|
|
137
|
+
return "pass";
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function summarizeDispatch(dispatchResult) {
|
|
141
|
+
if (dispatchResult.action === "request_paid_approval") {
|
|
142
|
+
return `compute packet requires explicit paid approval for rung ${dispatchResult.rungId}`;
|
|
143
|
+
}
|
|
144
|
+
if (dispatchResult.action === "hold_packet") {
|
|
145
|
+
return `compute packet is being held because ${dispatchResult.reason}`;
|
|
146
|
+
}
|
|
147
|
+
return `compute packet admitted with action ${dispatchResult.action}`;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async function runDecide(options) {
|
|
151
|
+
if (!options.input) {
|
|
152
|
+
throw new Error("compute decide requires --input <path>");
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
const context = buildContext(await readJson(options.input));
|
|
156
|
+
const dispatchResult = evaluateDispatch(context);
|
|
157
|
+
const gateResult = buildOrpComputeGateResult({
|
|
158
|
+
gateId: context.packet.rungId,
|
|
159
|
+
command: commandLabel("decide", options),
|
|
160
|
+
status: gateStatusForDispatch(dispatchResult.action),
|
|
161
|
+
evidenceNote: summarizeDispatch(dispatchResult),
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
const payload = {
|
|
165
|
+
ok: dispatchResult.action !== "hold_packet",
|
|
166
|
+
command: "compute decide",
|
|
167
|
+
dispatch_result: dispatchResult,
|
|
168
|
+
gate_result: gateResult,
|
|
169
|
+
};
|
|
170
|
+
|
|
171
|
+
if (options.packetOut) {
|
|
172
|
+
const orpPacket = buildOrpComputePacket({
|
|
173
|
+
repoRoot: context.raw.repo?.rootPath || process.cwd(),
|
|
174
|
+
repoGit: context.raw.repo?.git,
|
|
175
|
+
decision: context.decision,
|
|
176
|
+
packet: context.packet,
|
|
177
|
+
dispatchResult,
|
|
178
|
+
gateResults: [gateResult],
|
|
179
|
+
artifactRoot: context.raw.orp?.artifactRoot,
|
|
180
|
+
boardId: context.raw.orp?.boardId,
|
|
181
|
+
problemId: context.raw.orp?.problemId,
|
|
182
|
+
stateNote: summarizeDispatch(dispatchResult),
|
|
183
|
+
});
|
|
184
|
+
await writeJson(options.packetOut, orpPacket);
|
|
185
|
+
payload.orp_packet_path = path.resolve(options.packetOut);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if (options.json) {
|
|
189
|
+
console.log(JSON.stringify(payload, null, 2));
|
|
190
|
+
} else {
|
|
191
|
+
console.log(`${dispatchResult.action}: ${summarizeDispatch(dispatchResult)}`);
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
return dispatchResult.action === "hold_packet" ? 1 : 0;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
async function runLocal(options) {
|
|
198
|
+
if (!options.input) {
|
|
199
|
+
throw new Error("compute run-local requires --input <path>");
|
|
200
|
+
}
|
|
201
|
+
if (!options.task) {
|
|
202
|
+
throw new Error("compute run-local requires --task <path>");
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const context = buildContext(await readJson(options.input));
|
|
206
|
+
const task = await readJson(options.task);
|
|
207
|
+
const dispatchResult = evaluateDispatch(context);
|
|
208
|
+
|
|
209
|
+
if (dispatchResult.action !== "run_local") {
|
|
210
|
+
const message = `compute packet is not locally runnable; dispatch action is ${dispatchResult.action}`;
|
|
211
|
+
if (options.json) {
|
|
212
|
+
console.log(JSON.stringify({ ok: false, error: message, dispatch_result: dispatchResult }, null, 2));
|
|
213
|
+
} else {
|
|
214
|
+
console.error(message);
|
|
215
|
+
}
|
|
216
|
+
return 1;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
const executionReceipt = await runLocalShellPacket({
|
|
220
|
+
decision: context.decision,
|
|
221
|
+
rung: context.rung,
|
|
222
|
+
packet: context.packet,
|
|
223
|
+
dispatchResult,
|
|
224
|
+
task,
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
const gateResult = buildOrpComputeGateResult({
|
|
228
|
+
gateId: context.packet.rungId,
|
|
229
|
+
command: `${executionReceipt.command} ${executionReceipt.args.join(" ")}`.trim(),
|
|
230
|
+
status: executionReceipt.status === "pass" ? "pass" : "fail",
|
|
231
|
+
exitCode: executionReceipt.exitCode == null ? 1 : executionReceipt.exitCode,
|
|
232
|
+
durationMs: executionReceipt.durationMs,
|
|
233
|
+
evidenceNote: `local shell execution completed with status ${executionReceipt.status}`,
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
const resultBundle = defineResultBundle({
|
|
237
|
+
id: `${context.packet.id}-result`,
|
|
238
|
+
packetId: context.packet.id,
|
|
239
|
+
outputs: context.packet.requiredOutputs,
|
|
240
|
+
status: executionReceipt.status,
|
|
241
|
+
metrics: {
|
|
242
|
+
exitCode: executionReceipt.exitCode,
|
|
243
|
+
durationMs: executionReceipt.durationMs,
|
|
244
|
+
timedOut: executionReceipt.timedOut,
|
|
245
|
+
},
|
|
246
|
+
});
|
|
247
|
+
|
|
248
|
+
const impactRead = defineImpactRead({
|
|
249
|
+
id: `${context.packet.id}-impact`,
|
|
250
|
+
bundleId: resultBundle.id,
|
|
251
|
+
nextAction: executionReceipt.status === "pass" ? "review_result_bundle" : "reroute_or_debug",
|
|
252
|
+
summary:
|
|
253
|
+
executionReceipt.status === "pass"
|
|
254
|
+
? `local compute packet ${context.packet.id} completed successfully`
|
|
255
|
+
: `local compute packet ${context.packet.id} failed and needs follow-up`,
|
|
256
|
+
});
|
|
257
|
+
|
|
258
|
+
const payload = {
|
|
259
|
+
ok: executionReceipt.status === "pass",
|
|
260
|
+
command: "compute run-local",
|
|
261
|
+
dispatch_result: dispatchResult,
|
|
262
|
+
execution_receipt: executionReceipt,
|
|
263
|
+
gate_result: gateResult,
|
|
264
|
+
result_bundle: resultBundle,
|
|
265
|
+
impact_read: impactRead,
|
|
266
|
+
};
|
|
267
|
+
|
|
268
|
+
if (options.receiptOut) {
|
|
269
|
+
await writeJson(options.receiptOut, executionReceipt);
|
|
270
|
+
payload.execution_receipt_path = path.resolve(options.receiptOut);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
if (options.packetOut) {
|
|
274
|
+
const orpPacket = buildOrpComputePacket({
|
|
275
|
+
repoRoot: context.raw.repo?.rootPath || process.cwd(),
|
|
276
|
+
repoGit: context.raw.repo?.git,
|
|
277
|
+
decision: context.decision,
|
|
278
|
+
packet: context.packet,
|
|
279
|
+
dispatchResult,
|
|
280
|
+
resultBundle,
|
|
281
|
+
impactRead,
|
|
282
|
+
gateResults: [gateResult],
|
|
283
|
+
artifactRoot: context.raw.orp?.artifactRoot,
|
|
284
|
+
boardId: context.raw.orp?.boardId,
|
|
285
|
+
problemId: context.raw.orp?.problemId,
|
|
286
|
+
extraPaths: options.receiptOut ? [path.resolve(options.receiptOut)] : [],
|
|
287
|
+
stateNote: impactRead.summary,
|
|
288
|
+
});
|
|
289
|
+
await writeJson(options.packetOut, orpPacket);
|
|
290
|
+
payload.orp_packet_path = path.resolve(options.packetOut);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
if (options.json) {
|
|
294
|
+
console.log(JSON.stringify(payload, null, 2));
|
|
295
|
+
} else {
|
|
296
|
+
console.log(`${executionReceipt.status}: ${impactRead.summary}`);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (executionReceipt.exitCode == null) {
|
|
300
|
+
return 1;
|
|
301
|
+
}
|
|
302
|
+
return executionReceipt.exitCode;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
export async function runComputeCli(argv = process.argv.slice(2)) {
|
|
306
|
+
let options;
|
|
307
|
+
try {
|
|
308
|
+
options = parseArgs(argv);
|
|
309
|
+
} catch (error) {
|
|
310
|
+
console.error(String(error.message || error));
|
|
311
|
+
printHelp();
|
|
312
|
+
return 1;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
if (options.help || !options.command) {
|
|
316
|
+
printHelp();
|
|
317
|
+
return 0;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
try {
|
|
321
|
+
if (options.command === "decide") {
|
|
322
|
+
return await runDecide(options);
|
|
323
|
+
}
|
|
324
|
+
if (options.command === "run-local") {
|
|
325
|
+
return await runLocal(options);
|
|
326
|
+
}
|
|
327
|
+
throw new Error(`unknown compute command: ${options.command}`);
|
|
328
|
+
} catch (error) {
|
|
329
|
+
if (options.json) {
|
|
330
|
+
console.log(JSON.stringify({ ok: false, error: String(error.message || error) }, null, 2));
|
|
331
|
+
} else {
|
|
332
|
+
console.error(String(error.message || error));
|
|
333
|
+
}
|
|
334
|
+
return 1;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if (import.meta.url === `file://${process.argv[1]}`) {
|
|
339
|
+
const code = await runComputeCli(process.argv.slice(2));
|
|
340
|
+
process.exit(code == null ? 0 : code);
|
|
341
|
+
}
|
package/bin/orp.js
CHANGED
|
@@ -1,9 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
const path = require("path");
|
|
4
|
+
const { pathToFileURL } = require("url");
|
|
4
5
|
const { spawnSync } = require("child_process");
|
|
5
6
|
|
|
6
7
|
const cliPath = path.resolve(__dirname, "..", "cli", "orp.py");
|
|
8
|
+
const computeCliUrl = pathToFileURL(path.resolve(__dirname, "orp-compute.mjs")).href;
|
|
7
9
|
const argv = process.argv.slice(2);
|
|
8
10
|
|
|
9
11
|
const candidates = [];
|
|
@@ -15,24 +17,65 @@ if (process.platform === "win32") {
|
|
|
15
17
|
}
|
|
16
18
|
candidates.push("python3", "python");
|
|
17
19
|
|
|
18
|
-
|
|
20
|
+
function isTopLevelHelp(args) {
|
|
21
|
+
return args.length === 0 || args.includes("-h") || args.includes("--help");
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
async function runCompute(args) {
|
|
25
|
+
const mod = await import(computeCliUrl);
|
|
26
|
+
const code = await mod.runComputeCli(args);
|
|
27
|
+
process.exit(code == null ? 0 : code);
|
|
28
|
+
}
|
|
19
29
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
process.exit(result.status == null ? 1 : result.status);
|
|
30
|
+
async function main() {
|
|
31
|
+
if (argv[0] === "compute") {
|
|
32
|
+
await runCompute(argv.slice(1));
|
|
33
|
+
return;
|
|
25
34
|
}
|
|
26
|
-
|
|
27
|
-
|
|
35
|
+
|
|
36
|
+
const captureOutput = isTopLevelHelp(argv);
|
|
37
|
+
let lastErr = null;
|
|
38
|
+
|
|
39
|
+
for (const py of candidates) {
|
|
40
|
+
const args = py === "py" ? ["-3", cliPath, ...argv] : [cliPath, ...argv];
|
|
41
|
+
const result = spawnSync(
|
|
42
|
+
py,
|
|
43
|
+
args,
|
|
44
|
+
captureOutput
|
|
45
|
+
? { encoding: "utf8" }
|
|
46
|
+
: { stdio: "inherit" },
|
|
47
|
+
);
|
|
48
|
+
|
|
49
|
+
if (!result.error) {
|
|
50
|
+
if (captureOutput) {
|
|
51
|
+
if (result.stdout) {
|
|
52
|
+
process.stdout.write(result.stdout);
|
|
53
|
+
}
|
|
54
|
+
if (result.stderr) {
|
|
55
|
+
process.stderr.write(result.stderr);
|
|
56
|
+
}
|
|
57
|
+
if (result.status === 0) {
|
|
58
|
+
process.stdout.write("\nAdditional wrapper surface:\n orp compute -h\n");
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
process.exit(result.status == null ? 1 : result.status);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (result.error && result.error.code === "ENOENT") {
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
lastErr = result.error;
|
|
28
68
|
}
|
|
29
|
-
lastErr = result.error;
|
|
30
|
-
}
|
|
31
69
|
|
|
32
|
-
console.error("ORP CLI requires Python 3 on PATH.");
|
|
33
|
-
console.error("Tried: " + candidates.join(", "));
|
|
34
|
-
if (lastErr) {
|
|
35
|
-
|
|
70
|
+
console.error("ORP CLI requires Python 3 on PATH.");
|
|
71
|
+
console.error("Tried: " + candidates.join(", "));
|
|
72
|
+
if (lastErr) {
|
|
73
|
+
console.error(String(lastErr));
|
|
74
|
+
}
|
|
75
|
+
process.exit(1);
|
|
36
76
|
}
|
|
37
|
-
process.exit(1);
|
|
38
77
|
|
|
78
|
+
main().catch((error) => {
|
|
79
|
+
console.error(String(error && error.stack ? error.stack : error));
|
|
80
|
+
process.exit(1);
|
|
81
|
+
});
|
package/cli/orp.py
CHANGED
|
@@ -115,6 +115,10 @@ DEFAULT_DISCOVER_SCAN_ROOT = "orp/discovery/github"
|
|
|
115
115
|
DEFAULT_HOSTED_BASE_URL = "https://orp.earth"
|
|
116
116
|
KERNEL_SCHEMA_VERSION = "1.0.0"
|
|
117
117
|
YOUTUBE_SOURCE_SCHEMA_VERSION = "1.0.0"
|
|
118
|
+
YOUTUBE_ANDROID_CLIENT_VERSION = "20.10.38"
|
|
119
|
+
YOUTUBE_ANDROID_USER_AGENT = (
|
|
120
|
+
f"com.google.android.youtube/{YOUTUBE_ANDROID_CLIENT_VERSION} (Linux; U; Android 14)"
|
|
121
|
+
)
|
|
118
122
|
|
|
119
123
|
|
|
120
124
|
class HostedApiError(RuntimeError):
|
|
@@ -362,6 +366,35 @@ def _http_get_json(url: str, *, headers: dict[str, str] | None = None, timeout_s
|
|
|
362
366
|
raise RuntimeError(f"Response from {url} was not a JSON object.")
|
|
363
367
|
|
|
364
368
|
|
|
369
|
+
def _http_post_json(
|
|
370
|
+
url: str,
|
|
371
|
+
payload: dict[str, Any],
|
|
372
|
+
*,
|
|
373
|
+
headers: dict[str, str] | None = None,
|
|
374
|
+
timeout_sec: int = 20,
|
|
375
|
+
) -> dict[str, Any]:
|
|
376
|
+
body = json.dumps(payload).encode("utf-8")
|
|
377
|
+
merged_headers = {"Content-Type": "application/json"}
|
|
378
|
+
if headers:
|
|
379
|
+
merged_headers.update(headers)
|
|
380
|
+
request = urlrequest.Request(url, data=body, headers=merged_headers, method="POST")
|
|
381
|
+
try:
|
|
382
|
+
with urlrequest.urlopen(request, timeout=timeout_sec) as response:
|
|
383
|
+
text = response.read().decode("utf-8", errors="replace")
|
|
384
|
+
except urlerror.HTTPError as exc:
|
|
385
|
+
body_text = exc.read().decode("utf-8", errors="replace").strip()
|
|
386
|
+
raise RuntimeError(f"HTTP {exc.code} while fetching {url}: {body_text or exc.reason}") from exc
|
|
387
|
+
except urlerror.URLError as exc:
|
|
388
|
+
raise RuntimeError(f"Could not reach {url}: {exc.reason}") from exc
|
|
389
|
+
try:
|
|
390
|
+
parsed = json.loads(text)
|
|
391
|
+
except Exception as exc:
|
|
392
|
+
raise RuntimeError(f"Response from {url} was not valid JSON.") from exc
|
|
393
|
+
if isinstance(parsed, dict):
|
|
394
|
+
return parsed
|
|
395
|
+
raise RuntimeError(f"Response from {url} was not a JSON object.")
|
|
396
|
+
|
|
397
|
+
|
|
365
398
|
def _youtube_request_headers() -> dict[str, str]:
|
|
366
399
|
return {
|
|
367
400
|
"User-Agent": (
|
|
@@ -372,6 +405,13 @@ def _youtube_request_headers() -> dict[str, str]:
|
|
|
372
405
|
}
|
|
373
406
|
|
|
374
407
|
|
|
408
|
+
def _youtube_android_request_headers() -> dict[str, str]:
|
|
409
|
+
return {
|
|
410
|
+
"User-Agent": YOUTUBE_ANDROID_USER_AGENT,
|
|
411
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
|
|
375
415
|
def _youtube_source_schema_path() -> Path:
|
|
376
416
|
return Path(__file__).resolve().parent.parent / "spec" / "v1" / "youtube-source.schema.json"
|
|
377
417
|
|
|
@@ -459,21 +499,52 @@ def _youtube_track_label(track: dict[str, Any]) -> str:
|
|
|
459
499
|
return str(track.get("languageCode", "")).strip()
|
|
460
500
|
|
|
461
501
|
|
|
462
|
-
def
|
|
463
|
-
|
|
464
|
-
|
|
502
|
+
def _youtube_track_source(track: dict[str, Any]) -> str:
|
|
503
|
+
return str(track.get("_orp_source", "") or "unknown").strip()
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _youtube_track_inventory(tracks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
507
|
+
inventory: list[dict[str, Any]] = []
|
|
508
|
+
seen: set[tuple[str, str, str, str]] = set()
|
|
509
|
+
for track in tracks:
|
|
510
|
+
if not isinstance(track, dict):
|
|
511
|
+
continue
|
|
512
|
+
language_code = str(track.get("languageCode", "")).strip()
|
|
513
|
+
label = _youtube_track_label(track)
|
|
514
|
+
kind = "auto" if str(track.get("kind", "")).strip().lower() == "asr" else "manual"
|
|
515
|
+
source = _youtube_track_source(track)
|
|
516
|
+
key = (language_code, label, kind, source)
|
|
517
|
+
if key in seen:
|
|
518
|
+
continue
|
|
519
|
+
seen.add(key)
|
|
520
|
+
inventory.append(
|
|
521
|
+
{
|
|
522
|
+
"language_code": language_code,
|
|
523
|
+
"name": label,
|
|
524
|
+
"kind": kind,
|
|
525
|
+
"source": source,
|
|
526
|
+
}
|
|
527
|
+
)
|
|
528
|
+
return inventory
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def _youtube_caption_track_sort_key(track: dict[str, Any], preferred_lang: str = "") -> tuple[int, int]:
|
|
465
532
|
preferred = str(preferred_lang or "").strip().lower()
|
|
533
|
+
code = str(track.get("languageCode", "")).strip().lower()
|
|
534
|
+
kind = str(track.get("kind", "")).strip().lower()
|
|
535
|
+
auto = 1 if kind == "asr" else 0
|
|
536
|
+
source = _youtube_track_source(track)
|
|
537
|
+
source_bias = 15 if source == "android_player" else 0
|
|
538
|
+
exact = 1 if preferred and code == preferred else 0
|
|
539
|
+
prefix = 1 if preferred and code.startswith(preferred + "-") else 0
|
|
540
|
+
english = 1 if code.startswith("en") else 0
|
|
541
|
+
return (exact * 100 + prefix * 80 + english * 20 + source_bias - auto * 5, -auto)
|
|
466
542
|
|
|
467
|
-
def score(track: dict[str, Any]) -> tuple[int, int]:
|
|
468
|
-
code = str(track.get("languageCode", "")).strip().lower()
|
|
469
|
-
kind = str(track.get("kind", "")).strip().lower()
|
|
470
|
-
auto = 1 if kind == "asr" else 0
|
|
471
|
-
exact = 1 if preferred and code == preferred else 0
|
|
472
|
-
prefix = 1 if preferred and code.startswith(preferred + "-") else 0
|
|
473
|
-
english = 1 if code.startswith("en") else 0
|
|
474
|
-
return (exact * 100 + prefix * 80 + english * 20 - auto * 5, -auto)
|
|
475
543
|
|
|
476
|
-
|
|
544
|
+
def _pick_youtube_caption_track(tracks: list[dict[str, Any]], preferred_lang: str = "") -> dict[str, Any] | None:
|
|
545
|
+
if not tracks:
|
|
546
|
+
return None
|
|
547
|
+
ranked = sorted(tracks, key=lambda track: _youtube_caption_track_sort_key(track, preferred_lang), reverse=True)
|
|
477
548
|
return ranked[0] if ranked else None
|
|
478
549
|
|
|
479
550
|
|
|
@@ -544,6 +615,19 @@ def _parse_youtube_transcript_xml(text: str) -> tuple[str, list[dict[str, Any]]]
|
|
|
544
615
|
"text": body,
|
|
545
616
|
}
|
|
546
617
|
)
|
|
618
|
+
if not segments:
|
|
619
|
+
for node in root.findall(".//p"):
|
|
620
|
+
body = html.unescape("".join(node.itertext() or []))
|
|
621
|
+
body = re.sub(r"\s+", " ", body).strip()
|
|
622
|
+
if not body:
|
|
623
|
+
continue
|
|
624
|
+
segments.append(
|
|
625
|
+
{
|
|
626
|
+
"start_ms": int(node.attrib.get("t", "0") or "0"),
|
|
627
|
+
"duration_ms": int(node.attrib.get("d", "0") or "0"),
|
|
628
|
+
"text": body,
|
|
629
|
+
}
|
|
630
|
+
)
|
|
547
631
|
transcript_text = "\n".join(str(row["text"]) for row in segments)
|
|
548
632
|
return transcript_text, segments
|
|
549
633
|
|
|
@@ -577,6 +661,8 @@ def _youtube_fetch_watch_state(video_id: str) -> dict[str, Any]:
|
|
|
577
661
|
.get("playerCaptionsTracklistRenderer", {})
|
|
578
662
|
.get("captionTracks", [])
|
|
579
663
|
)
|
|
664
|
+
tracks = captions if isinstance(captions, list) else []
|
|
665
|
+
normalized_tracks = [{**row, "_orp_source": "watch_page"} for row in tracks if isinstance(row, dict)]
|
|
580
666
|
return {
|
|
581
667
|
"player_response": player_response,
|
|
582
668
|
"video_details": player_response.get("videoDetails", {}) if isinstance(player_response.get("videoDetails"), dict) else {},
|
|
@@ -590,29 +676,110 @@ def _youtube_fetch_watch_state(video_id: str) -> dict[str, Any]:
|
|
|
590
676
|
if isinstance(player_response.get("playabilityStatus"), dict)
|
|
591
677
|
else {}
|
|
592
678
|
),
|
|
593
|
-
"caption_tracks":
|
|
679
|
+
"caption_tracks": normalized_tracks,
|
|
680
|
+
}
|
|
681
|
+
|
|
682
|
+
|
|
683
|
+
def _youtube_fetch_android_player_state(video_id: str) -> dict[str, Any]:
|
|
684
|
+
payload = _http_post_json(
|
|
685
|
+
"https://www.youtube.com/youtubei/v1/player?prettyPrint=false",
|
|
686
|
+
{
|
|
687
|
+
"context": {
|
|
688
|
+
"client": {
|
|
689
|
+
"clientName": "ANDROID",
|
|
690
|
+
"clientVersion": YOUTUBE_ANDROID_CLIENT_VERSION,
|
|
691
|
+
}
|
|
692
|
+
},
|
|
693
|
+
"videoId": video_id,
|
|
694
|
+
},
|
|
695
|
+
headers=_youtube_android_request_headers(),
|
|
696
|
+
timeout_sec=25,
|
|
697
|
+
)
|
|
698
|
+
captions = (
|
|
699
|
+
payload.get("captions", {})
|
|
700
|
+
.get("playerCaptionsTracklistRenderer", {})
|
|
701
|
+
.get("captionTracks", [])
|
|
702
|
+
)
|
|
703
|
+
tracks = captions if isinstance(captions, list) else []
|
|
704
|
+
normalized_tracks = [{**row, "_orp_source": "android_player"} for row in tracks if isinstance(row, dict)]
|
|
705
|
+
return {
|
|
706
|
+
"player_response": payload,
|
|
707
|
+
"video_details": payload.get("videoDetails", {}) if isinstance(payload.get("videoDetails"), dict) else {},
|
|
708
|
+
"microformat": {},
|
|
709
|
+
"playability_status": payload.get("playabilityStatus", {}) if isinstance(payload.get("playabilityStatus"), dict) else {},
|
|
710
|
+
"caption_tracks": normalized_tracks,
|
|
594
711
|
}
|
|
595
712
|
|
|
596
713
|
|
|
714
|
+
def _youtube_ranked_caption_tracks(
|
|
715
|
+
watch_tracks: list[dict[str, Any]],
|
|
716
|
+
android_tracks: list[dict[str, Any]],
|
|
717
|
+
preferred_lang: str = "",
|
|
718
|
+
) -> list[dict[str, Any]]:
|
|
719
|
+
ranked = sorted(
|
|
720
|
+
[track for track in android_tracks if isinstance(track, dict)]
|
|
721
|
+
+ [track for track in watch_tracks if isinstance(track, dict)],
|
|
722
|
+
key=lambda track: _youtube_caption_track_sort_key(track, preferred_lang),
|
|
723
|
+
reverse=True,
|
|
724
|
+
)
|
|
725
|
+
unique: list[dict[str, Any]] = []
|
|
726
|
+
seen: set[tuple[str, str, str, str]] = set()
|
|
727
|
+
for track in ranked:
|
|
728
|
+
key = (
|
|
729
|
+
str(track.get("languageCode", "")).strip(),
|
|
730
|
+
_youtube_track_label(track),
|
|
731
|
+
str(track.get("kind", "")).strip().lower(),
|
|
732
|
+
_youtube_track_source(track),
|
|
733
|
+
)
|
|
734
|
+
if key in seen:
|
|
735
|
+
continue
|
|
736
|
+
seen.add(key)
|
|
737
|
+
unique.append(track)
|
|
738
|
+
return unique
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _youtube_parse_transcript_response(text: str) -> tuple[str, list[dict[str, Any]], str]:
|
|
742
|
+
stripped = str(text or "").lstrip()
|
|
743
|
+
if not stripped:
|
|
744
|
+
return ("", [], "empty")
|
|
745
|
+
if stripped.startswith("{"):
|
|
746
|
+
try:
|
|
747
|
+
payload = json.loads(text)
|
|
748
|
+
except Exception:
|
|
749
|
+
payload = None
|
|
750
|
+
if isinstance(payload, dict):
|
|
751
|
+
transcript_text, segments = _parse_youtube_transcript_json3(payload)
|
|
752
|
+
if transcript_text:
|
|
753
|
+
return (transcript_text, segments, "json3")
|
|
754
|
+
transcript_text, segments = _parse_youtube_transcript_xml(text)
|
|
755
|
+
if transcript_text:
|
|
756
|
+
return (transcript_text, segments, "xml")
|
|
757
|
+
return ("", [], "unparsed")
|
|
758
|
+
|
|
759
|
+
|
|
597
760
|
def _youtube_fetch_transcript_from_track(track: dict[str, Any]) -> tuple[str, list[dict[str, Any]], str]:
|
|
598
761
|
base_url = str(track.get("baseUrl", "")).strip()
|
|
599
762
|
if not base_url:
|
|
600
763
|
return ("", [], "missing_track_url")
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
764
|
+
source = _youtube_track_source(track) or "unknown"
|
|
765
|
+
candidate_urls = [
|
|
766
|
+
("base", base_url),
|
|
767
|
+
("json3", _youtube_add_query_param(base_url, "fmt", "json3")),
|
|
768
|
+
("srv3", _youtube_add_query_param(base_url, "fmt", "srv3")),
|
|
769
|
+
]
|
|
770
|
+
seen_urls: set[str] = set()
|
|
771
|
+
for mode, candidate_url in candidate_urls:
|
|
772
|
+
if candidate_url in seen_urls:
|
|
773
|
+
continue
|
|
774
|
+
seen_urls.add(candidate_url)
|
|
775
|
+
try:
|
|
776
|
+
response_text = _http_get_text(candidate_url, headers=_youtube_request_headers(), timeout_sec=25)
|
|
777
|
+
except Exception:
|
|
778
|
+
continue
|
|
779
|
+
transcript_text, segments, parsed_mode = _youtube_parse_transcript_response(response_text)
|
|
612
780
|
if transcript_text:
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
pass
|
|
781
|
+
final_mode = parsed_mode if mode == "base" else f"{mode}_{parsed_mode}"
|
|
782
|
+
return transcript_text, segments, f"{source}_{final_mode}"
|
|
616
783
|
return ("", [], "unavailable")
|
|
617
784
|
|
|
618
785
|
|
|
@@ -647,28 +814,61 @@ def _youtube_inspect_payload(raw_url: str, preferred_lang: str = "") -> dict[str
|
|
|
647
814
|
watch_state = _youtube_fetch_watch_state(video_id)
|
|
648
815
|
except Exception as exc:
|
|
649
816
|
warnings.append(str(exc))
|
|
817
|
+
android_state: dict[str, Any] = {}
|
|
818
|
+
try:
|
|
819
|
+
android_state = _youtube_fetch_android_player_state(video_id)
|
|
820
|
+
except Exception as exc:
|
|
821
|
+
warnings.append(str(exc))
|
|
650
822
|
|
|
651
|
-
|
|
823
|
+
watch_video_details = watch_state.get("video_details", {}) if isinstance(watch_state.get("video_details"), dict) else {}
|
|
824
|
+
android_video_details = (
|
|
825
|
+
android_state.get("video_details", {}) if isinstance(android_state.get("video_details"), dict) else {}
|
|
826
|
+
)
|
|
827
|
+
video_details = watch_video_details or android_video_details
|
|
652
828
|
microformat = watch_state.get("microformat", {}) if isinstance(watch_state.get("microformat"), dict) else {}
|
|
653
829
|
playability = watch_state.get("playability_status", {}) if isinstance(watch_state.get("playability_status"), dict) else {}
|
|
654
|
-
|
|
655
|
-
|
|
830
|
+
if not playability:
|
|
831
|
+
playability = android_state.get("playability_status", {}) if isinstance(android_state.get("playability_status"), dict) else {}
|
|
832
|
+
watch_tracks = [row for row in watch_state.get("caption_tracks", []) if isinstance(row, dict)]
|
|
833
|
+
android_tracks = [row for row in android_state.get("caption_tracks", []) if isinstance(row, dict)]
|
|
834
|
+
tracks = _youtube_ranked_caption_tracks(watch_tracks, android_tracks, preferred_lang)
|
|
835
|
+
available_tracks = _youtube_track_inventory(tracks)
|
|
656
836
|
transcript_text = ""
|
|
657
837
|
transcript_segments: list[dict[str, Any]] = []
|
|
658
838
|
transcript_fetch_mode = "none"
|
|
659
839
|
transcript_available = False
|
|
660
840
|
transcript_language = ""
|
|
661
841
|
transcript_track_name = ""
|
|
842
|
+
transcript_track_source = ""
|
|
662
843
|
transcript_kind = "none"
|
|
844
|
+
transcript_sources_tried: list[str] = []
|
|
845
|
+
chosen_track: dict[str, Any] | None = None
|
|
846
|
+
for candidate in tracks:
|
|
847
|
+
transcript_sources_tried.append(
|
|
848
|
+
":".join(
|
|
849
|
+
part
|
|
850
|
+
for part in [
|
|
851
|
+
_youtube_track_source(candidate),
|
|
852
|
+
str(candidate.get("languageCode", "")).strip(),
|
|
853
|
+
_youtube_track_label(candidate),
|
|
854
|
+
]
|
|
855
|
+
if part
|
|
856
|
+
)
|
|
857
|
+
)
|
|
858
|
+
transcript_text, transcript_segments, transcript_fetch_mode = _youtube_fetch_transcript_from_track(candidate)
|
|
859
|
+
if transcript_text.strip():
|
|
860
|
+
transcript_available = True
|
|
861
|
+
chosen_track = candidate
|
|
862
|
+
break
|
|
663
863
|
if chosen_track is not None:
|
|
664
|
-
transcript_text, transcript_segments, transcript_fetch_mode = _youtube_fetch_transcript_from_track(chosen_track)
|
|
665
|
-
transcript_available = bool(transcript_text.strip())
|
|
666
864
|
transcript_language = str(chosen_track.get("languageCode", "")).strip()
|
|
667
865
|
transcript_track_name = _youtube_track_label(chosen_track)
|
|
866
|
+
transcript_track_source = _youtube_track_source(chosen_track)
|
|
668
867
|
transcript_kind = "auto" if str(chosen_track.get("kind", "")).strip().lower() == "asr" else "manual"
|
|
868
|
+
if tracks:
|
|
669
869
|
if not transcript_available:
|
|
670
870
|
warnings.append("A caption track was found, but transcript text could not be fetched.")
|
|
671
|
-
elif watch_state:
|
|
871
|
+
elif watch_state or android_state:
|
|
672
872
|
warnings.append("No caption tracks were available for this video.")
|
|
673
873
|
|
|
674
874
|
title = str(video_details.get("title") or oembed.get("title") or "").strip()
|
|
@@ -698,13 +898,17 @@ def _youtube_inspect_payload(raw_url: str, preferred_lang: str = "") -> dict[str
|
|
|
698
898
|
"duration_seconds": duration_seconds or None,
|
|
699
899
|
"published_at": published_at,
|
|
700
900
|
"playability_status": str(playability.get("status", "")).strip(),
|
|
901
|
+
"transcript_track_count": len(available_tracks),
|
|
902
|
+
"available_transcript_tracks": available_tracks,
|
|
701
903
|
"transcript_available": transcript_available,
|
|
702
904
|
"transcript_language": transcript_language,
|
|
703
905
|
"transcript_track_name": transcript_track_name,
|
|
906
|
+
"transcript_track_source": transcript_track_source,
|
|
704
907
|
"transcript_kind": transcript_kind,
|
|
705
908
|
"transcript_fetch_mode": transcript_fetch_mode,
|
|
706
909
|
"transcript_text": transcript_text,
|
|
707
910
|
"transcript_segments": transcript_segments,
|
|
911
|
+
"transcript_sources_tried": transcript_sources_tried,
|
|
708
912
|
"warnings": _unique_strings(warnings),
|
|
709
913
|
}
|
|
710
914
|
payload["text_bundle"] = _youtube_text_bundle(payload)
|
|
@@ -754,8 +958,10 @@ def cmd_youtube_inspect(args: argparse.Namespace) -> int:
|
|
|
754
958
|
("video.title", str(payload.get("title", "")).strip()),
|
|
755
959
|
("video.author", str(payload.get("author_name", "")).strip()),
|
|
756
960
|
("video.duration_seconds", payload.get("duration_seconds") or ""),
|
|
961
|
+
("transcript.track_count", payload.get("transcript_track_count") or 0),
|
|
757
962
|
("transcript.available", str(bool(payload.get("transcript_available", False))).lower()),
|
|
758
963
|
("transcript.language", str(payload.get("transcript_language", "")).strip()),
|
|
964
|
+
("transcript.track_source", str(payload.get("transcript_track_source", "")).strip()),
|
|
759
965
|
("transcript.kind", str(payload.get("transcript_kind", "")).strip()),
|
|
760
966
|
("saved", str(bool(out_path is not None)).lower()),
|
|
761
967
|
("path", _path_for_state(out_path, repo_root) if out_path is not None else ""),
|
|
@@ -5774,7 +5980,7 @@ def _about_payload() -> dict[str, Any]:
|
|
|
5774
5980
|
"Default CLI output is human-readable; listed commands with json_output=true also support --json.",
|
|
5775
5981
|
"Reasoning-kernel artifacts shape promotable repository truth for tasks, decisions, hypotheses, experiments, checkpoints, policies, and results.",
|
|
5776
5982
|
"Kernel evolution in ORP should stay explicit: observe real usage, propose changes, and migrate artifacts through versioned CLI surfaces rather than silent agent mutation.",
|
|
5777
|
-
"YouTube inspection is a built-in ORP ability exposed through `orp youtube inspect`, returning public metadata
|
|
5983
|
+
"YouTube inspection is a built-in ORP ability exposed through `orp youtube inspect`, returning public metadata plus full transcript text and segments whenever public caption tracks are available.",
|
|
5778
5984
|
"Discovery profiles in ORP are portable search-intent files managed directly by ORP.",
|
|
5779
5985
|
"Collaboration is a built-in ORP ability exposed through `orp collaborate ...`.",
|
|
5780
5986
|
"Project/session linking is a built-in ORP ability exposed through `orp link ...` and stored machine-locally under `.git/orp/link/`.",
|
|
@@ -5885,7 +6091,7 @@ def _home_payload(repo_root: Path, config_arg: str) -> dict[str, Any]:
|
|
|
5885
6091
|
"command": "orp whoami --json",
|
|
5886
6092
|
},
|
|
5887
6093
|
{
|
|
5888
|
-
"label": "Inspect a YouTube video and public transcript
|
|
6094
|
+
"label": "Inspect a YouTube video and ingest full public transcript context",
|
|
5889
6095
|
"command": "orp youtube inspect https://www.youtube.com/watch?v=<video_id> --json",
|
|
5890
6096
|
},
|
|
5891
6097
|
{
|
|
@@ -12715,7 +12921,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
12715
12921
|
|
|
12716
12922
|
s_youtube_inspect = youtube_sub.add_parser(
|
|
12717
12923
|
"inspect",
|
|
12718
|
-
help="Inspect a YouTube video and fetch public metadata plus transcript text when
|
|
12924
|
+
help="Inspect a YouTube video and fetch public metadata plus full transcript text and segments when caption tracks are available",
|
|
12719
12925
|
)
|
|
12720
12926
|
s_youtube_inspect.add_argument("url", help="YouTube watch/share URL or 11-character video id")
|
|
12721
12927
|
s_youtube_inspect.add_argument(
|
|
@@ -6,7 +6,7 @@ YouTube videos.
|
|
|
6
6
|
It gives agents and users a stable way to turn a YouTube link into:
|
|
7
7
|
|
|
8
8
|
- normalized video metadata,
|
|
9
|
-
- public
|
|
9
|
+
- full public transcript text and segment timing when caption tracks are available,
|
|
10
10
|
- segment-level timing rows,
|
|
11
11
|
- and one agent-friendly `text_bundle` field that can be handed directly into
|
|
12
12
|
summarization, extraction, comparison, or kernel-shaped artifact creation.
|
|
@@ -61,13 +61,17 @@ The command returns:
|
|
|
61
61
|
- `published_at`
|
|
62
62
|
- `playability_status`
|
|
63
63
|
- transcript fields:
|
|
64
|
+
- `transcript_track_count`
|
|
65
|
+
- `available_transcript_tracks`
|
|
64
66
|
- `transcript_available`
|
|
65
67
|
- `transcript_language`
|
|
66
68
|
- `transcript_track_name`
|
|
69
|
+
- `transcript_track_source`
|
|
67
70
|
- `transcript_kind`
|
|
68
71
|
- `transcript_fetch_mode`
|
|
69
72
|
- `transcript_text`
|
|
70
73
|
- `transcript_segments`
|
|
74
|
+
- `transcript_sources_tried`
|
|
71
75
|
- agent-ready bundle:
|
|
72
76
|
- `text_bundle`
|
|
73
77
|
- capture notes:
|
|
@@ -89,6 +93,11 @@ discipline while staying outside the evidence boundary by default.
|
|
|
89
93
|
`orp youtube inspect` returns public source context. It does **not** make the
|
|
90
94
|
result canonical evidence by itself.
|
|
91
95
|
|
|
96
|
+
When public caption tracks exist, ORP now attempts full transcript ingestion
|
|
97
|
+
across multiple retrieval strategies and records which track/source succeeded.
|
|
98
|
+
If a video has no accessible caption tracks, ORP reports that honestly instead
|
|
99
|
+
of silently fabricating a transcript.
|
|
100
|
+
|
|
92
101
|
If a video matters for repo truth, the agent should still:
|
|
93
102
|
|
|
94
103
|
1. inspect the video,
|
package/llms.txt
CHANGED
|
@@ -13,7 +13,7 @@ ORP (Open Research Protocol) is a docs-first, local-first, agent-friendly protoc
|
|
|
13
13
|
## Fast Machine Discovery
|
|
14
14
|
|
|
15
15
|
- Run `orp about --json` for machine-readable tool metadata, artifact paths, schemas, supported commands, and bundled packs.
|
|
16
|
-
- Run `orp youtube inspect <youtube-url> --json` to normalize a public YouTube video into ORP's source artifact shape, including transcript text when public
|
|
16
|
+
- Run `orp youtube inspect <youtube-url> --json` to normalize a public YouTube video into ORP's source artifact shape, including full transcript text and timing segments when public caption tracks are available.
|
|
17
17
|
- Run `orp erdos sync --json` for machine-readable Erdos catalog sync results.
|
|
18
18
|
- Run `orp pack list --json` for machine-readable bundled pack inventory.
|
|
19
19
|
- Core runtime commands also support `--json`:
|
package/package.json
CHANGED
|
@@ -201,9 +201,9 @@ def _benchmark_init_starter(iterations: int) -> dict[str, Any]:
|
|
|
201
201
|
run_records.append(gate_payload["run_record"])
|
|
202
202
|
|
|
203
203
|
targets = {
|
|
204
|
-
"init_mean_lt_ms":
|
|
205
|
-
"validate_mean_lt_ms":
|
|
206
|
-
"gate_mean_lt_ms":
|
|
204
|
+
"init_mean_lt_ms": 375.0,
|
|
205
|
+
"validate_mean_lt_ms": 210.0,
|
|
206
|
+
"gate_mean_lt_ms": 350.0,
|
|
207
207
|
}
|
|
208
208
|
observed = {
|
|
209
209
|
"init": _stats(init_times),
|
|
@@ -264,8 +264,8 @@ def _benchmark_artifact_roundtrip() -> dict[str, Any]:
|
|
|
264
264
|
"validate": _stats(validate_times),
|
|
265
265
|
}
|
|
266
266
|
targets = {
|
|
267
|
-
"scaffold_mean_lt_ms":
|
|
268
|
-
"validate_mean_lt_ms":
|
|
267
|
+
"scaffold_mean_lt_ms": 210.0,
|
|
268
|
+
"validate_mean_lt_ms": 210.0,
|
|
269
269
|
}
|
|
270
270
|
return {
|
|
271
271
|
"artifact_classes_total": len(rows),
|
|
@@ -490,7 +490,7 @@ def _benchmark_cross_domain_corpus() -> dict[str, Any]:
|
|
|
490
490
|
targets = {
|
|
491
491
|
"domains_min": 5,
|
|
492
492
|
"fixtures_min": 7,
|
|
493
|
-
"validate_mean_lt_ms":
|
|
493
|
+
"validate_mean_lt_ms": 210.0,
|
|
494
494
|
}
|
|
495
495
|
return {
|
|
496
496
|
"fixtures_total": len(rows),
|
|
@@ -549,7 +549,7 @@ def _benchmark_requirement_enforcement() -> dict[str, Any]:
|
|
|
549
549
|
observed = {"validate": _stats(validate_times)}
|
|
550
550
|
targets = {
|
|
551
551
|
"all_cases_detected": sum(len(fields) for fields in requirements.values()),
|
|
552
|
-
"validate_mean_lt_ms":
|
|
552
|
+
"validate_mean_lt_ms": 210.0,
|
|
553
553
|
}
|
|
554
554
|
return {
|
|
555
555
|
"cases_total": len(rows),
|
|
@@ -717,7 +717,7 @@ def _benchmark_mutation_stress() -> dict[str, Any]:
|
|
|
717
717
|
observed = {"validate": _stats(validate_times)}
|
|
718
718
|
targets = {
|
|
719
719
|
"cases_total": len(cases),
|
|
720
|
-
"validate_mean_lt_ms":
|
|
720
|
+
"validate_mean_lt_ms": 210.0,
|
|
721
721
|
}
|
|
722
722
|
return {
|
|
723
723
|
"cases_total": len(rows),
|
|
@@ -20,13 +20,17 @@
|
|
|
20
20
|
"duration_seconds",
|
|
21
21
|
"published_at",
|
|
22
22
|
"playability_status",
|
|
23
|
+
"transcript_track_count",
|
|
24
|
+
"available_transcript_tracks",
|
|
23
25
|
"transcript_available",
|
|
24
26
|
"transcript_language",
|
|
25
27
|
"transcript_track_name",
|
|
28
|
+
"transcript_track_source",
|
|
26
29
|
"transcript_kind",
|
|
27
30
|
"transcript_fetch_mode",
|
|
28
31
|
"transcript_text",
|
|
29
32
|
"transcript_segments",
|
|
33
|
+
"transcript_sources_tried",
|
|
30
34
|
"warnings",
|
|
31
35
|
"text_bundle"
|
|
32
36
|
],
|
|
@@ -83,6 +87,41 @@
|
|
|
83
87
|
"playability_status": {
|
|
84
88
|
"type": "string"
|
|
85
89
|
},
|
|
90
|
+
"transcript_track_count": {
|
|
91
|
+
"type": "integer",
|
|
92
|
+
"minimum": 0
|
|
93
|
+
},
|
|
94
|
+
"available_transcript_tracks": {
|
|
95
|
+
"type": "array",
|
|
96
|
+
"items": {
|
|
97
|
+
"type": "object",
|
|
98
|
+
"additionalProperties": false,
|
|
99
|
+
"required": [
|
|
100
|
+
"language_code",
|
|
101
|
+
"name",
|
|
102
|
+
"kind",
|
|
103
|
+
"source"
|
|
104
|
+
],
|
|
105
|
+
"properties": {
|
|
106
|
+
"language_code": {
|
|
107
|
+
"type": "string"
|
|
108
|
+
},
|
|
109
|
+
"name": {
|
|
110
|
+
"type": "string"
|
|
111
|
+
},
|
|
112
|
+
"kind": {
|
|
113
|
+
"type": "string",
|
|
114
|
+
"enum": [
|
|
115
|
+
"manual",
|
|
116
|
+
"auto"
|
|
117
|
+
]
|
|
118
|
+
},
|
|
119
|
+
"source": {
|
|
120
|
+
"type": "string"
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
},
|
|
86
125
|
"transcript_available": {
|
|
87
126
|
"type": "boolean"
|
|
88
127
|
},
|
|
@@ -92,6 +131,9 @@
|
|
|
92
131
|
"transcript_track_name": {
|
|
93
132
|
"type": "string"
|
|
94
133
|
},
|
|
134
|
+
"transcript_track_source": {
|
|
135
|
+
"type": "string"
|
|
136
|
+
},
|
|
95
137
|
"transcript_kind": {
|
|
96
138
|
"type": "string",
|
|
97
139
|
"enum": [
|
|
@@ -101,14 +143,7 @@
|
|
|
101
143
|
]
|
|
102
144
|
},
|
|
103
145
|
"transcript_fetch_mode": {
|
|
104
|
-
"type": "string"
|
|
105
|
-
"enum": [
|
|
106
|
-
"json3",
|
|
107
|
-
"xml",
|
|
108
|
-
"unavailable",
|
|
109
|
-
"none",
|
|
110
|
-
"missing_track_url"
|
|
111
|
-
]
|
|
146
|
+
"type": "string"
|
|
112
147
|
},
|
|
113
148
|
"transcript_text": {
|
|
114
149
|
"type": "string"
|
|
@@ -138,6 +173,12 @@
|
|
|
138
173
|
}
|
|
139
174
|
}
|
|
140
175
|
},
|
|
176
|
+
"transcript_sources_tried": {
|
|
177
|
+
"type": "array",
|
|
178
|
+
"items": {
|
|
179
|
+
"type": "string"
|
|
180
|
+
}
|
|
181
|
+
},
|
|
141
182
|
"warnings": {
|
|
142
183
|
"type": "array",
|
|
143
184
|
"items": {
|