@humanjs/mcp 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2,7 +2,7 @@
2
2
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
3
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
4
  import { homedir } from 'os';
5
- import { join, extname, basename, dirname } from 'path';
5
+ import { join, basename, dirname } from 'path';
6
6
  import { spawn } from 'child_process';
7
7
  import { readFileSync } from 'fs';
8
8
  import { createRequire } from 'module';
@@ -20,6 +20,7 @@ function readEnv() {
20
20
  speed: parseSpeed(process.env.HUMANJS_SPEED),
21
21
  headless: parseBool(process.env.HUMANJS_HEADLESS, false),
22
22
  outputDir: process.env.HUMANJS_OUTPUT_DIR ?? process.cwd(),
23
+ uploadDir: process.env.HUMANJS_UPLOAD_DIR ?? process.cwd(),
23
24
  viewport: parseViewport(process.env.HUMANJS_VIEWPORT),
24
25
  autoInstall: parseBool(process.env.HUMANJS_AUTO_INSTALL, true),
25
26
  browser: resolveBrowserConfig(),
@@ -189,7 +190,10 @@ var SessionManager = class {
189
190
  stop = resolve;
190
191
  });
191
192
  const video = options.video ?? true;
192
- const done = session.human.record({ video, quality: options.quality ?? "high" }, () => signal);
193
+ const done = session.human.record(
194
+ { name: options.name, video, quality: options.quality ?? "high" },
195
+ () => signal
196
+ );
193
197
  session.recording = {
194
198
  name: options.name ?? "recording",
195
199
  startedAt: Date.now(),
@@ -586,6 +590,24 @@ function resolveOutputPath(outputDir, filename) {
586
590
  }
587
591
  return join(outputDir, base);
588
592
  }
593
+ function resolveUploadPath(uploadDir, filename) {
594
+ const base = basename(filename);
595
+ if (base !== filename || base.length === 0) {
596
+ throw new Error(
597
+ `upload filename must be a plain name with no path components, got "${filename}". Files are read from HUMANJS_UPLOAD_DIR \u2014 place the file there (or point HUMANJS_UPLOAD_DIR at its folder) and pass just the name.`
598
+ );
599
+ }
600
+ return join(uploadDir, base);
601
+ }
602
+ function resolveRecordingFormat(filename) {
603
+ const lower = filename.toLowerCase();
604
+ if (lower.endsWith(".mp4") || lower.endsWith(".webm")) return "video";
605
+ if (lower.endsWith(".gif")) return "gif";
606
+ if (lower.endsWith(".json")) return "timeline";
607
+ if (lower.endsWith(".spec.ts") || lower.endsWith(".test.ts")) return "playwright";
608
+ if (lower.endsWith(".ts")) return "humanjs";
609
+ return null;
610
+ }
589
611
 
590
612
  // src/tools/inspection.ts
591
613
  var sessionArg = z.string().optional().describe("Session ID to act on. Omit to use the default session.");
@@ -631,6 +653,22 @@ function registerInspectionTools(server, ctx) {
631
653
  return { content: [{ type: "text", text }] };
632
654
  }
633
655
  );
656
+ server.registerTool(
657
+ "human_outline",
658
+ {
659
+ title: "Page outline (accessibility tree)",
660
+ description: 'Returns a compact accessibility-tree outline of the page (or a region) \u2014 every interactive element and landmark by its ARIA role + accessible name, as YAML (e.g. `- button "Sign in"`, `- textbox "Email"`). The most token-efficient way to see what is actionable and pick a selector: the names map directly to getByRole / accessible-name selectors. Prefer this over human_get_html for "what can I click or fill"; use human_screenshot when you need the visual layout.',
661
+ inputSchema: {
662
+ selector: z.string().optional().describe("Optional region selector to scope the outline. Omit for the whole page."),
663
+ session: sessionArg
664
+ }
665
+ },
666
+ async ({ selector, session }) => {
667
+ const { human } = await ctx.sessions.get(session);
668
+ const text = await human.outline(selector);
669
+ return { content: [{ type: "text", text }] };
670
+ }
671
+ );
634
672
  server.registerTool(
635
673
  "human_get_text",
636
674
  {
@@ -709,7 +747,7 @@ function resolveTarget(input) {
709
747
  var sessionArg2 = z.string().optional().describe(
710
748
  "Session ID to act on. Omit to use the default session (created lazily on first call). Use human_create_session for parallel browsers."
711
749
  );
712
- function registerPrimitiveTools(server, { sessions }) {
750
+ function registerPrimitiveTools(server, { sessions, env }) {
713
751
  server.registerTool(
714
752
  "human_goto",
715
753
  {
@@ -756,6 +794,22 @@ function registerPrimitiveTools(server, { sessions }) {
756
794
  };
757
795
  }
758
796
  );
797
+ server.registerTool(
798
+ "human_doubleClick",
799
+ {
800
+ title: "Double-click (humanized)",
801
+ description: "Double-clicks the target \u2014 same humanized motion as human_click, but two presses within the OS double-click window. Use for things that open/activate on double-click (list rows, file items, editable cells). Target is a selector OR x/y coordinates.",
802
+ inputSchema: { ...targetFields, session: sessionArg2 }
803
+ },
804
+ async ({ selector, x, y, session }) => {
805
+ const { human } = await sessions.get(session);
806
+ const target = resolveTarget({ selector, x, y });
807
+ await human.doubleClick(target);
808
+ return {
809
+ content: [{ type: "text", text: `double-clicked ${describeTarget(selector, x, y)}` }]
810
+ };
811
+ }
812
+ );
759
813
  server.registerTool(
760
814
  "human_hover",
761
815
  {
@@ -850,6 +904,94 @@ function registerPrimitiveTools(server, { sessions }) {
850
904
  return { content: [{ type: "text", text: `pasted ${value.length} chars into ${selector}` }] };
851
905
  }
852
906
  );
907
+ server.registerTool(
908
+ "human_clear",
909
+ {
910
+ title: "Clear a field (humanized)",
911
+ description: "Clears a text field (input/textarea/contenteditable) with a real keyboard gesture \u2014 click to focus, select-all, then delete \u2014 firing the input events the page expects. Use before human_type when you need to replace an existing value rather than append to it.",
912
+ inputSchema: {
913
+ selector: z.string().describe("Selector of the field to clear."),
914
+ session: sessionArg2
915
+ }
916
+ },
917
+ async ({ selector, session }) => {
918
+ const { human } = await sessions.get(session);
919
+ await human.clear(selector);
920
+ return { content: [{ type: "text", text: `cleared ${selector}` }] };
921
+ }
922
+ );
923
+ server.registerTool(
924
+ "human_check",
925
+ {
926
+ title: "Check a box (humanized)",
927
+ description: "Ticks a checkbox or radio \u2014 moves the cursor to it and clicks, but only if it is not already checked (a real user does not re-click a ticked box). Verifies the resulting state. Pass the checkbox/radio input itself (or a [role=checkbox]) \u2014 not a wrapping <label> \u2014 so the current state can be read and the click stays idempotent.",
928
+ inputSchema: {
929
+ selector: z.string().describe("Selector of the checkbox/radio input."),
930
+ session: sessionArg2
931
+ }
932
+ },
933
+ async ({ selector, session }) => {
934
+ const { human } = await sessions.get(session);
935
+ await human.check(selector);
936
+ return { content: [{ type: "text", text: `checked ${selector}` }] };
937
+ }
938
+ );
939
+ server.registerTool(
940
+ "human_uncheck",
941
+ {
942
+ title: "Uncheck a box (humanized)",
943
+ description: "Unticks a checkbox \u2014 humanized click only if currently checked. Radios cannot be unchecked by clicking (select a different option instead). Pass the checkbox input itself (or a [role=checkbox]) \u2014 not a wrapping <label> \u2014 so its state can be read and the click stays idempotent.",
944
+ inputSchema: {
945
+ selector: z.string().describe("Selector of the checkbox input."),
946
+ session: sessionArg2
947
+ }
948
+ },
949
+ async ({ selector, session }) => {
950
+ const { human } = await sessions.get(session);
951
+ await human.uncheck(selector);
952
+ return { content: [{ type: "text", text: `unchecked ${selector}` }] };
953
+ }
954
+ );
955
+ server.registerTool(
956
+ "human_selectOption",
957
+ {
958
+ title: "Select dropdown option (humanized)",
959
+ description: "Chooses option(s) in a native <select> \u2014 moves the cursor to the dropdown, then sets the value (native selects open an OS menu automation can't drive, so the value is set programmatically, firing change/input). For custom DOM dropdowns, use human_click on the rendered options instead. Match by value(s); pass one string or an array for multi-selects.",
960
+ inputSchema: {
961
+ selector: z.string().describe("Selector of the <select> element."),
962
+ values: z.union([z.string(), z.array(z.string())]).describe("Option value, or array of values for a multi-select."),
963
+ session: sessionArg2
964
+ }
965
+ },
966
+ async ({ selector, values, session }) => {
967
+ const { human } = await sessions.get(session);
968
+ const selected = await human.selectOption(selector, values);
969
+ return {
970
+ content: [{ type: "text", text: `selected ${selected.join(", ")} in ${selector}` }]
971
+ };
972
+ }
973
+ );
974
+ server.registerTool(
975
+ "human_upload",
976
+ {
977
+ title: "Upload file(s) (humanized)",
978
+ description: `Attaches file(s) to a file input \u2014 moves the cursor to the control, then sets the files (never opens the OS dialog, which would hang). For safety, files are read by basename from HUMANJS_UPLOAD_DIR (default: the server working dir) \u2014 subdirectories, "../", and absolute paths are rejected, so the agent can't read and exfiltrate arbitrary local files. Pass the <input type="file"> selector and the filename(s).`,
979
+ inputSchema: {
980
+ selector: z.string().describe("Selector of the file input."),
981
+ files: z.union([z.string(), z.array(z.string())]).describe("Filename(s) inside HUMANJS_UPLOAD_DIR \u2014 a basename only, no path components."),
982
+ session: sessionArg2
983
+ }
984
+ },
985
+ async ({ selector, files, session }) => {
986
+ const { human } = await sessions.get(session);
987
+ const names = Array.isArray(files) ? files : [files];
988
+ const paths = names.map((name) => resolveUploadPath(env.uploadDir, name));
989
+ await human.upload(selector, paths);
990
+ return {
991
+ content: [{ type: "text", text: `uploaded ${paths.length} file(s) to ${selector}` }]
992
+ };
993
+ }
994
+ );
853
995
  server.registerTool(
854
996
  "human_press",
855
997
  {
@@ -952,32 +1094,32 @@ function registerRecordingTools(server, { sessions, env }) {
952
1094
  "human_stop_recording",
953
1095
  {
954
1096
  title: "Stop recording and save",
955
- description: `Stops the active recording and writes it to one or more files in HUMANJS_OUTPUT_DIR. Each filename's extension picks its format: .mp4/.webm = video, .gif = animated gif, .json = action timeline. Pass several to export the same recording multiple ways, e.g. ["demo.mp4", "demo.json"] for video + timeline. Path components are rejected for safety.`,
1097
+ description: `Stops the active recording and writes it to one or more files in HUMANJS_OUTPUT_DIR. Each filename's extension picks its format: .mp4/.webm = video, .gif = animated gif, .json = action timeline, .ts = runnable HumanJS script, .spec.ts/.test.ts = @playwright/test spec (humanized, with derived assertions). Pass several to export the same recording multiple ways, e.g. ["demo.mp4", "checkout.spec.ts"] for a video plus a ready-to-commit test. Path components are rejected for safety.`,
956
1098
  inputSchema: {
957
1099
  filenames: z.array(z.string()).min(1).describe(
958
- 'One or more output filenames. The recording is saved to each, format chosen by extension. e.g. ["demo.mp4"] or ["demo.mp4", "demo.gif", "demo.json"].'
1100
+ 'One or more output filenames. The recording is saved to each, format chosen by extension. e.g. ["demo.mp4"], ["checkout.spec.ts"], or ["demo.mp4", "demo.json", "demo.ts"].'
959
1101
  ),
960
1102
  session: z.string().optional().describe("Session ID. Omit for the default session.")
961
1103
  }
962
1104
  },
963
1105
  async ({ filenames, session }) => {
964
- const targets = filenames.map((filename) => ({
965
- path: resolveOutputPath(env.outputDir, filename),
966
- ext: extname(filename).toLowerCase()
967
- }));
968
- for (const { ext } of targets) {
969
- if (ext !== ".mp4" && ext !== ".webm" && ext !== ".gif" && ext !== ".json") {
1106
+ const targets = filenames.map((filename) => {
1107
+ const format = resolveRecordingFormat(filename);
1108
+ if (format === null) {
970
1109
  throw new Error(
971
- `Unsupported output extension "${ext}". Use .mp4, .webm, .gif, or .json.`
1110
+ `Unsupported output extension for "${filename}". Use .mp4/.webm (video), .gif, .json (timeline), .ts (HumanJS script), or .spec.ts/.test.ts (Playwright test).`
972
1111
  );
973
1112
  }
974
- }
1113
+ return { path: resolveOutputPath(env.outputDir, filename), format };
1114
+ });
975
1115
  const recording = await sessions.stopRecording(session);
976
1116
  try {
977
1117
  const saved = [];
978
- for (const { path, ext } of targets) {
979
- if (ext === ".gif") saved.push(await recording.toGif(path));
980
- else if (ext === ".json") saved.push(await recording.toTimeline(path));
1118
+ for (const { path, format } of targets) {
1119
+ if (format === "gif") saved.push(await recording.toGif(path));
1120
+ else if (format === "timeline") saved.push(await recording.toTimeline(path));
1121
+ else if (format === "humanjs") saved.push(await recording.toHumanJS(path));
1122
+ else if (format === "playwright") saved.push(await recording.toPlaywright(path));
981
1123
  else saved.push(await recording.toVideo(path));
982
1124
  }
983
1125
  return { content: [{ type: "text", text: `saved recording to:
@@ -1062,6 +1204,10 @@ Recording a flow (the natural-looking way):
1062
1204
  1. EXPLORE FIRST (un-recorded). Navigate the flow once to discover correct, unambiguous selectors (human_screenshot / human_get_html / human_get_attribute). Do this by default whenever the selectors aren't already known \u2014 no need for the user to ask. Skip it only if the selectors are already known or the user tells you not to explore.
1063
1205
  2. THEN RECORD ONE CLEAN RUN AS A SINGLE BATCH: human_start_recording + every action + human_stop_recording, all emitted in one turn. Keep selector-guessing and fumbles out of the take.
1064
1206
 
1207
+ Export as a test: human_stop_recording picks format by extension. A .spec.ts (or .test.ts) filename writes a ready-to-commit @playwright/test with derived assertions; a .ts writes a standalone HumanJS script; .mp4/.webm/.gif/.json are video/timeline. So "record this flow and save it as a test" = run the clean pass, then stop into e.g. "checkout.spec.ts".
1208
+
1209
+ Captured input + passwords: typed/pasted text IS recorded into the timeline and code exports, so generated scripts/tests are runnable \u2014 EXCEPT password fields, which are always masked (emitted as an empty string with a "fill in" comment). This is intentional, not a bug; don't work around it by hand-editing the secret back in. If the user explicitly wants the flow to log in, edit the exported file to read the credential from an env var (e.g. process.env.APP_PASSWORD) and tell them to set it \u2014 never hardcode a real password into a file that may be committed.
1210
+
1065
1211
  Dynamic UI: prefer specific selectors (role, aria-label) over text \u2014 the same visible text often matches several cards before a filter, or the wrong one after. If a click reports multiple matches, narrow the selector.
1066
1212
 
1067
1213
  Browser state: by default each run is a fresh, signed-out browser. If a flow needs a login, tell the user to enable persistence (human_enable_persistence or HUMANJS_PERSIST) or CDP attach \u2014 see human_browser_info.`;