@midscene/core 0.28.2-beta-20250910072710.0 → 0.28.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +17 -27
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +31 -43
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +3 -4
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/common.mjs +26 -16
- package/dist/es/ai-model/common.mjs.map +1 -1
- package/dist/es/ai-model/index.mjs +3 -3
- package/dist/es/ai-model/inspect.mjs +34 -26
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +15 -14
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/common.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/playwright-generator.mjs +12 -6
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/util.mjs +3 -3
- package/dist/es/ai-model/prompt/util.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +12 -6
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +23 -28
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +10 -10
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/insight/index.mjs +26 -20
- package/dist/es/insight/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +5 -4
- package/dist/es/utils.mjs.map +1 -1
- package/dist/lib/agent/agent.js +16 -26
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/tasks.js +31 -43
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/utils.js +3 -4
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/common.js +31 -18
- package/dist/lib/ai-model/common.js.map +1 -1
- package/dist/lib/ai-model/index.js +15 -12
- package/dist/lib/ai-model/inspect.js +32 -24
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +14 -13
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/common.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/playwright-generator.js +11 -5
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/util.js +3 -3
- package/dist/lib/ai-model/prompt/util.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +11 -5
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +31 -36
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +8 -8
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/insight/index.js +23 -17
- package/dist/lib/insight/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +4 -3
- package/dist/lib/utils.js.map +1 -1
- package/dist/types/agent/agent.d.ts +0 -2
- package/dist/types/agent/tasks.d.ts +7 -8
- package/dist/types/agent/utils.d.ts +1 -3
- package/dist/types/ai-model/common.d.ts +11 -7
- package/dist/types/ai-model/index.d.ts +2 -2
- package/dist/types/ai-model/inspect.d.ts +6 -7
- package/dist/types/ai-model/llm-planning.d.ts +2 -2
- package/dist/types/ai-model/prompt/common.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/llm-planning.d.ts +3 -3
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +2 -2
- package/dist/types/ai-model/prompt/playwright-generator.d.ts +2 -3
- package/dist/types/ai-model/prompt/util.d.ts +2 -3
- package/dist/types/ai-model/prompt/yaml-generator.d.ts +2 -3
- package/dist/types/ai-model/service-caller/index.d.ts +5 -5
- package/dist/types/ai-model/ui-tars-planning.d.ts +3 -3
- package/dist/types/device/index.d.ts +2 -2
- package/dist/types/insight/index.d.ts +7 -12
- package/dist/types/types.d.ts +9 -0
- package/dist/types/utils.d.ts +1 -2
- package/package.json +3 -3
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/yaml-generator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/yaml-generator.ts"],"sourcesContent":["import type {\n StreamingAIResponse,\n StreamingCodeGenerationOptions,\n} from '@/types';\nimport { YAML_EXAMPLE_CODE } from '@midscene/shared/constants';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n AIActionType,\n type ChatCompletionMessageParam,\n callAI,\n} from '../index';\n\n// Common interfaces for test generation (shared between YAML and Playwright)\nexport interface EventCounts {\n navigation: number;\n click: number;\n input: number;\n scroll: number;\n total: number;\n}\n\nexport interface InputDescription {\n description: string;\n value: string;\n}\n\nexport interface ProcessedEvent {\n type: string;\n timestamp: number;\n url?: string;\n title?: string;\n elementDescription?: string;\n value?: string;\n pageInfo?: any;\n elementRect?: any;\n}\n\nexport interface EventSummary {\n testName: string;\n startUrl: string;\n eventCounts: EventCounts;\n urls: string[];\n clickDescriptions: string[];\n inputDescriptions: InputDescription[];\n events: ProcessedEvent[];\n}\n\n// Common ChromeRecordedEvent interface\nexport interface ChromeRecordedEvent {\n type: string;\n timestamp: number;\n url?: string;\n title?: string;\n elementDescription?: string;\n value?: string;\n pageInfo?: any;\n elementRect?: any;\n screenshotBefore?: string;\n screenshotAfter?: string;\n screenshotWithBox?: string;\n}\n\nexport interface YamlGenerationOptions {\n testName?: string;\n includeTimestamps?: boolean;\n maxScreenshots?: number;\n description?: string;\n}\n\nexport interface FilteredEvents {\n navigationEvents: ChromeRecordedEvent[];\n clickEvents: ChromeRecordedEvent[];\n inputEvents: ChromeRecordedEvent[];\n scrollEvents: ChromeRecordedEvent[];\n}\n\n// Common utility functions (shared between YAML and Playwright generators)\n\n/**\n * Get screenshots from events for LLM context\n */\nexport const getScreenshotsForLLM = (\n events: ChromeRecordedEvent[],\n maxScreenshots = 1,\n): string[] => {\n // Find events with screenshots, prioritizing navigation and click events\n const eventsWithScreenshots = events.filter(\n (event) =>\n event.screenshotBefore ||\n event.screenshotAfter ||\n event.screenshotWithBox,\n );\n\n // Sort them by priority (navigation first, then clicks, then others)\n const sortedEvents = [...eventsWithScreenshots].sort((a, b) => {\n if (a.type === 'navigation' && b.type !== 'navigation') return -1;\n if (a.type !== 'navigation' && b.type === 'navigation') return 1;\n if (a.type === 'click' && b.type !== 'click') return -1;\n if (a.type !== 'click' && b.type === 'click') return 1;\n return 0;\n });\n\n // Extract up to maxScreenshots screenshots\n const screenshots: string[] = [];\n for (const event of sortedEvents) {\n // Prefer the most informative screenshot\n const screenshot =\n event.screenshotWithBox ||\n event.screenshotAfter ||\n event.screenshotBefore;\n if (screenshot && !screenshots.includes(screenshot)) {\n screenshots.push(screenshot);\n if (screenshots.length >= maxScreenshots) break;\n }\n }\n\n return screenshots;\n};\n\n/**\n * Filter events by type for easier processing\n */\nexport const filterEventsByType = (\n events: ChromeRecordedEvent[],\n): FilteredEvents => {\n return {\n navigationEvents: events.filter((event) => event.type === 'navigation'),\n clickEvents: events.filter((event) => event.type === 'click'),\n inputEvents: events.filter((event) => event.type === 'input'),\n scrollEvents: events.filter((event) => event.type === 'scroll'),\n };\n};\n\n/**\n * Create event counts summary\n */\nexport const createEventCounts = (\n filteredEvents: FilteredEvents,\n totalEvents: number,\n): EventCounts => {\n return {\n navigation: filteredEvents.navigationEvents.length,\n click: filteredEvents.clickEvents.length,\n input: filteredEvents.inputEvents.length,\n scroll: filteredEvents.scrollEvents.length,\n total: totalEvents,\n };\n};\n\n/**\n * Extract input descriptions from input events\n */\nexport const extractInputDescriptions = (\n inputEvents: ChromeRecordedEvent[],\n): InputDescription[] => {\n return inputEvents\n .map((event) => ({\n description: event.elementDescription || '',\n value: event.value || '',\n }))\n .filter((item) => item.description && item.value);\n};\n\n/**\n * Process events for LLM consumption\n */\nexport const processEventsForLLM = (\n events: ChromeRecordedEvent[],\n): ProcessedEvent[] => {\n return events.map((event) => ({\n type: event.type,\n timestamp: event.timestamp,\n url: event.url,\n title: event.title,\n elementDescription: event.elementDescription,\n value: event.value,\n pageInfo: event.pageInfo,\n elementRect: event.elementRect,\n }));\n};\n\n/**\n * Prepare comprehensive event summary for LLM\n */\nexport const prepareEventSummary = (\n events: ChromeRecordedEvent[],\n options: { testName?: string; maxScreenshots?: number } = {},\n): EventSummary => {\n const filteredEvents = filterEventsByType(events);\n const eventCounts = createEventCounts(filteredEvents, events.length);\n\n // Extract useful information from events\n const startUrl =\n filteredEvents.navigationEvents.length > 0\n ? filteredEvents.navigationEvents[0].url || ''\n : '';\n\n const clickDescriptions = filteredEvents.clickEvents\n .map((event) => event.elementDescription)\n .filter((desc): desc is string => Boolean(desc))\n .slice(0, 10);\n\n const inputDescriptions = extractInputDescriptions(\n filteredEvents.inputEvents,\n ).slice(0, 10);\n\n const urls = filteredEvents.navigationEvents\n .map((e) => e.url)\n .filter((url): url is string => Boolean(url))\n .slice(0, 5);\n\n const processedEvents = processEventsForLLM(events);\n\n return {\n testName: options.testName || 'Automated test from recorded events',\n startUrl,\n eventCounts,\n urls,\n clickDescriptions,\n inputDescriptions,\n events: processedEvents,\n };\n};\n\n/**\n * Create message content for LLM with optional screenshots\n */\nexport const createMessageContent = (\n promptText: string,\n screenshots: string[] = [],\n includeScreenshots = true,\n) => {\n const messageContent: any[] = [\n {\n type: 'text',\n text: promptText,\n },\n ];\n\n // Add screenshots if available and requested\n if (includeScreenshots && screenshots.length > 0) {\n messageContent.unshift({\n type: 'text',\n text: 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n screenshots.forEach((screenshot) => {\n messageContent.push({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n });\n });\n }\n\n return messageContent;\n};\n\n/**\n * Validate events before processing\n */\nexport const validateEvents = (events: ChromeRecordedEvent[]): void => {\n if (!events.length) {\n throw new Error('No events provided for test generation');\n }\n};\n\n// YAML-specific generation functions\n\n/**\n * Generates YAML test configuration from recorded events using AI\n */\nexport const generateYamlTest = async (\n events: ChromeRecordedEvent[],\n options: YamlGenerationOptions,\n modelConfig: IModelConfig,\n): Promise<string> => {\n try {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add YAML-specific options to summary\n const yamlSummary = {\n ...summary,\n includeTimestamps: options.includeTimestamps || false,\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(\n events,\n options.maxScreenshots || 3,\n );\n\n // Use LLM to generate the YAML test configuration\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: `You are an expert in Midscene.js YAML test generation. Generate clean, accurate YAML following these rules: ${YAML_EXAMPLE_CODE}`,\n },\n {\n role: 'user',\n content: `Generate YAML test for Midscene.js automation from recorded browser events.\n\nEvent Summary:\n${JSON.stringify(yamlSummary, null, 2)}\n\nConvert events:\n- navigation → target.url\n- click → aiTap with element description\n- input → aiInput with value and locate\n- scroll → aiScroll with appropriate direction\n- Add aiAssert for important state changes\n\nRespond with YAML only, no explanations.`,\n },\n ];\n\n // Add screenshots if available and requested\n if (screenshots.length > 0) {\n prompt.push({\n role: 'user',\n content:\n 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n prompt.push({\n role: 'user',\n content: screenshots.map((screenshot) => ({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n })),\n });\n }\n\n const response = await callAI(\n prompt,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n if (response?.content && typeof response.content === 'string') {\n return response.content;\n }\n\n throw new Error('Failed to generate YAML test configuration');\n } catch (error) {\n throw new Error(`Failed to generate YAML test: ${error}`);\n }\n};\n\n/**\n * Generates YAML test configuration from recorded events using AI with streaming support\n */\nexport const generateYamlTestStream = async (\n events: ChromeRecordedEvent[],\n options: YamlGenerationOptions & StreamingCodeGenerationOptions,\n modelConfig: IModelConfig,\n): Promise<StreamingAIResponse> => {\n try {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add YAML-specific options to summary\n const yamlSummary = {\n ...summary,\n includeTimestamps: options.includeTimestamps || false,\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(\n events,\n options.maxScreenshots || 3,\n );\n\n // Use LLM to generate the YAML test configuration\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: `You are an expert in Midscene.js YAML test generation. Generate clean, accurate YAML following these rules: ${YAML_EXAMPLE_CODE}`,\n },\n {\n role: 'user',\n content: `Generate YAML test for Midscene.js automation from recorded browser events.\n\nEvent Summary:\n${JSON.stringify(yamlSummary, null, 2)}\n\nConvert events:\n- navigation → target.url\n- click → aiTap with element description\n- input → aiInput with value and locate\n- scroll → aiScroll with appropriate direction\n- Add aiAssert for important state changes\n\nRespond with YAML only, no explanations.`,\n },\n ];\n\n // Add screenshots if available and requested\n if (screenshots.length > 0) {\n prompt.push({\n role: 'user',\n content:\n 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n prompt.push({\n role: 'user',\n content: screenshots.map((screenshot) => ({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n })),\n });\n }\n\n if (options.stream && options.onChunk) {\n // Use streaming\n return await callAI(prompt, AIActionType.EXTRACT_DATA, modelConfig, {\n stream: true,\n onChunk: options.onChunk,\n });\n } else {\n // Fallback to non-streaming\n const response = await callAI(\n prompt,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n if (response?.content && typeof response.content === 'string') {\n return {\n content: response.content,\n usage: response.usage,\n isStreamed: false,\n };\n }\n\n throw new Error('Failed to generate YAML test configuration');\n }\n } catch (error) {\n throw new Error(`Failed to generate YAML test: ${error}`);\n }\n};\n"],"names":["getScreenshotsForLLM","events","maxScreenshots","eventsWithScreenshots","event","sortedEvents","a","b","screenshots","screenshot","filterEventsByType","createEventCounts","filteredEvents","totalEvents","extractInputDescriptions","inputEvents","item","processEventsForLLM","prepareEventSummary","options","eventCounts","startUrl","clickDescriptions","desc","Boolean","inputDescriptions","urls","e","url","processedEvents","createMessageContent","promptText","includeScreenshots","messageContent","validateEvents","Error","generateYamlTest","modelConfig","summary","yamlSummary","prompt","YAML_EXAMPLE_CODE","JSON","response","callAI","AIActionType","error","generateYamlTestStream"],"mappings":";;AAiFO,MAAMA,uBAAuB,CAClCC,QACAC,iBAAiB,CAAC;IAGlB,MAAMC,wBAAwBF,OAAO,MAAM,CACzC,CAACG,QACCA,MAAM,gBAAgB,IACtBA,MAAM,eAAe,IACrBA,MAAM,iBAAiB;IAI3B,MAAMC,eAAe;WAAIF;KAAsB,CAAC,IAAI,CAAC,CAACG,GAAGC;QACvD,IAAID,AAAW,iBAAXA,EAAE,IAAI,IAAqBC,AAAW,iBAAXA,EAAE,IAAI,EAAmB,OAAO;QAC/D,IAAID,AAAW,iBAAXA,EAAE,IAAI,IAAqBC,AAAW,iBAAXA,EAAE,IAAI,EAAmB,OAAO;QAC/D,IAAID,AAAW,YAAXA,EAAE,IAAI,IAAgBC,AAAW,YAAXA,EAAE,IAAI,EAAc,OAAO;QACrD,IAAID,AAAW,YAAXA,EAAE,IAAI,IAAgBC,AAAW,YAAXA,EAAE,IAAI,EAAc,OAAO;QACrD,OAAO;IACT;IAGA,MAAMC,cAAwB,EAAE;IAChC,KAAK,MAAMJ,SAASC,aAAc;QAEhC,MAAMI,aACJL,MAAM,iBAAiB,IACvBA,MAAM,eAAe,IACrBA,MAAM,gBAAgB;QACxB,IAAIK,cAAc,CAACD,YAAY,QAAQ,CAACC,aAAa;YACnDD,YAAY,IAAI,CAACC;YACjB,IAAID,YAAY,MAAM,IAAIN,gBAAgB;QAC5C;IACF;IAEA,OAAOM;AACT;AAKO,MAAME,qBAAqB,CAChCT,SAEO;QACL,kBAAkBA,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,iBAAfA,MAAM,IAAI;QACrD,aAAaH,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,aAAaH,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,cAAcH,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,aAAfA,MAAM,IAAI;IACnD;AAMK,MAAMO,oBAAoB,CAC/BC,gBACAC,cAEO;QACL,YAAYD,eAAe,gBAAgB,CAAC,MAAM;QAClD,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,QAAQA,eAAe,YAAY,CAAC,MAAM;QAC1C,OAAOC;IACT;AAMK,MAAMC,2BAA2B,CACtCC,cAEOA,YACJ,GAAG,CAAC,CAACX,QAAW;YACf,aAAaA,MAAM,kBAAkB,IAAI;YACzC,OAAOA,MAAM,KAAK,IAAI;QACxB,IACC,MAAM,CAAC,CAACY,OAASA,KAAK,WAAW,IAAIA,KAAK,KAAK;AAM7C,MAAMC,sBAAsB,CACjChB,SAEOA,OAAO,GAAG,CAAC,CAACG,QAAW;YAC5B,MAAMA,MAAM,IAAI;YAChB,WAAWA,MAAM,SAAS;YAC1B,KAAKA,MAAM,GAAG;YACd,OAAOA,MAAM,KAAK;YAClB,oBAAoBA,MAAM,kBAAkB;YAC5C,OAAOA,MAAM,KAAK;YAClB,UAAUA,MAAM,QAAQ;YACxB,aAAaA,MAAM,WAAW;QAChC;AAMK,MAAMc,sBAAsB,CACjCjB,QACAkB,UAA0D,CAAC,CAAC;IAE5D,MAAMP,iBAAiBF,mBAAmBT;IAC1C,MAAMmB,cAAcT,kBAAkBC,gBAAgBX,OAAO,MAAM;IAGnE,MAAMoB,WACJT,eAAe,gBAAgB,CAAC,MAAM,GAAG,IACrCA,eAAe,gBAAgB,CAAC,EAAE,CAAC,GAAG,IAAI,KAC1C;IAEN,MAAMU,oBAAoBV,eAAe,WAAW,CACjD,GAAG,CAAC,CAACR,QAAUA,MAAM,kBAAkB,EACvC,MAAM,CAAC,CAACmB,OAAyBC,QAAQD,OACzC,KAAK,CAAC,GAAG;IAEZ,MAAME,oBAAoBX,yBACxBF,eAAe,WAAW,EAC1B,KAAK,CAAC,GAAG;IAEX,MAAMc,OAAOd,eAAe,gBAAgB,CACzC,GAAG,CAAC,CAACe,IAAMA,EAAE,GAAG,EAChB,MAAM,CAAC,CAACC,MAAuBJ,QAAQI,MACvC,KAAK,CAAC,GAAG;IAEZ,MAAMC,kBAAkBZ,oBAAoBhB;IAE5C,OAAO;QACL,UAAUkB,QAAQ,QAAQ,IAAI;QAC9BE;QACAD;QACAM;QACAJ;QACAG;QACA,QAAQI;IACV;AACF;AAKO,MAAMC,uBAAuB,CAClCC,YACAvB,cAAwB,EAAE,EAC1BwB,qBAAqB,IAAI;IAEzB,MAAMC,iBAAwB;QAC5B;YACE,MAAM;YACN,MAAMF;QACR;KACD;IAGD,IAAIC,sBAAsBxB,YAAY,MAAM,GAAG,GAAG;QAChDyB,eAAe,OAAO,CAAC;YACrB,MAAM;YACN,MAAM;QACR;QAEAzB,YAAY,OAAO,CAAC,CAACC;YACnBwB,eAAe,IAAI,CAAC;gBAClB,MAAM;gBACN,WAAW;oBACT,KAAKxB;gBACP;YACF;QACF;IACF;IAEA,OAAOwB;AACT;AAKO,MAAMC,iBAAiB,CAACjC;IAC7B,IAAI,CAACA,OAAO,MAAM,EAChB,MAAM,IAAIkC,MAAM;AAEpB;AAOO,MAAMC,mBAAmB,OAC9BnC,QACAkB,SACAkB;IAEA,IAAI;QAEFH,eAAejC;QAGf,MAAMqC,UAAUpB,oBAAoBjB,QAAQ;YAC1C,UAAUkB,QAAQ,QAAQ;YAC1B,gBAAgBA,QAAQ,cAAc,IAAI;QAC5C;QAGA,MAAMoB,cAAc;YAClB,GAAGD,OAAO;YACV,mBAAmBnB,QAAQ,iBAAiB,IAAI;QAClD;QAGA,MAAMX,cAAcR,qBAClBC,QACAkB,QAAQ,cAAc,IAAI;QAI5B,MAAMqB,SAAuC;YAC3C;gBACE,MAAM;gBACN,SAAS,CAAC,4GAA4G,EAAEC,mBAAmB;YAC7I;YACA;gBACE,MAAM;gBACN,SAAS,CAAC;;;AAGlB,EAAEC,KAAK,SAAS,CAACH,aAAa,MAAM,GAAG;;;;;;;;;wCASC,CAAC;YACnC;SACD;QAGD,IAAI/B,YAAY,MAAM,GAAG,GAAG;YAC1BgC,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SACE;YACJ;YAEAA,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SAAShC,YAAY,GAAG,CAAC,CAACC,aAAgB;wBACxC,MAAM;wBACN,WAAW;4BACT,KAAKA;wBACP;oBACF;YACF;QACF;QAEA,MAAMkC,WAAW,MAAMC,OACrBJ,QACAK,aAAa,YAAY,EACzBR;QAGF,IAAIM,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAOA,SAAS,OAAO;QAGzB,MAAM,IAAIR,MAAM;IAClB,EAAE,OAAOW,OAAO;QACd,MAAM,IAAIX,MAAM,CAAC,8BAA8B,EAAEW,OAAO;IAC1D;AACF;AAKO,MAAMC,yBAAyB,OACpC9C,QACAkB,SACAkB;IAEA,IAAI;QAEFH,eAAejC;QAGf,MAAMqC,UAAUpB,oBAAoBjB,QAAQ;YAC1C,UAAUkB,QAAQ,QAAQ;YAC1B,gBAAgBA,QAAQ,cAAc,IAAI;QAC5C;QAGA,MAAMoB,cAAc;YAClB,GAAGD,OAAO;YACV,mBAAmBnB,QAAQ,iBAAiB,IAAI;QAClD;QAGA,MAAMX,cAAcR,qBAClBC,QACAkB,QAAQ,cAAc,IAAI;QAI5B,MAAMqB,SAAuC;YAC3C;gBACE,MAAM;gBACN,SAAS,CAAC,4GAA4G,EAAEC,mBAAmB;YAC7I;YACA;gBACE,MAAM;gBACN,SAAS,CAAC;;;AAGlB,EAAEC,KAAK,SAAS,CAACH,aAAa,MAAM,GAAG;;;;;;;;;wCASC,CAAC;YACnC;SACD;QAGD,IAAI/B,YAAY,MAAM,GAAG,GAAG;YAC1BgC,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SACE;YACJ;YAEAA,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SAAShC,YAAY,GAAG,CAAC,CAACC,aAAgB;wBACxC,MAAM;wBACN,WAAW;4BACT,KAAKA;wBACP;oBACF;YACF;QACF;QAEA,IAAIU,QAAQ,MAAM,IAAIA,QAAQ,OAAO,EAEnC,OAAO,MAAMyB,OAAOJ,QAAQK,aAAa,YAAY,EAAER,aAAa;YAClE,QAAQ;YACR,SAASlB,QAAQ,OAAO;QAC1B;QACK;YAEL,MAAMwB,WAAW,MAAMC,OACrBJ,QACAK,aAAa,YAAY,EACzBR;YAGF,IAAIM,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAO;gBACL,SAASA,SAAS,OAAO;gBACzB,OAAOA,SAAS,KAAK;gBACrB,YAAY;YACd;YAGF,MAAM,IAAIR,MAAM;QAClB;IACF,EAAE,OAAOW,OAAO;QACd,MAAM,IAAIX,MAAM,CAAC,8BAA8B,EAAEW,OAAO;IAC1D;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/yaml-generator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/yaml-generator.ts"],"sourcesContent":["import type {\n StreamingAIResponse,\n StreamingCodeGenerationOptions,\n} from '@/types';\nimport { YAML_EXAMPLE_CODE } from '@midscene/shared/constants';\nimport {\n AIActionType,\n type ChatCompletionMessageParam,\n callAi,\n} from '../index';\n\n// Common interfaces for test generation (shared between YAML and Playwright)\nexport interface EventCounts {\n navigation: number;\n click: number;\n input: number;\n scroll: number;\n total: number;\n}\n\nexport interface InputDescription {\n description: string;\n value: string;\n}\n\nexport interface ProcessedEvent {\n type: string;\n timestamp: number;\n url?: string;\n title?: string;\n elementDescription?: string;\n value?: string;\n pageInfo?: any;\n elementRect?: any;\n}\n\nexport interface EventSummary {\n testName: string;\n startUrl: string;\n eventCounts: EventCounts;\n urls: string[];\n clickDescriptions: string[];\n inputDescriptions: InputDescription[];\n events: ProcessedEvent[];\n}\n\n// Common ChromeRecordedEvent interface\nexport interface ChromeRecordedEvent {\n type: string;\n timestamp: number;\n url?: string;\n title?: string;\n elementDescription?: string;\n value?: string;\n pageInfo?: any;\n elementRect?: any;\n screenshotBefore?: string;\n screenshotAfter?: string;\n screenshotWithBox?: string;\n}\n\nexport interface YamlGenerationOptions {\n testName?: string;\n includeTimestamps?: boolean;\n maxScreenshots?: number;\n description?: string;\n}\n\nexport interface FilteredEvents {\n navigationEvents: ChromeRecordedEvent[];\n clickEvents: ChromeRecordedEvent[];\n inputEvents: ChromeRecordedEvent[];\n scrollEvents: ChromeRecordedEvent[];\n}\n\n// Common utility functions (shared between YAML and Playwright generators)\n\n/**\n * Get screenshots from events for LLM context\n */\nexport const getScreenshotsForLLM = (\n events: ChromeRecordedEvent[],\n maxScreenshots = 1,\n): string[] => {\n // Find events with screenshots, prioritizing navigation and click events\n const eventsWithScreenshots = events.filter(\n (event) =>\n event.screenshotBefore ||\n event.screenshotAfter ||\n event.screenshotWithBox,\n );\n\n // Sort them by priority (navigation first, then clicks, then others)\n const sortedEvents = [...eventsWithScreenshots].sort((a, b) => {\n if (a.type === 'navigation' && b.type !== 'navigation') return -1;\n if (a.type !== 'navigation' && b.type === 'navigation') return 1;\n if (a.type === 'click' && b.type !== 'click') return -1;\n if (a.type !== 'click' && b.type === 'click') return 1;\n return 0;\n });\n\n // Extract up to maxScreenshots screenshots\n const screenshots: string[] = [];\n for (const event of sortedEvents) {\n // Prefer the most informative screenshot\n const screenshot =\n event.screenshotWithBox ||\n event.screenshotAfter ||\n event.screenshotBefore;\n if (screenshot && !screenshots.includes(screenshot)) {\n screenshots.push(screenshot);\n if (screenshots.length >= maxScreenshots) break;\n }\n }\n\n return screenshots;\n};\n\n/**\n * Filter events by type for easier processing\n */\nexport const filterEventsByType = (\n events: ChromeRecordedEvent[],\n): FilteredEvents => {\n return {\n navigationEvents: events.filter((event) => event.type === 'navigation'),\n clickEvents: events.filter((event) => event.type === 'click'),\n inputEvents: events.filter((event) => event.type === 'input'),\n scrollEvents: events.filter((event) => event.type === 'scroll'),\n };\n};\n\n/**\n * Create event counts summary\n */\nexport const createEventCounts = (\n filteredEvents: FilteredEvents,\n totalEvents: number,\n): EventCounts => {\n return {\n navigation: filteredEvents.navigationEvents.length,\n click: filteredEvents.clickEvents.length,\n input: filteredEvents.inputEvents.length,\n scroll: filteredEvents.scrollEvents.length,\n total: totalEvents,\n };\n};\n\n/**\n * Extract input descriptions from input events\n */\nexport const extractInputDescriptions = (\n inputEvents: ChromeRecordedEvent[],\n): InputDescription[] => {\n return inputEvents\n .map((event) => ({\n description: event.elementDescription || '',\n value: event.value || '',\n }))\n .filter((item) => item.description && item.value);\n};\n\n/**\n * Process events for LLM consumption\n */\nexport const processEventsForLLM = (\n events: ChromeRecordedEvent[],\n): ProcessedEvent[] => {\n return events.map((event) => ({\n type: event.type,\n timestamp: event.timestamp,\n url: event.url,\n title: event.title,\n elementDescription: event.elementDescription,\n value: event.value,\n pageInfo: event.pageInfo,\n elementRect: event.elementRect,\n }));\n};\n\n/**\n * Prepare comprehensive event summary for LLM\n */\nexport const prepareEventSummary = (\n events: ChromeRecordedEvent[],\n options: { testName?: string; maxScreenshots?: number } = {},\n): EventSummary => {\n const filteredEvents = filterEventsByType(events);\n const eventCounts = createEventCounts(filteredEvents, events.length);\n\n // Extract useful information from events\n const startUrl =\n filteredEvents.navigationEvents.length > 0\n ? filteredEvents.navigationEvents[0].url || ''\n : '';\n\n const clickDescriptions = filteredEvents.clickEvents\n .map((event) => event.elementDescription)\n .filter((desc): desc is string => Boolean(desc))\n .slice(0, 10);\n\n const inputDescriptions = extractInputDescriptions(\n filteredEvents.inputEvents,\n ).slice(0, 10);\n\n const urls = filteredEvents.navigationEvents\n .map((e) => e.url)\n .filter((url): url is string => Boolean(url))\n .slice(0, 5);\n\n const processedEvents = processEventsForLLM(events);\n\n return {\n testName: options.testName || 'Automated test from recorded events',\n startUrl,\n eventCounts,\n urls,\n clickDescriptions,\n inputDescriptions,\n events: processedEvents,\n };\n};\n\n/**\n * Create message content for LLM with optional screenshots\n */\nexport const createMessageContent = (\n promptText: string,\n screenshots: string[] = [],\n includeScreenshots = true,\n) => {\n const messageContent: any[] = [\n {\n type: 'text',\n text: promptText,\n },\n ];\n\n // Add screenshots if available and requested\n if (includeScreenshots && screenshots.length > 0) {\n messageContent.unshift({\n type: 'text',\n text: 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n screenshots.forEach((screenshot) => {\n messageContent.push({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n });\n });\n }\n\n return messageContent;\n};\n\n/**\n * Validate events before processing\n */\nexport const validateEvents = (events: ChromeRecordedEvent[]): void => {\n if (!events.length) {\n throw new Error('No events provided for test generation');\n }\n};\n\n// YAML-specific generation functions\n\n/**\n * Generates YAML test configuration from recorded events using AI\n */\nexport const generateYamlTest = async (\n events: ChromeRecordedEvent[],\n options: YamlGenerationOptions = {},\n): Promise<string> => {\n try {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add YAML-specific options to summary\n const yamlSummary = {\n ...summary,\n includeTimestamps: options.includeTimestamps || false,\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(\n events,\n options.maxScreenshots || 3,\n );\n\n // Use LLM to generate the YAML test configuration\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: `You are an expert in Midscene.js YAML test generation. Generate clean, accurate YAML following these rules: ${YAML_EXAMPLE_CODE}`,\n },\n {\n role: 'user',\n content: `Generate YAML test for Midscene.js automation from recorded browser events.\n\nEvent Summary:\n${JSON.stringify(yamlSummary, null, 2)}\n\nConvert events:\n- navigation → target.url\n- click → aiTap with element description\n- input → aiInput with value and locate\n- scroll → aiScroll with appropriate direction\n- Add aiAssert for important state changes\n\nRespond with YAML only, no explanations.`,\n },\n ];\n\n // Add screenshots if available and requested\n if (screenshots.length > 0) {\n prompt.push({\n role: 'user',\n content:\n 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n prompt.push({\n role: 'user',\n content: screenshots.map((screenshot) => ({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n })),\n });\n }\n\n const response = await callAi(prompt, AIActionType.EXTRACT_DATA, {\n intent: 'default',\n });\n\n if (response?.content && typeof response.content === 'string') {\n return response.content;\n }\n\n throw new Error('Failed to generate YAML test configuration');\n } catch (error) {\n throw new Error(`Failed to generate YAML test: ${error}`);\n }\n};\n\n/**\n * Generates YAML test configuration from recorded events using AI with streaming support\n */\nexport const generateYamlTestStream = async (\n events: ChromeRecordedEvent[],\n options: YamlGenerationOptions & StreamingCodeGenerationOptions = {},\n): Promise<StreamingAIResponse> => {\n try {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add YAML-specific options to summary\n const yamlSummary = {\n ...summary,\n includeTimestamps: options.includeTimestamps || false,\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(\n events,\n options.maxScreenshots || 3,\n );\n\n // Use LLM to generate the YAML test configuration\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: `You are an expert in Midscene.js YAML test generation. Generate clean, accurate YAML following these rules: ${YAML_EXAMPLE_CODE}`,\n },\n {\n role: 'user',\n content: `Generate YAML test for Midscene.js automation from recorded browser events.\n\nEvent Summary:\n${JSON.stringify(yamlSummary, null, 2)}\n\nConvert events:\n- navigation → target.url\n- click → aiTap with element description\n- input → aiInput with value and locate\n- scroll → aiScroll with appropriate direction\n- Add aiAssert for important state changes\n\nRespond with YAML only, no explanations.`,\n },\n ];\n\n // Add screenshots if available and requested\n if (screenshots.length > 0) {\n prompt.push({\n role: 'user',\n content:\n 'Here are screenshots from the recording session to help you understand the context:',\n });\n\n prompt.push({\n role: 'user',\n content: screenshots.map((screenshot) => ({\n type: 'image_url',\n image_url: {\n url: screenshot,\n },\n })),\n });\n }\n\n if (options.stream && options.onChunk) {\n // Use streaming\n return await callAi(\n prompt,\n AIActionType.EXTRACT_DATA,\n {\n intent: 'default',\n },\n {\n stream: true,\n onChunk: options.onChunk,\n },\n );\n } else {\n // Fallback to non-streaming\n const response = await callAi(prompt, AIActionType.EXTRACT_DATA, {\n intent: 'default',\n });\n\n if (response?.content && typeof response.content === 'string') {\n return {\n content: response.content,\n usage: response.usage,\n isStreamed: false,\n };\n }\n\n throw new Error('Failed to generate YAML test configuration');\n }\n } catch (error) {\n throw new Error(`Failed to generate YAML test: ${error}`);\n }\n};\n"],"names":["getScreenshotsForLLM","events","maxScreenshots","eventsWithScreenshots","event","sortedEvents","a","b","screenshots","screenshot","filterEventsByType","createEventCounts","filteredEvents","totalEvents","extractInputDescriptions","inputEvents","item","processEventsForLLM","prepareEventSummary","options","eventCounts","startUrl","clickDescriptions","desc","Boolean","inputDescriptions","urls","e","url","processedEvents","createMessageContent","promptText","includeScreenshots","messageContent","validateEvents","Error","generateYamlTest","summary","yamlSummary","prompt","YAML_EXAMPLE_CODE","JSON","response","callAi","AIActionType","error","generateYamlTestStream"],"mappings":";;AAgFO,MAAMA,uBAAuB,CAClCC,QACAC,iBAAiB,CAAC;IAGlB,MAAMC,wBAAwBF,OAAO,MAAM,CACzC,CAACG,QACCA,MAAM,gBAAgB,IACtBA,MAAM,eAAe,IACrBA,MAAM,iBAAiB;IAI3B,MAAMC,eAAe;WAAIF;KAAsB,CAAC,IAAI,CAAC,CAACG,GAAGC;QACvD,IAAID,AAAW,iBAAXA,EAAE,IAAI,IAAqBC,AAAW,iBAAXA,EAAE,IAAI,EAAmB,OAAO;QAC/D,IAAID,AAAW,iBAAXA,EAAE,IAAI,IAAqBC,AAAW,iBAAXA,EAAE,IAAI,EAAmB,OAAO;QAC/D,IAAID,AAAW,YAAXA,EAAE,IAAI,IAAgBC,AAAW,YAAXA,EAAE,IAAI,EAAc,OAAO;QACrD,IAAID,AAAW,YAAXA,EAAE,IAAI,IAAgBC,AAAW,YAAXA,EAAE,IAAI,EAAc,OAAO;QACrD,OAAO;IACT;IAGA,MAAMC,cAAwB,EAAE;IAChC,KAAK,MAAMJ,SAASC,aAAc;QAEhC,MAAMI,aACJL,MAAM,iBAAiB,IACvBA,MAAM,eAAe,IACrBA,MAAM,gBAAgB;QACxB,IAAIK,cAAc,CAACD,YAAY,QAAQ,CAACC,aAAa;YACnDD,YAAY,IAAI,CAACC;YACjB,IAAID,YAAY,MAAM,IAAIN,gBAAgB;QAC5C;IACF;IAEA,OAAOM;AACT;AAKO,MAAME,qBAAqB,CAChCT,SAEO;QACL,kBAAkBA,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,iBAAfA,MAAM,IAAI;QACrD,aAAaH,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,aAAaH,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,YAAfA,MAAM,IAAI;QAChD,cAAcH,OAAO,MAAM,CAAC,CAACG,QAAUA,AAAe,aAAfA,MAAM,IAAI;IACnD;AAMK,MAAMO,oBAAoB,CAC/BC,gBACAC,cAEO;QACL,YAAYD,eAAe,gBAAgB,CAAC,MAAM;QAClD,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,OAAOA,eAAe,WAAW,CAAC,MAAM;QACxC,QAAQA,eAAe,YAAY,CAAC,MAAM;QAC1C,OAAOC;IACT;AAMK,MAAMC,2BAA2B,CACtCC,cAEOA,YACJ,GAAG,CAAC,CAACX,QAAW;YACf,aAAaA,MAAM,kBAAkB,IAAI;YACzC,OAAOA,MAAM,KAAK,IAAI;QACxB,IACC,MAAM,CAAC,CAACY,OAASA,KAAK,WAAW,IAAIA,KAAK,KAAK;AAM7C,MAAMC,sBAAsB,CACjChB,SAEOA,OAAO,GAAG,CAAC,CAACG,QAAW;YAC5B,MAAMA,MAAM,IAAI;YAChB,WAAWA,MAAM,SAAS;YAC1B,KAAKA,MAAM,GAAG;YACd,OAAOA,MAAM,KAAK;YAClB,oBAAoBA,MAAM,kBAAkB;YAC5C,OAAOA,MAAM,KAAK;YAClB,UAAUA,MAAM,QAAQ;YACxB,aAAaA,MAAM,WAAW;QAChC;AAMK,MAAMc,sBAAsB,CACjCjB,QACAkB,UAA0D,CAAC,CAAC;IAE5D,MAAMP,iBAAiBF,mBAAmBT;IAC1C,MAAMmB,cAAcT,kBAAkBC,gBAAgBX,OAAO,MAAM;IAGnE,MAAMoB,WACJT,eAAe,gBAAgB,CAAC,MAAM,GAAG,IACrCA,eAAe,gBAAgB,CAAC,EAAE,CAAC,GAAG,IAAI,KAC1C;IAEN,MAAMU,oBAAoBV,eAAe,WAAW,CACjD,GAAG,CAAC,CAACR,QAAUA,MAAM,kBAAkB,EACvC,MAAM,CAAC,CAACmB,OAAyBC,QAAQD,OACzC,KAAK,CAAC,GAAG;IAEZ,MAAME,oBAAoBX,yBACxBF,eAAe,WAAW,EAC1B,KAAK,CAAC,GAAG;IAEX,MAAMc,OAAOd,eAAe,gBAAgB,CACzC,GAAG,CAAC,CAACe,IAAMA,EAAE,GAAG,EAChB,MAAM,CAAC,CAACC,MAAuBJ,QAAQI,MACvC,KAAK,CAAC,GAAG;IAEZ,MAAMC,kBAAkBZ,oBAAoBhB;IAE5C,OAAO;QACL,UAAUkB,QAAQ,QAAQ,IAAI;QAC9BE;QACAD;QACAM;QACAJ;QACAG;QACA,QAAQI;IACV;AACF;AAKO,MAAMC,uBAAuB,CAClCC,YACAvB,cAAwB,EAAE,EAC1BwB,qBAAqB,IAAI;IAEzB,MAAMC,iBAAwB;QAC5B;YACE,MAAM;YACN,MAAMF;QACR;KACD;IAGD,IAAIC,sBAAsBxB,YAAY,MAAM,GAAG,GAAG;QAChDyB,eAAe,OAAO,CAAC;YACrB,MAAM;YACN,MAAM;QACR;QAEAzB,YAAY,OAAO,CAAC,CAACC;YACnBwB,eAAe,IAAI,CAAC;gBAClB,MAAM;gBACN,WAAW;oBACT,KAAKxB;gBACP;YACF;QACF;IACF;IAEA,OAAOwB;AACT;AAKO,MAAMC,iBAAiB,CAACjC;IAC7B,IAAI,CAACA,OAAO,MAAM,EAChB,MAAM,IAAIkC,MAAM;AAEpB;AAOO,MAAMC,mBAAmB,OAC9BnC,QACAkB,UAAiC,CAAC,CAAC;IAEnC,IAAI;QAEFe,eAAejC;QAGf,MAAMoC,UAAUnB,oBAAoBjB,QAAQ;YAC1C,UAAUkB,QAAQ,QAAQ;YAC1B,gBAAgBA,QAAQ,cAAc,IAAI;QAC5C;QAGA,MAAMmB,cAAc;YAClB,GAAGD,OAAO;YACV,mBAAmBlB,QAAQ,iBAAiB,IAAI;QAClD;QAGA,MAAMX,cAAcR,qBAClBC,QACAkB,QAAQ,cAAc,IAAI;QAI5B,MAAMoB,SAAuC;YAC3C;gBACE,MAAM;gBACN,SAAS,CAAC,4GAA4G,EAAEC,mBAAmB;YAC7I;YACA;gBACE,MAAM;gBACN,SAAS,CAAC;;;AAGlB,EAAEC,KAAK,SAAS,CAACH,aAAa,MAAM,GAAG;;;;;;;;;wCASC,CAAC;YACnC;SACD;QAGD,IAAI9B,YAAY,MAAM,GAAG,GAAG;YAC1B+B,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SACE;YACJ;YAEAA,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SAAS/B,YAAY,GAAG,CAAC,CAACC,aAAgB;wBACxC,MAAM;wBACN,WAAW;4BACT,KAAKA;wBACP;oBACF;YACF;QACF;QAEA,MAAMiC,WAAW,MAAMC,OAAOJ,QAAQK,aAAa,YAAY,EAAE;YAC/D,QAAQ;QACV;QAEA,IAAIF,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAOA,SAAS,OAAO;QAGzB,MAAM,IAAIP,MAAM;IAClB,EAAE,OAAOU,OAAO;QACd,MAAM,IAAIV,MAAM,CAAC,8BAA8B,EAAEU,OAAO;IAC1D;AACF;AAKO,MAAMC,yBAAyB,OACpC7C,QACAkB,UAAkE,CAAC,CAAC;IAEpE,IAAI;QAEFe,eAAejC;QAGf,MAAMoC,UAAUnB,oBAAoBjB,QAAQ;YAC1C,UAAUkB,QAAQ,QAAQ;YAC1B,gBAAgBA,QAAQ,cAAc,IAAI;QAC5C;QAGA,MAAMmB,cAAc;YAClB,GAAGD,OAAO;YACV,mBAAmBlB,QAAQ,iBAAiB,IAAI;QAClD;QAGA,MAAMX,cAAcR,qBAClBC,QACAkB,QAAQ,cAAc,IAAI;QAI5B,MAAMoB,SAAuC;YAC3C;gBACE,MAAM;gBACN,SAAS,CAAC,4GAA4G,EAAEC,mBAAmB;YAC7I;YACA;gBACE,MAAM;gBACN,SAAS,CAAC;;;AAGlB,EAAEC,KAAK,SAAS,CAACH,aAAa,MAAM,GAAG;;;;;;;;;wCASC,CAAC;YACnC;SACD;QAGD,IAAI9B,YAAY,MAAM,GAAG,GAAG;YAC1B+B,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SACE;YACJ;YAEAA,OAAO,IAAI,CAAC;gBACV,MAAM;gBACN,SAAS/B,YAAY,GAAG,CAAC,CAACC,aAAgB;wBACxC,MAAM;wBACN,WAAW;4BACT,KAAKA;wBACP;oBACF;YACF;QACF;QAEA,IAAIU,QAAQ,MAAM,IAAIA,QAAQ,OAAO,EAEnC,OAAO,MAAMwB,OACXJ,QACAK,aAAa,YAAY,EACzB;YACE,QAAQ;QACV,GACA;YACE,QAAQ;YACR,SAASzB,QAAQ,OAAO;QAC1B;QAEG;YAEL,MAAMuB,WAAW,MAAMC,OAAOJ,QAAQK,aAAa,YAAY,EAAE;gBAC/D,QAAQ;YACV;YAEA,IAAIF,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAO;gBACL,SAASA,SAAS,OAAO;gBACzB,OAAOA,SAAS,KAAK;gBACrB,YAAY;YACd;YAGF,MAAM,IAAIP,MAAM;QAClB;IACF,EAAE,OAAOU,OAAO;QACd,MAAM,IAAIV,MAAM,CAAC,8BAA8B,EAAEU,OAAO;IAC1D;AACF"}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { AIResponseFormat } from "../../types.mjs";
|
|
2
2
|
import { Anthropic } from "@anthropic-ai/sdk";
|
|
3
3
|
import { DefaultAzureCredential, getBearerTokenProvider } from "@azure/identity";
|
|
4
|
-
import { MIDSCENE_API_TYPE, MIDSCENE_LANGSMITH_DEBUG, OPENAI_MAX_TOKENS, globalConfigManager } from "@midscene/shared/env";
|
|
4
|
+
import { MIDSCENE_API_TYPE, MIDSCENE_LANGSMITH_DEBUG, OPENAI_MAX_TOKENS, globalConfigManager, uiTarsModelVersion, vlLocateMode } from "@midscene/shared/env";
|
|
5
5
|
import { parseBase64 } from "@midscene/shared/img";
|
|
6
6
|
import { getDebug } from "@midscene/shared/logger";
|
|
7
7
|
import { assert, ifInBrowser } from "@midscene/shared/utils";
|
|
@@ -13,8 +13,8 @@ import { AIActionType } from "../common.mjs";
|
|
|
13
13
|
import { assertSchema } from "../prompt/assertion.mjs";
|
|
14
14
|
import { locatorSchema } from "../prompt/llm-locator.mjs";
|
|
15
15
|
import { planSchema } from "../prompt/llm-planning.mjs";
|
|
16
|
-
async function createChatClient({ AIActionTypeValue,
|
|
17
|
-
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, openaiUseAzureDeprecated, useAzureOpenai, azureOpenaiScope, azureOpenaiKey, azureOpenaiEndpoint, azureOpenaiApiVersion, azureOpenaiDeployment, azureExtraConfig, useAnthropicSdk, anthropicApiKey, modelDescription
|
|
16
|
+
async function createChatClient({ AIActionTypeValue, modelPreferences }) {
|
|
17
|
+
const { socksProxy, httpProxy, modelName, openaiBaseURL, openaiApiKey, openaiExtraConfig, openaiUseAzureDeprecated, useAzureOpenai, azureOpenaiScope, azureOpenaiKey, azureOpenaiEndpoint, azureOpenaiApiVersion, azureOpenaiDeployment, azureExtraConfig, useAnthropicSdk, anthropicApiKey, modelDescription } = globalConfigManager.getModelConfigByIntent(modelPreferences.intent);
|
|
18
18
|
let openai;
|
|
19
19
|
let proxyAgent;
|
|
20
20
|
const debugProxy = getDebug('ai:call:proxy');
|
|
@@ -76,9 +76,7 @@ async function createChatClient({ AIActionTypeValue, modelConfig }) {
|
|
|
76
76
|
completion: openai.chat.completions,
|
|
77
77
|
style: 'openai',
|
|
78
78
|
modelName,
|
|
79
|
-
modelDescription
|
|
80
|
-
uiTarsVersion,
|
|
81
|
-
vlMode
|
|
79
|
+
modelDescription
|
|
82
80
|
};
|
|
83
81
|
if (useAnthropicSdk) openai = new Anthropic({
|
|
84
82
|
apiKey: anthropicApiKey,
|
|
@@ -89,16 +87,14 @@ async function createChatClient({ AIActionTypeValue, modelConfig }) {
|
|
|
89
87
|
completion: openai.messages,
|
|
90
88
|
style: 'anthropic',
|
|
91
89
|
modelName,
|
|
92
|
-
modelDescription
|
|
93
|
-
uiTarsVersion,
|
|
94
|
-
vlMode
|
|
90
|
+
modelDescription
|
|
95
91
|
};
|
|
96
92
|
throw new Error('Openai SDK or Anthropic SDK is not initialized');
|
|
97
93
|
}
|
|
98
|
-
async function
|
|
99
|
-
const { completion, style, modelName, modelDescription
|
|
94
|
+
async function call(messages, AIActionTypeValue, modelPreferences, options) {
|
|
95
|
+
const { completion, style, modelName, modelDescription } = await createChatClient({
|
|
100
96
|
AIActionTypeValue,
|
|
101
|
-
|
|
97
|
+
modelPreferences
|
|
102
98
|
});
|
|
103
99
|
const responseFormat = getResponseFormat(modelName, AIActionTypeValue);
|
|
104
100
|
const maxTokens = globalConfigManager.getEnvConfigValue(OPENAI_MAX_TOKENS);
|
|
@@ -112,10 +108,10 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
|
|
|
112
108
|
let usage;
|
|
113
109
|
let timeCost;
|
|
114
110
|
const commonConfig = {
|
|
115
|
-
temperature: 'vlm-ui-tars' ===
|
|
111
|
+
temperature: 'vlm-ui-tars' === vlLocateMode(modelPreferences) ? 0.0 : 0.1,
|
|
116
112
|
stream: !!isStreaming,
|
|
117
113
|
max_tokens: 'number' == typeof maxTokens ? maxTokens : Number.parseInt(maxTokens || '2048', 10),
|
|
118
|
-
...'qwen-vl' ===
|
|
114
|
+
...'qwen-vl' === vlLocateMode(modelPreferences) ? {
|
|
119
115
|
vl_high_resolution_images: true
|
|
120
116
|
} : {}
|
|
121
117
|
};
|
|
@@ -169,7 +165,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
|
|
|
169
165
|
time_cost: timeCost ?? 0,
|
|
170
166
|
model_name: modelName,
|
|
171
167
|
model_description: modelDescription,
|
|
172
|
-
intent:
|
|
168
|
+
intent: modelPreferences.intent
|
|
173
169
|
}
|
|
174
170
|
};
|
|
175
171
|
options.onChunk(finalChunk);
|
|
@@ -177,7 +173,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
|
|
|
177
173
|
}
|
|
178
174
|
}
|
|
179
175
|
content = accumulated;
|
|
180
|
-
debugProfileStats(`streaming model, ${modelName}, mode, ${
|
|
176
|
+
debugProfileStats(`streaming model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, cost-ms, ${timeCost}`);
|
|
181
177
|
} else {
|
|
182
178
|
var _result_usage, _result_usage1, _result_usage2;
|
|
183
179
|
const result = await completion.create({
|
|
@@ -187,7 +183,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
|
|
|
187
183
|
...commonConfig
|
|
188
184
|
});
|
|
189
185
|
timeCost = Date.now() - startTime;
|
|
190
|
-
debugProfileStats(`model, ${modelName}, mode, ${
|
|
186
|
+
debugProfileStats(`model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, ui-tars-version, ${uiTarsModelVersion(modelPreferences)}, prompt-tokens, ${(null == (_result_usage = result.usage) ? void 0 : _result_usage.prompt_tokens) || ''}, completion-tokens, ${(null == (_result_usage1 = result.usage) ? void 0 : _result_usage1.completion_tokens) || ''}, total-tokens, ${(null == (_result_usage2 = result.usage) ? void 0 : _result_usage2.total_tokens) || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`);
|
|
191
187
|
debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);
|
|
192
188
|
assert(result.choices, `invalid response from LLM service: ${JSON.stringify(result)}`);
|
|
193
189
|
content = result.choices[0].message.content;
|
|
@@ -252,7 +248,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
|
|
|
252
248
|
time_cost: timeCost ?? 0,
|
|
253
249
|
model_name: modelName,
|
|
254
250
|
model_description: modelDescription,
|
|
255
|
-
intent:
|
|
251
|
+
intent: modelPreferences.intent
|
|
256
252
|
} : void 0
|
|
257
253
|
};
|
|
258
254
|
options.onChunk(finalChunk);
|
|
@@ -294,7 +290,7 @@ async function callAI(messages, AIActionTypeValue, modelConfig, options) {
|
|
|
294
290
|
time_cost: timeCost ?? 0,
|
|
295
291
|
model_name: modelName,
|
|
296
292
|
model_description: modelDescription,
|
|
297
|
-
intent:
|
|
293
|
+
intent: modelPreferences.intent
|
|
298
294
|
} : void 0,
|
|
299
295
|
isStreamed: !!isStreaming
|
|
300
296
|
};
|
|
@@ -330,18 +326,17 @@ const getResponseFormat = (modelName, AIActionTypeValue)=>{
|
|
|
330
326
|
};
|
|
331
327
|
return responseFormat;
|
|
332
328
|
};
|
|
333
|
-
async function
|
|
334
|
-
const response = await
|
|
329
|
+
async function callToGetJSONObject(messages, AIActionTypeValue, modelPreferences) {
|
|
330
|
+
const response = await call(messages, AIActionTypeValue, modelPreferences);
|
|
335
331
|
assert(response, 'empty response');
|
|
336
|
-
const
|
|
337
|
-
const jsonContent = safeParseJson(response.content, vlMode);
|
|
332
|
+
const jsonContent = safeParseJson(response.content, modelPreferences);
|
|
338
333
|
return {
|
|
339
334
|
content: jsonContent,
|
|
340
335
|
usage: response.usage
|
|
341
336
|
};
|
|
342
337
|
}
|
|
343
|
-
async function
|
|
344
|
-
const { content, usage } = await
|
|
338
|
+
async function callAiFnWithStringResponse(msgs, AIActionTypeValue, modelPreferences) {
|
|
339
|
+
const { content, usage } = await call(msgs, AIActionTypeValue, modelPreferences);
|
|
345
340
|
return {
|
|
346
341
|
content,
|
|
347
342
|
usage
|
|
@@ -362,7 +357,7 @@ function preprocessDoubaoBboxJson(input) {
|
|
|
362
357
|
if (input.includes('bbox')) while(/\d+\s+\d+/.test(input))input = input.replace(/(\d+)\s+(\d+)/g, '$1,$2');
|
|
363
358
|
return input;
|
|
364
359
|
}
|
|
365
|
-
function safeParseJson(input,
|
|
360
|
+
function safeParseJson(input, modelPreferences) {
|
|
366
361
|
const cleanJsonString = extractJSONFromCodeBlock(input);
|
|
367
362
|
if (null == cleanJsonString ? void 0 : cleanJsonString.match(/\((\d+),(\d+)\)/)) {
|
|
368
363
|
var _cleanJsonString_match;
|
|
@@ -374,12 +369,12 @@ function safeParseJson(input, vlMode) {
|
|
|
374
369
|
try {
|
|
375
370
|
return JSON.parse(jsonrepair(cleanJsonString));
|
|
376
371
|
} catch (e) {}
|
|
377
|
-
if ('doubao-vision' ===
|
|
372
|
+
if ('doubao-vision' === vlLocateMode(modelPreferences) || 'vlm-ui-tars' === vlLocateMode(modelPreferences)) {
|
|
378
373
|
const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
|
|
379
374
|
return JSON.parse(jsonrepair(jsonString));
|
|
380
375
|
}
|
|
381
376
|
throw Error(`failed to parse json response: ${input}`);
|
|
382
377
|
}
|
|
383
|
-
export {
|
|
378
|
+
export { call, callAiFnWithStringResponse, callToGetJSONObject, extractJSONFromCodeBlock, getResponseFormat, preprocessDoubaoBboxJson, safeParseJson };
|
|
384
379
|
|
|
385
380
|
//# sourceMappingURL=index.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/service-caller/index.mjs","sources":["webpack://@midscene/core/./src/ai-model/service-caller/index.ts"],"sourcesContent":["import { AIResponseFormat, type AIUsageInfo } from '@/types';\nimport type { CodeGenerationChunk, StreamingCallback } from '@/types';\nimport { Anthropic } from '@anthropic-ai/sdk';\nimport {\n DefaultAzureCredential,\n getBearerTokenProvider,\n} from '@azure/identity';\nimport {\n type IModelConfig,\n MIDSCENE_API_TYPE,\n MIDSCENE_LANGSMITH_DEBUG,\n OPENAI_MAX_TOKENS,\n type TVlModeTypes,\n type UITarsModelVersion,\n globalConfigManager,\n} from '@midscene/shared/env';\n\nimport { parseBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ifInBrowser } from '@midscene/shared/utils';\nimport { HttpsProxyAgent } from 'https-proxy-agent';\nimport { jsonrepair } from 'jsonrepair';\nimport OpenAI, { AzureOpenAI } from 'openai';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { Stream } from 'openai/streaming';\nimport { SocksProxyAgent } from 'socks-proxy-agent';\nimport { AIActionType, type AIArgs } from '../common';\nimport { assertSchema } from '../prompt/assertion';\nimport { locatorSchema } from '../prompt/llm-locator';\nimport { planSchema } from '../prompt/llm-planning';\n\nasync function createChatClient({\n AIActionTypeValue,\n modelConfig,\n}: {\n AIActionTypeValue: AIActionType;\n modelConfig: IModelConfig;\n}): Promise<{\n completion: OpenAI.Chat.Completions;\n style: 'openai' | 'anthropic';\n modelName: string;\n modelDescription: string;\n uiTarsVersion?: UITarsModelVersion;\n vlMode: TVlModeTypes | undefined;\n}> {\n const {\n socksProxy,\n httpProxy,\n modelName,\n openaiBaseURL,\n openaiApiKey,\n openaiExtraConfig,\n openaiUseAzureDeprecated,\n useAzureOpenai,\n azureOpenaiScope,\n azureOpenaiKey,\n azureOpenaiEndpoint,\n azureOpenaiApiVersion,\n azureOpenaiDeployment,\n azureExtraConfig,\n useAnthropicSdk,\n anthropicApiKey,\n modelDescription,\n uiTarsModelVersion: uiTarsVersion,\n vlMode,\n } = modelConfig;\n\n let openai: OpenAI | AzureOpenAI | undefined;\n\n let proxyAgent = undefined;\n const debugProxy = getDebug('ai:call:proxy');\n if (httpProxy) {\n debugProxy('using http proxy', httpProxy);\n proxyAgent = new HttpsProxyAgent(httpProxy);\n } else if (socksProxy) {\n debugProxy('using socks proxy', socksProxy);\n proxyAgent = new SocksProxyAgent(socksProxy);\n }\n\n if (openaiUseAzureDeprecated) {\n // this is deprecated\n openai = new AzureOpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n dangerouslyAllowBrowser: true,\n }) as OpenAI;\n } else if (useAzureOpenai) {\n // https://learn.microsoft.com/en-us/azure/ai-services/openai/chatgpt-quickstart?tabs=bash%2Cjavascript-key%2Ctypescript-keyless%2Cpython&pivots=programming-language-javascript#rest-api\n // keyless authentication\n let tokenProvider: any = undefined;\n if (azureOpenaiScope) {\n assert(\n !ifInBrowser,\n 'Azure OpenAI is not supported in browser with Midscene.',\n );\n const credential = new DefaultAzureCredential();\n\n tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);\n\n openai = new AzureOpenAI({\n azureADTokenProvider: tokenProvider,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n } else {\n // endpoint, apiKey, apiVersion, deployment\n openai = new AzureOpenAI({\n apiKey: azureOpenaiKey,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n dangerouslyAllowBrowser: true,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n }\n } else if (!useAnthropicSdk) {\n openai = new OpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n defaultHeaders: {\n ...(openaiExtraConfig?.defaultHeaders || {}),\n [MIDSCENE_API_TYPE]: AIActionTypeValue.toString(),\n },\n dangerouslyAllowBrowser: true,\n });\n }\n\n if (\n openai &&\n globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)\n ) {\n if (ifInBrowser) {\n throw new Error('langsmith is not supported in browser');\n }\n console.log('DEBUGGING MODE: langsmith wrapper enabled');\n const { wrapOpenAI } = await import('langsmith/wrappers');\n openai = wrapOpenAI(openai);\n }\n\n if (typeof openai !== 'undefined') {\n return {\n completion: openai.chat.completions,\n style: 'openai',\n modelName,\n modelDescription,\n uiTarsVersion,\n vlMode,\n };\n }\n\n // Anthropic\n if (useAnthropicSdk) {\n openai = new Anthropic({\n apiKey: anthropicApiKey,\n httpAgent: proxyAgent,\n dangerouslyAllowBrowser: true,\n }) as any;\n }\n\n if (typeof openai !== 'undefined' && (openai as any).messages) {\n return {\n completion: (openai as any).messages,\n style: 'anthropic',\n modelName,\n modelDescription,\n uiTarsVersion,\n vlMode,\n };\n }\n\n throw new Error('Openai SDK or Anthropic SDK is not initialized');\n}\n\nexport async function callAI(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelConfig: IModelConfig,\n options?: {\n stream?: boolean;\n onChunk?: StreamingCallback;\n },\n): Promise<{ content: string; usage?: AIUsageInfo; isStreamed: boolean }> {\n const {\n completion,\n style,\n modelName,\n modelDescription,\n uiTarsVersion,\n vlMode,\n } = await createChatClient({\n AIActionTypeValue,\n modelConfig,\n });\n\n const responseFormat = getResponseFormat(modelName, AIActionTypeValue);\n\n const maxTokens = globalConfigManager.getEnvConfigValue(OPENAI_MAX_TOKENS);\n const debugCall = getDebug('ai:call');\n const debugProfileStats = getDebug('ai:profile:stats');\n const debugProfileDetail = getDebug('ai:profile:detail');\n\n const startTime = Date.now();\n\n const isStreaming = options?.stream && options?.onChunk;\n let content: string | undefined;\n let accumulated = '';\n let usage: OpenAI.CompletionUsage | undefined;\n let timeCost: number | undefined;\n\n const commonConfig = {\n temperature: vlMode === 'vlm-ui-tars' ? 0.0 : 0.1,\n stream: !!isStreaming,\n max_tokens:\n typeof maxTokens === 'number'\n ? maxTokens\n : Number.parseInt(maxTokens || '2048', 10),\n ...(vlMode === 'qwen-vl' // qwen specific config\n ? {\n vl_high_resolution_images: true,\n }\n : {}),\n };\n\n try {\n if (style === 'openai') {\n debugCall(\n `sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`,\n );\n\n if (isStreaming) {\n const stream = (await completion.create(\n {\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n },\n {\n stream: true,\n },\n )) as Stream<OpenAI.Chat.Completions.ChatCompletionChunk> & {\n _request_id?: string | null;\n };\n\n for await (const chunk of stream) {\n const content = chunk.choices?.[0]?.delta?.content || '';\n const reasoning_content =\n (chunk.choices?.[0]?.delta as any)?.reasoning_content || '';\n\n // Check for usage info in any chunk (OpenAI provides usage in separate chunks)\n if (chunk.usage) {\n usage = chunk.usage;\n }\n\n if (content || reasoning_content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n reasoning_content,\n accumulated,\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.choices?.[0]?.finish_reason) {\n timeCost = Date.now() - startTime;\n\n // If usage is not available from the stream, provide a basic usage info\n if (!usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor(accumulated.length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelConfig.intent,\n },\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n debugProfileStats(\n `streaming model, ${modelName}, mode, ${vlMode || 'default'}, cost-ms, ${timeCost}`,\n );\n } else {\n const result = await completion.create({\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n\n debugProfileStats(\n `model, ${modelName}, mode, ${vlMode || 'default'}, ui-tars-version, ${uiTarsVersion}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`,\n );\n\n debugProfileDetail(\n `model usage detail: ${JSON.stringify(result.usage)}`,\n );\n\n assert(\n result.choices,\n `invalid response from LLM service: ${JSON.stringify(result)}`,\n );\n content = result.choices[0].message.content!;\n usage = result.usage;\n }\n\n debugCall(`response: ${content}`);\n assert(content, 'empty content');\n } else if (style === 'anthropic') {\n const convertImageContent = (content: any) => {\n if (content.type === 'image_url') {\n const imgBase64 = content.image_url.url;\n assert(imgBase64, 'image_url is required');\n const { mimeType, body } = parseBase64(content.image_url.url);\n return {\n source: {\n type: 'base64',\n media_type: mimeType,\n data: body,\n },\n type: 'image',\n };\n }\n return content;\n };\n\n if (isStreaming) {\n const stream = (await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any)) as any;\n\n for await (const chunk of stream) {\n const content = chunk.delta?.text || '';\n if (content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n accumulated,\n reasoning_content: '',\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.type === 'message_stop') {\n timeCost = Date.now() - startTime;\n const anthropicUsage = chunk.usage;\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: anthropicUsage\n ? {\n prompt_tokens: anthropicUsage.input_tokens ?? 0,\n completion_tokens: anthropicUsage.output_tokens ?? 0,\n total_tokens:\n (anthropicUsage.input_tokens ?? 0) +\n (anthropicUsage.output_tokens ?? 0),\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelConfig.intent,\n }\n : undefined,\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n } else {\n const result = await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n content = (result as any).content[0].text as string;\n usage = result.usage;\n }\n\n assert(content, 'empty content');\n }\n // Ensure we always have usage info for streaming responses\n if (isStreaming && !usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor((content || '').length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n return {\n content: content || '',\n usage: usage\n ? {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelConfig.intent,\n }\n : undefined,\n isStreamed: !!isStreaming,\n };\n } catch (e: any) {\n console.error(' call AI error', e);\n const newError = new Error(\n `failed to call ${isStreaming ? 'streaming ' : ''}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`,\n {\n cause: e,\n },\n );\n throw newError;\n }\n}\n\nexport const getResponseFormat = (\n modelName: string,\n AIActionTypeValue: AIActionType,\n):\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject => {\n let responseFormat:\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject\n | undefined;\n\n if (modelName.includes('gpt-4')) {\n switch (AIActionTypeValue) {\n case AIActionType.ASSERT:\n responseFormat = assertSchema;\n break;\n case AIActionType.INSPECT_ELEMENT:\n responseFormat = locatorSchema;\n break;\n case AIActionType.PLAN:\n responseFormat = planSchema;\n break;\n case AIActionType.EXTRACT_DATA:\n case AIActionType.DESCRIBE_ELEMENT:\n responseFormat = { type: AIResponseFormat.JSON };\n break;\n }\n }\n\n // gpt-4o-2024-05-13 only supports json_object response format\n if (modelName === 'gpt-4o-2024-05-13') {\n responseFormat = { type: AIResponseFormat.JSON };\n }\n\n return responseFormat;\n};\n\nexport async function callAIWithObjectResponse<T>(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelConfig: IModelConfig,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const response = await callAI(messages, AIActionTypeValue, modelConfig);\n assert(response, 'empty response');\n const vlMode = modelConfig.vlMode;\n const jsonContent = safeParseJson(response.content, vlMode);\n return { content: jsonContent, usage: response.usage };\n}\n\nexport async function callAIWithStringResponse(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n modelConfig: IModelConfig,\n): Promise<{ content: string; usage?: AIUsageInfo }> {\n const { content, usage } = await callAI(msgs, AIActionTypeValue, modelConfig);\n return { content, usage };\n}\n\nexport function extractJSONFromCodeBlock(response: string) {\n try {\n // First, try to match a JSON object directly in the response\n const jsonMatch = response.match(/^\\s*(\\{[\\s\\S]*\\})\\s*$/);\n if (jsonMatch) {\n return jsonMatch[1];\n }\n\n // If no direct JSON object is found, try to extract JSON from a code block\n const codeBlockMatch = response.match(\n /```(?:json)?\\s*(\\{[\\s\\S]*?\\})\\s*```/,\n );\n if (codeBlockMatch) {\n return codeBlockMatch[1];\n }\n\n // If no code block is found, try to find a JSON-like structure in the text\n const jsonLikeMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (jsonLikeMatch) {\n return jsonLikeMatch[0];\n }\n } catch {}\n // If no JSON-like structure is found, return the original response\n return response;\n}\n\nexport function preprocessDoubaoBboxJson(input: string) {\n if (input.includes('bbox')) {\n // when its values like 940 445 969 490, replace all /\\d+\\s+\\d+/g with /$1,$2/g\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nexport function safeParseJson(input: string, vlMode: TVlModeTypes | undefined) {\n const cleanJsonString = extractJSONFromCodeBlock(input);\n // match the point\n if (cleanJsonString?.match(/\\((\\d+),(\\d+)\\)/)) {\n return cleanJsonString\n .match(/\\((\\d+),(\\d+)\\)/)\n ?.slice(1)\n .map(Number);\n }\n try {\n return JSON.parse(cleanJsonString);\n } catch {}\n try {\n return JSON.parse(jsonrepair(cleanJsonString));\n } catch (e) {}\n\n if (vlMode === 'doubao-vision' || vlMode === 'vlm-ui-tars') {\n const jsonString = preprocessDoubaoBboxJson(cleanJsonString);\n return JSON.parse(jsonrepair(jsonString));\n }\n throw Error(`failed to parse json response: ${input}`);\n}\n"],"names":["createChatClient","AIActionTypeValue","modelConfig","socksProxy","httpProxy","modelName","openaiBaseURL","openaiApiKey","openaiExtraConfig","openaiUseAzureDeprecated","useAzureOpenai","azureOpenaiScope","azureOpenaiKey","azureOpenaiEndpoint","azureOpenaiApiVersion","azureOpenaiDeployment","azureExtraConfig","useAnthropicSdk","anthropicApiKey","modelDescription","uiTarsVersion","vlMode","openai","proxyAgent","debugProxy","getDebug","HttpsProxyAgent","SocksProxyAgent","AzureOpenAI","tokenProvider","assert","ifInBrowser","credential","DefaultAzureCredential","getBearerTokenProvider","OpenAI","MIDSCENE_API_TYPE","globalConfigManager","MIDSCENE_LANGSMITH_DEBUG","Error","console","wrapOpenAI","Anthropic","callAI","messages","options","completion","style","responseFormat","getResponseFormat","maxTokens","OPENAI_MAX_TOKENS","debugCall","debugProfileStats","debugProfileDetail","startTime","Date","isStreaming","content","accumulated","usage","timeCost","commonConfig","Number","stream","chunk","_chunk_choices__delta","_chunk_choices__delta1","_chunk_choices_2","reasoning_content","chunkData","undefined","estimatedTokens","Math","finalChunk","_result_usage","_result_usage1","_result_usage2","result","JSON","convertImageContent","imgBase64","mimeType","body","parseBase64","m","Array","_chunk_delta","anthropicUsage","e","newError","AIActionType","assertSchema","locatorSchema","planSchema","AIResponseFormat","callAIWithObjectResponse","response","jsonContent","safeParseJson","callAIWithStringResponse","msgs","extractJSONFromCodeBlock","jsonMatch","codeBlockMatch","jsonLikeMatch","preprocessDoubaoBboxJson","input","cleanJsonString","_cleanJsonString_match","jsonrepair","jsonString"],"mappings":";;;;;;;;;;;;;;;AAgCA,eAAeA,iBAAiB,EAC9BC,iBAAiB,EACjBC,WAAW,EAIZ;IAQC,MAAM,EACJC,UAAU,EACVC,SAAS,EACTC,SAAS,EACTC,aAAa,EACbC,YAAY,EACZC,iBAAiB,EACjBC,wBAAwB,EACxBC,cAAc,EACdC,gBAAgB,EAChBC,cAAc,EACdC,mBAAmB,EACnBC,qBAAqB,EACrBC,qBAAqB,EACrBC,gBAAgB,EAChBC,eAAe,EACfC,eAAe,EACfC,gBAAgB,EAChB,oBAAoBC,aAAa,EACjCC,MAAM,EACP,GAAGnB;IAEJ,IAAIoB;IAEJ,IAAIC;IACJ,MAAMC,aAAaC,SAAS;IAC5B,IAAIrB,WAAW;QACboB,WAAW,oBAAoBpB;QAC/BmB,aAAa,IAAIG,gBAAgBtB;IACnC,OAAO,IAAID,YAAY;QACrBqB,WAAW,qBAAqBrB;QAChCoB,aAAa,IAAII,gBAAgBxB;IACnC;IAEA,IAAIM,0BAEFa,SAAS,IAAIM,YAAY;QACvB,SAAStB;QACT,QAAQC;QACR,WAAWgB;QACX,GAAGf,iBAAiB;QACpB,yBAAyB;IAC3B;SACK,IAAIE,gBAAgB;QAGzB,IAAImB;QACJ,IAAIlB,kBAAkB;YACpBmB,OACE,CAACC,aACD;YAEF,MAAMC,aAAa,IAAIC;YAEvBJ,gBAAgBK,uBAAuBF,YAAYrB;YAEnDW,SAAS,IAAIM,YAAY;gBACvB,sBAAsBC;gBACtB,UAAUhB;gBACV,YAAYC;gBACZ,YAAYC;gBACZ,GAAGP,iBAAiB;gBACpB,GAAGQ,gBAAgB;YACrB;QACF,OAEEM,SAAS,IAAIM,YAAY;YACvB,QAAQhB;YACR,UAAUC;YACV,YAAYC;YACZ,YAAYC;YACZ,yBAAyB;YACzB,GAAGP,iBAAiB;YACpB,GAAGQ,gBAAgB;QACrB;IAEJ,OAAO,IAAI,CAACC,iBACVK,SAAS,IAAIa,SAAO;QAClB,SAAS7B;QACT,QAAQC;QACR,WAAWgB;QACX,GAAGf,iBAAiB;QACpB,gBAAgB;YACd,GAAIA,AAAAA,CAAAA,QAAAA,oBAAAA,KAAAA,IAAAA,kBAAmB,cAAc,AAAD,KAAK,CAAC,CAAC;YAC3C,CAAC4B,kBAAkB,EAAEnC,kBAAkB,QAAQ;QACjD;QACA,yBAAyB;IAC3B;IAGF,IACEqB,UACAe,oBAAoB,qBAAqB,CAACC,2BAC1C;QACA,IAAIP,aACF,MAAM,IAAIQ,MAAM;QAElBC,QAAQ,GAAG,CAAC;QACZ,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC;QACpCnB,SAASmB,WAAWnB;IACtB;IAEA,IAAI,AAAkB,WAAXA,QACT,OAAO;QACL,YAAYA,OAAO,IAAI,CAAC,WAAW;QACnC,OAAO;QACPjB;QACAc;QACAC;QACAC;IACF;IAIF,IAAIJ,iBACFK,SAAS,IAAIoB,UAAU;QACrB,QAAQxB;QACR,WAAWK;QACX,yBAAyB;IAC3B;IAGF,IAAI,AAAkB,WAAXD,UAA2BA,OAAe,QAAQ,EAC3D,OAAO;QACL,YAAaA,OAAe,QAAQ;QACpC,OAAO;QACPjB;QACAc;QACAC;QACAC;IACF;IAGF,MAAM,IAAIkB,MAAM;AAClB;AAEO,eAAeI,OACpBC,QAAsC,EACtC3C,iBAA+B,EAC/BC,WAAyB,EACzB2C,OAGC;IAED,MAAM,EACJC,UAAU,EACVC,KAAK,EACL1C,SAAS,EACTc,gBAAgB,EAChBC,aAAa,EACbC,MAAM,EACP,GAAG,MAAMrB,iBAAiB;QACzBC;QACAC;IACF;IAEA,MAAM8C,iBAAiBC,kBAAkB5C,WAAWJ;IAEpD,MAAMiD,YAAYb,oBAAoB,iBAAiB,CAACc;IACxD,MAAMC,YAAY3B,SAAS;IAC3B,MAAM4B,oBAAoB5B,SAAS;IACnC,MAAM6B,qBAAqB7B,SAAS;IAEpC,MAAM8B,YAAYC,KAAK,GAAG;IAE1B,MAAMC,cAAcZ,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,MAAM,AAAD,KAAKA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD;IACtD,IAAIa;IACJ,IAAIC,cAAc;IAClB,IAAIC;IACJ,IAAIC;IAEJ,MAAMC,eAAe;QACnB,aAAazC,AAAW,kBAAXA,SAA2B,MAAM;QAC9C,QAAQ,CAAC,CAACoC;QACV,YACE,AAAqB,YAArB,OAAOP,YACHA,YACAa,OAAO,QAAQ,CAACb,aAAa,QAAQ;QAC3C,GAAI7B,AAAW,cAAXA,SACA;YACE,2BAA2B;QAC7B,IACA,CAAC,CAAC;IACR;IAEA,IAAI;QACF,IAAI0B,AAAU,aAAVA,OAAoB;YACtBK,UACE,CAAC,QAAQ,EAAEK,cAAc,eAAe,GAAG,WAAW,EAAEpD,WAAW;YAGrE,IAAIoD,aAAa;gBACf,MAAMO,SAAU,MAAMlB,WAAW,MAAM,CACrC;oBACE,OAAOzC;oBACPuC;oBACA,iBAAiBI;oBACjB,GAAGc,YAAY;gBACjB,GACA;oBACE,QAAQ;gBACV;gBAKF,WAAW,MAAMG,SAASD,OAAQ;wBAChBE,uBAAAA,iBAAAA,gBAEbC,wBAAAA,kBAAAA,iBAoBCC,kBAAAA;oBAtBJ,MAAMV,UAAUQ,AAAAA,SAAAA,CAAAA,iBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,kBAAAA,cAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,wBAAAA,gBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,sBAA2B,OAAO,AAAD,KAAK;oBACtD,MAAMG,oBACJ,AAAC,SAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,yBAAAA,iBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,uBAAmC,iBAAiB,AAAD,KAAK;oBAG3D,IAAIF,MAAM,KAAK,EACbL,QAAQK,MAAM,KAAK;oBAGrB,IAAIP,WAAWW,mBAAmB;wBAChCV,eAAeD;wBACf,MAAMY,YAAiC;4BACrCZ;4BACAW;4BACAV;4BACA,YAAY;4BACZ,OAAOY;wBACT;wBACA1B,QAAQ,OAAO,CAAEyB;oBACnB;oBAGA,IAAI,QAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,iBAAoB,aAAa,EAAE;wBACrCP,WAAWL,KAAK,GAAG,KAAKD;wBAGxB,IAAI,CAACK,OAAO;4BAEV,MAAMY,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAACd,YAAY,MAAM,GAAG;4BAElCC,QAAQ;gCACN,eAAeY;gCACf,mBAAmBA;gCACnB,cAAcA,AAAkB,IAAlBA;4BAChB;wBACF;wBAGA,MAAME,aAAkC;4BACtC,SAAS;4BACTf;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO;gCACL,eAAeC,MAAM,aAAa,IAAI;gCACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gCAC9C,cAAcA,MAAM,YAAY,IAAI;gCACpC,WAAWC,YAAY;gCACvB,YAAYxD;gCACZ,mBAAmBc;gCACnB,QAAQjB,YAAY,MAAM;4BAC5B;wBACF;wBACA2C,QAAQ,OAAO,CAAE6B;wBACjB;oBACF;gBACF;gBACAhB,UAAUC;gBACVN,kBACE,CAAC,iBAAiB,EAAEhD,UAAU,QAAQ,EAAEgB,UAAU,UAAU,WAAW,EAAEwC,UAAU;YAEvF,OAAO;oBAUqGc,eAAyDC,gBAAwDC;gBAT3N,MAAMC,SAAS,MAAMhC,WAAW,MAAM,CAAC;oBACrC,OAAOzC;oBACPuC;oBACA,iBAAiBI;oBACjB,GAAGc,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBAExBF,kBACE,CAAC,OAAO,EAAEhD,UAAU,QAAQ,EAAEgB,UAAU,UAAU,mBAAmB,EAAED,cAAc,iBAAiB,EAAEuD,AAAAA,SAAAA,CAAAA,gBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,cAAc,aAAa,AAAD,KAAK,GAAG,qBAAqB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,iBAAiB,AAAD,KAAK,GAAG,gBAAgB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,YAAY,AAAD,KAAK,GAAG,WAAW,EAAEhB,SAAS,aAAa,EAAEiB,OAAO,WAAW,IAAI,IAAI;gBAG3TxB,mBACE,CAAC,oBAAoB,EAAEyB,KAAK,SAAS,CAACD,OAAO,KAAK,GAAG;gBAGvDhD,OACEgD,OAAO,OAAO,EACd,CAAC,mCAAmC,EAAEC,KAAK,SAAS,CAACD,SAAS;gBAEhEpB,UAAUoB,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,OAAO;gBAC3ClB,QAAQkB,OAAO,KAAK;YACtB;YAEA1B,UAAU,CAAC,UAAU,EAAEM,SAAS;YAChC5B,OAAO4B,SAAS;QAClB,OAAO,IAAIX,AAAU,gBAAVA,OAAuB;YAChC,MAAMiC,sBAAsB,CAACtB;gBAC3B,IAAIA,AAAiB,gBAAjBA,QAAQ,IAAI,EAAkB;oBAChC,MAAMuB,YAAYvB,QAAQ,SAAS,CAAC,GAAG;oBACvC5B,OAAOmD,WAAW;oBAClB,MAAM,EAAEC,QAAQ,EAAEC,IAAI,EAAE,GAAGC,YAAY1B,QAAQ,SAAS,CAAC,GAAG;oBAC5D,OAAO;wBACL,QAAQ;4BACN,MAAM;4BACN,YAAYwB;4BACZ,MAAMC;wBACR;wBACA,MAAM;oBACR;gBACF;gBACA,OAAOzB;YACT;YAEA,IAAID,aAAa;gBACf,MAAMO,SAAU,MAAMlB,WAAW,MAAM,CAAC;oBACtC,OAAOzC;oBACP,QAAQ;oBACR,UAAUuC,SAAS,GAAG,CAAC,CAACyC,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBrC;oBACjB,GAAGc,YAAY;gBACjB;gBAEA,WAAW,MAAMG,SAASD,OAAQ;wBAChBuB;oBAAhB,MAAM7B,UAAU6B,AAAAA,SAAAA,CAAAA,eAAAA,MAAM,KAAK,AAAD,IAAVA,KAAAA,IAAAA,aAAa,IAAI,AAAD,KAAK;oBACrC,IAAI7B,SAAS;wBACXC,eAAeD;wBACf,MAAMY,YAAiC;4BACrCZ;4BACAC;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAOY;wBACT;wBACA1B,QAAQ,OAAO,CAAEyB;oBACnB;oBAGA,IAAIL,AAAe,mBAAfA,MAAM,IAAI,EAAqB;wBACjCJ,WAAWL,KAAK,GAAG,KAAKD;wBACxB,MAAMiC,iBAAiBvB,MAAM,KAAK;wBAGlC,MAAMS,aAAkC;4BACtC,SAAS;4BACTf;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO6B,iBACH;gCACE,eAAeA,eAAe,YAAY,IAAI;gCAC9C,mBAAmBA,eAAe,aAAa,IAAI;gCACnD,cACGA,AAAAA,CAAAA,eAAe,YAAY,IAAI,KAC/BA,CAAAA,eAAe,aAAa,IAAI;gCACnC,WAAW3B,YAAY;gCACvB,YAAYxD;gCACZ,mBAAmBc;gCACnB,QAAQjB,YAAY,MAAM;4BAC5B,IACAqE;wBACN;wBACA1B,QAAQ,OAAO,CAAE6B;wBACjB;oBACF;gBACF;gBACAhB,UAAUC;YACZ,OAAO;gBACL,MAAMmB,SAAS,MAAMhC,WAAW,MAAM,CAAC;oBACrC,OAAOzC;oBACP,QAAQ;oBACR,UAAUuC,SAAS,GAAG,CAAC,CAACyC,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBrC;oBACjB,GAAGc,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBACxBG,UAAWoB,OAAe,OAAO,CAAC,EAAE,CAAC,IAAI;gBACzClB,QAAQkB,OAAO,KAAK;YACtB;YAEAhD,OAAO4B,SAAS;QAClB;QAEA,IAAID,eAAe,CAACG,OAAO;YAEzB,MAAMY,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAEf,AAAAA,CAAAA,WAAW,EAAC,EAAG,MAAM,GAAG;YAEtCE,QAAQ;gBACN,eAAeY;gBACf,mBAAmBA;gBACnB,cAAcA,AAAkB,IAAlBA;YAChB;QACF;QAEA,OAAO;YACL,SAASd,WAAW;YACpB,OAAOE,QACH;gBACE,eAAeA,MAAM,aAAa,IAAI;gBACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gBAC9C,cAAcA,MAAM,YAAY,IAAI;gBACpC,WAAWC,YAAY;gBACvB,YAAYxD;gBACZ,mBAAmBc;gBACnB,QAAQjB,YAAY,MAAM;YAC5B,IACAqE;YACJ,YAAY,CAAC,CAACd;QAChB;IACF,EAAE,OAAOgC,GAAQ;QACfjD,QAAQ,KAAK,CAAC,kBAAkBiD;QAChC,MAAMC,WAAW,IAAInD,MACnB,CAAC,eAAe,EAAEkB,cAAc,eAAe,GAAG,kBAAkB,EAAEgC,EAAE,OAAO,CAAC,8DAA8D,CAAC,EAC/I;YACE,OAAOA;QACT;QAEF,MAAMC;IACR;AACF;AAEO,MAAMzC,oBAAoB,CAC/B5C,WACAJ;IAIA,IAAI+C;IAKJ,IAAI3C,UAAU,QAAQ,CAAC,UACrB,OAAQJ;QACN,KAAK0F,aAAa,MAAM;YACtB3C,iBAAiB4C;YACjB;QACF,KAAKD,aAAa,eAAe;YAC/B3C,iBAAiB6C;YACjB;QACF,KAAKF,aAAa,IAAI;YACpB3C,iBAAiB8C;YACjB;QACF,KAAKH,aAAa,YAAY;QAC9B,KAAKA,aAAa,gBAAgB;YAChC3C,iBAAiB;gBAAE,MAAM+C,iBAAiB,IAAI;YAAC;YAC/C;IACJ;IAIF,IAAI1F,AAAc,wBAAdA,WACF2C,iBAAiB;QAAE,MAAM+C,iBAAiB,IAAI;IAAC;IAGjD,OAAO/C;AACT;AAEO,eAAegD,yBACpBpD,QAAsC,EACtC3C,iBAA+B,EAC/BC,WAAyB;IAEzB,MAAM+F,WAAW,MAAMtD,OAAOC,UAAU3C,mBAAmBC;IAC3D4B,OAAOmE,UAAU;IACjB,MAAM5E,SAASnB,YAAY,MAAM;IACjC,MAAMgG,cAAcC,cAAcF,SAAS,OAAO,EAAE5E;IACpD,OAAO;QAAE,SAAS6E;QAAa,OAAOD,SAAS,KAAK;IAAC;AACvD;AAEO,eAAeG,yBACpBC,IAAY,EACZpG,iBAA+B,EAC/BC,WAAyB;IAEzB,MAAM,EAAEwD,OAAO,EAAEE,KAAK,EAAE,GAAG,MAAMjB,OAAO0D,MAAMpG,mBAAmBC;IACjE,OAAO;QAAEwD;QAASE;IAAM;AAC1B;AAEO,SAAS0C,yBAAyBL,QAAgB;IACvD,IAAI;QAEF,MAAMM,YAAYN,SAAS,KAAK,CAAC;QACjC,IAAIM,WACF,OAAOA,SAAS,CAAC,EAAE;QAIrB,MAAMC,iBAAiBP,SAAS,KAAK,CACnC;QAEF,IAAIO,gBACF,OAAOA,cAAc,CAAC,EAAE;QAI1B,MAAMC,gBAAgBR,SAAS,KAAK,CAAC;QACrC,IAAIQ,eACF,OAAOA,aAAa,CAAC,EAAE;IAE3B,EAAE,OAAM,CAAC;IAET,OAAOR;AACT;AAEO,SAASS,yBAAyBC,KAAa;IACpD,IAAIA,MAAM,QAAQ,CAAC,SAEjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEO,SAASR,cAAcQ,KAAa,EAAEtF,MAAgC;IAC3E,MAAMuF,kBAAkBN,yBAAyBK;IAEjD,IAAIC,QAAAA,kBAAAA,KAAAA,IAAAA,gBAAiB,KAAK,CAAC,oBAAoB;YACtCC;QAAP,OAAO,QAAAA,CAAAA,yBAAAA,gBACJ,KAAK,CAAC,kBAAiB,IADnBA,KAAAA,IAAAA,uBAEH,KAAK,CAAC,GACP,GAAG,CAAC9C;IACT;IACA,IAAI;QACF,OAAOgB,KAAK,KAAK,CAAC6B;IACpB,EAAE,OAAM,CAAC;IACT,IAAI;QACF,OAAO7B,KAAK,KAAK,CAAC+B,WAAWF;IAC/B,EAAE,OAAOnB,GAAG,CAAC;IAEb,IAAIpE,AAAW,oBAAXA,UAA8BA,AAAW,kBAAXA,QAA0B;QAC1D,MAAM0F,aAAaL,yBAAyBE;QAC5C,OAAO7B,KAAK,KAAK,CAAC+B,WAAWC;IAC/B;IACA,MAAMxE,MAAM,CAAC,+BAA+B,EAAEoE,OAAO;AACvD"}
|
|
1
|
+
{"version":3,"file":"ai-model/service-caller/index.mjs","sources":["webpack://@midscene/core/./src/ai-model/service-caller/index.ts"],"sourcesContent":["import { AIResponseFormat, type AIUsageInfo } from '@/types';\nimport type { CodeGenerationChunk, StreamingCallback } from '@/types';\nimport { Anthropic } from '@anthropic-ai/sdk';\nimport {\n DefaultAzureCredential,\n getBearerTokenProvider,\n} from '@azure/identity';\nimport {\n type IModelPreferences,\n MIDSCENE_API_TYPE,\n MIDSCENE_LANGSMITH_DEBUG,\n OPENAI_MAX_TOKENS,\n globalConfigManager,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { parseBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport { ifInBrowser } from '@midscene/shared/utils';\nimport { HttpsProxyAgent } from 'https-proxy-agent';\nimport { jsonrepair } from 'jsonrepair';\nimport OpenAI, { AzureOpenAI } from 'openai';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport type { Stream } from 'openai/streaming';\nimport { SocksProxyAgent } from 'socks-proxy-agent';\nimport { AIActionType, type AIArgs } from '../common';\nimport { assertSchema } from '../prompt/assertion';\nimport { locatorSchema } from '../prompt/llm-locator';\nimport { planSchema } from '../prompt/llm-planning';\n\nasync function createChatClient({\n AIActionTypeValue,\n modelPreferences,\n}: {\n AIActionTypeValue: AIActionType;\n modelPreferences: IModelPreferences;\n}): Promise<{\n completion: OpenAI.Chat.Completions;\n style: 'openai' | 'anthropic';\n modelName: string;\n modelDescription: string;\n}> {\n const {\n socksProxy,\n httpProxy,\n modelName,\n openaiBaseURL,\n openaiApiKey,\n openaiExtraConfig,\n openaiUseAzureDeprecated,\n useAzureOpenai,\n azureOpenaiScope,\n azureOpenaiKey,\n azureOpenaiEndpoint,\n azureOpenaiApiVersion,\n azureOpenaiDeployment,\n azureExtraConfig,\n useAnthropicSdk,\n anthropicApiKey,\n modelDescription,\n } = globalConfigManager.getModelConfigByIntent(modelPreferences.intent);\n\n let openai: OpenAI | AzureOpenAI | undefined;\n\n let proxyAgent = undefined;\n const debugProxy = getDebug('ai:call:proxy');\n if (httpProxy) {\n debugProxy('using http proxy', httpProxy);\n proxyAgent = new HttpsProxyAgent(httpProxy);\n } else if (socksProxy) {\n debugProxy('using socks proxy', socksProxy);\n proxyAgent = new SocksProxyAgent(socksProxy);\n }\n\n if (openaiUseAzureDeprecated) {\n // this is deprecated\n openai = new AzureOpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n dangerouslyAllowBrowser: true,\n }) as OpenAI;\n } else if (useAzureOpenai) {\n // https://learn.microsoft.com/en-us/azure/ai-services/openai/chatgpt-quickstart?tabs=bash%2Cjavascript-key%2Ctypescript-keyless%2Cpython&pivots=programming-language-javascript#rest-api\n // keyless authentication\n let tokenProvider: any = undefined;\n if (azureOpenaiScope) {\n assert(\n !ifInBrowser,\n 'Azure OpenAI is not supported in browser with Midscene.',\n );\n const credential = new DefaultAzureCredential();\n\n tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);\n\n openai = new AzureOpenAI({\n azureADTokenProvider: tokenProvider,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n } else {\n // endpoint, apiKey, apiVersion, deployment\n openai = new AzureOpenAI({\n apiKey: azureOpenaiKey,\n endpoint: azureOpenaiEndpoint,\n apiVersion: azureOpenaiApiVersion,\n deployment: azureOpenaiDeployment,\n dangerouslyAllowBrowser: true,\n ...openaiExtraConfig,\n ...azureExtraConfig,\n });\n }\n } else if (!useAnthropicSdk) {\n openai = new OpenAI({\n baseURL: openaiBaseURL,\n apiKey: openaiApiKey,\n httpAgent: proxyAgent,\n ...openaiExtraConfig,\n defaultHeaders: {\n ...(openaiExtraConfig?.defaultHeaders || {}),\n [MIDSCENE_API_TYPE]: AIActionTypeValue.toString(),\n },\n dangerouslyAllowBrowser: true,\n });\n }\n\n if (\n openai &&\n globalConfigManager.getEnvConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)\n ) {\n if (ifInBrowser) {\n throw new Error('langsmith is not supported in browser');\n }\n console.log('DEBUGGING MODE: langsmith wrapper enabled');\n const { wrapOpenAI } = await import('langsmith/wrappers');\n openai = wrapOpenAI(openai);\n }\n\n if (typeof openai !== 'undefined') {\n return {\n completion: openai.chat.completions,\n style: 'openai',\n modelName,\n modelDescription,\n };\n }\n\n // Anthropic\n if (useAnthropicSdk) {\n openai = new Anthropic({\n apiKey: anthropicApiKey,\n httpAgent: proxyAgent,\n dangerouslyAllowBrowser: true,\n }) as any;\n }\n\n if (typeof openai !== 'undefined' && (openai as any).messages) {\n return {\n completion: (openai as any).messages,\n style: 'anthropic',\n modelName,\n modelDescription,\n };\n }\n\n throw new Error('Openai SDK or Anthropic SDK is not initialized');\n}\n\nexport async function call(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n options?: {\n stream?: boolean;\n onChunk?: StreamingCallback;\n },\n): Promise<{ content: string; usage?: AIUsageInfo; isStreamed: boolean }> {\n const { completion, style, modelName, modelDescription } =\n await createChatClient({\n AIActionTypeValue,\n modelPreferences,\n });\n\n const responseFormat = getResponseFormat(modelName, AIActionTypeValue);\n\n const maxTokens = globalConfigManager.getEnvConfigValue(OPENAI_MAX_TOKENS);\n const debugCall = getDebug('ai:call');\n const debugProfileStats = getDebug('ai:profile:stats');\n const debugProfileDetail = getDebug('ai:profile:detail');\n\n const startTime = Date.now();\n\n const isStreaming = options?.stream && options?.onChunk;\n let content: string | undefined;\n let accumulated = '';\n let usage: OpenAI.CompletionUsage | undefined;\n let timeCost: number | undefined;\n\n const commonConfig = {\n temperature: vlLocateMode(modelPreferences) === 'vlm-ui-tars' ? 0.0 : 0.1,\n stream: !!isStreaming,\n max_tokens:\n typeof maxTokens === 'number'\n ? maxTokens\n : Number.parseInt(maxTokens || '2048', 10),\n ...(vlLocateMode(modelPreferences) === 'qwen-vl' // qwen specific config\n ? {\n vl_high_resolution_images: true,\n }\n : {}),\n };\n\n try {\n if (style === 'openai') {\n debugCall(\n `sending ${isStreaming ? 'streaming ' : ''}request to ${modelName}`,\n );\n\n if (isStreaming) {\n const stream = (await completion.create(\n {\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n },\n {\n stream: true,\n },\n )) as Stream<OpenAI.Chat.Completions.ChatCompletionChunk> & {\n _request_id?: string | null;\n };\n\n for await (const chunk of stream) {\n const content = chunk.choices?.[0]?.delta?.content || '';\n const reasoning_content =\n (chunk.choices?.[0]?.delta as any)?.reasoning_content || '';\n\n // Check for usage info in any chunk (OpenAI provides usage in separate chunks)\n if (chunk.usage) {\n usage = chunk.usage;\n }\n\n if (content || reasoning_content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n reasoning_content,\n accumulated,\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.choices?.[0]?.finish_reason) {\n timeCost = Date.now() - startTime;\n\n // If usage is not available from the stream, provide a basic usage info\n if (!usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor(accumulated.length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelPreferences.intent,\n },\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n debugProfileStats(\n `streaming model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, cost-ms, ${timeCost}`,\n );\n } else {\n const result = await completion.create({\n model: modelName,\n messages,\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n\n debugProfileStats(\n `model, ${modelName}, mode, ${vlLocateMode(modelPreferences) || 'default'}, ui-tars-version, ${uiTarsModelVersion(modelPreferences)}, prompt-tokens, ${result.usage?.prompt_tokens || ''}, completion-tokens, ${result.usage?.completion_tokens || ''}, total-tokens, ${result.usage?.total_tokens || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`,\n );\n\n debugProfileDetail(\n `model usage detail: ${JSON.stringify(result.usage)}`,\n );\n\n assert(\n result.choices,\n `invalid response from LLM service: ${JSON.stringify(result)}`,\n );\n content = result.choices[0].message.content!;\n usage = result.usage;\n }\n\n debugCall(`response: ${content}`);\n assert(content, 'empty content');\n } else if (style === 'anthropic') {\n const convertImageContent = (content: any) => {\n if (content.type === 'image_url') {\n const imgBase64 = content.image_url.url;\n assert(imgBase64, 'image_url is required');\n const { mimeType, body } = parseBase64(content.image_url.url);\n return {\n source: {\n type: 'base64',\n media_type: mimeType,\n data: body,\n },\n type: 'image',\n };\n }\n return content;\n };\n\n if (isStreaming) {\n const stream = (await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any)) as any;\n\n for await (const chunk of stream) {\n const content = chunk.delta?.text || '';\n if (content) {\n accumulated += content;\n const chunkData: CodeGenerationChunk = {\n content,\n accumulated,\n reasoning_content: '',\n isComplete: false,\n usage: undefined,\n };\n options.onChunk!(chunkData);\n }\n\n // Check if stream is complete\n if (chunk.type === 'message_stop') {\n timeCost = Date.now() - startTime;\n const anthropicUsage = chunk.usage;\n\n // Send final chunk\n const finalChunk: CodeGenerationChunk = {\n content: '',\n accumulated,\n reasoning_content: '',\n isComplete: true,\n usage: anthropicUsage\n ? {\n prompt_tokens: anthropicUsage.input_tokens ?? 0,\n completion_tokens: anthropicUsage.output_tokens ?? 0,\n total_tokens:\n (anthropicUsage.input_tokens ?? 0) +\n (anthropicUsage.output_tokens ?? 0),\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelPreferences.intent,\n }\n : undefined,\n };\n options.onChunk!(finalChunk);\n break;\n }\n }\n content = accumulated;\n } else {\n const result = await completion.create({\n model: modelName,\n system: 'You are a versatile professional in software UI automation',\n messages: messages.map((m) => ({\n role: 'user',\n content: Array.isArray(m.content)\n ? (m.content as any).map(convertImageContent)\n : m.content,\n })),\n response_format: responseFormat,\n ...commonConfig,\n } as any);\n timeCost = Date.now() - startTime;\n content = (result as any).content[0].text as string;\n usage = result.usage;\n }\n\n assert(content, 'empty content');\n }\n // Ensure we always have usage info for streaming responses\n if (isStreaming && !usage) {\n // Estimate token counts based on content length (rough approximation)\n const estimatedTokens = Math.max(\n 1,\n Math.floor((content || '').length / 4),\n );\n usage = {\n prompt_tokens: estimatedTokens,\n completion_tokens: estimatedTokens,\n total_tokens: estimatedTokens * 2,\n };\n }\n\n return {\n content: content || '',\n usage: usage\n ? {\n prompt_tokens: usage.prompt_tokens ?? 0,\n completion_tokens: usage.completion_tokens ?? 0,\n total_tokens: usage.total_tokens ?? 0,\n time_cost: timeCost ?? 0,\n model_name: modelName,\n model_description: modelDescription,\n intent: modelPreferences.intent,\n }\n : undefined,\n isStreamed: !!isStreaming,\n };\n } catch (e: any) {\n console.error(' call AI error', e);\n const newError = new Error(\n `failed to call ${isStreaming ? 'streaming ' : ''}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`,\n {\n cause: e,\n },\n );\n throw newError;\n }\n}\n\nexport const getResponseFormat = (\n modelName: string,\n AIActionTypeValue: AIActionType,\n):\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject => {\n let responseFormat:\n | OpenAI.ChatCompletionCreateParams['response_format']\n | OpenAI.ResponseFormatJSONObject\n | undefined;\n\n if (modelName.includes('gpt-4')) {\n switch (AIActionTypeValue) {\n case AIActionType.ASSERT:\n responseFormat = assertSchema;\n break;\n case AIActionType.INSPECT_ELEMENT:\n responseFormat = locatorSchema;\n break;\n case AIActionType.PLAN:\n responseFormat = planSchema;\n break;\n case AIActionType.EXTRACT_DATA:\n case AIActionType.DESCRIBE_ELEMENT:\n responseFormat = { type: AIResponseFormat.JSON };\n break;\n }\n }\n\n // gpt-4o-2024-05-13 only supports json_object response format\n if (modelName === 'gpt-4o-2024-05-13') {\n responseFormat = { type: AIResponseFormat.JSON };\n }\n\n return responseFormat;\n};\n\nexport async function callToGetJSONObject<T>(\n messages: ChatCompletionMessageParam[],\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const response = await call(messages, AIActionTypeValue, modelPreferences);\n assert(response, 'empty response');\n const jsonContent = safeParseJson(response.content, modelPreferences);\n return { content: jsonContent, usage: response.usage };\n}\n\nexport async function callAiFnWithStringResponse<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: string; usage?: AIUsageInfo }> {\n const { content, usage } = await call(\n msgs,\n AIActionTypeValue,\n modelPreferences,\n );\n return { content, usage };\n}\n\nexport function extractJSONFromCodeBlock(response: string) {\n try {\n // First, try to match a JSON object directly in the response\n const jsonMatch = response.match(/^\\s*(\\{[\\s\\S]*\\})\\s*$/);\n if (jsonMatch) {\n return jsonMatch[1];\n }\n\n // If no direct JSON object is found, try to extract JSON from a code block\n const codeBlockMatch = response.match(\n /```(?:json)?\\s*(\\{[\\s\\S]*?\\})\\s*```/,\n );\n if (codeBlockMatch) {\n return codeBlockMatch[1];\n }\n\n // If no code block is found, try to find a JSON-like structure in the text\n const jsonLikeMatch = response.match(/\\{[\\s\\S]*\\}/);\n if (jsonLikeMatch) {\n return jsonLikeMatch[0];\n }\n } catch {}\n // If no JSON-like structure is found, return the original response\n return response;\n}\n\nexport function preprocessDoubaoBboxJson(input: string) {\n if (input.includes('bbox')) {\n // when its values like 940 445 969 490, replace all /\\d+\\s+\\d+/g with /$1,$2/g\n while (/\\d+\\s+\\d+/.test(input)) {\n input = input.replace(/(\\d+)\\s+(\\d+)/g, '$1,$2');\n }\n }\n return input;\n}\n\nexport function safeParseJson(\n input: string,\n modelPreferences: IModelPreferences,\n) {\n const cleanJsonString = extractJSONFromCodeBlock(input);\n // match the point\n if (cleanJsonString?.match(/\\((\\d+),(\\d+)\\)/)) {\n return cleanJsonString\n .match(/\\((\\d+),(\\d+)\\)/)\n ?.slice(1)\n .map(Number);\n }\n try {\n return JSON.parse(cleanJsonString);\n } catch {}\n try {\n return JSON.parse(jsonrepair(cleanJsonString));\n } catch (e) {}\n\n if (\n vlLocateMode(modelPreferences) === 'doubao-vision' ||\n vlLocateMode(modelPreferences) === 'vlm-ui-tars'\n ) {\n const jsonString = preprocessDoubaoBboxJson(cleanJsonString);\n return JSON.parse(jsonrepair(jsonString));\n }\n throw Error(`failed to parse json response: ${input}`);\n}\n"],"names":["createChatClient","AIActionTypeValue","modelPreferences","socksProxy","httpProxy","modelName","openaiBaseURL","openaiApiKey","openaiExtraConfig","openaiUseAzureDeprecated","useAzureOpenai","azureOpenaiScope","azureOpenaiKey","azureOpenaiEndpoint","azureOpenaiApiVersion","azureOpenaiDeployment","azureExtraConfig","useAnthropicSdk","anthropicApiKey","modelDescription","globalConfigManager","openai","proxyAgent","debugProxy","getDebug","HttpsProxyAgent","SocksProxyAgent","AzureOpenAI","tokenProvider","assert","ifInBrowser","credential","DefaultAzureCredential","getBearerTokenProvider","OpenAI","MIDSCENE_API_TYPE","MIDSCENE_LANGSMITH_DEBUG","Error","console","wrapOpenAI","Anthropic","call","messages","options","completion","style","responseFormat","getResponseFormat","maxTokens","OPENAI_MAX_TOKENS","debugCall","debugProfileStats","debugProfileDetail","startTime","Date","isStreaming","content","accumulated","usage","timeCost","commonConfig","vlLocateMode","Number","stream","chunk","_chunk_choices__delta","_chunk_choices__delta1","_chunk_choices_2","reasoning_content","chunkData","undefined","estimatedTokens","Math","finalChunk","_result_usage","_result_usage1","_result_usage2","result","uiTarsModelVersion","JSON","convertImageContent","imgBase64","mimeType","body","parseBase64","m","Array","_chunk_delta","anthropicUsage","e","newError","AIActionType","assertSchema","locatorSchema","planSchema","AIResponseFormat","callToGetJSONObject","response","jsonContent","safeParseJson","callAiFnWithStringResponse","msgs","extractJSONFromCodeBlock","jsonMatch","codeBlockMatch","jsonLikeMatch","preprocessDoubaoBboxJson","input","cleanJsonString","_cleanJsonString_match","jsonrepair","jsonString"],"mappings":";;;;;;;;;;;;;;;AA+BA,eAAeA,iBAAiB,EAC9BC,iBAAiB,EACjBC,gBAAgB,EAIjB;IAMC,MAAM,EACJC,UAAU,EACVC,SAAS,EACTC,SAAS,EACTC,aAAa,EACbC,YAAY,EACZC,iBAAiB,EACjBC,wBAAwB,EACxBC,cAAc,EACdC,gBAAgB,EAChBC,cAAc,EACdC,mBAAmB,EACnBC,qBAAqB,EACrBC,qBAAqB,EACrBC,gBAAgB,EAChBC,eAAe,EACfC,eAAe,EACfC,gBAAgB,EACjB,GAAGC,oBAAoB,sBAAsB,CAAClB,iBAAiB,MAAM;IAEtE,IAAImB;IAEJ,IAAIC;IACJ,MAAMC,aAAaC,SAAS;IAC5B,IAAIpB,WAAW;QACbmB,WAAW,oBAAoBnB;QAC/BkB,aAAa,IAAIG,gBAAgBrB;IACnC,OAAO,IAAID,YAAY;QACrBoB,WAAW,qBAAqBpB;QAChCmB,aAAa,IAAII,gBAAgBvB;IACnC;IAEA,IAAIM,0BAEFY,SAAS,IAAIM,YAAY;QACvB,SAASrB;QACT,QAAQC;QACR,WAAWe;QACX,GAAGd,iBAAiB;QACpB,yBAAyB;IAC3B;SACK,IAAIE,gBAAgB;QAGzB,IAAIkB;QACJ,IAAIjB,kBAAkB;YACpBkB,OACE,CAACC,aACD;YAEF,MAAMC,aAAa,IAAIC;YAEvBJ,gBAAgBK,uBAAuBF,YAAYpB;YAEnDU,SAAS,IAAIM,YAAY;gBACvB,sBAAsBC;gBACtB,UAAUf;gBACV,YAAYC;gBACZ,YAAYC;gBACZ,GAAGP,iBAAiB;gBACpB,GAAGQ,gBAAgB;YACrB;QACF,OAEEK,SAAS,IAAIM,YAAY;YACvB,QAAQf;YACR,UAAUC;YACV,YAAYC;YACZ,YAAYC;YACZ,yBAAyB;YACzB,GAAGP,iBAAiB;YACpB,GAAGQ,gBAAgB;QACrB;IAEJ,OAAO,IAAI,CAACC,iBACVI,SAAS,IAAIa,SAAO;QAClB,SAAS5B;QACT,QAAQC;QACR,WAAWe;QACX,GAAGd,iBAAiB;QACpB,gBAAgB;YACd,GAAIA,AAAAA,CAAAA,QAAAA,oBAAAA,KAAAA,IAAAA,kBAAmB,cAAc,AAAD,KAAK,CAAC,CAAC;YAC3C,CAAC2B,kBAAkB,EAAElC,kBAAkB,QAAQ;QACjD;QACA,yBAAyB;IAC3B;IAGF,IACEoB,UACAD,oBAAoB,qBAAqB,CAACgB,2BAC1C;QACA,IAAIN,aACF,MAAM,IAAIO,MAAM;QAElBC,QAAQ,GAAG,CAAC;QACZ,MAAM,EAAEC,UAAU,EAAE,GAAG,MAAM,MAAM,CAAC;QACpClB,SAASkB,WAAWlB;IACtB;IAEA,IAAI,AAAkB,WAAXA,QACT,OAAO;QACL,YAAYA,OAAO,IAAI,CAAC,WAAW;QACnC,OAAO;QACPhB;QACAc;IACF;IAIF,IAAIF,iBACFI,SAAS,IAAImB,UAAU;QACrB,QAAQtB;QACR,WAAWI;QACX,yBAAyB;IAC3B;IAGF,IAAI,AAAkB,WAAXD,UAA2BA,OAAe,QAAQ,EAC3D,OAAO;QACL,YAAaA,OAAe,QAAQ;QACpC,OAAO;QACPhB;QACAc;IACF;IAGF,MAAM,IAAIkB,MAAM;AAClB;AAEO,eAAeI,KACpBC,QAAsC,EACtCzC,iBAA+B,EAC/BC,gBAAmC,EACnCyC,OAGC;IAED,MAAM,EAAEC,UAAU,EAAEC,KAAK,EAAExC,SAAS,EAAEc,gBAAgB,EAAE,GACtD,MAAMnB,iBAAiB;QACrBC;QACAC;IACF;IAEF,MAAM4C,iBAAiBC,kBAAkB1C,WAAWJ;IAEpD,MAAM+C,YAAY5B,oBAAoB,iBAAiB,CAAC6B;IACxD,MAAMC,YAAY1B,SAAS;IAC3B,MAAM2B,oBAAoB3B,SAAS;IACnC,MAAM4B,qBAAqB5B,SAAS;IAEpC,MAAM6B,YAAYC,KAAK,GAAG;IAE1B,MAAMC,cAAcZ,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,MAAM,AAAD,KAAKA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,OAAO,AAAD;IACtD,IAAIa;IACJ,IAAIC,cAAc;IAClB,IAAIC;IACJ,IAAIC;IAEJ,MAAMC,eAAe;QACnB,aAAaC,AAAmC,kBAAnCA,aAAa3D,oBAAsC,MAAM;QACtE,QAAQ,CAAC,CAACqD;QACV,YACE,AAAqB,YAArB,OAAOP,YACHA,YACAc,OAAO,QAAQ,CAACd,aAAa,QAAQ;QAC3C,GAAIa,AAAmC,cAAnCA,aAAa3D,oBACb;YACE,2BAA2B;QAC7B,IACA,CAAC,CAAC;IACR;IAEA,IAAI;QACF,IAAI2C,AAAU,aAAVA,OAAoB;YACtBK,UACE,CAAC,QAAQ,EAAEK,cAAc,eAAe,GAAG,WAAW,EAAElD,WAAW;YAGrE,IAAIkD,aAAa;gBACf,MAAMQ,SAAU,MAAMnB,WAAW,MAAM,CACrC;oBACE,OAAOvC;oBACPqC;oBACA,iBAAiBI;oBACjB,GAAGc,YAAY;gBACjB,GACA;oBACE,QAAQ;gBACV;gBAKF,WAAW,MAAMI,SAASD,OAAQ;wBAChBE,uBAAAA,iBAAAA,gBAEbC,wBAAAA,kBAAAA,iBAoBCC,kBAAAA;oBAtBJ,MAAMX,UAAUS,AAAAA,SAAAA,CAAAA,iBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,kBAAAA,cAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,wBAAAA,gBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,sBAA2B,OAAO,AAAD,KAAK;oBACtD,MAAMG,oBACJ,AAAC,SAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,QAAAA,CAAAA,yBAAAA,iBAAoB,KAAK,AAAD,IAAxBA,KAAAA,IAAAA,uBAAmC,iBAAiB,AAAD,KAAK;oBAG3D,IAAIF,MAAM,KAAK,EACbN,QAAQM,MAAM,KAAK;oBAGrB,IAAIR,WAAWY,mBAAmB;wBAChCX,eAAeD;wBACf,MAAMa,YAAiC;4BACrCb;4BACAY;4BACAX;4BACA,YAAY;4BACZ,OAAOa;wBACT;wBACA3B,QAAQ,OAAO,CAAE0B;oBACnB;oBAGA,IAAI,QAAAF,CAAAA,kBAAAA,MAAM,OAAO,AAAD,IAAZA,KAAAA,IAAAA,QAAAA,CAAAA,mBAAAA,eAAe,CAAC,EAAE,AAAD,IAAjBA,KAAAA,IAAAA,iBAAoB,aAAa,EAAE;wBACrCR,WAAWL,KAAK,GAAG,KAAKD;wBAGxB,IAAI,CAACK,OAAO;4BAEV,MAAMa,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAACf,YAAY,MAAM,GAAG;4BAElCC,QAAQ;gCACN,eAAea;gCACf,mBAAmBA;gCACnB,cAAcA,AAAkB,IAAlBA;4BAChB;wBACF;wBAGA,MAAME,aAAkC;4BACtC,SAAS;4BACThB;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO;gCACL,eAAeC,MAAM,aAAa,IAAI;gCACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gCAC9C,cAAcA,MAAM,YAAY,IAAI;gCACpC,WAAWC,YAAY;gCACvB,YAAYtD;gCACZ,mBAAmBc;gCACnB,QAAQjB,iBAAiB,MAAM;4BACjC;wBACF;wBACAyC,QAAQ,OAAO,CAAE8B;wBACjB;oBACF;gBACF;gBACAjB,UAAUC;gBACVN,kBACE,CAAC,iBAAiB,EAAE9C,UAAU,QAAQ,EAAEwD,aAAa3D,qBAAqB,UAAU,WAAW,EAAEyD,UAAU;YAE/G,OAAO;oBAUoJe,eAAyDC,gBAAwDC;gBAT1Q,MAAMC,SAAS,MAAMjC,WAAW,MAAM,CAAC;oBACrC,OAAOvC;oBACPqC;oBACA,iBAAiBI;oBACjB,GAAGc,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBAExBF,kBACE,CAAC,OAAO,EAAE9C,UAAU,QAAQ,EAAEwD,aAAa3D,qBAAqB,UAAU,mBAAmB,EAAE4E,mBAAmB5E,kBAAkB,iBAAiB,EAAEwE,AAAAA,SAAAA,CAAAA,gBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,cAAc,aAAa,AAAD,KAAK,GAAG,qBAAqB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,iBAAiB,AAAD,KAAK,GAAG,gBAAgB,EAAEC,AAAAA,SAAAA,CAAAA,iBAAAA,OAAO,KAAK,AAAD,IAAXA,KAAAA,IAAAA,eAAc,YAAY,AAAD,KAAK,GAAG,WAAW,EAAEjB,SAAS,aAAa,EAAEkB,OAAO,WAAW,IAAI,IAAI;gBAG1WzB,mBACE,CAAC,oBAAoB,EAAE2B,KAAK,SAAS,CAACF,OAAO,KAAK,GAAG;gBAGvDhD,OACEgD,OAAO,OAAO,EACd,CAAC,mCAAmC,EAAEE,KAAK,SAAS,CAACF,SAAS;gBAEhErB,UAAUqB,OAAO,OAAO,CAAC,EAAE,CAAC,OAAO,CAAC,OAAO;gBAC3CnB,QAAQmB,OAAO,KAAK;YACtB;YAEA3B,UAAU,CAAC,UAAU,EAAEM,SAAS;YAChC3B,OAAO2B,SAAS;QAClB,OAAO,IAAIX,AAAU,gBAAVA,OAAuB;YAChC,MAAMmC,sBAAsB,CAACxB;gBAC3B,IAAIA,AAAiB,gBAAjBA,QAAQ,IAAI,EAAkB;oBAChC,MAAMyB,YAAYzB,QAAQ,SAAS,CAAC,GAAG;oBACvC3B,OAAOoD,WAAW;oBAClB,MAAM,EAAEC,QAAQ,EAAEC,IAAI,EAAE,GAAGC,YAAY5B,QAAQ,SAAS,CAAC,GAAG;oBAC5D,OAAO;wBACL,QAAQ;4BACN,MAAM;4BACN,YAAY0B;4BACZ,MAAMC;wBACR;wBACA,MAAM;oBACR;gBACF;gBACA,OAAO3B;YACT;YAEA,IAAID,aAAa;gBACf,MAAMQ,SAAU,MAAMnB,WAAW,MAAM,CAAC;oBACtC,OAAOvC;oBACP,QAAQ;oBACR,UAAUqC,SAAS,GAAG,CAAC,CAAC2C,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBvC;oBACjB,GAAGc,YAAY;gBACjB;gBAEA,WAAW,MAAMI,SAASD,OAAQ;wBAChBwB;oBAAhB,MAAM/B,UAAU+B,AAAAA,SAAAA,CAAAA,eAAAA,MAAM,KAAK,AAAD,IAAVA,KAAAA,IAAAA,aAAa,IAAI,AAAD,KAAK;oBACrC,IAAI/B,SAAS;wBACXC,eAAeD;wBACf,MAAMa,YAAiC;4BACrCb;4BACAC;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAOa;wBACT;wBACA3B,QAAQ,OAAO,CAAE0B;oBACnB;oBAGA,IAAIL,AAAe,mBAAfA,MAAM,IAAI,EAAqB;wBACjCL,WAAWL,KAAK,GAAG,KAAKD;wBACxB,MAAMmC,iBAAiBxB,MAAM,KAAK;wBAGlC,MAAMS,aAAkC;4BACtC,SAAS;4BACThB;4BACA,mBAAmB;4BACnB,YAAY;4BACZ,OAAO+B,iBACH;gCACE,eAAeA,eAAe,YAAY,IAAI;gCAC9C,mBAAmBA,eAAe,aAAa,IAAI;gCACnD,cACGA,AAAAA,CAAAA,eAAe,YAAY,IAAI,KAC/BA,CAAAA,eAAe,aAAa,IAAI;gCACnC,WAAW7B,YAAY;gCACvB,YAAYtD;gCACZ,mBAAmBc;gCACnB,QAAQjB,iBAAiB,MAAM;4BACjC,IACAoE;wBACN;wBACA3B,QAAQ,OAAO,CAAE8B;wBACjB;oBACF;gBACF;gBACAjB,UAAUC;YACZ,OAAO;gBACL,MAAMoB,SAAS,MAAMjC,WAAW,MAAM,CAAC;oBACrC,OAAOvC;oBACP,QAAQ;oBACR,UAAUqC,SAAS,GAAG,CAAC,CAAC2C,IAAO;4BAC7B,MAAM;4BACN,SAASC,MAAM,OAAO,CAACD,EAAE,OAAO,IAC3BA,EAAE,OAAO,CAAS,GAAG,CAACL,uBACvBK,EAAE,OAAO;wBACf;oBACA,iBAAiBvC;oBACjB,GAAGc,YAAY;gBACjB;gBACAD,WAAWL,KAAK,GAAG,KAAKD;gBACxBG,UAAWqB,OAAe,OAAO,CAAC,EAAE,CAAC,IAAI;gBACzCnB,QAAQmB,OAAO,KAAK;YACtB;YAEAhD,OAAO2B,SAAS;QAClB;QAEA,IAAID,eAAe,CAACG,OAAO;YAEzB,MAAMa,kBAAkBC,KAAK,GAAG,CAC9B,GACAA,KAAK,KAAK,CAAEhB,AAAAA,CAAAA,WAAW,EAAC,EAAG,MAAM,GAAG;YAEtCE,QAAQ;gBACN,eAAea;gBACf,mBAAmBA;gBACnB,cAAcA,AAAkB,IAAlBA;YAChB;QACF;QAEA,OAAO;YACL,SAASf,WAAW;YACpB,OAAOE,QACH;gBACE,eAAeA,MAAM,aAAa,IAAI;gBACtC,mBAAmBA,MAAM,iBAAiB,IAAI;gBAC9C,cAAcA,MAAM,YAAY,IAAI;gBACpC,WAAWC,YAAY;gBACvB,YAAYtD;gBACZ,mBAAmBc;gBACnB,QAAQjB,iBAAiB,MAAM;YACjC,IACAoE;YACJ,YAAY,CAAC,CAACf;QAChB;IACF,EAAE,OAAOkC,GAAQ;QACfnD,QAAQ,KAAK,CAAC,kBAAkBmD;QAChC,MAAMC,WAAW,IAAIrD,MACnB,CAAC,eAAe,EAAEkB,cAAc,eAAe,GAAG,kBAAkB,EAAEkC,EAAE,OAAO,CAAC,8DAA8D,CAAC,EAC/I;YACE,OAAOA;QACT;QAEF,MAAMC;IACR;AACF;AAEO,MAAM3C,oBAAoB,CAC/B1C,WACAJ;IAIA,IAAI6C;IAKJ,IAAIzC,UAAU,QAAQ,CAAC,UACrB,OAAQJ;QACN,KAAK0F,aAAa,MAAM;YACtB7C,iBAAiB8C;YACjB;QACF,KAAKD,aAAa,eAAe;YAC/B7C,iBAAiB+C;YACjB;QACF,KAAKF,aAAa,IAAI;YACpB7C,iBAAiBgD;YACjB;QACF,KAAKH,aAAa,YAAY;QAC9B,KAAKA,aAAa,gBAAgB;YAChC7C,iBAAiB;gBAAE,MAAMiD,iBAAiB,IAAI;YAAC;YAC/C;IACJ;IAIF,IAAI1F,AAAc,wBAAdA,WACFyC,iBAAiB;QAAE,MAAMiD,iBAAiB,IAAI;IAAC;IAGjD,OAAOjD;AACT;AAEO,eAAekD,oBACpBtD,QAAsC,EACtCzC,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAM+F,WAAW,MAAMxD,KAAKC,UAAUzC,mBAAmBC;IACzD2B,OAAOoE,UAAU;IACjB,MAAMC,cAAcC,cAAcF,SAAS,OAAO,EAAE/F;IACpD,OAAO;QAAE,SAASgG;QAAa,OAAOD,SAAS,KAAK;IAAC;AACvD;AAEO,eAAeG,2BACpBC,IAAY,EACZpG,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAM,EAAEsD,OAAO,EAAEE,KAAK,EAAE,GAAG,MAAMjB,KAC/B4D,MACApG,mBACAC;IAEF,OAAO;QAAEsD;QAASE;IAAM;AAC1B;AAEO,SAAS4C,yBAAyBL,QAAgB;IACvD,IAAI;QAEF,MAAMM,YAAYN,SAAS,KAAK,CAAC;QACjC,IAAIM,WACF,OAAOA,SAAS,CAAC,EAAE;QAIrB,MAAMC,iBAAiBP,SAAS,KAAK,CACnC;QAEF,IAAIO,gBACF,OAAOA,cAAc,CAAC,EAAE;QAI1B,MAAMC,gBAAgBR,SAAS,KAAK,CAAC;QACrC,IAAIQ,eACF,OAAOA,aAAa,CAAC,EAAE;IAE3B,EAAE,OAAM,CAAC;IAET,OAAOR;AACT;AAEO,SAASS,yBAAyBC,KAAa;IACpD,IAAIA,MAAM,QAAQ,CAAC,SAEjB,MAAO,YAAY,IAAI,CAACA,OACtBA,QAAQA,MAAM,OAAO,CAAC,kBAAkB;IAG5C,OAAOA;AACT;AAEO,SAASR,cACdQ,KAAa,EACbzG,gBAAmC;IAEnC,MAAM0G,kBAAkBN,yBAAyBK;IAEjD,IAAIC,QAAAA,kBAAAA,KAAAA,IAAAA,gBAAiB,KAAK,CAAC,oBAAoB;YACtCC;QAAP,OAAO,QAAAA,CAAAA,yBAAAA,gBACJ,KAAK,CAAC,kBAAiB,IADnBA,KAAAA,IAAAA,uBAEH,KAAK,CAAC,GACP,GAAG,CAAC/C;IACT;IACA,IAAI;QACF,OAAOiB,KAAK,KAAK,CAAC6B;IACpB,EAAE,OAAM,CAAC;IACT,IAAI;QACF,OAAO7B,KAAK,KAAK,CAAC+B,WAAWF;IAC/B,EAAE,OAAOnB,GAAG,CAAC;IAEb,IACE5B,AAAmC,oBAAnCA,aAAa3D,qBACb2D,AAAmC,kBAAnCA,aAAa3D,mBACb;QACA,MAAM6G,aAAaL,yBAAyBE;QAC5C,OAAO7B,KAAK,KAAK,CAAC+B,WAAWC;IAC/B;IACA,MAAM1E,MAAM,CAAC,+BAA+B,EAAEsE,OAAO;AACvD"}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { UITarsModelVersion } from "@midscene/shared/env";
|
|
1
|
+
import { UITarsModelVersion, uiTarsModelVersion, vlLocateMode } from "@midscene/shared/env";
|
|
2
2
|
import { resizeImgBase64 } from "@midscene/shared/img";
|
|
3
3
|
import { getDebug } from "@midscene/shared/logger";
|
|
4
4
|
import { transformHotkeyInput } from "@midscene/shared/us-keyboard-layout";
|
|
@@ -6,7 +6,7 @@ import { assert } from "@midscene/shared/utils";
|
|
|
6
6
|
import { actionParser } from "@ui-tars/action-parser";
|
|
7
7
|
import { AIActionType } from "./common.mjs";
|
|
8
8
|
import { getSummary, getUiTarsPlanningPrompt } from "./prompt/ui-tars-planning.mjs";
|
|
9
|
-
import {
|
|
9
|
+
import { call } from "./service-caller/index.mjs";
|
|
10
10
|
const debug = getDebug('ui-tars-planning');
|
|
11
11
|
const bboxSize = 10;
|
|
12
12
|
const pointToBbox = (point, width, height)=>[
|
|
@@ -16,17 +16,17 @@ const pointToBbox = (point, width, height)=>[
|
|
|
16
16
|
Math.round(Math.min(point.y + bboxSize / 2, height))
|
|
17
17
|
];
|
|
18
18
|
async function vlmPlanning(options) {
|
|
19
|
-
const { conversationHistory, userInstruction, size,
|
|
20
|
-
const { uiTarsModelVersion } = modelConfig;
|
|
19
|
+
const { conversationHistory, userInstruction, size, modelPreferences } = options;
|
|
21
20
|
const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
|
|
22
|
-
const res = await
|
|
21
|
+
const res = await call([
|
|
23
22
|
{
|
|
24
23
|
role: 'user',
|
|
25
24
|
content: systemPrompt
|
|
26
25
|
},
|
|
27
26
|
...conversationHistory
|
|
28
|
-
], AIActionType.INSPECT_ELEMENT,
|
|
27
|
+
], AIActionType.INSPECT_ELEMENT, modelPreferences);
|
|
29
28
|
const convertedText = convertBboxToCoordinates(res.content);
|
|
29
|
+
const modelVer = uiTarsModelVersion(modelPreferences);
|
|
30
30
|
const { parsed } = actionParser({
|
|
31
31
|
prediction: convertedText,
|
|
32
32
|
factor: [
|
|
@@ -37,9 +37,9 @@ async function vlmPlanning(options) {
|
|
|
37
37
|
width: size.width,
|
|
38
38
|
height: size.height
|
|
39
39
|
},
|
|
40
|
-
modelVer:
|
|
40
|
+
modelVer: modelVer || void 0
|
|
41
41
|
});
|
|
42
|
-
debug('ui-tars modelVer',
|
|
42
|
+
debug('ui-tars modelVer', modelVer, ', parsed', JSON.stringify(parsed));
|
|
43
43
|
const transformActions = [];
|
|
44
44
|
parsed.forEach((action)=>{
|
|
45
45
|
if ('click' === action.action_type) {
|
|
@@ -155,8 +155,8 @@ function getPoint(startBox, size) {
|
|
|
155
155
|
y * size.height
|
|
156
156
|
];
|
|
157
157
|
}
|
|
158
|
-
async function resizeImageForUiTars(imageBase64, size,
|
|
159
|
-
if (
|
|
158
|
+
async function resizeImageForUiTars(imageBase64, size, modelPreferences) {
|
|
159
|
+
if ('vlm-ui-tars' === vlLocateMode(modelPreferences) && uiTarsModelVersion(modelPreferences) === UITarsModelVersion.V1_5) {
|
|
160
160
|
debug('ui-tars-v1.5, will check image size', size);
|
|
161
161
|
const currentPixels = size.width * size.height;
|
|
162
162
|
const maxPixels = 12845056;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport { type IModelConfig, UITarsModelVersion } from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { callAI } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelConfig: IModelConfig;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelConfig } = options;\n const { uiTarsModelVersion } = modelConfig;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await callAI(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelConfig,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: uiTarsModelVersion,\n });\n\n debug(\n 'ui-tars modelVer',\n uiTarsModelVersion,\n ', parsed',\n JSON.stringify(parsed),\n );\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys,\n },\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n uiTarsVersion: UITarsModelVersion | undefined,\n) {\n if (uiTarsVersion === UITarsModelVersion.V1_5) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelConfig","uiTarsModelVersion","systemPrompt","getUiTarsPlanningPrompt","res","callAI","AIActionType","convertedText","convertBboxToCoordinates","parsed","actionParser","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","Error","getSummary","undefined","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","uiTarsVersion","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AAyBA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,WAAW,EAAE,GAAGJ;IACpE,MAAM,EAAEK,kBAAkB,EAAE,GAAGD;IAC/B,MAAME,eAAeC,4BAA4BL;IAEjD,MAAMM,MAAM,MAAMC,OAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGL;KACJ,EACDS,aAAa,eAAe,EAC5BN;IAEF,MAAMO,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAM,EAAEK,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYH;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOR,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUE;IACZ;IAEAd,MACE,oBACAc,oBACA,YACAU,KAAK,SAAS,CAACF;IAGjB,MAAMG,mBAAqC,EAAE;IAC7CH,OAAO,OAAO,CAAC,CAACI;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMtB,QAAQwB,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEd;YACvDa,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMvB,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIc,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEd;YAC5D,MAAMkB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEd;YACxDa,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMvB,YACJ;4BAAE,GAAG0B,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCjB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQc,OAAO,OAAO,IAAI;wBAC1B,MAAMvB,YACJ;4BAAE,GAAG2B,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjClB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASc,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASM;gBACX;gBACA,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEO,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASC,OAAO,OAAO,IAAI;QAC7B;IAEJ;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIS,MAAM,CAAC,4BAA4B,EAAEjB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBK;QACF;IACF;IAGFtB,MAAM,oBAAoBwB,KAAK,SAAS,CAACC,kBAAkB,MAAM;IAEjE,OAAO;QACL,SAASA;QACT,kBAAkBH;QAClB,gBAAgBa,WAAWlB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaO,KAAK,SAAS,CAACP,IAAI,OAAO,EAAEmB,QAAW;IACtD;AACF;AAOA,SAASf,yBAAyBgB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI3C,KAAK,KAAK,CAAEsC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI5C,KAAK,KAAK,CAAEwC,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASX,SAASyB,QAAgB,EAAEzC,IAAuC;IACzE,MAAM,CAACsC,GAAGC,EAAE,GAAG3B,KAAK,KAAK,CAAC6B;IAC1B,OAAO;QAACH,IAAItC,KAAK,KAAK;QAAEuC,IAAIvC,KAAK,MAAM;KAAC;AAC1C;AAkEO,eAAe0C,qBACpBC,WAAmB,EACnB3C,IAAU,EACV4C,aAA6C;IAE7C,IAAIA,kBAAkBC,mBAAmB,IAAI,EAAE;QAC7CzD,MAAM,uCAAuCY;QAC7C,MAAM8C,gBAAgB9C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAM+C,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAerD,KAAK,IAAI,CAACoD,YAAYD;YAC3C,MAAMG,WAAWtD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGgD;YACzC,MAAME,YAAYvD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGgD;YAC3C5D,MACE,2DACA6D,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|
|
1
|
+
{"version":3,"file":"ai-model/ui-tars-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport {\n type IModelPreferences,\n UITarsModelVersion,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { call } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelPreferences: IModelPreferences;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelPreferences } =\n options;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await call(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelPreferences,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const modelVer = uiTarsModelVersion(modelPreferences);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: modelVer || undefined,\n });\n\n debug('ui-tars modelVer', modelVer, ', parsed', JSON.stringify(parsed));\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys,\n },\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (\n vlLocateMode(modelPreferences) === 'vlm-ui-tars' &&\n uiTarsModelVersion(modelPreferences) === UITarsModelVersion.V1_5\n ) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelPreferences","systemPrompt","getUiTarsPlanningPrompt","res","call","AIActionType","convertedText","convertBboxToCoordinates","modelVer","uiTarsModelVersion","parsed","actionParser","undefined","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","Error","getSummary","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","vlLocateMode","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;;;;;;;AA8BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,gBAAgB,EAAE,GACpEJ;IACF,MAAMK,eAAeC,4BAA4BJ;IAEjD,MAAMK,MAAM,MAAMC,KAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGJ;KACJ,EACDQ,aAAa,eAAe,EAC5BL;IAEF,MAAMM,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAMK,WAAWC,mBAAmBT;IAEpC,MAAM,EAAEU,MAAM,EAAE,GAAGC,aAAa;QAC9B,YAAYL;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOP,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUS,YAAYI;IACxB;IAEAzB,MAAM,oBAAoBqB,UAAU,YAAYK,KAAK,SAAS,CAACH;IAE/D,MAAMI,mBAAqC,EAAE;IAC7CJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMxB,QAAQ0B,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YACvDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIgB,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,OAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,OAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YAC5D,MAAMoB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEhB;YACxDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAG4B,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCnB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQgB,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAG6B,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjCpB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASgB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,qBAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASM;gBACX;gBACA,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEO,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASC,OAAO,OAAO,IAAI;QAC7B;IAEJ;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIS,MAAM,CAAC,4BAA4B,EAAEpB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBO;QACF;IACF;IAGFvB,MAAM,oBAAoB0B,KAAK,SAAS,CAACC,kBAAkB,MAAM;IAEjE,OAAO;QACL,SAASA;QACT,kBAAkBJ;QAClB,gBAAgBc,WAAWrB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaU,KAAK,SAAS,CAACV,IAAI,OAAO,EAAES,QAAW;IACtD;AACF;AAOA,SAASL,yBAAyBkB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI5C,KAAK,KAAK,CAAEuC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI7C,KAAK,KAAK,CAAEyC,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASV,SAASwB,QAAgB,EAAE1C,IAAuC;IACzE,MAAM,CAACuC,GAAGC,EAAE,GAAG1B,KAAK,KAAK,CAAC4B;IAC1B,OAAO;QAACH,IAAIvC,KAAK,KAAK;QAAEwC,IAAIxC,KAAK,MAAM;KAAC;AAC1C;AAkEO,eAAe2C,qBACpBC,WAAmB,EACnB5C,IAAU,EACVC,gBAAmC;IAEnC,IACE4C,AAAmC,kBAAnCA,aAAa5C,qBACbS,mBAAmBT,sBAAsB6C,mBAAmB,IAAI,EAChE;QACA1D,MAAM,uCAAuCY;QAC7C,MAAM+C,gBAAgB/C,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMgD,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAetD,KAAK,IAAI,CAACqD,YAAYD;YAC3C,MAAMG,WAAWvD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGiD;YACzC,MAAME,YAAYxD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGiD;YAC3C7D,MACE,2DACA8D,UACAC;YAEF,MAAMC,eAAe,MAAMC,gBAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
|