@jarvis-agent/core 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.esm.js CHANGED
@@ -39669,6 +39669,24 @@ Output:
39669
39669
  "changed": true,
39670
39670
  "changeInfo": "New message received in the group chat. The message content is: 'Hello, how are you?'"
39671
39671
  }`;
39672
+ const watch_text_system_prompt = `You are a page content analyzer. Given a page content and a condition description, determine if the condition is currently met on the page.
39673
+ Return ONLY a JSON object, no other text.
39674
+ - "changed": true means the condition IS met (e.g. the target button exists, the status has changed to the expected value)
39675
+ - "changed": false means the condition is NOT yet met
39676
+
39677
+ ## Example
39678
+ Condition: Monitor for a "Retry" button appearing on the page
39679
+ ### Condition not met
39680
+ Output:
39681
+ {
39682
+ "changed": false
39683
+ }
39684
+ ### Condition met
39685
+ Output:
39686
+ {
39687
+ "changed": true,
39688
+ "changeInfo": "The 'Retry' button is present on the page at index 127-128"
39689
+ }`;
39672
39690
  class WatchTriggerTool {
39673
39691
  constructor() {
39674
39692
  this.name = TOOL_NAME$a;
@@ -39733,12 +39751,34 @@ class WatchTriggerTool {
39733
39751
  ],
39734
39752
  };
39735
39753
  }
39754
+ const rlm = new RetryLanguageModel(agentContext.context.config.llms, agentContext.agent.Llms, agentContext.context.config.globalConfig?.streamFirstTimeout, agentContext.context.config.globalConfig?.streamTokenTimeout, agentContext);
39755
+ const useVision = this.isVisionModel(rlm);
39756
+ // Initial condition check (text-based, works with all models)
39757
+ const pageContent = await this.get_page_content(agentContext);
39758
+ const initialCheck = await this.is_condition_met(rlm, pageContent, task_description, agentContext);
39759
+ if (initialCheck.changed) {
39760
+ return {
39761
+ content: [
39762
+ {
39763
+ type: "text",
39764
+ text: initialCheck.changeInfo || "Condition already met on page.",
39765
+ },
39766
+ ],
39767
+ };
39768
+ }
39769
+ // Enter monitoring loop
39736
39770
  await this.init_eko_observer(agentContext);
39737
- const image1 = await this.get_screenshot(agentContext);
39738
39771
  const start = new Date().getTime();
39739
39772
  const timeout = (args.timeout || 5) * 60000;
39740
39773
  const frequency = Math.max(500, (args.frequency || 1) * 1000);
39741
- const rlm = new RetryLanguageModel(agentContext.context.config.llms, agentContext.agent.Llms, agentContext.context.config.globalConfig?.streamFirstTimeout, agentContext.context.config.globalConfig?.streamTokenTimeout, agentContext);
39774
+ let image1;
39775
+ let content1;
39776
+ if (useVision) {
39777
+ image1 = await this.get_screenshot(agentContext);
39778
+ }
39779
+ else {
39780
+ content1 = pageContent;
39781
+ }
39742
39782
  while (new Date().getTime() - start < timeout) {
39743
39783
  await agentContext.context.checkAborted();
39744
39784
  await new Promise((resolve) => setTimeout(resolve, frequency));
@@ -39747,17 +39787,32 @@ class WatchTriggerTool {
39747
39787
  continue;
39748
39788
  }
39749
39789
  await this.init_eko_observer(agentContext);
39750
- const image2 = await this.get_screenshot(agentContext);
39751
- const changeResult = await this.is_dom_change(agentContext, rlm, image1, image2, task_description);
39752
- if (changeResult.changed) {
39753
- return {
39754
- content: [
39755
- {
39756
- type: "text",
39757
- text: changeResult.changeInfo || "DOM change detected.",
39758
- },
39759
- ],
39760
- };
39790
+ if (useVision) {
39791
+ // Vision model: compare screenshots
39792
+ const image2 = await this.get_screenshot(agentContext);
39793
+ const changeResult = await this.is_dom_change(agentContext, rlm, image1, image2, task_description);
39794
+ if (changeResult.changed) {
39795
+ return {
39796
+ content: [
39797
+ { type: "text", text: changeResult.changeInfo || "DOM change detected." },
39798
+ ],
39799
+ };
39800
+ }
39801
+ }
39802
+ else {
39803
+ // Text model: compare page content
39804
+ const content2 = await this.get_page_content(agentContext);
39805
+ if (content2 === content1)
39806
+ continue;
39807
+ const changeResult = await this.is_condition_met(rlm, content2, task_description, agentContext);
39808
+ content1 = content2;
39809
+ if (changeResult.changed) {
39810
+ return {
39811
+ content: [
39812
+ { type: "text", text: changeResult.changeInfo || "Condition met." },
39813
+ ],
39814
+ };
39815
+ }
39761
39816
  }
39762
39817
  }
39763
39818
  return {
@@ -39769,6 +39824,65 @@ class WatchTriggerTool {
39769
39824
  ],
39770
39825
  };
39771
39826
  }
39827
+ /** Check if the primary LLM supports vision */
39828
+ isVisionModel(rlm) {
39829
+ const names = rlm.Names;
39830
+ const llms = rlm.Llms;
39831
+ if (!names || names.length === 0)
39832
+ return false;
39833
+ const config = llms[names[0]];
39834
+ if (!config)
39835
+ return false;
39836
+ const provider = String(config.provider || "").toLowerCase();
39837
+ const model = String(config.model || "").toLowerCase();
39838
+ if (provider === "deepseek" || model.includes("deepseek"))
39839
+ return false;
39840
+ if (provider === "anthropic")
39841
+ return true;
39842
+ if (provider === "google")
39843
+ return true;
39844
+ if (model.includes("gpt-4o") || model.includes("gpt-4-vision") || model.includes("gpt-4-turbo"))
39845
+ return true;
39846
+ if (model.includes("claude") || model.includes("gemini"))
39847
+ return true;
39848
+ return false;
39849
+ }
39850
+ /** Get page text content via extract_page_content */
39851
+ async get_page_content(agentContext) {
39852
+ const extract = agentContext.agent["extract_page_content"];
39853
+ if (!extract)
39854
+ return "";
39855
+ const result = await extract.call(agentContext.agent, agentContext);
39856
+ return result?.page_content || "";
39857
+ }
39858
+ /** Check if condition is met using text-based LLM analysis */
39859
+ async is_condition_met(rlm, pageContent, task_description, agentContext) {
39860
+ try {
39861
+ const request = {
39862
+ messages: [
39863
+ { role: "system", content: watch_text_system_prompt },
39864
+ {
39865
+ role: "user",
39866
+ content: [
39867
+ {
39868
+ type: "text",
39869
+ text: `Condition: ${task_description}\n\nPage content:\n${pageContent.slice(0, 30000)}`,
39870
+ },
39871
+ ],
39872
+ },
39873
+ ],
39874
+ abortSignal: agentContext.context.controller.signal,
39875
+ };
39876
+ const result = await rlm.call(request);
39877
+ let resultText = result.text || "{}";
39878
+ resultText = resultText.substring(resultText.indexOf("{"), resultText.lastIndexOf("}") + 1);
39879
+ return JSON.parse(resultText);
39880
+ }
39881
+ catch (error) {
39882
+ Log.error("Error in is_condition_met:", error);
39883
+ }
39884
+ return { changed: false };
39885
+ }
39772
39886
  async get_screenshot(agentContext) {
39773
39887
  const screenshot = agentContext.agent["screenshot"];
39774
39888
  const imageResult = (await screenshot.call(agentContext.agent, agentContext));
@@ -40178,7 +40292,7 @@ class ActivateSkillTool {
40178
40292
  required: ["name"],
40179
40293
  };
40180
40294
  }
40181
- async execute(args) {
40295
+ async execute(args, ..._rest) {
40182
40296
  const name = args.name;
40183
40297
  if (!global.skillService) {
40184
40298
  return {
@@ -40475,11 +40589,11 @@ monitor changes in webpage DOM elements, when executing to the watch node, requi
40475
40589
  <if ${TOOL_NAME$6}Tool>
40476
40590
  * SKILLS
40477
40591
  You can use the \`${TOOL_NAME$6}\` tool to load domain-specific skill instructions when they would help complete the current task.
40592
+ </if>
40478
40593
  <if skills>
40479
40594
  Available skills:
40480
40595
  {{skills}}
40481
40596
  </if>
40482
- </if>
40483
40597
 
40484
40598
  <if mainTask>
40485
40599
  Main task: {{mainTask}}