ibm-watsonx-orchestrate-evaluation-framework 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ibm-watsonx-orchestrate-evaluation-framework might be problematic. Click here for more details.

Files changed (46) hide show
  1. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/METADATA +322 -0
  2. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/RECORD +46 -0
  3. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/WHEEL +5 -0
  4. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/licenses/LICENSE +22 -0
  5. ibm_watsonx_orchestrate_evaluation_framework-1.0.0.dist-info/top_level.txt +1 -0
  6. wxo_agentic_evaluation/__init__.py +0 -0
  7. wxo_agentic_evaluation/analytics/tools/analyzer.py +405 -0
  8. wxo_agentic_evaluation/analytics/tools/main.py +163 -0
  9. wxo_agentic_evaluation/analytics/tools/types.py +130 -0
  10. wxo_agentic_evaluation/analytics/tools/ux.py +428 -0
  11. wxo_agentic_evaluation/analyze_run.py +123 -0
  12. wxo_agentic_evaluation/annotate.py +40 -0
  13. wxo_agentic_evaluation/arg_configs.py +78 -0
  14. wxo_agentic_evaluation/batch_annotate.py +181 -0
  15. wxo_agentic_evaluation/data_annotator.py +253 -0
  16. wxo_agentic_evaluation/evaluation_package.py +518 -0
  17. wxo_agentic_evaluation/external_agent/external_validate.py +69 -0
  18. wxo_agentic_evaluation/external_agent/types.py +65 -0
  19. wxo_agentic_evaluation/inference_backend.py +601 -0
  20. wxo_agentic_evaluation/llm_matching.py +39 -0
  21. wxo_agentic_evaluation/llm_rag_eval.py +47 -0
  22. wxo_agentic_evaluation/llm_user.py +38 -0
  23. wxo_agentic_evaluation/main.py +231 -0
  24. wxo_agentic_evaluation/metrics/__init__.py +0 -0
  25. wxo_agentic_evaluation/metrics/llm_as_judge.py +46 -0
  26. wxo_agentic_evaluation/metrics/metrics.py +101 -0
  27. wxo_agentic_evaluation/prompt/__init__.py +0 -0
  28. wxo_agentic_evaluation/prompt/answer_relevancy_prompt.jinja2 +120 -0
  29. wxo_agentic_evaluation/prompt/batch_testcase_prompt.jinja2 +51 -0
  30. wxo_agentic_evaluation/prompt/examples/__init__.py +0 -0
  31. wxo_agentic_evaluation/prompt/examples/data_simple.json +93 -0
  32. wxo_agentic_evaluation/prompt/faithfulness_prompt.jinja2 +59 -0
  33. wxo_agentic_evaluation/prompt/keyword_matching_prompt.jinja2 +75 -0
  34. wxo_agentic_evaluation/prompt/keywords_generation_prompt.jinja2 +20 -0
  35. wxo_agentic_evaluation/prompt/llama_user_prompt.jinja2 +22 -0
  36. wxo_agentic_evaluation/prompt/semantic_matching_prompt.jinja2 +114 -0
  37. wxo_agentic_evaluation/prompt/template_render.py +90 -0
  38. wxo_agentic_evaluation/prompt/tool_chain_agent.jinja2 +11 -0
  39. wxo_agentic_evaluation/prompt/tool_planner.jinja2 +40 -0
  40. wxo_agentic_evaluation/record_chat.py +165 -0
  41. wxo_agentic_evaluation/service_instance.py +179 -0
  42. wxo_agentic_evaluation/tool_planner.py +228 -0
  43. wxo_agentic_evaluation/type.py +176 -0
  44. wxo_agentic_evaluation/utils/__init__.py +6 -0
  45. wxo_agentic_evaluation/utils/utils.py +233 -0
  46. wxo_agentic_evaluation/watsonx_provider.py +175 -0
@@ -0,0 +1,59 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are an evaluation agent judging how faithful a claim is to the source material.
3
+ You are only to use the source material when judging a claim. Do not use your own knowledge when evaluating a claim. STICK TO THE SUPPORTING EVIDENCE.
4
+
5
+ ## Key Evaluation Principles:
6
+ - When evaluating the input against the source material, list the evidence from the source material that supports the claim.
7
+ - Claims don't have to be worded exactly as presented in the source material; however, the claim should convey the same general meaning from the source material. BUT, crucially, claims should not overstate or hyperbolize.
8
+ - A claim can be partially true. Keep this in mind when evaluating the claim.
9
+ - You can ignore different formatting for things like dates (mm/dd/yy vs mm-dd-yy vs mmddyyy as just some examples), places (Florida vs. Fl., NYC vs. New York City), numbers ($210,000 vs 210000.0 vs $21,0000.0)
10
+ - Keep your reasoning brief and to the point. Don't be wordy.
11
+
12
+ ## Scoring:
13
+ Once you have evaluated the claim against the source material, determine a score from 0 - 1 on how well the claim is supported by the evidence.
14
+ A higher score indicates the claim is well supported by the evidence. A lower score indicates that the claim is less supported.
15
+ You need to provide a reason for your score. Your reasoning should illustrate why you gave the score you did. Ask yourself these questions:
16
+ - Is there a lot of evidence for the claim?
17
+ - If a claim is contradictory to the source material, what pieces of evidence contradict the claim?
18
+ - If a claim is partially supported by the evidence, what parts are supported by the evidence? what parts are not supported?
19
+
20
+ ## Output:
21
+ Respond in a JSON format with the following fields:
22
+ - evidence: this is a list that contains the evidence from the source material.
23
+ - faithfulness_score: this field contains the score you gave
24
+ - reason: this field contains your justification on why you gave the claim the score you did
25
+
26
+ This is an example of a valid JSON output
27
+
28
+ {
29
+ "evidence": [],
30
+ "reason": "place holder text",
31
+ "faithfulness_score": 0.5
32
+
33
+ }
34
+
35
+
36
+ DO NOT PROVIDE ADDITIONAL COMMENTARY, EXPLANATIONS, OR OUTPUTS other than what is explicitly required above.
37
+
38
+ <|eot_id|>
39
+
40
+ ---
41
+
42
+ <|start_header_id|>user<|end_header_id|>
43
+
44
+
45
+ Now evaluate the following claim against the source material.
46
+
47
+ Claim:
48
+ {{ claim }}
49
+
50
+ Supporting Evidence:
51
+ {{ supporting_evidence }}
52
+
53
+ Answer:
54
+
55
+ <|eot_id|>
56
+
57
+
58
+ <|start_header_id|>assistant<|end_header_id|>
59
+
@@ -0,0 +1,75 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are an evaluation agent. Your task is to check if ALL keywords appear in the given response.
3
+
4
+ - If ALL keywords are present (exactly or as close variants), respond only with: True
5
+ - If ANY keyword is missing, respond only with: False
6
+ - DO NOT explain or list anything. Only return: True or False
7
+ - Only evaluate the keywords and response given to you. Do not generate additional examples.
8
+ <|eot_id|>
9
+
10
+ <|start_header_id|>user<|end_header_id|>
11
+ Evaluate the following examples:
12
+
13
+ ### Example 1
14
+ Keywords:
15
+ successfully
16
+ email
17
+ updated
18
+
19
+ Response:
20
+ Your email has been successfully updated.
21
+
22
+ Answer:
23
+ True
24
+
25
+ ### Example 2
26
+ Keywords:
27
+ job code
28
+ Engineering
29
+ 50000076
30
+
31
+ Response:
32
+ The job code in the system for Engineering is 50000074.
33
+
34
+ Answer:
35
+ False
36
+
37
+ ### Example 3
38
+ Keywords:
39
+ 2024-12-25
40
+ 2025-02-15
41
+ 2025-03-20
42
+ 2025-01-10
43
+
44
+ Response:
45
+ Team members will be off on the following dates — jsmith: 20251225, 20250215; alee: 20250320, 20250110.
46
+
47
+ Answer:
48
+ True
49
+
50
+ ### Example 4
51
+ Keywords:
52
+ EUR
53
+ 85000.0
54
+
55
+ Response:
56
+ Your annual compensation is now set to €8,5000.0, paid in EUR.
57
+
58
+ Answer:
59
+ True
60
+
61
+ ---
62
+
63
+ ### Now, evaluate the following:
64
+
65
+ Keywords:
66
+ {{ keywords_text }}
67
+
68
+ Response:
69
+ {{ response_text }}
70
+
71
+ Answer:
72
+
73
+ <|eot_id|>
74
+
75
+ <|start_header_id|>assistant<|end_header_id|>
@@ -0,0 +1,20 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ Given a text response, generate keywords that capture the main information in the response.
3
+ Capture all specific information in the response.
4
+ Only return a list starting with [ and ending with ].
5
+ No extra commentary<|eot_id|>
6
+
7
+ Here are some examples:
8
+ <|start_header_id|>user<|end_header_id|>The effective start date of your latest dental plan is January 1, 2022.<|eot_id|>
9
+ <|start_header_id|>assistant<|end_header_id|>["dental plan", "January 1, 2022"]<|eot_id|>
10
+ <|start_header_id|>user<|end_header_id|>Your current compensation details are as follows:\n\n* Currency: NZD\n* Yearly base salary: $102,000.00<|eot_id|>
11
+ <|start_header_id|>assistant<|end_header_id|>["compensation", "NZD", "$102,000.00"]<|eot_id|>
12
+ <|start_header_id|>user<|end_header_id|>Your payslip details from January 1, 2018, to January 15, 2018, are as follows:\n\n* Start Date: January 1, 2018\n* End Date: January 15, 2018\n* Currency: USD\n* Wages:\n\t+ Gross: $2466.66\n\t+ Net Pay: $1712.75\n\t+ Taxes: $556.58\n\t+ Other (None): -$197.33\n\t+ Salary: $2466.66<|eot_id|>
13
+ <|start_header_id|>assistant<|end_header_id|>["January 1, 2018", "January 15, 2018", "USD", "$2466.66", "$1712.75", "$556.58","-$197.33"]<|eot_id|>
14
+ <|start_header_id|>user<|end_header_id|>I successfully updated your address.<|eot_id|>
15
+ <|start_header_id|>assistant<|end_header_id|>[]<|eot_id|>
16
+ <|start_header_id|>user<|end_header_id|>Here are the details for the user 108727:\n| field | value\n|-------------|------------|\n| first name | Merton |\n| last name | Wells |\n| nationality | USA |\n| gender | M |\n| country | USA |<|eot_id|>
17
+ <|start_header_id|>assistant<|end_header_id|>["Merton", "Wells", "USA", "M"]<|eot_id|>
18
+
19
+ <|start_header_id|>user<|end_header_id|>{{response}}<|eot_id|>
20
+ <|start_header_id|>assistant<|end_header_id|>
@@ -0,0 +1,22 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+
3
+ The user ALWAYS responds with "END" once it has the information listed in user story. If prompted a question of is it correct, the user will respond with YES if it is correct else mention No and what is to corrected/ added.
4
+
5
+ This is the user story:
6
+ {{user_story}}
7
+
8
+ {% if user_response_style -%}
9
+ This is the user response style:
10
+ {% for instruction in user_response_style -%}
11
+ - {{instruction}}
12
+ {% endfor -%}
13
+ {% endif -%}
14
+
15
+ <|eot_id|>
16
+
17
+ {% for message in conversation_history -%}
18
+ <|start_header_id|>{{message.role}}<|end_header_id|>
19
+ {{message.content}}<|eot_id|>
20
+
21
+ {% endfor -%}
22
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
@@ -0,0 +1,114 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are an evaluation agent specializing in semantic similarity assessment. Your task is to determine whether two texts express the same factual information and intentions, even when presented differently.
3
+
4
+ Key evaluation principles:
5
+ 1. Focus on whether the core information and outcome is the same
6
+ 2. Different phrasings that convey the same result should be considered equivalent
7
+ 3. When specific values (IDs, dates, amounts, names) appear in both texts, they must match exactly
8
+ 4. Ignore formatting differences in dates (2022-01-01 vs. 1/1/2022 vs 20220101), numbers ($210,000 vs 210000.0 vs $21,0000.0), and IDs
9
+ 5. Different levels of detail are acceptable if they don't contradict each other and the primary information remains intact
10
+ 6. Reference IDs that are clearly system-generated (like request IDs, confirmation numbers, UUIDs) may vary and should be ignored
11
+
12
+ Respond ONLY with:
13
+ - True: if the texts convey the same essential information and outcomes
14
+ - False: if they communicate different factual information or contradict each other
15
+
16
+ DO NOT provide explanations or commentary - only respond with "True" or "False"
17
+ <|eot_id|>
18
+
19
+ <|start_header_id|>user<|end_header_id|>
20
+ Evaluate the following examples:
21
+
22
+ ### Example 1
23
+ Expected:
24
+ Your email has been successfully updated.
25
+
26
+ Actual:
27
+ You have successfully updated your email.
28
+
29
+ Answer:
30
+ True
31
+
32
+ ### Example 2
33
+ Expected:
34
+ Ontario is a province in Canada.
35
+
36
+ Actual:
37
+ Ontario is a province.
38
+
39
+ Answer:
40
+ False
41
+
42
+ ### Example 3
43
+ Expected:
44
+ No payslips found for user with ID 12345.
45
+
46
+ Actual:
47
+ You don't have any payslips.
48
+
49
+ Answer:
50
+ True
51
+
52
+ ### Example 4
53
+ Expected:
54
+ Your time off request from 2024-11-01 to 2024-11-01 for TRAVEL has been successfully submitted. The request ID is c705878eb6584e9b910b8db3907a31da.
55
+
56
+ Actual:
57
+ Your time off request for TRAVEL on 2024-11-01 has been submitted. The confirmation ID is d805979fc7595f0a021b9ec4018b42eb.
58
+
59
+ Answer:
60
+ True
61
+
62
+ ### Example 5
63
+ Expected:
64
+ Your compensation details are as follows:
65
+ * Currency: USD
66
+ * Yearly base salary: 210000.0
67
+
68
+ Actual:
69
+ Your compensation is $210,000 USD per annum.
70
+
71
+ Answer:
72
+ True
73
+
74
+ ### Example 6
75
+ Expected:
76
+ Your visa details are as follows:
77
+ - Country: 44
78
+ - Document Number: DF112345DD
79
+ - Expiration Date: 2022-09-01
80
+
81
+ Actual:
82
+ Your visa details are as follows:
83
+ - Country: 46
84
+ - Document Number: DF112345DD
85
+ - Expiration Date: 2022-09-01
86
+
87
+ Answer:
88
+ False
89
+
90
+ ### Example 7
91
+ Expected:
92
+ I successfully updated your personal information.
93
+
94
+ Actual:
95
+ I have successfully updated your preferred name to M Wells and starting date is 7/05/2025.
96
+
97
+ Answer:
98
+ True
99
+
100
+ ---
101
+
102
+ ### Now, evaluate the following:
103
+
104
+ Expected:
105
+ {{ expected_text }}
106
+
107
+ Actual:
108
+ {{ actual_text }}
109
+
110
+ Answer:
111
+
112
+ <|eot_id|>
113
+
114
+ <|start_header_id|>assistant<|end_header_id|>
@@ -0,0 +1,90 @@
1
+ import jinja2
2
+ from typing import List
3
+
4
+
5
+ class JinjaTemplateRenderer:
6
+ def __init__(self, template_path: str):
7
+ self._template_env = jinja2.Environment(
8
+ loader=jinja2.BaseLoader(), undefined=jinja2.StrictUndefined
9
+ )
10
+ # TODO: make use of config
11
+ self._template_env.policies["json.dumps_kwargs"] = {"sort_keys": False}
12
+ with open(template_path, "r") as file:
13
+ template_str = file.read()
14
+ self.template_str = template_str
15
+ self.template = self._template_env.from_string(template_str)
16
+
17
+ def render(self, **kwargs):
18
+ return self.template.render(**kwargs)
19
+
20
+
21
+ class LlamaUserTemplateRenderer(JinjaTemplateRenderer):
22
+ def render(
23
+ self, user_story: str, user_response_style: List, conversation_history: List
24
+ ) -> str:
25
+ return super().render(
26
+ user_story=user_story,
27
+ user_response_style=user_response_style,
28
+ conversation_history=conversation_history,
29
+ )
30
+
31
+
32
+ class KeywordMatchingTemplateRenderer(JinjaTemplateRenderer):
33
+ def render(self, keywords_text: str, response_text: str) -> str:
34
+ return super().render(keywords_text=keywords_text, response_text=response_text)
35
+
36
+
37
+ class SemanticMatchingTemplateRenderer(JinjaTemplateRenderer):
38
+ def render(self, expected_text: str, actual_text: str) -> str:
39
+ return super().render(expected_text=expected_text, actual_text=actual_text)
40
+
41
+
42
+ class LlamaKeywordsGenerationTemplateRenderer(JinjaTemplateRenderer):
43
+ def render(self, response: str) -> str:
44
+ return super().render(response=response)
45
+
46
+
47
+ class FaithfulnessTemplateRenderer(JinjaTemplateRenderer):
48
+ def render(self, claim, retrieval_context):
49
+ return super().render(claim=claim, supporting_evidence=retrieval_context)
50
+
51
+
52
+ class AnswerRelevancyTemplateRenderer(JinjaTemplateRenderer):
53
+ def render(self, question, context, answer):
54
+ return super().render(question=question, context=context, answer=answer)
55
+
56
+
57
+ class ToolPlannerTemplateRenderer(JinjaTemplateRenderer):
58
+ def render(self, user_story: str, agent_name: str, available_tools: str) -> str:
59
+ return super().render(
60
+ user_story=user_story,
61
+ agent_name=agent_name,
62
+ available_tools=available_tools,
63
+ )
64
+
65
+ class ToolChainAgentTemplateRenderer(JinjaTemplateRenderer):
66
+ def render(self, tool_call_history: List, available_tools:str) -> str:
67
+ return super().render(
68
+ tool_call_history=tool_call_history,
69
+ available_tools=available_tools,
70
+ )
71
+
72
+
73
+ class BatchTestCaseGeneratorTemplateRenderer(JinjaTemplateRenderer):
74
+ def render(
75
+ self,
76
+ agent_name: str,
77
+ tool_blocks: str,
78
+ tool_inputs_str: str,
79
+ story: str,
80
+ num_variants: int,
81
+ example_str: str,
82
+ ) -> str:
83
+ return super().render(
84
+ agent_name=agent_name,
85
+ tool_blocks=tool_blocks,
86
+ tool_inputs_str=tool_inputs_str,
87
+ story=story,
88
+ num_variants=num_variants,
89
+ example_str=example_str,
90
+ )
@@ -0,0 +1,11 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are trying to make tool calls
3
+
4
+ {{ available_tools }}
5
+
6
+ <|eot_id|>
7
+ {% for message in tool_call_history -%}
8
+ <|start_header_id|>assistant<|end_header_id|>
9
+ {{message}}<|eot_id|>
10
+ {% endfor -%}
11
+ <|start_header_id|>assistant<|end_header_id|>
@@ -0,0 +1,40 @@
1
+ <|begin_of_text|><|start_header_id|>system<|end_header_id|>
2
+ You are a tool-planning assistant for an AI system.
3
+
4
+ Your job is to extract and sequence tool calls based on user stories and available tools.
5
+
6
+ Rules:
7
+ - Use only the tools listed below.
8
+ - Use only input values explicitly stated or clearly implied in the story.
9
+ - If a tool depends on a prior output, refer to it as "$<key>" (e.g., "$fetch_assignment_id").
10
+ - Do not use index notation like [0], [1], etc. in any tool inputs. Use the full list as-is when multiple values are expected.
11
+ - Output ONLY one valid JSON array.
12
+ - DO NOT include extra text or wrap the output. Just return the JSON list.
13
+
14
+ Available Tools:
15
+ {{ available_tools }}
16
+
17
+ Example:
18
+ Story: "Your username is nwaters. You want to find out your time-off schedule from: 2025-01-01 to: 2025-12-31."
19
+
20
+ [
21
+ {
22
+ "tool_name": "fetch_assignment_id",
23
+ "inputs": {
24
+ "username": "nwaters"
25
+ }
26
+ },
27
+ {
28
+ "tool_name": "retrieve_timeoff_schedule",
29
+ "inputs": {
30
+ "assignment_id": "$fetch_assignment_id",
31
+ "start_date": "2025-01-01",
32
+ "end_date": "2025-12-31"
33
+ }
34
+ }
35
+ ]
36
+ <|eot_id|>
37
+ <|start_header_id|>user<|end_header_id|>
38
+ Story: "{{ user_story }}"
39
+ <|eot_id|>
40
+ <|start_header_id|>assistant<|end_header_id|>
@@ -0,0 +1,165 @@
1
+ from wxo_agentic_evaluation.type import Message
2
+ from wxo_agentic_evaluation.arg_configs import (
3
+ ChatRecordingConfig,
4
+ KeywordsGenerationConfig,
5
+ )
6
+ from wxo_agentic_evaluation.inference_backend import (
7
+ WXOClient,
8
+ WXOInferenceBackend,
9
+ get_wxo_client,
10
+ )
11
+ from wxo_agentic_evaluation.data_annotator import DataAnnotator
12
+ from wxo_agentic_evaluation.utils.utils import is_saas_url
13
+ from wxo_agentic_evaluation.service_instance import tenant_setup
14
+
15
+ import json
16
+ import os
17
+ import rich
18
+ from datetime import datetime
19
+ import time
20
+ from typing import List
21
+ from jsonargparse import CLI
22
+ import warnings
23
+
24
+ warnings.filterwarnings("ignore", category=DeprecationWarning)
25
+ warnings.filterwarnings("ignore", category=FutureWarning)
26
+
27
+
28
+ def get_all_runs(wxo_client: WXOClient):
29
+ limit = 20 # Maximum allowed limit per request
30
+ offset = 0
31
+ all_runs = []
32
+
33
+ if is_saas_url(wxo_client.service_url):
34
+ path = "v1//orchestrate/runs"
35
+ else:
36
+ path = "/orchestrate/runs"
37
+
38
+ initial_response = wxo_client.get(
39
+ path, {"limit": limit, "offset": 0}
40
+ ).json()
41
+ total_runs = initial_response["total"]
42
+ all_runs.extend(initial_response["data"])
43
+
44
+ while len(all_runs) < total_runs:
45
+ offset += limit
46
+ response = wxo_client.get(
47
+ path, {"limit": limit, "offset": offset}
48
+ ).json()
49
+ all_runs.extend(response["data"])
50
+
51
+ # Sort runs by completed_at in descending order (most recent first)
52
+ # Put runs with no completion time at the end
53
+ all_runs.sort(
54
+ key=lambda x: (
55
+ datetime.strptime(x["completed_at"], "%Y-%m-%dT%H:%M:%S.%fZ")
56
+ if x.get("completed_at")
57
+ else datetime.min
58
+ ),
59
+ reverse=True,
60
+ )
61
+
62
+ return all_runs
63
+
64
+
65
+ def pull_messages_from_thread_id(thread_id: str, wxo_client: WXOClient):
66
+ inference_backend = WXOInferenceBackend(wxo_client=wxo_client)
67
+ messages = inference_backend.get_messages(thread_id)
68
+ return messages
69
+
70
+
71
+ def annotate_messages(
72
+ messages: List[Message], keywords_generation_config: KeywordsGenerationConfig
73
+ ):
74
+ annotator = DataAnnotator(
75
+ messages=messages, keywords_generation_config=keywords_generation_config
76
+ )
77
+ return annotator.generate()
78
+
79
+
80
+ def record_chats(config: ChatRecordingConfig):
81
+ """Record chats in background mode"""
82
+ start_time = datetime.utcnow()
83
+ processed_threads = set()
84
+
85
+ rich.print(
86
+ f"[green]INFO:[/green] Starting chat recording at {start_time}. Press Ctrl+C to stop."
87
+ )
88
+ if config.token is None:
89
+ token = tenant_setup(config.service_url, config.tenant_name)
90
+ else:
91
+ token = config.token
92
+ wxo_client = get_wxo_client(config.service_url, token)
93
+ try:
94
+ while True:
95
+ all_runs = get_all_runs(wxo_client)
96
+ seen_threads = set()
97
+
98
+ # Process only new runs that started after our recording began
99
+ for run in all_runs:
100
+ thread_id = run.get("thread_id")
101
+ if thread_id in seen_threads:
102
+ continue
103
+ seen_threads.add(thread_id)
104
+ started_at = run.get("started_at")
105
+
106
+ if not thread_id or not started_at:
107
+ continue
108
+
109
+ try:
110
+ started_time = datetime.strptime(
111
+ started_at, "%Y-%m-%dT%H:%M:%S.%fZ"
112
+ )
113
+ if started_time > start_time:
114
+ if thread_id not in processed_threads:
115
+ os.makedirs(config.output_dir, exist_ok=True)
116
+ rich.print(
117
+ f"\n[green]INFO:[/green] New recording started at {started_at}"
118
+ )
119
+ rich.print(
120
+ f"[green]INFO:[/green] Messages saved to: {os.path.join(config.output_dir, f'{thread_id}_messages.json')}"
121
+ )
122
+ rich.print(
123
+ f"[green]INFO:[/green] Annotations saved to: {os.path.join(config.output_dir, f'{thread_id}_annotated_data.json')}"
124
+ )
125
+ processed_threads.add(thread_id)
126
+
127
+ try:
128
+ messages = pull_messages_from_thread_id(
129
+ thread_id, wxo_client
130
+ )
131
+ annotated_data = annotate_messages(
132
+ messages, config.keywords_generation_config
133
+ )
134
+
135
+ messages_filename = os.path.join(
136
+ config.output_dir, f"{thread_id}_messages.json"
137
+ )
138
+ annotation_filename = os.path.join(
139
+ config.output_dir, f"{thread_id}_annotated_data.json"
140
+ )
141
+
142
+ with open(messages_filename, "w") as f:
143
+ json.dump(
144
+ [msg.model_dump() for msg in messages], f, indent=4
145
+ )
146
+
147
+ with open(annotation_filename, "w") as f:
148
+ json.dump(annotated_data, f, indent=4)
149
+ except Exception as e:
150
+ rich.print(
151
+ f"[red]ERROR:[/red] Failed to process thread {thread_id}: {str(e)}"
152
+ )
153
+ except (ValueError, TypeError) as e:
154
+ rich.print(
155
+ f"[yellow]WARNING:[/yellow] Invalid timestamp format for thread {thread_id}: {str(e)}"
156
+ )
157
+
158
+ time.sleep(2) # Poll every 2 seconds
159
+
160
+ except KeyboardInterrupt:
161
+ rich.print("\n[yellow]Recording stopped by user[/yellow]")
162
+
163
+
164
+ if __name__ == "__main__":
165
+ record_chats(CLI(ChatRecordingConfig, as_positional=False))