dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,234 @@
1
+ # import json
2
+ # import logging
3
+ # from typing import Any, Dict, List, Optional, Tuple
4
+
5
+ # from pydantic import BaseModel
6
+
7
+ # from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
8
+ # from dhisana.utils.workflow_code_model import WorkflowPythonCode
9
+
10
+ # # Example imports: adapt paths to your actual modules
11
+ # from dhisana.utils.generate_leads import generate_leads
12
+ # from dhisana.utils.qualify_leads import qualify_leads
13
+ # from dhisana.utils.compose_cadence import generate_campaign_cadence_workflow_and_execute
14
+
15
+ # # Initialize logger
16
+ # logging.basicConfig(level=logging.INFO)
17
+ # logger = logging.getLogger(__name__)
18
+
19
+
20
+ # class ThreeStepWorkflow(BaseModel):
21
+ # """
22
+ # Pydantic model representing the three-step workflow generated by LLM.
23
+ # """
24
+ # step_1_fetch_the_list: str
25
+ # step_2_qualify_the_leads: str
26
+ # step_3_execute_campaign_cadence: str
27
+
28
+
29
+ # async def generate_three_step_workflow(
30
+ # english_description: str,
31
+ # tool_config: Optional[List[Dict[str, Any]]] = None,
32
+ # ) -> Tuple[Dict[str, Any], str]:
33
+ # """
34
+ # Given a input, Split them into 3 Steps:
35
+ # 1) Step1: Describe how to fetch List of Leads with Sales Navigator.
36
+ # 2) How to qualify the leads (in plain English).
37
+ # 3) How to execute the LinkedIn/email campaign cadence.
38
+
39
+ # Returns:
40
+ # A tuple of:
41
+ # - A dictionary matching the ThreeStepWorkflow model:
42
+ # {
43
+ # "step_1_fetch_the_list": ...,
44
+ # "step_2_qualify_the_leads": ...,
45
+ # "step_3_execute_campaign_cadence": ...
46
+ # }
47
+ # - A string representing status, "SUCCESS" or "ERROR".
48
+ # """
49
+ # # A robust system prompt describing the required JSON structure
50
+ # user_prompt = f"""
51
+ # You are a helpful assistant. Please analyze the user's English description and produce a JSON object with
52
+ # three fields: step_1_fetch_the_list, step_2_qualify_the_leads, step_3_execute_campaign_cadence.
53
+
54
+ # The user wants a 3-step workflow:
55
+ # 1) Step 1: Describe how to fetch the list of leads with LinkedIn sales navigator. If user has provided sales navigator URL include it in the instructions.
56
+ # 2) Step 2: Describe How to qualify the leads. Extract how user has specified lead qualification and set it here.
57
+ # 3) Step 3: Describe How to enage the users execute the campaign cadence. Summarize how the user wants to engage leads on LinkedIn, email, or both (cadence rules, templates, etc.).
58
+
59
+ # Output MUST be valid JSON, for example:
60
+ # {{
61
+ # "step_1_fetch_the_list": "How to fetch leads from Sales Navigator. Include sales navigator URL if provided.",
62
+ # "step_2_qualify_the_leads": "Lead qualification instructions...",
63
+ # "step_3_execute_campaign_cadence": "Instruction on how to engage the user..."
64
+ # }}
65
+
66
+ # Double-check that you produce valid JSON with exactly these three keys.
67
+
68
+ # -- The user provided description is below --
69
+ # {english_description}
70
+ # """
71
+
72
+ # # Use get_structured_output_internal to parse LLM output into ThreeStepWorkflow
73
+ # response_obj, status = await get_structured_output_internal(
74
+ # user_prompt,
75
+ # ThreeStepWorkflow,
76
+ # tool_config=tool_config
77
+ # )
78
+
79
+ # if status != "SUCCESS":
80
+ # # Return a simple error response
81
+ # return {"error": "Failed to generate three-step workflow."}, "ERROR"
82
+
83
+ # # Convert the Pydantic model to a dictionary for easy usage
84
+ # return response_obj.model_dump(), status
85
+
86
+
87
+ # async def generate_three_step_workflow_execute(
88
+ # user_query: str,
89
+ # input_leads_list: List[Dict[str, Any]],
90
+ # tool_config: Optional[List[Dict[str, Any]]] = None
91
+ # ) -> str:
92
+ # """
93
+ # 1. Generates a three-step workflow from the user query.
94
+ # 2. Executes:
95
+ # - Step 1: Fetch leads from either Sales Navigator or a provided input list.
96
+ # - Step 2: Qualify leads (Smart List).
97
+ # - Step 3: Execute a LinkedIn/email cadence (Campaign Cadence).
98
+ # 3. Returns a JSON string describing success or error, including final leads if relevant.
99
+ # """
100
+
101
+ # try:
102
+ # # Generate the 3-step instructions
103
+ # three_step_response, status = await generate_three_step_workflow(
104
+ # user_query,
105
+ # tool_config=tool_config
106
+ # )
107
+ # if status != "SUCCESS":
108
+ # return json.dumps({
109
+ # "status": "ERROR",
110
+ # "error": three_step_response.get("error", "Failed to generate three-step workflow.")
111
+ # })
112
+
113
+ # # Extract the step instructions
114
+ # step_1_instructions = three_step_response.get("step_1_fetch_the_list", "")
115
+ # step_2_instructions = three_step_response.get("step_2_qualify_the_leads", "")
116
+ # step_3_instructions = three_step_response.get("step_3_execute_campaign_cadence", "")
117
+
118
+ # # ==============================
119
+ # # Step 1: Fetch leads
120
+ # # ==============================
121
+ # leads_list: List[Dict[str, Any]] = []
122
+ # step_1_status = "SUCCESS"
123
+ # if step_1_instructions == "use_input_file":
124
+ # # We rely on the leads passed in as input
125
+ # leads_list = input_leads_list
126
+ # logger.info("Using leads from input file (already provided).")
127
+ # else:
128
+ # # We interpret step_1_instructions as a user query for sales nav
129
+ # logger.info("Generating leads from sales nav instructions: %s", step_1_instructions)
130
+ # result_str = await generate_leads(
131
+ # user_query=step_1_instructions,
132
+ # tool_config=tool_config
133
+ # )
134
+ # # Parse the JSON result to extract leads
135
+ # try:
136
+ # result_json = json.loads(result_str)
137
+ # except json.JSONDecodeError as e:
138
+ # logger.exception("Failed to parse Step 1 JSON output.")
139
+ # return json.dumps({
140
+ # "status": "ERROR",
141
+ # "error": f"Failed to parse Step 1 output as JSON: {e}"
142
+ # })
143
+
144
+ # step_1_status = result_json.get("status", "ERROR")
145
+ # if step_1_status != "SUCCESS":
146
+ # # Return early if step 1 fails
147
+ # return json.dumps({
148
+ # "status": "ERROR",
149
+ # "step": "step_1_fetch_the_list",
150
+ # "error": result_json.get("error", "Failed to fetch leads in step 1.")
151
+ # })
152
+
153
+ # leads_list = result_json.get("leads", [])
154
+ # logger.info("Step 1 completed with %d leads", len(leads_list))
155
+
156
+ # # ==============================
157
+ # # Step 2: Qualify leads
158
+ # # ==============================
159
+ # step_2_status = "SUCCESS"
160
+ # if step_2_instructions:
161
+ # logger.info("Executing smart list creation with instructions: %s", step_2_instructions)
162
+ # try:
163
+ # # generate_smart_list_creation_code_and_execute typically returns JSON:
164
+ # # { "status": "SUCCESS", "qualified_leads": [...] }
165
+ # result_str = await qualify_leads(
166
+ # user_query=step_2_instructions,
167
+ # input_leads_list=leads_list,
168
+ # tool_config=tool_config
169
+ # )
170
+ # result_json = json.loads(result_str)
171
+ # step_2_status = result_json.get("status", "ERROR")
172
+ # if step_2_status != "SUCCESS":
173
+ # return json.dumps({
174
+ # "status": step_2_status,
175
+ # "step": "step_2_qualify_the_leads",
176
+ # "error": result_json.get("error", "Failed to qualify leads in step 2."),
177
+ # "leads_before_qualification": len(leads_list)
178
+ # })
179
+ # leads_list = result_json.get("qualified_leads", [])
180
+ # logger.info("Step 2 completed with %d leads after qualification", len(leads_list))
181
+ # except Exception as exc:
182
+ # logger.exception("Exception during Step 2 (smart list).")
183
+ # return json.dumps({
184
+ # "status": "ERROR",
185
+ # "step": "step_2_qualify_the_leads",
186
+ # "error": str(exc)
187
+ # })
188
+
189
+ # # ==============================
190
+ # # Step 3: Execute campaign cadence
191
+ # # ==============================
192
+ # step_3_status = "SUCCESS"
193
+ # if step_3_instructions:
194
+ # logger.info("Executing campaign cadence with instructions: %s", step_3_instructions)
195
+ # try:
196
+ # # generate_campaign_cadence_workflow_and_execute might return JSON:
197
+ # # { "status": "SUCCESS", "result": "Campaign done" } or similar
198
+ # result_str = await generate_campaign_cadence_workflow_and_execute(
199
+ # instructions=step_3_instructions,
200
+ # input_leads=leads_list,
201
+ # tool_config=tool_config
202
+ # )
203
+ # result_json = json.loads(result_str)
204
+ # step_3_status = result_json.get("status", "ERROR")
205
+ # if step_3_status != "SUCCESS":
206
+ # return json.dumps({
207
+ # "status": step_3_status,
208
+ # "step": "step_3_execute_campaign_cadence",
209
+ # "error": result_json.get("error", "Failed to run campaign in step 3.")
210
+ # })
211
+ # logger.info("Step 3 completed successfully.")
212
+ # except Exception as exc:
213
+ # logger.exception("Exception during Step 3 (campaign cadence).")
214
+ # return json.dumps({
215
+ # "status": "ERROR",
216
+ # "step": "step_3_execute_campaign_cadence",
217
+ # "error": str(exc)
218
+ # })
219
+
220
+ # # If all steps succeed, return final leads
221
+ # return json.dumps({
222
+ # "status": "SUCCESS",
223
+ # "steps": {
224
+ # "step_1_status": step_1_status,
225
+ # "step_2_status": step_2_status,
226
+ # "step_3_status": step_3_status
227
+ # },
228
+ # "final_leads_count": len(leads_list),
229
+ # "leads_list": leads_list
230
+ # })
231
+
232
+ # except Exception as e:
233
+ # logger.exception("Exception in generate_three_step_workflow_execute.")
234
+ # return json.dumps({"status": "ERROR", "error": str(e)})
@@ -0,0 +1,137 @@
1
+ import json
2
+ from pydantic import BaseModel, Field
3
+ from dhisana.utils.assistant_tool_tag import assistant_tool
4
+ from dhisana.utils.built_with_api_tools import (
5
+ get_company_info_from_builtwith,
6
+ get_company_info_from_builtwith_by_name,
7
+ )
8
+ from dhisana.utils.dataframe_tools import get_structured_output
9
+ from dhisana.utils.google_custom_search import search_google_custom
10
+
11
+
12
+ class QualifyCompanyBasedOnTechUsage(BaseModel):
13
+ company_name: str = Field(..., description="Name of the company")
14
+ company_domain: str = Field(..., description="Domain of the company")
15
+ built_with_technology_to_check: str = Field(
16
+ ..., description="Which technology we are checking for usage in the company."
17
+ )
18
+ is_built_with_technology: bool = Field(
19
+ ..., description="True if the input technology is used by the company based on input data."
20
+ )
21
+ reasoning_on_built_with: str = Field(
22
+ ..., description="Summary of built with technology in company and why is_built_with_technology is set to True or False."
23
+ )
24
+
25
+
26
+ def get_technologies(data, keyword):
27
+ """
28
+ Check if the keyword is found in the data JSON string.
29
+
30
+ Args:
31
+ data (dict): The data returned by BuiltWith API.
32
+ keyword (str): The keyword to search for.
33
+
34
+ Returns:
35
+ bool: True if the keyword is found, False otherwise.
36
+ """
37
+ data_str = json.dumps(data).lower()
38
+ keyword_lower = keyword.lower()
39
+ return keyword_lower in data_str
40
+
41
+ @assistant_tool
42
+ async def find_tech_usage_in_company(
43
+ company_domain: str,
44
+ company_name: str,
45
+ technology_to_look_for: str,
46
+ company_information: str
47
+ ):
48
+ """
49
+ Determine if a company is using a specific technology.
50
+
51
+ Args:
52
+ company_domain (str): The domain name of the company's website.
53
+ company_name (str): The name of the company.
54
+ technology_to_look_for (str): The technology to look for.
55
+ company_information (str): Additional company information.
56
+
57
+ Returns:
58
+ str: A JSON string containing the structured output.
59
+ """
60
+ if not company_name:
61
+ return json.dumps({
62
+ "company_name": company_name,
63
+ "is_built_with_technology": "False",
64
+ "reasoning_on_built_with": "Company name is missing."
65
+ })
66
+
67
+ if not company_domain:
68
+ company_data_buildwith = await get_company_info_from_builtwith_by_name(company_name)
69
+ company_domain = company_data_buildwith.get('Lookup', '')
70
+ else:
71
+ company_data_buildwith = await get_company_info_from_builtwith(company_domain)
72
+
73
+ # Search for job postings on the company's website mentioning the technology
74
+ search_google_results = ""
75
+ if company_domain:
76
+ company_domain_search = f"site:{company_domain} \"{company_name}\" \"{technology_to_look_for}\""
77
+ search_google_results = await search_google_custom(company_domain_search, 2)
78
+
79
+ # Search LinkedIn for people at the company with skills in the technology
80
+ linked_in_search = f"site:linkedin.com/in \"{company_name}\" \"{technology_to_look_for}\" intitle:\"{company_name}\" -intitle:\"followers\" -intitle:\"connections\" -intitle:\"profiles\" -inurl:\"dir/+\""
81
+ people_with_skills_results = await search_google_custom(linked_in_search, 2)
82
+
83
+ # Search LinkedIn for posts mentioning the company and technology
84
+ linked_in_posts_search = f"site:linkedin.com/posts \"{company_name}\" \"{technology_to_look_for}\" intitle:\"{company_name}\" -intitle:\"members\" -intitle:\"connections\""
85
+ linkedin_posts_search = await search_google_custom(linked_in_posts_search, 4)
86
+
87
+ # Search Twitter/X for posts mentioning the company and technology
88
+ twitter_posts_search_query = f'site:x.com "{company_name}" "{technology_to_look_for}" -intitle:"status"'
89
+ twitter_posts_search_results = await search_google_custom(twitter_posts_search_query, 4)
90
+
91
+ # General search results
92
+ general_search_results_query = f"\"{company_name}\" \"{technology_to_look_for}\""
93
+ general_search_results = await search_google_custom(general_search_results_query, 4)
94
+
95
+ # Get technologies used by the company from BuiltWith
96
+ tech_found_in_builtwith = []
97
+ if company_domain:
98
+ tech_found_in_builtwith = get_technologies(company_data_buildwith, technology_to_look_for)
99
+
100
+ # Prepare the prompt for structured output
101
+ prompt = f"""
102
+ Mark the company as qualified in is_built_with_technology if the company {company_name} is using technology {technology_to_look_for}.
103
+ DO NOT make up information.
104
+ Give reasoning why the company is is_built_with_technology based on one of the reasons:
105
+ 1. There is a job posting on the company website for that technology.
106
+ 2. There are people with that skill in the given company which were found on linked in google search.
107
+ 3. BuiltWith shows the company uses the tech that is input.
108
+ 4. LinkedIn posts search results show a strong indication of the company using the technology.
109
+ 5. Twitter/X posts search results show a strong indication of the company using the technology.
110
+ 6. General search results show a strong indication of the company using the technology.
111
+
112
+ Input Company Name: {company_name}
113
+ Technology to look for: {technology_to_look_for}
114
+ Google search results on company website for technology:
115
+ {search_google_results}
116
+
117
+ Google search on LinkedIn for people with skills:
118
+ {people_with_skills_results}
119
+
120
+ LinkedIn posts search for the company:
121
+ {linkedin_posts_search}
122
+
123
+ Twitter/X posts search for the company:
124
+ {twitter_posts_search_results}
125
+
126
+ General Search results:
127
+ {general_search_results}
128
+
129
+ Input Company Details To Lookup:
130
+ {company_information}
131
+
132
+ BuiltWith shows technology used: {tech_found_in_builtwith}
133
+ """
134
+
135
+ # Get structured output based on the prompt
136
+ output, _ = await get_structured_output(prompt, QualifyCompanyBasedOnTechUsage)
137
+ return json.dumps(output.dict())
@@ -0,0 +1,237 @@
1
+ # Tools to mainpulate dataframes. Convert any Natural Language query to a pandas query and process data frames..
2
+
3
+ import logging
4
+ import os
5
+ import json
6
+ import csv
7
+ from typing import List, Optional
8
+ from fastapi import HTTPException
9
+ import pandas as pd
10
+ from pydantic import BaseModel
11
+ import time
12
+ import os
13
+ from openai import LengthFinishReasonError, AsyncOpenAI, OpenAIError
14
+ import csv
15
+ from dhisana.utils.cache_output_tools import cache_output
16
+ from dhisana.utils.assistant_tool_tag import assistant_tool
17
+ import hashlib
18
+ import json
19
+ import glob
20
+
21
+ from dhisana.utils.cache_output_tools import retrieve_output
22
+
23
+ class FileItem:
24
+ def __init__(self, file_path: str):
25
+ self.file_path = file_path
26
+
27
+ class FileList:
28
+ def __init__(self, files: List[FileItem]):
29
+ self.files = files
30
+
31
+ class PandasQuery(BaseModel):
32
+ pandas_query: str
33
+
34
+
35
+ @assistant_tool
36
+ async def get_structured_output(message: str, response_type, model: str = "gpt-5.1-chat"):
37
+ """
38
+ Asynchronously retrieves structured output from the OpenAI API based on the input message.
39
+
40
+ :param message: The input message to be processed by the OpenAI API.
41
+ :param response_type: The expected format of the response (e.g., JSON).
42
+ :param model: The model to be used for processing the input message. Defaults to "gpt-5.1-chat".
43
+ :return: A tuple containing the parsed response and a status string ('SUCCESS' or 'FAIL').
44
+ """
45
+ try:
46
+ # Use the class name instead of serializing the class
47
+ response_type_str = response_type.__name__
48
+
49
+ # Create unique hashes for message and response_type
50
+ message_hash = hashlib.md5(message.encode('utf-8')).hexdigest()
51
+ response_type_hash = hashlib.md5(response_type_str.encode('utf-8')).hexdigest()
52
+
53
+ # Generate the cache key
54
+ cache_key = f"{message_hash}:{response_type_hash}"
55
+ cached_response = retrieve_output("get_structured_output", cache_key)
56
+ if cached_response is not None:
57
+ parsed_cached_response = response_type.parse_raw(cached_response)
58
+ return parsed_cached_response, 'SUCCESS'
59
+
60
+ client = AsyncOpenAI()
61
+ completion = await client.beta.chat.completions.parse(
62
+ model=model,
63
+ messages=[
64
+ {"role": "system", "content": "Extract structured content from input. Output is in JSON Format."},
65
+ {"role": "user", "content": message},
66
+ ],
67
+ response_format=response_type,
68
+ temperature=0.0
69
+ )
70
+
71
+ response = completion.choices[0].message
72
+ if response.parsed:
73
+ # Cache the successful response
74
+ cache_output("get_structured_output", cache_key, response.parsed.json())
75
+ return response.parsed, 'SUCCESS'
76
+ elif response.refusal:
77
+ logging.warning("ERROR: Refusal response: %s", response.refusal)
78
+ return response.refusal, 'FAIL'
79
+
80
+ except LengthFinishReasonError as e:
81
+ logging.error(f"Too many tokens: {e}")
82
+ raise HTTPException(status_code=502, detail="The request exceeded the maximum token limit.")
83
+ except OpenAIError as e:
84
+ logging.error(f"OpenAI API error: {e}")
85
+ raise HTTPException(status_code=502, detail="Error communicating with the OpenAI API.")
86
+ except Exception as e:
87
+ logging.error(f"Unexpected error: {e}")
88
+ return {"error": str(e)}, 'FAIL'
89
+
90
+ @assistant_tool
91
+ async def query_dataframes(user_query: str, input_files: Optional[List[str]], output_file_path: Optional[str] = None) -> str:
92
+ """
93
+ Query multiple dataframes based on a user query and write the output dataframe to a specified output file path.
94
+
95
+ Args:
96
+ user_query (str): User query in natural language.
97
+ input_files (List[str]): List of paths to CSV files to be loaded into dataframes.
98
+ output_file_path (Optional[str]): Path to the output file where the resulting dataframe will be saved.
99
+ If not specified, a unique file path will be generated in '/tmp/run_interim_outputs/'.
100
+
101
+ Returns:
102
+ str: A JSON string representing the FileList containing the path to the output file if created,
103
+ or an error message if an error occurred.
104
+ """
105
+ max_retries = 3
106
+ if not input_files or not user_query:
107
+ return json.dumps({"files": []})
108
+
109
+ if not output_file_path:
110
+ output_folder = '/tmp/run_interim_outputs/'
111
+ os.makedirs(output_folder, exist_ok=True)
112
+ unique_number = int(time.time() * 1000)
113
+ output_file_name = f'query_dataframe_{unique_number}.csv'
114
+ output_file_path = os.path.join(output_folder, output_file_name)
115
+ else:
116
+ output_folder = os.path.dirname(output_file_path)
117
+ if output_folder:
118
+ os.makedirs(output_folder, exist_ok=True)
119
+
120
+ data_frames = []
121
+ df_names = []
122
+ for idx, file in enumerate(input_files):
123
+ if os.path.getsize(file) == 0:
124
+ continue
125
+ df = pd.read_csv(file)
126
+ data_frames.append(df)
127
+ df_name = f'df{idx+1}'
128
+ df_names.append(df_name)
129
+
130
+ if not data_frames:
131
+ return json.dumps({"files": []})
132
+
133
+ schema_info = ""
134
+ for df_name, df in zip(df_names, data_frames):
135
+ schema_info += f"DataFrame '{df_name}' columns: {', '.join(df.columns)}\n"
136
+
137
+ error_message = ""
138
+
139
+ for attempt in range(max_retries):
140
+ message = f"""
141
+ You are an expert data analyst. Given the following DataFrames and their schemas:
142
+
143
+ {schema_info}
144
+
145
+ Write a pandas query to answer the following question:
146
+
147
+ \"\"\"{user_query}\"\"\"
148
+
149
+ Your query should use the provided DataFrames ({', '.join(df_names)}) and produce a DataFrame named 'result_df'. Do not include any imports or explanations; only provide the pandas query code that assigns the result to 'result_df'.
150
+ """
151
+ if error_message:
152
+ message += f"\nThe previous query returned the following error:\n{error_message}\nPlease fix the query."
153
+
154
+ pandas_query_result, status = await get_structured_output(message, PandasQuery)
155
+ if status == 'SUCCESS' and pandas_query_result and pandas_query_result.pandas_query:
156
+ pandas_query = pandas_query_result.pandas_query
157
+ local_vars = {name: df for name, df in zip(df_names, data_frames)}
158
+ global_vars = {}
159
+ try:
160
+ exec(pandas_query, global_vars, local_vars)
161
+ result_df = local_vars.get('result_df')
162
+ if result_df is None:
163
+ error_message = "The query did not produce a DataFrame named 'result_df'."
164
+ if attempt == max_retries - 1:
165
+ return json.dumps({"error": error_message})
166
+ continue
167
+ break
168
+ except Exception as e:
169
+ error_message = str(e)
170
+ if attempt == max_retries - 1:
171
+ return json.dumps({"error": error_message})
172
+ continue
173
+ else:
174
+ if attempt == max_retries - 1:
175
+ return json.dumps({"error": "Failed to get a valid pandas query after multiple attempts."})
176
+ continue
177
+
178
+ result_df.to_csv(output_file_path, index=False)
179
+
180
+ file_list = FileList(files=[FileItem(file_path=output_file_path)])
181
+
182
+ def file_item_to_dict(file_item):
183
+ return {"file_path": file_item.file_path}
184
+
185
+ file_list_dict = {
186
+ "files": [file_item_to_dict(file_item) for file_item in file_list.files]
187
+ }
188
+ file_list_json = json.dumps(file_list_dict, indent=2)
189
+ return file_list_json
190
+
191
+ @assistant_tool
192
+ async def load_csv_file(input_file_path: str):
193
+ """
194
+ Loads data from a CSV file and returns it as a list of dictionaries.
195
+
196
+ Args:
197
+ input_file_path (str): The path to the input CSV file.
198
+
199
+ Returns:
200
+ List[Dict[str, Any]]: List of rows from the CSV file, where each row is a dictionary.
201
+ """
202
+ with open(input_file_path, newline='') as csvfile:
203
+ reader = csv.DictReader(csvfile)
204
+ return [row for row in reader]
205
+
206
+
207
+
208
+ @assistant_tool
209
+ async def merge_csv_files(input_folder_path: str, extension: str, required_fields=[], dedup_by_fields=[], sort_by_fields=[], output_file_path: str ="") -> str:
210
+ # Step 1: List all CSV files in the input folder with the given extension
211
+ all_files = glob.glob(os.path.join(input_folder_path, f"*.{extension}"))
212
+
213
+ # Step 2: Read each CSV file into a DataFrame
214
+ df_list = []
215
+ for file in all_files:
216
+ df = pd.read_csv(file)
217
+ df_list.append(df)
218
+
219
+ # Step 3: Concatenate all DataFrames
220
+ merged_df = pd.concat(df_list, ignore_index=True)
221
+
222
+ # Step 4: Filter rows where required fields are not empty
223
+ if required_fields:
224
+ merged_df = merged_df.dropna(subset=required_fields)
225
+
226
+ # Step 5: Remove duplicate rows based on the dedup fields
227
+ if dedup_by_fields:
228
+ merged_df = merged_df.drop_duplicates(subset=dedup_by_fields)
229
+
230
+ # Step 6: Sort the DataFrame by the sort fields
231
+ if sort_by_fields:
232
+ merged_df = merged_df.sort_values(by=sort_by_fields)
233
+
234
+ # Step 7: Write the final DataFrame to the output file
235
+ merged_df.to_csv(output_file_path, index=False)
236
+
237
+ return output_file_path
@@ -0,0 +1,45 @@
1
+ # A set of domains that should be excluded because they are social or bio/link aggregator services.
2
+ import tldextract
3
+
4
+
5
+ EXCLUDED_LINK_DOMAINS = [
6
+ "beacon.ai",
7
+ "tap.bio",
8
+ "campsite.bio",
9
+ "shor.by",
10
+ "milkshake.app",
11
+ "lnk.bio",
12
+ "carrd.co",
13
+ "bio.fm",
14
+ "withkoji.com",
15
+ "flowcode.com",
16
+ "biolinky.co",
17
+ "contactinbio.com",
18
+ "linktr.ee",
19
+ "linkedin.com",
20
+ "facebook.com",
21
+ "youtube.com",
22
+ ]
23
+
24
+ def get_domain_from_website(website: str) -> str:
25
+ """
26
+ Extracts the domain from a given website URL using tldextract.
27
+ Returns an empty string if no website is provided.
28
+
29
+ :param website: The full URL from which to extract the domain.
30
+ :return: Extracted domain in the form 'example.com', or '' if none.
31
+ """
32
+ if not website:
33
+ return ""
34
+ extracted = tldextract.extract(website)
35
+ return f"{extracted.domain}.{extracted.suffix}"
36
+
37
+
38
+ def is_excluded_domain(domain: str) -> bool:
39
+ """
40
+ Checks if the domain is in the EXCLUDED_LINK_DOMAINS list.
41
+
42
+ :param domain: The domain (without protocol) to be checked.
43
+ :return: True if the domain is excluded, False otherwise.
44
+ """
45
+ return domain.lower() in EXCLUDED_LINK_DOMAINS