PyPI - dhisana - Versions diffs - 0.0.1.dev243__py3-none-any.whl - Mend

dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

dhisana/__init__.py +1 -0
dhisana/cli/__init__.py +1 -0
dhisana/cli/cli.py +20 -0
dhisana/cli/datasets.py +27 -0
dhisana/cli/models.py +26 -0
dhisana/cli/predictions.py +20 -0
dhisana/schemas/__init__.py +1 -0
dhisana/schemas/common.py +399 -0
dhisana/schemas/sales.py +965 -0
dhisana/ui/__init__.py +1 -0
dhisana/ui/components.py +472 -0
dhisana/utils/__init__.py +1 -0
dhisana/utils/add_mapping.py +352 -0
dhisana/utils/agent_tools.py +51 -0
dhisana/utils/apollo_tools.py +1597 -0
dhisana/utils/assistant_tool_tag.py +4 -0
dhisana/utils/built_with_api_tools.py +282 -0
dhisana/utils/cache_output_tools.py +98 -0
dhisana/utils/cache_output_tools_local.py +78 -0
dhisana/utils/check_email_validity_tools.py +717 -0
dhisana/utils/check_for_intent_signal.py +107 -0
dhisana/utils/check_linkedin_url_validity.py +209 -0
dhisana/utils/clay_tools.py +43 -0
dhisana/utils/clean_properties.py +135 -0
dhisana/utils/company_utils.py +60 -0
dhisana/utils/compose_salesnav_query.py +259 -0
dhisana/utils/compose_search_query.py +759 -0
dhisana/utils/compose_three_step_workflow.py +234 -0
dhisana/utils/composite_tools.py +137 -0
dhisana/utils/dataframe_tools.py +237 -0
dhisana/utils/domain_parser.py +45 -0
dhisana/utils/email_body_utils.py +72 -0
dhisana/utils/email_parse_helpers.py +132 -0
dhisana/utils/email_provider.py +375 -0
dhisana/utils/enrich_lead_information.py +933 -0
dhisana/utils/extract_email_content_for_llm.py +101 -0
dhisana/utils/fetch_openai_config.py +129 -0
dhisana/utils/field_validators.py +426 -0
dhisana/utils/g2_tools.py +104 -0
dhisana/utils/generate_content.py +41 -0
dhisana/utils/generate_custom_message.py +271 -0
dhisana/utils/generate_email.py +278 -0
dhisana/utils/generate_email_response.py +465 -0
dhisana/utils/generate_flow.py +102 -0
dhisana/utils/generate_leads_salesnav.py +303 -0
dhisana/utils/generate_linkedin_connect_message.py +224 -0
dhisana/utils/generate_linkedin_response_message.py +317 -0
dhisana/utils/generate_structured_output_internal.py +462 -0
dhisana/utils/google_custom_search.py +267 -0
dhisana/utils/google_oauth_tools.py +727 -0
dhisana/utils/google_workspace_tools.py +1294 -0
dhisana/utils/hubspot_clearbit.py +96 -0
dhisana/utils/hubspot_crm_tools.py +2440 -0
dhisana/utils/instantly_tools.py +149 -0
dhisana/utils/linkedin_crawler.py +168 -0
dhisana/utils/lusha_tools.py +333 -0
dhisana/utils/mailgun_tools.py +156 -0
dhisana/utils/mailreach_tools.py +123 -0
dhisana/utils/microsoft365_tools.py +455 -0
dhisana/utils/openai_assistant_and_file_utils.py +267 -0
dhisana/utils/openai_helpers.py +977 -0
dhisana/utils/openapi_spec_to_tools.py +45 -0
dhisana/utils/openapi_tool/__init__.py +1 -0
dhisana/utils/openapi_tool/api_models.py +633 -0
dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
dhisana/utils/openapi_tool/openapi_tool.py +319 -0
dhisana/utils/parse_linkedin_messages_txt.py +100 -0
dhisana/utils/profile.py +37 -0
dhisana/utils/proxy_curl_tools.py +1226 -0
dhisana/utils/proxycurl_search_leads.py +426 -0
dhisana/utils/python_function_to_tools.py +83 -0
dhisana/utils/research_lead.py +176 -0
dhisana/utils/sales_navigator_crawler.py +1103 -0
dhisana/utils/salesforce_crm_tools.py +477 -0
dhisana/utils/search_router.py +131 -0
dhisana/utils/search_router_jobs.py +51 -0
dhisana/utils/sendgrid_tools.py +162 -0
dhisana/utils/serarch_router_local_business.py +75 -0
dhisana/utils/serpapi_additional_tools.py +290 -0
dhisana/utils/serpapi_google_jobs.py +117 -0
dhisana/utils/serpapi_google_search.py +188 -0
dhisana/utils/serpapi_local_business_search.py +129 -0
dhisana/utils/serpapi_search_tools.py +852 -0
dhisana/utils/serperdev_google_jobs.py +125 -0
dhisana/utils/serperdev_local_business.py +154 -0
dhisana/utils/serperdev_search.py +233 -0
dhisana/utils/smtp_email_tools.py +582 -0
dhisana/utils/test_connect.py +2087 -0
dhisana/utils/trasform_json.py +173 -0
dhisana/utils/web_download_parse_tools.py +189 -0
dhisana/utils/workflow_code_model.py +5 -0
dhisana/utils/zoominfo_tools.py +357 -0
dhisana/workflow/__init__.py +1 -0
dhisana/workflow/agent.py +18 -0
dhisana/workflow/flow.py +44 -0
dhisana/workflow/task.py +43 -0
dhisana/workflow/test.py +90 -0
dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0

dhisana/utils/compose_three_step_workflow.py ADDED Viewed

@@ -0,0 +1,234 @@
+# import json
+# import logging
+# from typing import Any, Dict, List, Optional, Tuple
+# from pydantic import BaseModel
+# from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
+# from dhisana.utils.workflow_code_model import WorkflowPythonCode
+# # Example imports: adapt paths to your actual modules
+# from dhisana.utils.generate_leads import generate_leads
+# from dhisana.utils.qualify_leads import qualify_leads
+# from dhisana.utils.compose_cadence import generate_campaign_cadence_workflow_and_execute
+# # Initialize logger
+# logging.basicConfig(level=logging.INFO)
+# logger = logging.getLogger(__name__)
+# class ThreeStepWorkflow(BaseModel):
+#     """
+#     Pydantic model representing the three-step workflow generated by LLM.
+#     """
+#     step_1_fetch_the_list: str
+#     step_2_qualify_the_leads: str
+#     step_3_execute_campaign_cadence: str
+# async def generate_three_step_workflow(
+#     english_description: str,
+#     tool_config: Optional[List[Dict[str, Any]]] = None,
+# ) -> Tuple[Dict[str, Any], str]:
+#     """
+#     Given a input, Split them into 3 Steps:
+#       1) Step1: Describe how to fetch List of Leads with Sales Navigator.
+#       2) How to qualify the leads (in plain English).
+#       3) How to execute the LinkedIn/email campaign cadence.
+#     Returns:
+#         A tuple of:
+#          - A dictionary matching the ThreeStepWorkflow model:
+#             {
+#                 "step_1_fetch_the_list": ...,
+#                 "step_2_qualify_the_leads": ...,
+#                 "step_3_execute_campaign_cadence": ...
+#             }
+#          - A string representing status, "SUCCESS" or "ERROR".
+#     """
+#     # A robust system prompt describing the required JSON structure
+#     user_prompt = f"""
+#     You are a helpful assistant. Please analyze the user's English description and produce a JSON object with
+#     three fields: step_1_fetch_the_list, step_2_qualify_the_leads, step_3_execute_campaign_cadence.
+#     The user wants a 3-step workflow:
+#       1) Step 1: Describe how to fetch the list of leads with LinkedIn sales navigator. If user has provided sales navigator URL include it in the instructions.
+#       2) Step 2: Describe How to qualify the leads. Extract how user has specified lead qualification and set it here.
+#       3) Step 3: Describe How to enage the users execute the campaign cadence. Summarize how the user wants to engage leads on LinkedIn, email, or both (cadence rules, templates, etc.).
+#     Output MUST be valid JSON, for example:
+#     {{
+#       "step_1_fetch_the_list": "How to fetch leads from Sales Navigator. Include sales navigator URL if provided.",
+#       "step_2_qualify_the_leads": "Lead qualification instructions...",
+#       "step_3_execute_campaign_cadence": "Instruction on how to engage the user..."
+#     }}
+#     Double-check that you produce valid JSON with exactly these three keys.
+#     -- The user provided description is below --
+#     {english_description}
+#     """
+#     # Use get_structured_output_internal to parse LLM output into ThreeStepWorkflow
+#     response_obj, status = await get_structured_output_internal(
+#         user_prompt,
+#         ThreeStepWorkflow,
+#         tool_config=tool_config
+#     )
+#     if status != "SUCCESS":
+#         # Return a simple error response
+#         return {"error": "Failed to generate three-step workflow."}, "ERROR"
+#     # Convert the Pydantic model to a dictionary for easy usage
+#     return response_obj.model_dump(), status
+# async def generate_three_step_workflow_execute(
+#     user_query: str,
+#     input_leads_list: List[Dict[str, Any]],
+#     tool_config: Optional[List[Dict[str, Any]]] = None
+# ) -> str:
+#     """
+#     1. Generates a three-step workflow from the user query.
+#     2. Executes:
+#        - Step 1: Fetch leads from either Sales Navigator or a provided input list.
+#        - Step 2: Qualify leads (Smart List).
+#        - Step 3: Execute a LinkedIn/email cadence (Campaign Cadence).
+#     3. Returns a JSON string describing success or error, including final leads if relevant.
+#     """
+#     try:
+#         # Generate the 3-step instructions
+#         three_step_response, status = await generate_three_step_workflow(
+#             user_query,
+#             tool_config=tool_config
+#         )
+#         if status != "SUCCESS":
+#             return json.dumps({
+#                 "status": "ERROR",
+#                 "error": three_step_response.get("error", "Failed to generate three-step workflow.")
+#             })
+#         # Extract the step instructions
+#         step_1_instructions = three_step_response.get("step_1_fetch_the_list", "")
+#         step_2_instructions = three_step_response.get("step_2_qualify_the_leads", "")
+#         step_3_instructions = three_step_response.get("step_3_execute_campaign_cadence", "")
+#         # ==============================
+#         # Step 1: Fetch leads
+#         # ==============================
+#         leads_list: List[Dict[str, Any]] = []
+#         step_1_status = "SUCCESS"
+#         if step_1_instructions == "use_input_file":
+#             # We rely on the leads passed in as input
+#             leads_list = input_leads_list
+#             logger.info("Using leads from input file (already provided).")
+#         else:
+#             # We interpret step_1_instructions as a user query for sales nav
+#             logger.info("Generating leads from sales nav instructions: %s", step_1_instructions)
+#             result_str = await generate_leads(
+#                 user_query=step_1_instructions,
+#                 tool_config=tool_config
+#             )
+#             # Parse the JSON result to extract leads
+#             try:
+#                 result_json = json.loads(result_str)
+#             except json.JSONDecodeError as e:
+#                 logger.exception("Failed to parse Step 1 JSON output.")
+#                 return json.dumps({
+#                     "status": "ERROR",
+#                     "error": f"Failed to parse Step 1 output as JSON: {e}"
+#                 })
+#             step_1_status = result_json.get("status", "ERROR")
+#             if step_1_status != "SUCCESS":
+#                 # Return early if step 1 fails
+#                 return json.dumps({
+#                     "status": "ERROR",
+#                     "step": "step_1_fetch_the_list",
+#                     "error": result_json.get("error", "Failed to fetch leads in step 1.")
+#                 })
+#             leads_list = result_json.get("leads", [])
+#             logger.info("Step 1 completed with %d leads", len(leads_list))
+#         # ==============================
+#         # Step 2: Qualify leads
+#         # ==============================
+#         step_2_status = "SUCCESS"
+#         if step_2_instructions:
+#             logger.info("Executing smart list creation with instructions: %s", step_2_instructions)
+#             try:
+#                 # generate_smart_list_creation_code_and_execute typically returns JSON:
+#                 # { "status": "SUCCESS", "qualified_leads": [...] }
+#                 result_str = await qualify_leads(
+#                     user_query=step_2_instructions,
+#                     input_leads_list=leads_list,
+#                     tool_config=tool_config
+#                 )
+#                 result_json = json.loads(result_str)
+#                 step_2_status = result_json.get("status", "ERROR")
+#                 if step_2_status != "SUCCESS":
+#                     return json.dumps({
+#                         "status": step_2_status,
+#                         "step": "step_2_qualify_the_leads",
+#                         "error": result_json.get("error", "Failed to qualify leads in step 2."),
+#                         "leads_before_qualification": len(leads_list)
+#                     })
+#                 leads_list = result_json.get("qualified_leads", [])
+#                 logger.info("Step 2 completed with %d leads after qualification", len(leads_list))
+#             except Exception as exc:
+#                 logger.exception("Exception during Step 2 (smart list).")
+#                 return json.dumps({
+#                     "status": "ERROR",
+#                     "step": "step_2_qualify_the_leads",
+#                     "error": str(exc)
+#                 })
+#         # ==============================
+#         # Step 3: Execute campaign cadence
+#         # ==============================
+#         step_3_status = "SUCCESS"
+#         if step_3_instructions:
+#             logger.info("Executing campaign cadence with instructions: %s", step_3_instructions)
+#             try:
+#                 # generate_campaign_cadence_workflow_and_execute might return JSON:
+#                 # { "status": "SUCCESS", "result": "Campaign done" } or similar
+#                 result_str = await generate_campaign_cadence_workflow_and_execute(
+#                     instructions=step_3_instructions,
+#                     input_leads=leads_list,
+#                     tool_config=tool_config
+#                 )
+#                 result_json = json.loads(result_str)
+#                 step_3_status = result_json.get("status", "ERROR")
+#                 if step_3_status != "SUCCESS":
+#                     return json.dumps({
+#                         "status": step_3_status,
+#                         "step": "step_3_execute_campaign_cadence",
+#                         "error": result_json.get("error", "Failed to run campaign in step 3.")
+#                     })
+#                 logger.info("Step 3 completed successfully.")
+#             except Exception as exc:
+#                 logger.exception("Exception during Step 3 (campaign cadence).")
+#                 return json.dumps({
+#                     "status": "ERROR",
+#                     "step": "step_3_execute_campaign_cadence",
+#                     "error": str(exc)
+#                 })
+#         # If all steps succeed, return final leads
+#         return json.dumps({
+#             "status": "SUCCESS",
+#             "steps": {
+#                 "step_1_status": step_1_status,
+#                 "step_2_status": step_2_status,
+#                 "step_3_status": step_3_status
+#             },
+#             "final_leads_count": len(leads_list),
+#             "leads_list": leads_list
+#         })
+#     except Exception as e:
+#         logger.exception("Exception in generate_three_step_workflow_execute.")
+#         return json.dumps({"status": "ERROR", "error": str(e)})

dhisana/utils/composite_tools.py ADDED Viewed

@@ -0,0 +1,137 @@
+import json
+from pydantic import BaseModel, Field
+from dhisana.utils.assistant_tool_tag import assistant_tool
+from dhisana.utils.built_with_api_tools import (
+    get_company_info_from_builtwith,
+    get_company_info_from_builtwith_by_name,
+)
+from dhisana.utils.dataframe_tools import get_structured_output
+from dhisana.utils.google_custom_search import search_google_custom
+class QualifyCompanyBasedOnTechUsage(BaseModel):
+    company_name: str = Field(..., description="Name of the company")
+    company_domain: str = Field(..., description="Domain of the company")
+    built_with_technology_to_check: str = Field(
+        ..., description="Which technology we are checking for usage in the company."
+    )
+    is_built_with_technology: bool = Field(
+        ..., description="True if the input technology is used by the company based on input data."
+    )
+    reasoning_on_built_with: str = Field(
+        ..., description="Summary of built with technology in company and why is_built_with_technology is set to True or False."
+    )
+def get_technologies(data, keyword):
+    """
+    Check if the keyword is found in the data JSON string.
+    Args:
+        data (dict): The data returned by BuiltWith API.
+        keyword (str): The keyword to search for.
+    Returns:
+        bool: True if the keyword is found, False otherwise.
+    """
+    data_str = json.dumps(data).lower()
+    keyword_lower = keyword.lower()
+    return keyword_lower in data_str
+@assistant_tool
+async def find_tech_usage_in_company(
+    company_domain: str,
+    company_name: str,
+    technology_to_look_for: str,
+    company_information: str
+):
+    """
+    Determine if a company is using a specific technology.
+    Args:
+        company_domain (str): The domain name of the company's website.
+        company_name (str): The name of the company.
+        technology_to_look_for (str): The technology to look for.
+        company_information (str): Additional company information.
+    Returns:
+        str: A JSON string containing the structured output.
+    """
+    if not company_name:
+        return json.dumps({
+            "company_name": company_name,
+            "is_built_with_technology": "False",
+            "reasoning_on_built_with": "Company name is missing."
+        })
+    if not company_domain:
+        company_data_buildwith = await get_company_info_from_builtwith_by_name(company_name)
+        company_domain = company_data_buildwith.get('Lookup', '')
+    else:
+        company_data_buildwith = await get_company_info_from_builtwith(company_domain)
+    # Search for job postings on the company's website mentioning the technology
+    search_google_results = ""
+    if company_domain:
+        company_domain_search = f"site:{company_domain} \"{company_name}\" \"{technology_to_look_for}\""
+        search_google_results = await search_google_custom(company_domain_search, 2)
+    # Search LinkedIn for people at the company with skills in the technology
+    linked_in_search = f"site:linkedin.com/in \"{company_name}\" \"{technology_to_look_for}\" intitle:\"{company_name}\" -intitle:\"followers\" -intitle:\"connections\" -intitle:\"profiles\" -inurl:\"dir/+\""
+    people_with_skills_results = await search_google_custom(linked_in_search, 2)
+    # Search LinkedIn for posts mentioning the company and technology
+    linked_in_posts_search = f"site:linkedin.com/posts \"{company_name}\" \"{technology_to_look_for}\" intitle:\"{company_name}\" -intitle:\"members\" -intitle:\"connections\""
+    linkedin_posts_search = await search_google_custom(linked_in_posts_search, 4)
+    # Search Twitter/X for posts mentioning the company and technology
+    twitter_posts_search_query = f'site:x.com "{company_name}" "{technology_to_look_for}" -intitle:"status"'
+    twitter_posts_search_results = await search_google_custom(twitter_posts_search_query, 4)
+    # General search results
+    general_search_results_query = f"\"{company_name}\" \"{technology_to_look_for}\""
+    general_search_results = await search_google_custom(general_search_results_query, 4)
+    # Get technologies used by the company from BuiltWith
+    tech_found_in_builtwith = []
+    if company_domain:
+        tech_found_in_builtwith = get_technologies(company_data_buildwith, technology_to_look_for)
+    # Prepare the prompt for structured output
+    prompt = f"""
+        Mark the company as qualified in is_built_with_technology if the company {company_name} is using technology {technology_to_look_for}.
+        DO NOT make up information.
+        Give reasoning why the company is is_built_with_technology based on one of the reasons:
+        1. There is a job posting on the company website for that technology.
+        2. There are people with that skill in the given company which were found on linked in google search.
+        3. BuiltWith shows the company uses the tech that is input.
+        4. LinkedIn posts search results show a strong indication of the company using the technology.
+        5. Twitter/X posts search results show a strong indication of the company using the technology.
+        6. General search results show a strong indication of the company using the technology.
+        Input Company Name: {company_name}
+        Technology to look for: {technology_to_look_for}
+        Google search results on company website for technology:
+        {search_google_results}
+        Google search on LinkedIn for people with skills:
+        {people_with_skills_results}
+        LinkedIn posts search for the company:
+        {linkedin_posts_search}
+        Twitter/X posts search for the company:
+        {twitter_posts_search_results}
+        General Search results:
+        {general_search_results}
+        Input Company Details To Lookup:
+        {company_information}
+        BuiltWith shows technology used: {tech_found_in_builtwith}
+    """
+    # Get structured output based on the prompt
+    output, _ = await get_structured_output(prompt, QualifyCompanyBasedOnTechUsage)
+    return json.dumps(output.dict())

dhisana/utils/dataframe_tools.py ADDED Viewed

@@ -0,0 +1,237 @@
+# Tools to mainpulate dataframes. Convert any Natural Language query to a pandas query and process data frames..
+import logging
+import os
+import json
+import csv
+from typing import List, Optional
+from fastapi import HTTPException
+import pandas as pd
+from pydantic import BaseModel
+import time
+import os
+from openai import LengthFinishReasonError, AsyncOpenAI, OpenAIError
+import csv
+from dhisana.utils.cache_output_tools import cache_output
+from dhisana.utils.assistant_tool_tag import assistant_tool
+import hashlib
+import json
+import glob
+from dhisana.utils.cache_output_tools import retrieve_output
+class FileItem:
+    def __init__(self, file_path: str):
+        self.file_path = file_path
+class FileList:
+    def __init__(self, files: List[FileItem]):
+        self.files = files
+class PandasQuery(BaseModel):
+    pandas_query: str
+@assistant_tool
+async def get_structured_output(message: str, response_type, model: str = "gpt-5.1-chat"):
+    """
+    Asynchronously retrieves structured output from the OpenAI API based on the input message.
+    :param message: The input message to be processed by the OpenAI API.
+    :param response_type: The expected format of the response (e.g., JSON).
+    :param model: The model to be used for processing the input message. Defaults to "gpt-5.1-chat".
+    :return: A tuple containing the parsed response and a status string ('SUCCESS' or 'FAIL').
+    """
+    try:
+        # Use the class name instead of serializing the class
+        response_type_str = response_type.__name__
+        # Create unique hashes for message and response_type
+        message_hash = hashlib.md5(message.encode('utf-8')).hexdigest()
+        response_type_hash = hashlib.md5(response_type_str.encode('utf-8')).hexdigest()
+        # Generate the cache key
+        cache_key = f"{message_hash}:{response_type_hash}"
+        cached_response = retrieve_output("get_structured_output", cache_key)
+        if cached_response is not None:
+            parsed_cached_response = response_type.parse_raw(cached_response)
+            return parsed_cached_response, 'SUCCESS'
+        client = AsyncOpenAI()
+        completion = await client.beta.chat.completions.parse(
+            model=model,
+            messages=[
+                {"role": "system", "content": "Extract structured content from input. Output is in JSON Format."},
+                {"role": "user", "content": message},
+            ],
+            response_format=response_type,
+            temperature=0.0
+        )
+        response = completion.choices[0].message
+        if response.parsed:
+            # Cache the successful response
+            cache_output("get_structured_output", cache_key, response.parsed.json())
+            return response.parsed, 'SUCCESS'
+        elif response.refusal:
+            logging.warning("ERROR: Refusal response: %s", response.refusal)
+            return response.refusal, 'FAIL'
+    except LengthFinishReasonError as e:
+        logging.error(f"Too many tokens: {e}")
+        raise HTTPException(status_code=502, detail="The request exceeded the maximum token limit.")
+    except OpenAIError as e:
+        logging.error(f"OpenAI API error: {e}")
+        raise HTTPException(status_code=502, detail="Error communicating with the OpenAI API.")
+    except Exception as e:
+        logging.error(f"Unexpected error: {e}")
+        return {"error": str(e)}, 'FAIL'
+@assistant_tool
+async def query_dataframes(user_query: str, input_files: Optional[List[str]], output_file_path: Optional[str] = None) -> str:
+    """
+    Query multiple dataframes based on a user query and write the output dataframe to a specified output file path.
+    Args:
+        user_query (str): User query in natural language.
+        input_files (List[str]): List of paths to CSV files to be loaded into dataframes.
+        output_file_path (Optional[str]): Path to the output file where the resulting dataframe will be saved.
+            If not specified, a unique file path will be generated in '/tmp/run_interim_outputs/'.
+    Returns:
+        str: A JSON string representing the FileList containing the path to the output file if created,
+             or an error message if an error occurred.
+    """
+    max_retries = 3
+    if not input_files or not user_query:
+        return json.dumps({"files": []})
+    if not output_file_path:
+        output_folder = '/tmp/run_interim_outputs/'
+        os.makedirs(output_folder, exist_ok=True)
+        unique_number = int(time.time() * 1000)
+        output_file_name = f'query_dataframe_{unique_number}.csv'
+        output_file_path = os.path.join(output_folder, output_file_name)
+    else:
+        output_folder = os.path.dirname(output_file_path)
+        if output_folder:
+            os.makedirs(output_folder, exist_ok=True)
+    data_frames = []
+    df_names = []
+    for idx, file in enumerate(input_files):
+        if os.path.getsize(file) == 0:
+            continue
+        df = pd.read_csv(file)
+        data_frames.append(df)
+        df_name = f'df{idx+1}'
+        df_names.append(df_name)
+    if not data_frames:
+        return json.dumps({"files": []})
+    schema_info = ""
+    for df_name, df in zip(df_names, data_frames):
+        schema_info += f"DataFrame '{df_name}' columns: {', '.join(df.columns)}\n"
+    error_message = ""
+    for attempt in range(max_retries):
+        message = f"""
+        You are an expert data analyst. Given the following DataFrames and their schemas:
+        {schema_info}
+        Write a pandas query to answer the following question:
+        \"\"\"{user_query}\"\"\"
+        Your query should use the provided DataFrames ({', '.join(df_names)}) and produce a DataFrame named 'result_df'. Do not include any imports or explanations; only provide the pandas query code that assigns the result to 'result_df'.
+        """
+        if error_message:
+            message += f"\nThe previous query returned the following error:\n{error_message}\nPlease fix the query."
+        pandas_query_result, status = await get_structured_output(message, PandasQuery)
+        if status == 'SUCCESS' and pandas_query_result and pandas_query_result.pandas_query:
+            pandas_query = pandas_query_result.pandas_query
+            local_vars = {name: df for name, df in zip(df_names, data_frames)}
+            global_vars = {}
+            try:
+                exec(pandas_query, global_vars, local_vars)
+                result_df = local_vars.get('result_df')
+                if result_df is None:
+                    error_message = "The query did not produce a DataFrame named 'result_df'."
+                    if attempt == max_retries - 1:
+                        return json.dumps({"error": error_message})
+                    continue
+                break
+            except Exception as e:
+                error_message = str(e)
+                if attempt == max_retries - 1:
+                    return json.dumps({"error": error_message})
+                continue
+        else:
+            if attempt == max_retries - 1:
+                return json.dumps({"error": "Failed to get a valid pandas query after multiple attempts."})
+            continue
+    result_df.to_csv(output_file_path, index=False)
+    file_list = FileList(files=[FileItem(file_path=output_file_path)])
+    def file_item_to_dict(file_item):
+        return {"file_path": file_item.file_path}
+    file_list_dict = {
+        "files": [file_item_to_dict(file_item) for file_item in file_list.files]
+    }
+    file_list_json = json.dumps(file_list_dict, indent=2)
+    return file_list_json
+@assistant_tool
+async def load_csv_file(input_file_path: str):
+    """
+    Loads data from a CSV file and returns it as a list of dictionaries.
+    Args:
+        input_file_path (str): The path to the input CSV file.
+    Returns:
+        List[Dict[str, Any]]: List of rows from the CSV file, where each row is a dictionary.
+    """
+    with open(input_file_path, newline='') as csvfile:
+        reader = csv.DictReader(csvfile)
+        return [row for row in reader]
+@assistant_tool
+async def merge_csv_files(input_folder_path: str, extension: str, required_fields=[], dedup_by_fields=[], sort_by_fields=[], output_file_path: str ="") -> str:
+    # Step 1: List all CSV files in the input folder with the given extension
+    all_files = glob.glob(os.path.join(input_folder_path, f"*.{extension}"))
+    # Step 2: Read each CSV file into a DataFrame
+    df_list = []
+    for file in all_files:
+        df = pd.read_csv(file)
+        df_list.append(df)
+    # Step 3: Concatenate all DataFrames
+    merged_df = pd.concat(df_list, ignore_index=True)
+    # Step 4: Filter rows where required fields are not empty
+    if required_fields:
+        merged_df = merged_df.dropna(subset=required_fields)
+    # Step 5: Remove duplicate rows based on the dedup fields
+    if dedup_by_fields:
+        merged_df = merged_df.drop_duplicates(subset=dedup_by_fields)
+    # Step 6: Sort the DataFrame by the sort fields
+    if sort_by_fields:
+        merged_df = merged_df.sort_values(by=sort_by_fields)
+    # Step 7: Write the final DataFrame to the output file
+    merged_df.to_csv(output_file_path, index=False)
+    return output_file_path

dhisana/utils/domain_parser.py ADDED Viewed

@@ -0,0 +1,45 @@
+# A set of domains that should be excluded because they are social or bio/link aggregator services.
+import tldextract
+EXCLUDED_LINK_DOMAINS = [
+    "beacon.ai",
+    "tap.bio",
+    "campsite.bio",
+    "shor.by",
+    "milkshake.app",
+    "lnk.bio",
+    "carrd.co",
+    "bio.fm",
+    "withkoji.com",
+    "flowcode.com",
+    "biolinky.co",
+    "contactinbio.com",
+    "linktr.ee",
+    "linkedin.com",
+    "facebook.com",
+    "youtube.com",
+]
+def get_domain_from_website(website: str) -> str:
+    """
+    Extracts the domain from a given website URL using tldextract.
+    Returns an empty string if no website is provided.
+    :param website: The full URL from which to extract the domain.
+    :return: Extracted domain in the form 'example.com', or '' if none.
+    """
+    if not website:
+        return ""
+    extracted = tldextract.extract(website)
+    return f"{extracted.domain}.{extracted.suffix}"
+def is_excluded_domain(domain: str) -> bool:
+    """
+    Checks if the domain is in the EXCLUDED_LINK_DOMAINS list.
+    :param domain: The domain (without protocol) to be checked.
+    :return: True if the domain is excluded, False otherwise.
+    """
+    return domain.lower() in EXCLUDED_LINK_DOMAINS