dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,933 @@
1
+ """
2
+ This module provides a set of functions to enrich lead and organization information
3
+ using various enrichment tools such as Apollo or ProxyCurl. It also allows
4
+ extraction and validation of domains from user-provided links or company websites.
5
+ """
6
+
7
+ import re
8
+ from typing import Any, Dict, List, Optional
9
+ from urllib.parse import urlparse
10
+
11
+ from pydantic import BaseModel, Field
12
+ import mdformat
13
+
14
+ from dhisana.utils.check_email_validity_tools import process_email_properties
15
+ from dhisana.utils.company_utils import normalize_company_name
16
+ from dhisana.utils.field_validators import (
17
+ normalize_linkedin_url,
18
+ normalize_linkedin_company_url,
19
+ normalize_salesnav_url,
20
+ normalize_linkedin_company_salesnav_url,
21
+ validate_and_clean_email,
22
+ validation_organization_domain,
23
+ validate_website_url
24
+ )
25
+ from dhisana.utils.apollo_tools import enrich_user_info_with_apollo
26
+ from dhisana.utils.assistant_tool_tag import assistant_tool
27
+ from dhisana.utils.domain_parser import get_domain_from_website, is_excluded_domain
28
+ from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
29
+ from dhisana.utils.proxy_curl_tools import (
30
+ enrich_job_info_from_proxycurl,
31
+ enrich_organization_info_from_proxycurl,
32
+ enrich_user_info_with_proxy_curl,
33
+ )
34
+ from dhisana.utils.research_lead import research_company_with_full_info_ai, research_lead_with_full_info_ai
35
+ from dhisana.utils.serpapi_search_tools import (
36
+ find_organization_linkedin_url_with_google_search,
37
+ find_user_linkedin_url_by_email_google,
38
+ find_user_linkedin_url_google,
39
+ find_user_linkedin_url_with_serper,
40
+ get_company_website_from_linkedin_url,
41
+ )
42
+
43
+ import logging
44
+ logging.basicConfig(level=logging.INFO)
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ # ----------------------------------------------------------------------
49
+ # Allowed Enrichment Tools
50
+ # ----------------------------------------------------------------------
51
+ ALLOWED_ENRICHMENT_TOOLS = ["proxycurl", "apollo", "zoominfo"]
52
+
53
+ USER_LOOKUP_TOOL_NAME_TO_FUNCTION_MAP = {
54
+ "apollo": enrich_user_info_with_apollo,
55
+ "proxycurl": enrich_user_info_with_proxy_curl,
56
+ }
57
+
58
+
59
+ # ----------------------------------------------------------------------
60
+ # BasicLeadInformation model
61
+ # ----------------------------------------------------------------------
62
+ class BasicLeadInformation(BaseModel):
63
+ full_name: str = Field(..., description="Full name of the lead")
64
+ first_name: str = Field(..., description="First name of the lead")
65
+ last_name: str = Field(..., description="Last name of the lead")
66
+ email: str = Field(..., description="Email address of the lead")
67
+ primary_domain_of_organization: str = Field(..., description="Primary domain of the organization")
68
+ job_title: str = Field(..., description="Job Title of the lead")
69
+ phone: str = Field(..., description="Phone number of the lead")
70
+ headline: str = Field(..., description="Headline of the lead")
71
+ lead_location: str = Field(..., description="Location of the lead")
72
+ organization_name: str = Field(..., description="Current Company where lead works")
73
+ common_connections: int = Field(..., description="Number of common connections with the lead. Default 0")
74
+ followers_count: int = Field(..., description="Number of followers of the lead. Default 0")
75
+ tenure_in_current_role: str = Field(..., description="Tenure in the current role")
76
+ tenure_in_current_company: str = Field(..., description="Tenure in the current company")
77
+ connection_degree: str = Field(..., description="Degree of connection with the lead (1st, 2nd, 3rd)")
78
+ is_premium_account: bool = Field(..., description="Is the lead a premium account. Default is false.")
79
+ country_code: str = Field(..., description="Alpha-2 ISO3166 country code eg. US")
80
+
81
+
82
+ # ----------------------------------------------------------------------
83
+ # Helper: chunkify
84
+ # ----------------------------------------------------------------------
85
+ def chunkify(items: List[Any], chunk_size: int) -> List[List[Any]]:
86
+ """
87
+ Splits a list into sublists (chunks) of size `chunk_size`.
88
+ """
89
+ for i in range(0, len(items), chunk_size):
90
+ yield items[i : i + chunk_size]
91
+
92
+
93
+ # ----------------------------------------------------------------------
94
+ # Function: cleanup_user_name
95
+ # ----------------------------------------------------------------------
96
+ def cleanup_user_name(cloned_properties: dict) -> dict:
97
+ """
98
+ Cleans up user name fields: 'full_name', 'first_name', 'last_name'.
99
+ Returns the updated dictionary. If values are invalid or placeholders, sets them to ''.
100
+ """
101
+ if not isinstance(cloned_properties, dict):
102
+ return {}
103
+
104
+ def normalize(name: str) -> str:
105
+ if not name or not isinstance(name, str):
106
+ return ""
107
+ # Common placeholders or invalid tokens
108
+ invalid_tokens = [
109
+ "null", "none", "na", "n.a", "notfound", "error",
110
+ "na.", "na,", "notavilable", "notavailable", ""
111
+ ]
112
+ stripped = name.strip().lower()
113
+ if stripped in invalid_tokens:
114
+ return ""
115
+
116
+ # Remove anything in parentheses
117
+ stripped = re.sub(r"\(.*?\)", "", stripped)
118
+ # Remove anything after '|'
119
+ stripped = stripped.split("|", 1)[0]
120
+ # Remove extra non-alphanumeric characters (but allow whitespace)
121
+ stripped = re.sub(r"[^a-zA-Z0-9\s]", "", stripped)
122
+
123
+ # Capitalize the first letter of each word, and lowercase the rest
124
+ return " ".join(word.capitalize() for word in stripped.strip().split())
125
+
126
+ full_name = normalize(cloned_properties.get("full_name"))
127
+ first_name = normalize(cloned_properties.get("first_name"))
128
+ last_name = normalize(cloned_properties.get("last_name"))
129
+
130
+ # If full_name is empty, build from first_name + last_name
131
+ if first_name and last_name and not full_name:
132
+ full_name = (first_name + " " + last_name).strip()
133
+
134
+ cloned_properties["full_name"] = full_name
135
+ cloned_properties["first_name"] = first_name
136
+ cloned_properties["last_name"] = last_name
137
+
138
+ return cloned_properties
139
+
140
+
141
+ # ----------------------------------------------------------------------
142
+ # LLM-based cleanup for single lead
143
+ # ----------------------------------------------------------------------
144
+ async def get_clean_lead_info_with_llm(lead_info_str: str, tool_config: Optional[dict]) -> Dict[str, Any]:
145
+ """
146
+ Takes a JSON string representation of partial lead info,
147
+ returns a cleaned-up lead dictionary matching BasicLeadInformation fields.
148
+ """
149
+ prompt = f"""
150
+ Given the following data about a lead and the organization they work for,
151
+ extract and clean up the lead information.
152
+ - Format 'full_name' properly.
153
+ - Format 'first_name' and 'last_name' so they're capitalized properly if available.
154
+ - Make sure 'organization_name' is properly capitalized if provided.
155
+ - Do not invent data that isn't provided.
156
+
157
+ Data:
158
+ {lead_info_str}
159
+
160
+ The output format is in JSON. The expected fields match BasicLeadInformation.
161
+ """
162
+ lead_info, status = await get_structured_output_internal(
163
+ prompt,
164
+ BasicLeadInformation,
165
+ model="gpt-5.1-chat",
166
+ tool_config=tool_config
167
+ )
168
+ if status == "ERROR":
169
+ return {}
170
+ return lead_info.model_dump()
171
+
172
+
173
+ # ----------------------------------------------------------------------
174
+ # Helper: is_personal_email_domain
175
+ # ----------------------------------------------------------------------
176
+ def is_personal_email_domain(domain: str) -> bool:
177
+ """
178
+ Very simple check to see if the domain is one of the common free/personal
179
+ email providers. Could expand this list or integrate a third-party API
180
+ for more accuracy.
181
+ """
182
+ common_free_domains = {
183
+ "gmail.com", "yahoo.com", "hotmail.com", "outlook.com",
184
+ "protonmail.com", "icloud.com", "aol.com", "mail.com",
185
+ "pm.me", "yandex.com", "gmx.com"
186
+ }
187
+ domain = domain.strip().lower()
188
+ return (domain in common_free_domains) or domain.endswith(".edu")
189
+
190
+
191
+ # ----------------------------------------------------------------------
192
+ # Main validation & cleanup function
193
+ # ----------------------------------------------------------------------
194
+ async def validate_and_cleanup(
195
+ cloned_properties: dict,
196
+ tool_config: Optional[dict] = None,
197
+ use_strict_check: bool = False
198
+ ) -> dict:
199
+ """
200
+ Wrapper to validate & normalize various properties in a dictionary.
201
+
202
+ 1) Clean up/validate typical fields.
203
+ 2) If name fields appear invalid, fallback to LLM-based name inference.
204
+ 3) If 'primary_domain_of_organization' AND 'organization_website' are both empty,
205
+ but there's a valid corporate email, use that as the domain.
206
+ 4) (Optional) Enrich the organization info from the name if needed.
207
+ """
208
+
209
+ if not isinstance(cloned_properties, dict):
210
+ return {}
211
+
212
+ # ------------------------------------------------------------------
213
+ # Step 1: Normalize typical fields
214
+ # ------------------------------------------------------------------
215
+ cloned_properties["user_linkedin_url"] = normalize_linkedin_url(
216
+ cloned_properties.get("user_linkedin_url")
217
+ )
218
+ cloned_properties["user_linkedin_salesnav_url"] = normalize_salesnav_url(
219
+ cloned_properties.get("user_linkedin_salesnav_url")
220
+ )
221
+ cloned_properties["organization_linkedin_url"] = normalize_linkedin_company_url(
222
+ cloned_properties.get("organization_linkedin_url")
223
+ )
224
+ cloned_properties["organization_linkedin_salesnav_url"] = normalize_linkedin_company_salesnav_url(
225
+ cloned_properties.get("organization_linkedin_salesnav_url")
226
+ )
227
+ cloned_properties["email"] = validate_and_clean_email(
228
+ cloned_properties.get("email")
229
+ )
230
+ cloned_properties["primary_domain_of_organization"] = validation_organization_domain(
231
+ cloned_properties.get("primary_domain_of_organization")
232
+ )
233
+ cloned_properties["organization_website"] = validate_website_url(
234
+ cloned_properties.get("organization_website")
235
+ )
236
+ cloned_properties["organization_name"] = normalize_company_name(
237
+ cloned_properties.get("organization_name")
238
+ )
239
+
240
+ # ------------------------------------------------------------------
241
+ # Step 2: Basic name-check. If invalid => LLM fallback.
242
+ # ------------------------------------------------------------------
243
+ def has_special_characters(val: str) -> bool:
244
+ return bool(re.search(r"[^a-zA-Z0-9\s]", val))
245
+
246
+ def is_invalid_name(val: str) -> bool:
247
+ return (len(val.strip()) < 3) or has_special_characters(val)
248
+
249
+ full_name = cloned_properties.get("full_name", "")
250
+ first_name = cloned_properties.get("first_name", "")
251
+ last_name = cloned_properties.get("last_name", "")
252
+ if (not full_name or full_name.startswith("None")):
253
+ full_name = ""
254
+ if (not first_name or first_name.startswith("None")):
255
+ first_name = ""
256
+ if (not last_name or last_name.startswith("None")):
257
+ last_name = ""
258
+
259
+ if (
260
+ is_invalid_name(full_name)
261
+ or is_invalid_name(first_name)
262
+ or is_invalid_name(last_name)
263
+ ):
264
+ # Check if we have a valid LinkedIn URL - if so, skip LLM as ProxyCurl will fill the data
265
+ user_linkedin_url = cloned_properties.get("user_linkedin_url", "").strip()
266
+ if not user_linkedin_url:
267
+ lead_info_str = str(cloned_properties)
268
+ logger.info(
269
+ "Detected invalid name fields. Using LLM to infer/correct name fields."
270
+ )
271
+ # Attempt LLM-based cleanup
272
+ new_lead_info = await get_clean_lead_info_with_llm(lead_info_str, tool_config=tool_config)
273
+ if new_lead_info:
274
+ cloned_properties["full_name"] = new_lead_info.get("full_name", "")
275
+ cloned_properties["first_name"] = new_lead_info.get("first_name", "")
276
+ cloned_properties["last_name"] = new_lead_info.get("last_name", "")
277
+ else:
278
+ logger.info("Valid LinkedIn URL found. Skipping LLM cleanup as ProxyCurl will enrich the data.")
279
+ else:
280
+ # Use the cheaper logic
281
+ cloned_properties = cleanup_user_name(cloned_properties)
282
+
283
+ # ------------------------------------------------------------------
284
+ # Step 3: If domain & website are empty but there's a corporate email
285
+ # ------------------------------------------------------------------
286
+ # - If email is present, check if domain is personal or corporate
287
+ # - If corporate, set primary_domain_of_organization from email domain
288
+ # ------------------------------------------------------------------
289
+ domain_empty = not cloned_properties.get("primary_domain_of_organization")
290
+ website_empty = not cloned_properties.get("organization_website")
291
+ email = cloned_properties.get("email", "")
292
+
293
+ if domain_empty and website_empty and email:
294
+ # parse domain from email
295
+ extracted_domain = email.split("@")[-1].strip().lower()
296
+ if extracted_domain and (not is_personal_email_domain(extracted_domain)):
297
+ # This is a "corporate" email domain, so use it
298
+ cloned_properties["primary_domain_of_organization"] = extracted_domain
299
+ cloned_properties["organization_website"] = f"https://www.{extracted_domain}"
300
+ logger.info("Set primary_domain_of_organization from corporate email domain.")
301
+
302
+ if domain_empty and not website_empty:
303
+ from urllib.parse import urlparse
304
+ parsed_website = urlparse(cloned_properties["organization_website"])
305
+ possible_domain = parsed_website.netloc.replace("www.", "")
306
+ if possible_domain:
307
+ cloned_properties["primary_domain_of_organization"] = possible_domain
308
+ logger.info("Set primary_domain_of_organization from organization_website domain.")
309
+ return cloned_properties
310
+
311
+ @assistant_tool
312
+ async def enrich_lead_information(
313
+ user_properties: Dict[str, Any],
314
+ use_strict_check: bool = True,
315
+ get_valid_email: bool = True,
316
+ company_research_instructions: str = "",
317
+ lead_research_instructions: str = "",
318
+ enrich_company_information: bool = True,
319
+ enrich_lead_information: bool = True,
320
+ tool_config: Optional[List[Dict[str, Any]]] = None,
321
+ ) -> Dict[str, Any]:
322
+ logger.debug("Starting enrich_lead_information with user_properties: %s", user_properties)
323
+ cloned_properties = dict(user_properties)
324
+
325
+ cloned_properties = await validate_and_cleanup(cloned_properties, tool_config=tool_config, use_strict_check=use_strict_check)
326
+
327
+ cloned_properties = await enrich_user_info(
328
+ input_properties=cloned_properties,
329
+ use_strict_check=use_strict_check,
330
+ tool_config=tool_config,
331
+ )
332
+ if use_strict_check and not cloned_properties.get("user_linkedin_url") and not cloned_properties.get("email"):
333
+ return cloned_properties
334
+
335
+ await enrich_organization_info_from_name(
336
+ row=cloned_properties,
337
+ use_strict_check=use_strict_check,
338
+ tool_config=tool_config,
339
+ )
340
+
341
+ cloned_properties = await enrich_with_provider(cloned_properties, tool_config)
342
+
343
+ await enrich_organization_info_from_name(
344
+ row=cloned_properties,
345
+ use_strict_check=use_strict_check,
346
+ tool_config=tool_config,
347
+ )
348
+
349
+ if get_valid_email:
350
+ await process_email_properties(cloned_properties, tool_config)
351
+
352
+ # ------------------------------------------------------------------
353
+ # Supplement missing follower count or name information using Serper
354
+ # ------------------------------------------------------------------
355
+ linkedin_url = cloned_properties.get("user_linkedin_url", "").strip()
356
+ follower_count = cloned_properties.get("linkedin_follower_count")
357
+ first_name = cloned_properties.get("first_name")
358
+ if (
359
+ linkedin_url
360
+ and (follower_count is None or (isinstance(follower_count, str) and not follower_count.strip()) or not first_name)
361
+ ):
362
+ serper_result = await find_user_linkedin_url_with_serper(
363
+ linkedin_url, tool_config=tool_config
364
+ )
365
+ if serper_result:
366
+ if follower_count is None or (
367
+ isinstance(follower_count, str) and not follower_count.strip()
368
+ ):
369
+ cloned_properties["linkedin_follower_count"] = serper_result.get(
370
+ "linkedin_follower_count", 0
371
+ )
372
+ if not first_name:
373
+ cloned_properties["first_name"] = serper_result.get("first_name", "")
374
+ cloned_properties["last_name"] = serper_result.get("last_name", "")
375
+
376
+ cloned_properties = await validate_and_cleanup(
377
+ cloned_properties, tool_config=tool_config, use_strict_check=use_strict_check
378
+ )
379
+
380
+ research_summary = cloned_properties.get("research_summary", "")
381
+
382
+ if enrich_lead_information:
383
+ summary = await research_lead_with_full_info_ai(
384
+ cloned_properties, lead_research_instructions, tool_config=tool_config
385
+ )
386
+ if summary:
387
+ research_summary = summary.get("research_summary", "")
388
+
389
+ if enrich_company_information:
390
+ company_company_properties = {
391
+ "organization_name": cloned_properties.get("organization_name", ""),
392
+ "primary_domain_of_organization": cloned_properties.get("primary_domain_of_organization", ""),
393
+ "organization_website": cloned_properties.get("organization_website", ""),
394
+ }
395
+ company_summary = await research_company_with_full_info_ai(
396
+ company_company_properties,
397
+ company_research_instructions,
398
+ tool_config=tool_config,
399
+ )
400
+ if company_summary:
401
+ markdown_text = research_summary + "\n\n#### " + company_summary.get(
402
+ "research_summary", ""
403
+ )
404
+ formatted_markdown = mdformat.text(markdown_text)
405
+ research_summary = re.sub(
406
+ r'^(#{1,6})\s+', '##### ', formatted_markdown, flags=re.MULTILINE
407
+ )
408
+
409
+ cloned_properties["research_summary"] = research_summary
410
+ return cloned_properties
411
+
412
+
413
+ class UserInfoFromGithubProfileId(BaseModel):
414
+ first_name: str
415
+ last_name: str
416
+ full_name: str
417
+ linkedin_url: str
418
+ github_url: str
419
+ email: str
420
+ twitter_handle: str
421
+ website: str
422
+ location: str
423
+
424
+
425
+ def extract_id_from_salesnav_url(url_key: str) -> str:
426
+ """
427
+ Extract the Sales Navigator lead ID from a URL like
428
+ 'https://www.linkedin.com/sales/lead/<ID>?...'
429
+ """
430
+ if not url_key:
431
+ return ""
432
+ match = re.search(r"linkedin\.com/sales/lead/([^/?#,]+)", url_key, re.IGNORECASE)
433
+ if not match:
434
+ return ""
435
+ # strip out any non-word or hyphen chars
436
+ return re.sub(r"[^\w-]", "", match.group(1))
437
+
438
+ def proxy_linkedin_url(user_linkedin_salesnav_url: str) -> str:
439
+ """
440
+ Given a Sales Navigator URL, return the corresponding public LinkedIn URL.
441
+ Raises ValueError if the ID cannot be extracted.
442
+ """
443
+ salesnav_id = extract_id_from_salesnav_url(user_linkedin_salesnav_url)
444
+ if not salesnav_id:
445
+ raise ValueError("Could not extract ID from Sales Nav URL.")
446
+ return f"https://www.linkedin.com/in/{salesnav_id}"
447
+
448
+ # -------------------------------------------------------------------
449
+ # (Pseudo) get_structured_output_internal, find_user_linkedin_url_google
450
+ # and other references assumed to exist in your environment.
451
+ # -------------------------------------------------------------------
452
+
453
+ async def get_user_linkedin_url_from_github_profile(
454
+ github_profile_id: str,
455
+ lead_properties: dict,
456
+ instructions: str,
457
+ tool_config: Optional[List[Dict]] = None
458
+ ) -> Dict[str, Any]:
459
+ """
460
+ Attempt to locate a user's LinkedIn profile URL from their GitHub profile ID via web search.
461
+ Also gather basic user info (first/last name) if possible.
462
+ """
463
+ instructions = f"""
464
+ Give user information from user GitHub handle; try to locate the LinkedIn profile URL
465
+ for the user using web search.
466
+ ---
467
+ Github profile id:
468
+ {github_profile_id}
469
+ Company Data include name, domain and website:
470
+ {lead_properties}
471
+
472
+ Instructions:
473
+ {instructions}
474
+ ---
475
+ Use websearch to locate the LinkedIn profile url for the user if present.
476
+
477
+ **Output**:
478
+ Return your final output as valid JSON with the following structure:
479
+ {{
480
+ "first_name": "...",
481
+ "last_name": "...",
482
+ "full_name": "...",
483
+ "linkedin_url": "...",
484
+ "github_url": "...",
485
+ "email": "...",
486
+ "twitter_handle": "...",
487
+ "website": "...",
488
+ "location": "..."
489
+ }}
490
+ """
491
+
492
+ # Example call to structured output function
493
+ response, status = await get_structured_output_internal(
494
+ instructions,
495
+ UserInfoFromGithubProfileId,
496
+ model="gpt-5.1-chat",
497
+ use_web_search=True,
498
+ tool_config=tool_config
499
+ )
500
+ if status == "SUCCESS":
501
+ return response
502
+ else:
503
+ return {}
504
+
505
+ async def enrich_user_info(
506
+ input_properties: Dict[str, Any],
507
+ use_strict_check: bool,
508
+ tool_config: Optional[List[Dict[str, Any]]] = None,
509
+ ) -> Dict[str, Any]:
510
+ """
511
+ Attempt to find or fix a user's LinkedIn URL using name, title, location,
512
+ company info or GitHub profile handle if present. If still not found,
513
+ but user_linkedin_salesnav_url exists, we fall back to creating a
514
+ proxy URL from the Sales Navigator link.
515
+ """
516
+ logger.debug("Starting enrich_user_info for: %s", input_properties.get("full_name"))
517
+ user_linkedin_url = (input_properties.get("user_linkedin_url") or "").strip()
518
+ input_properties["linkedin_url_match"] = False
519
+ github_profile_id = (input_properties.get("github_profile_id") or "").strip()
520
+
521
+ # 1) If we do not have a user_linkedin_url, try getting it from GitHub
522
+ if not user_linkedin_url:
523
+ if github_profile_id:
524
+ response = await get_user_linkedin_url_from_github_profile(
525
+ github_profile_id=github_profile_id,
526
+ lead_properties=input_properties,
527
+ instructions="Use web search to find the user's LinkedIn profile from GitHub handle if present.",
528
+ tool_config=tool_config,
529
+ )
530
+ user_linkedin_url = response.get("linkedin_url", "")
531
+ if user_linkedin_url:
532
+ input_properties["user_linkedin_url"] = user_linkedin_url
533
+ if not input_properties.get("first_name"):
534
+ input_properties["first_name"] = response.get("first_name", "")
535
+ if not input_properties.get("last_name"):
536
+ input_properties["last_name"] = response.get("last_name", "")
537
+ if not input_properties.get("email"):
538
+ input_properties["email"] = response.get("email", "")
539
+ if not input_properties.get("lead_location"):
540
+ input_properties["lead_location"] = response.get("location", "")
541
+ return input_properties
542
+
543
+ # 2) If still no LinkedIn URL, try name/title/org searching
544
+ full_name = (input_properties.get("full_name") or "").strip()
545
+ if not full_name:
546
+ first_name = (input_properties.get("first_name", "") or "").strip()
547
+ last_name = (input_properties.get("last_name", "") or "").strip()
548
+ full_name = f"{first_name} {last_name}".strip()
549
+
550
+ title = input_properties.get("job_title", "") or ""
551
+ location = input_properties.get("lead_location", "") or ""
552
+ org_name = (input_properties.get("organization_name", "") or "").strip()
553
+ org_domain = (input_properties.get("primary_domain_of_organization", "") or "").strip()
554
+ email = (input_properties.get("email") or "").strip()
555
+
556
+ if full_name and (org_name or org_domain or title):
557
+ # This function does a google-based search for the user's LinkedIn
558
+ found_linkedin_url = await find_user_linkedin_url_google(
559
+ user_name=full_name,
560
+ user_title=title,
561
+ user_location=location,
562
+ user_company=org_name,
563
+ user_company_domain=org_domain,
564
+ use_strict_check=use_strict_check,
565
+ tool_config=tool_config,
566
+ )
567
+ if found_linkedin_url:
568
+ user_linkedin_url = found_linkedin_url
569
+ input_properties["user_linkedin_url"] = user_linkedin_url
570
+ if not user_linkedin_url and email:
571
+ # If we have an email but no name, try searching by email
572
+ email_lookup_result = await find_user_linkedin_url_by_email_google(
573
+ email=email,
574
+ user_name=full_name,
575
+ user_title=title,
576
+ user_location=location,
577
+ user_company=org_name,
578
+ tool_config=tool_config,
579
+ )
580
+ if email_lookup_result and email_lookup_result.get("linkedin_url"):
581
+ user_linkedin_url = email_lookup_result["linkedin_url"]
582
+ input_properties["user_linkedin_url"] = user_linkedin_url
583
+ confidence = email_lookup_result.get("confidence", 0.0)
584
+ reasoning = email_lookup_result.get("reasoning", "")
585
+ input_properties["user_linkedin_url_confidence"] = confidence
586
+ input_properties["user_linkedin_url_reasoning"] = reasoning
587
+
588
+ additional_properties = input_properties.get("additional_properties") or {}
589
+ additional_properties["user_linkedin_url_confidence"] = confidence
590
+ if reasoning:
591
+ additional_properties["user_linkedin_url_reasoning"] = reasoning
592
+ input_properties["additional_properties"] = additional_properties
593
+
594
+ # 3) Final fallback: if STILL no user_linkedin_url,
595
+ # but user_linkedin_salesnav_url is present, use proxy
596
+ if not input_properties.get("user_linkedin_url"):
597
+ salesnav_url = input_properties.get("user_linkedin_salesnav_url", "")
598
+ if salesnav_url:
599
+ try:
600
+ proxy_url = proxy_linkedin_url(salesnav_url)
601
+ input_properties["user_linkedin_url"] = proxy_url
602
+ logger.debug("Falling back to proxy LinkedIn URL from SalesNav: %s", proxy_url)
603
+ except ValueError:
604
+ # If we can't parse an ID from the sales nav URL, skip
605
+ logger.warning("Could not parse ID from user_linkedin_salesnav_url: %s", salesnav_url)
606
+
607
+ return input_properties
608
+
609
+
610
+
611
+ async def enrich_with_provider(
612
+ cloned_properties: Dict[str, Any],
613
+ tool_config: Optional[List[Dict[str, Any]]],
614
+ ) -> Dict[str, Any]:
615
+ """
616
+ Enrich user/lead data using one of the allowed provider tools (e.g., Apollo, ZoomInfo).
617
+ The tool_config should specify which tool(s) to use.
618
+
619
+ :param cloned_properties: Dictionary containing user/lead details to be enriched.
620
+ :param tool_config: List of tool configuration dicts, e.g. [{"name": "apollo"}, ...].
621
+ :return: The updated dictionary after enrichment.
622
+ :raises ValueError: If no tool_config is provided or no suitable enrichment tool is found.
623
+ """
624
+ if not tool_config:
625
+ raise ValueError("No tool configuration found.")
626
+
627
+ chosen_tool_func = None
628
+ for allowed_tool_name in ALLOWED_ENRICHMENT_TOOLS:
629
+ for item in tool_config:
630
+ logger.debug("Selected tool: %s", item.get("name"))
631
+ if item.get("name") == allowed_tool_name and allowed_tool_name in USER_LOOKUP_TOOL_NAME_TO_FUNCTION_MAP:
632
+ chosen_tool_func = USER_LOOKUP_TOOL_NAME_TO_FUNCTION_MAP[allowed_tool_name]
633
+ break
634
+ if chosen_tool_func:
635
+ break
636
+
637
+ if not chosen_tool_func:
638
+ raise ValueError("No suitable email validation tool found in tool_config.")
639
+
640
+ return await chosen_tool_func(cloned_properties, tool_config)
641
+
642
+
643
+ async def enrich_organization_info_from_name(
644
+ row: Dict[str, str],
645
+ use_strict_check: bool = True,
646
+ tool_config: Optional[List[Dict[str, Any]]] = None,
647
+ ) -> None:
648
+ """
649
+ Given a dictionary (treated like a CSV row) containing 'organization_name',
650
+ 'organization_linkedin_url', and 'website' keys, enrich the row only if the
651
+ domain and website are currently empty.
652
+ """
653
+ org_name_key = "organization_name"
654
+ org_domain_key = "primary_domain_of_organization"
655
+ website_key = "organization_website"
656
+
657
+ org_name = (row.get(org_name_key) or "").strip()
658
+ logger.debug("Enriching organization info from name: %s", org_name)
659
+ if org_name.lower() in ["none", "freelance"]:
660
+ row[org_name_key] = ""
661
+ org_name = ""
662
+
663
+ # If there's no organization name, just return
664
+ if not org_name:
665
+ return
666
+
667
+ # If domain or website is already present, we consider it enriched
668
+ if row.get(org_domain_key) or row.get(website_key):
669
+ return
670
+ await set_organization_domain(row, use_strict_check, tool_config)
671
+
672
+
673
+ async def set_organization_domain(
674
+ row: Dict[str, str],
675
+ use_strict_check: bool = True,
676
+ tool_config: Optional[List[Dict[str, Any]]] = None,
677
+ ) -> None:
678
+ """
679
+ Update the row with a 'primary_domain_of_organization' based on 'website' or
680
+ search results if the domain is absent.
681
+ """
682
+ org_name_key = "organization_name"
683
+ org_domain_key = "primary_domain_of_organization"
684
+ website_key = "organization_website"
685
+ linkedin_url_key = "organization_linkedin_url"
686
+
687
+ existing_domain = (row.get(org_domain_key) or "").strip()
688
+ org_name = (row.get(org_name_key) or "").strip()
689
+ logger.debug("Setting organization domain for organization: %s", org_name)
690
+ logger.debug("Check existing_domain: %s", existing_domain)
691
+ logger.debug("Check org_name: %s", org_name)
692
+
693
+ if not existing_domain:
694
+ company_website = (row.get(website_key) or "").strip()
695
+ logger.debug("Check company_website: %s", company_website)
696
+ extracted_domain = ""
697
+ logger.debug("Initial extracted_domain: %s", extracted_domain)
698
+ if not company_website and row.get(linkedin_url_key):
699
+ company_website = await get_company_website_from_linkedin_url(row.get(linkedin_url_key))
700
+ if company_website:
701
+ logger.debug("Found company website from LinkedIn URL: %s", company_website)
702
+ row[website_key] = company_website
703
+
704
+ if company_website:
705
+ extracted_domain = get_domain_from_website(company_website)
706
+ logger.debug("extracted domain from website: %s", extracted_domain)
707
+ if extracted_domain and is_excluded_domain(extracted_domain):
708
+ extracted_domain = ""
709
+ company_website = ""
710
+
711
+ if not extracted_domain and not use_strict_check and org_name:
712
+ logger.debug("Performing Google search to find domain for org_name: %s", org_name)
713
+ company_info = await get_company_domain_from_llm_web_search(
714
+ company_name=org_name,
715
+ lead_info=row,
716
+ location="US",
717
+ tool_config=tool_config
718
+ )
719
+ if company_info and isinstance(company_info, dict):
720
+ # If the LLM found a domain, set it
721
+ if company_info.get("primary_domain_of_organization") and not row[org_domain_key]:
722
+ row[org_domain_key] = company_info["primary_domain_of_organization"]
723
+
724
+ # If the LLM found an organization website, set it
725
+ if company_info.get("organization_website") and not row[website_key]:
726
+ row[website_key] = company_info["organization_website"]
727
+
728
+ # If there's a LinkedIn URL from LLM, set it
729
+ if company_info.get("organization_linkedin_url") and not row[linkedin_url_key]:
730
+ row[linkedin_url_key] = company_info["organization_linkedin_url"]
731
+
732
+ if company_info.get("organization_name") and not row[org_name_key]:
733
+ row[org_name_key] = company_info["organization_name"]
734
+
735
+ row[org_domain_key] = extracted_domain or ""
736
+ logger.debug("Final domain selected: %s", row[org_domain_key])
737
+ row[website_key] = company_website or ""
738
+
739
+ # If there's still no website but we have a domain, set a default website
740
+ company_website = (row.get(website_key) or "").strip()
741
+ if existing_domain and not company_website:
742
+ row[website_key] = f"https://www.{existing_domain}"
743
+
744
+
745
+ async def get_organization_linkedin_url(lead: Dict[str, Any], tools: Optional[List[Dict[str, Any]]]) -> str:
746
+ """
747
+ Retrieve the organization's LinkedIn URL using the company name, domain, and search tools.
748
+ Returns an empty string if the organization name is missing.
749
+ """
750
+ name = lead.get("organization_name", "").strip()
751
+ if not name:
752
+ return ""
753
+
754
+ linkedin_url = await find_organization_linkedin_url_with_google_search(
755
+ name,
756
+ company_location="US",
757
+ company_domain=lead.get("primary_domain_of_organization"),
758
+ use_strict_check=True,
759
+ tool_config=tools,
760
+ )
761
+ return linkedin_url
762
+
763
+
764
+ async def enrich_organization_info_from_company_url(
765
+ organization_linkedin_url: str,
766
+ use_strict_check: bool = True,
767
+ tool_config: Optional[List[Dict[str, Any]]] = None,
768
+ categories: Optional[bool] = None,
769
+ funding_data: Optional[bool] = None,
770
+ exit_data: Optional[bool] = None,
771
+ acquisitions: Optional[bool] = None,
772
+ extra: Optional[bool] = None,
773
+ use_cache: Optional[str] = "if-present",
774
+ fallback_to_cache: Optional[str] = "on-error",
775
+ ) -> Dict[str, Any]:
776
+ """
777
+ Given an organization LinkedIn URL, attempt to enrich its data (e.g. name, website)
778
+ via ProxyCurl. Additional Proxycurl Company API boolean flags (categories, funding_data, etc.)
779
+ can be supplied to control the returned payload (True -> "include"). If data is found,
780
+ set domain, then return the dict. Otherwise, return {}.
781
+ """
782
+
783
+ # Call ProxyCurl to enrich
784
+ company_data = await enrich_organization_info_from_proxycurl(
785
+ organization_linkedin_url=organization_linkedin_url,
786
+ tool_config=tool_config,
787
+ categories=categories,
788
+ funding_data=funding_data,
789
+ exit_data=exit_data,
790
+ acquisitions=acquisitions,
791
+ extra=extra,
792
+ use_cache=use_cache,
793
+ fallback_to_cache=fallback_to_cache,
794
+ )
795
+
796
+ # If ProxyCurl returned any data, set domain, then return
797
+ if company_data and isinstance(company_data, dict):
798
+ await set_organization_domain(company_data, use_strict_check, tool_config)
799
+ summary = await research_company_with_full_info_ai(company_data, "", tool_config=tool_config)
800
+ if summary:
801
+ company_data["organization_details"] = summary.get("research_summary", "")
802
+ return company_data
803
+
804
+ return {}
805
+
806
+
807
+ async def enrich_organization_info_from_job_url(
808
+ job_url: str,
809
+ use_strict_check: bool = True,
810
+ tool_config: Optional[List[Dict[str, Any]]] = None,
811
+ ) -> Dict[str, Any]:
812
+ """
813
+ Given a LinkedIn job posting URL, fetch job details using Proxycurl.
814
+ If job details are successfully retrieved, extract organization information
815
+ and return them in a dictionary. If not found, return {}.
816
+ """
817
+ # Validate the job URL.
818
+ if "linkedin.com/jobs/view/" not in job_url:
819
+ logger.debug("URL is not a valid LinkedIn job posting; skipping enrichment.")
820
+ return {}
821
+
822
+ # Normalize the job URL to use 'www.linkedin.com'
823
+ parsed = urlparse(job_url)
824
+ normalized_job_url = parsed._replace(netloc="www.linkedin.com").geturl()
825
+
826
+ logger.debug(f"Fetching job info from Proxycurl for URL: {normalized_job_url}")
827
+ try:
828
+ job_info = await enrich_job_info_from_proxycurl(
829
+ normalized_job_url, tool_config=tool_config
830
+ )
831
+ except Exception:
832
+ logger.exception("Exception occurred while fetching job info from Proxycurl.")
833
+ return {}
834
+
835
+ if not job_info:
836
+ logger.debug("No job info returned from Proxycurl; skipping enrichment.")
837
+ return {}
838
+
839
+ # Extract organization details from the 'company' key.
840
+ company_data = job_info.get("company", {})
841
+
842
+ # Make sure we have a company name before proceeding
843
+ if company_data and company_data.get("name", ""):
844
+ result = {
845
+ "organization_name": company_data.get("name", ""),
846
+ "organization_linkedin_url": company_data.get("url", ""),
847
+ # Include the website if provided
848
+ "organization_website": company_data.get("website", "")
849
+ }
850
+
851
+ # Refine domain and possibly fix the website
852
+ await set_organization_domain(result, use_strict_check, tool_config)
853
+ return result
854
+
855
+ return {}
856
+
857
+
858
+ class CompanyInfoFromName(BaseModel):
859
+ organization_name: str
860
+ primary_domain_of_organization: str
861
+ organization_website: str
862
+ organization_linkedin_url: str
863
+
864
+
865
+ @assistant_tool
866
+ async def get_company_domain_from_llm_web_search(
867
+ company_name: str,
868
+ lead_info: dict,
869
+ location: Optional[str] = None,
870
+ tool_config: Optional[List[Dict]] = None
871
+ ) -> Dict[str, Any]:
872
+ """
873
+ Tries to find relevant company info (name, domain, website, LinkedIn URL) from the company name
874
+ using an LLM with web search. Returns a dictionary with keys:
875
+ {
876
+ "organization_name": str,
877
+ "primary_domain_of_organization": str,
878
+ "organization_website": str,
879
+ "organization_linkedin_url": str
880
+ }
881
+ or an empty dict on failure.
882
+ """
883
+ logger.info("Entering get_company_domain_from_llm_web_search")
884
+
885
+ cleaned_name = company_name.replace(" ", "")
886
+ if not cleaned_name or company_name.lower() in ["none", "freelance"]:
887
+ logger.debug("Invalid or excluded company_name provided.")
888
+ return {}
889
+
890
+ query = f"\"{company_name}\" official website"
891
+ if location:
892
+ query += f", {location}"
893
+
894
+ try:
895
+ logger.debug(f"Performing LLM search with query: {query}")
896
+ # Build instructions for the LLM
897
+ instructions = f"""
898
+ Given the following information, find the company name, website, and domain information.
899
+ ---
900
+ Company name:
901
+ {company_name}
902
+
903
+ Additional lead info:
904
+ {lead_info}
905
+
906
+ Search and gather any domain/website info or LinkedIn details.
907
+ DO NOT make up information about company.
908
+ Find based on the domain in the leads email if its a corporate email, company name if sepcified to find the company name, website and domain.
909
+
910
+ **Output**:
911
+ Return your final output as valid JSON with the following structure:
912
+ {{
913
+ "organization_name": "...",
914
+ "primary_domain_of_organization": "...",
915
+ "organization_website": "...",
916
+ "organization_linkedin_url": "..."
917
+ }}
918
+ """
919
+ response, status = await get_structured_output_internal(
920
+ instructions,
921
+ CompanyInfoFromName,
922
+ model="gpt-5.1-chat",
923
+ use_web_search=True,
924
+ tool_config=tool_config
925
+ )
926
+ if status == "SUCCESS":
927
+ # Return the dictionary form of the model
928
+ return response.model_dump()
929
+ else:
930
+ return {}
931
+ except Exception:
932
+ logger.exception("Exception during get_company_domain_from_llm_web_search.")
933
+ return {}