dhisana 0.0.1.dev85__py3-none-any.whl → 0.0.1.dev236__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. dhisana/schemas/common.py +33 -0
  2. dhisana/schemas/sales.py +224 -23
  3. dhisana/utils/add_mapping.py +72 -63
  4. dhisana/utils/apollo_tools.py +739 -109
  5. dhisana/utils/built_with_api_tools.py +4 -2
  6. dhisana/utils/cache_output_tools.py +23 -23
  7. dhisana/utils/check_email_validity_tools.py +456 -458
  8. dhisana/utils/check_for_intent_signal.py +1 -2
  9. dhisana/utils/check_linkedin_url_validity.py +34 -8
  10. dhisana/utils/clay_tools.py +3 -2
  11. dhisana/utils/clean_properties.py +3 -1
  12. dhisana/utils/compose_salesnav_query.py +0 -1
  13. dhisana/utils/compose_search_query.py +7 -3
  14. dhisana/utils/composite_tools.py +0 -1
  15. dhisana/utils/dataframe_tools.py +2 -2
  16. dhisana/utils/email_body_utils.py +72 -0
  17. dhisana/utils/email_provider.py +375 -0
  18. dhisana/utils/enrich_lead_information.py +585 -85
  19. dhisana/utils/fetch_openai_config.py +129 -0
  20. dhisana/utils/field_validators.py +1 -1
  21. dhisana/utils/g2_tools.py +0 -1
  22. dhisana/utils/generate_content.py +0 -1
  23. dhisana/utils/generate_email.py +69 -16
  24. dhisana/utils/generate_email_response.py +298 -41
  25. dhisana/utils/generate_flow.py +0 -1
  26. dhisana/utils/generate_linkedin_connect_message.py +19 -6
  27. dhisana/utils/generate_linkedin_response_message.py +156 -65
  28. dhisana/utils/generate_structured_output_internal.py +351 -131
  29. dhisana/utils/google_custom_search.py +150 -44
  30. dhisana/utils/google_oauth_tools.py +721 -0
  31. dhisana/utils/google_workspace_tools.py +391 -25
  32. dhisana/utils/hubspot_clearbit.py +3 -1
  33. dhisana/utils/hubspot_crm_tools.py +771 -167
  34. dhisana/utils/instantly_tools.py +3 -1
  35. dhisana/utils/lusha_tools.py +10 -7
  36. dhisana/utils/mailgun_tools.py +150 -0
  37. dhisana/utils/microsoft365_tools.py +447 -0
  38. dhisana/utils/openai_assistant_and_file_utils.py +121 -177
  39. dhisana/utils/openai_helpers.py +19 -16
  40. dhisana/utils/parse_linkedin_messages_txt.py +2 -3
  41. dhisana/utils/profile.py +37 -0
  42. dhisana/utils/proxy_curl_tools.py +507 -206
  43. dhisana/utils/proxycurl_search_leads.py +426 -0
  44. dhisana/utils/research_lead.py +121 -68
  45. dhisana/utils/sales_navigator_crawler.py +1 -6
  46. dhisana/utils/salesforce_crm_tools.py +323 -50
  47. dhisana/utils/search_router.py +131 -0
  48. dhisana/utils/search_router_jobs.py +51 -0
  49. dhisana/utils/sendgrid_tools.py +126 -91
  50. dhisana/utils/serarch_router_local_business.py +75 -0
  51. dhisana/utils/serpapi_additional_tools.py +290 -0
  52. dhisana/utils/serpapi_google_jobs.py +117 -0
  53. dhisana/utils/serpapi_google_search.py +188 -0
  54. dhisana/utils/serpapi_local_business_search.py +129 -0
  55. dhisana/utils/serpapi_search_tools.py +363 -432
  56. dhisana/utils/serperdev_google_jobs.py +125 -0
  57. dhisana/utils/serperdev_local_business.py +154 -0
  58. dhisana/utils/serperdev_search.py +233 -0
  59. dhisana/utils/smtp_email_tools.py +576 -0
  60. dhisana/utils/test_connect.py +1765 -92
  61. dhisana/utils/trasform_json.py +95 -16
  62. dhisana/utils/web_download_parse_tools.py +0 -1
  63. dhisana/utils/zoominfo_tools.py +2 -3
  64. dhisana/workflow/test.py +1 -1
  65. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/METADATA +5 -2
  66. dhisana-0.0.1.dev236.dist-info/RECORD +100 -0
  67. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/WHEEL +1 -1
  68. dhisana-0.0.1.dev85.dist-info/RECORD +0 -81
  69. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/entry_points.txt +0 -0
  70. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/top_level.txt +0 -0
@@ -8,61 +8,105 @@ import re
8
8
  from typing import Any, Dict, List, Optional
9
9
  from urllib.parse import urlparse
10
10
 
11
- import tldextract
11
+ from pydantic import BaseModel, Field
12
+ import mdformat
12
13
 
13
14
  from dhisana.utils.check_email_validity_tools import process_email_properties
14
15
  from dhisana.utils.company_utils import normalize_company_name
15
16
  from dhisana.utils.field_validators import (
16
- normalize_linkedin_url, normalize_linkedin_company_url, normalize_salesnav_url, normalize_linkedin_company_salesnav_url)
17
+ normalize_linkedin_url,
18
+ normalize_linkedin_company_url,
19
+ normalize_salesnav_url,
20
+ normalize_linkedin_company_salesnav_url,
21
+ validate_and_clean_email,
22
+ validation_organization_domain,
23
+ validate_website_url
24
+ )
17
25
  from dhisana.utils.apollo_tools import enrich_user_info_with_apollo
18
26
  from dhisana.utils.assistant_tool_tag import assistant_tool
19
27
  from dhisana.utils.domain_parser import get_domain_from_website, is_excluded_domain
28
+ from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
20
29
  from dhisana.utils.proxy_curl_tools import (
21
30
  enrich_job_info_from_proxycurl,
22
31
  enrich_organization_info_from_proxycurl,
23
32
  enrich_user_info_with_proxy_curl,
24
33
  )
34
+ from dhisana.utils.research_lead import research_company_with_full_info_ai, research_lead_with_full_info_ai
25
35
  from dhisana.utils.serpapi_search_tools import (
26
36
  find_organization_linkedin_url_with_google_search,
37
+ find_user_linkedin_url_by_email_google,
27
38
  find_user_linkedin_url_google,
28
- get_company_domain_from_google_search,
39
+ find_user_linkedin_url_with_serper,
29
40
  get_company_website_from_linkedin_url,
30
41
  )
31
- from dhisana.utils.field_validators import (
32
- validate_and_clean_email,
33
- validation_organization_domain,
34
- validate_website_url
35
- )
36
42
 
37
- # The enrichment tools that are permissible for usage.
43
+ import logging
44
+ logging.basicConfig(level=logging.INFO)
45
+ logger = logging.getLogger(__name__)
46
+
47
+
48
+ # ----------------------------------------------------------------------
49
+ # Allowed Enrichment Tools
50
+ # ----------------------------------------------------------------------
38
51
  ALLOWED_ENRICHMENT_TOOLS = ["proxycurl", "apollo", "zoominfo"]
39
52
 
40
- # A map from tool name to the corresponding function that will enrich user info.
41
53
  USER_LOOKUP_TOOL_NAME_TO_FUNCTION_MAP = {
42
54
  "apollo": enrich_user_info_with_apollo,
43
55
  "proxycurl": enrich_user_info_with_proxy_curl,
44
56
  }
45
57
 
46
- import logging
47
- logging.basicConfig(level=logging.INFO)
48
- logger = logging.getLogger(__name__)
49
58
 
59
+ # ----------------------------------------------------------------------
60
+ # BasicLeadInformation model
61
+ # ----------------------------------------------------------------------
62
+ class BasicLeadInformation(BaseModel):
63
+ full_name: str = Field(..., description="Full name of the lead")
64
+ first_name: str = Field(..., description="First name of the lead")
65
+ last_name: str = Field(..., description="Last name of the lead")
66
+ email: str = Field(..., description="Email address of the lead")
67
+ primary_domain_of_organization: str = Field(..., description="Primary domain of the organization")
68
+ job_title: str = Field(..., description="Job Title of the lead")
69
+ phone: str = Field(..., description="Phone number of the lead")
70
+ headline: str = Field(..., description="Headline of the lead")
71
+ lead_location: str = Field(..., description="Location of the lead")
72
+ organization_name: str = Field(..., description="Current Company where lead works")
73
+ common_connections: int = Field(..., description="Number of common connections with the lead. Default 0")
74
+ followers_count: int = Field(..., description="Number of followers of the lead. Default 0")
75
+ tenure_in_current_role: str = Field(..., description="Tenure in the current role")
76
+ tenure_in_current_company: str = Field(..., description="Tenure in the current company")
77
+ connection_degree: str = Field(..., description="Degree of connection with the lead (1st, 2nd, 3rd)")
78
+ is_premium_account: bool = Field(..., description="Is the lead a premium account. Default is false.")
79
+ country_code: str = Field(..., description="Alpha-2 ISO3166 country code eg. US")
80
+
81
+
82
+ # ----------------------------------------------------------------------
83
+ # Helper: chunkify
84
+ # ----------------------------------------------------------------------
85
+ def chunkify(items: List[Any], chunk_size: int) -> List[List[Any]]:
86
+ """
87
+ Splits a list into sublists (chunks) of size `chunk_size`.
88
+ """
89
+ for i in range(0, len(items), chunk_size):
90
+ yield items[i : i + chunk_size]
50
91
 
92
+
93
+ # ----------------------------------------------------------------------
94
+ # Function: cleanup_user_name
95
+ # ----------------------------------------------------------------------
51
96
  def cleanup_user_name(cloned_properties: dict) -> dict:
52
97
  """
53
98
  Cleans up user name fields: 'full_name', 'first_name', 'last_name'.
54
99
  Returns the updated dictionary. If values are invalid or placeholders, sets them to ''.
55
100
  """
56
-
57
101
  if not isinstance(cloned_properties, dict):
58
102
  return {}
59
103
 
60
- def normalize(name) -> str:
104
+ def normalize(name: str) -> str:
61
105
  if not name or not isinstance(name, str):
62
106
  return ""
63
107
  # Common placeholders or invalid tokens
64
108
  invalid_tokens = [
65
- "null", "none", "na", "n.a", "notfound", "error",
109
+ "null", "none", "na", "n.a", "notfound", "error",
66
110
  "na.", "na,", "notavilable", "notavailable", ""
67
111
  ]
68
112
  stripped = name.strip().lower()
@@ -75,33 +119,99 @@ def cleanup_user_name(cloned_properties: dict) -> dict:
75
119
  stripped = stripped.split("|", 1)[0]
76
120
  # Remove extra non-alphanumeric characters (but allow whitespace)
77
121
  stripped = re.sub(r"[^a-zA-Z0-9\s]", "", stripped)
78
- # Capitalize first letter, lowercase the rest
79
- return stripped.strip().capitalize()
122
+
123
+ # Capitalize the first letter of each word, and lowercase the rest
124
+ return " ".join(word.capitalize() for word in stripped.strip().split())
80
125
 
81
126
  full_name = normalize(cloned_properties.get("full_name"))
82
127
  first_name = normalize(cloned_properties.get("first_name"))
83
128
  last_name = normalize(cloned_properties.get("last_name"))
84
129
 
85
130
  # If full_name is empty, build from first_name + last_name
86
- if first_name and last_name:
131
+ if first_name and last_name and not full_name:
87
132
  full_name = (first_name + " " + last_name).strip()
88
133
 
89
134
  cloned_properties["full_name"] = full_name
90
135
  cloned_properties["first_name"] = first_name
91
136
  cloned_properties["last_name"] = last_name
137
+
92
138
  return cloned_properties
93
139
 
94
140
 
95
- def validate_and_cleanup(cloned_properties: dict) -> dict:
141
+ # ----------------------------------------------------------------------
142
+ # LLM-based cleanup for single lead
143
+ # ----------------------------------------------------------------------
144
+ async def get_clean_lead_info_with_llm(lead_info_str: str, tool_config: Optional[dict]) -> Dict[str, Any]:
145
+ """
146
+ Takes a JSON string representation of partial lead info,
147
+ returns a cleaned-up lead dictionary matching BasicLeadInformation fields.
148
+ """
149
+ prompt = f"""
150
+ Given the following data about a lead and the organization they work for,
151
+ extract and clean up the lead information.
152
+ - Format 'full_name' properly.
153
+ - Format 'first_name' and 'last_name' so they're capitalized properly if available.
154
+ - Make sure 'organization_name' is properly capitalized if provided.
155
+ - Do not invent data that isn't provided.
156
+
157
+ Data:
158
+ {lead_info_str}
159
+
160
+ The output format is in JSON. The expected fields match BasicLeadInformation.
161
+ """
162
+ lead_info, status = await get_structured_output_internal(
163
+ prompt,
164
+ BasicLeadInformation,
165
+ model="gpt-5.1-chat",
166
+ tool_config=tool_config
167
+ )
168
+ if status == "ERROR":
169
+ return {}
170
+ return lead_info.model_dump()
171
+
172
+
173
+ # ----------------------------------------------------------------------
174
+ # Helper: is_personal_email_domain
175
+ # ----------------------------------------------------------------------
176
+ def is_personal_email_domain(domain: str) -> bool:
177
+ """
178
+ Very simple check to see if the domain is one of the common free/personal
179
+ email providers. Could expand this list or integrate a third-party API
180
+ for more accuracy.
181
+ """
182
+ common_free_domains = {
183
+ "gmail.com", "yahoo.com", "hotmail.com", "outlook.com",
184
+ "protonmail.com", "icloud.com", "aol.com", "mail.com",
185
+ "pm.me", "yandex.com", "gmx.com"
186
+ }
187
+ domain = domain.strip().lower()
188
+ return (domain in common_free_domains) or domain.endswith(".edu")
189
+
190
+
191
+ # ----------------------------------------------------------------------
192
+ # Main validation & cleanup function
193
+ # ----------------------------------------------------------------------
194
+ async def validate_and_cleanup(
195
+ cloned_properties: dict,
196
+ tool_config: Optional[dict] = None,
197
+ use_strict_check: bool = False
198
+ ) -> dict:
96
199
  """
97
200
  Wrapper to validate & normalize various properties in a dictionary.
98
- Safe against None, non-dict, or missing keys. Returns a cleaned dict.
201
+
202
+ 1) Clean up/validate typical fields.
203
+ 2) If name fields appear invalid, fallback to LLM-based name inference.
204
+ 3) If 'primary_domain_of_organization' AND 'organization_website' are both empty,
205
+ but there's a valid corporate email, use that as the domain.
206
+ 4) (Optional) Enrich the organization info from the name if needed.
99
207
  """
100
208
 
101
209
  if not isinstance(cloned_properties, dict):
102
210
  return {}
103
211
 
104
- # Safely fetch each key, process, and reassign
212
+ # ------------------------------------------------------------------
213
+ # Step 1: Normalize typical fields
214
+ # ------------------------------------------------------------------
105
215
  cloned_properties["user_linkedin_url"] = normalize_linkedin_url(
106
216
  cloned_properties.get("user_linkedin_url")
107
217
  )
@@ -127,75 +237,310 @@ def validate_and_cleanup(cloned_properties: dict) -> dict:
127
237
  cloned_properties.get("organization_name")
128
238
  )
129
239
 
130
- # Clean up user name fields
131
- cloned_properties = cleanup_user_name(cloned_properties)
132
-
240
+ # ------------------------------------------------------------------
241
+ # Step 2: Basic name-check. If invalid => LLM fallback.
242
+ # ------------------------------------------------------------------
243
+ def has_special_characters(val: str) -> bool:
244
+ return bool(re.search(r"[^a-zA-Z0-9\s]", val))
245
+
246
+ def is_invalid_name(val: str) -> bool:
247
+ return (len(val.strip()) < 3) or has_special_characters(val)
248
+
249
+ full_name = cloned_properties.get("full_name", "")
250
+ first_name = cloned_properties.get("first_name", "")
251
+ last_name = cloned_properties.get("last_name", "")
252
+ if (not full_name or full_name.startswith("None")):
253
+ full_name = ""
254
+ if (not first_name or first_name.startswith("None")):
255
+ first_name = ""
256
+ if (not last_name or last_name.startswith("None")):
257
+ last_name = ""
258
+
259
+ if (
260
+ is_invalid_name(full_name)
261
+ or is_invalid_name(first_name)
262
+ or is_invalid_name(last_name)
263
+ ):
264
+ # Check if we have a valid LinkedIn URL - if so, skip LLM as ProxyCurl will fill the data
265
+ user_linkedin_url = cloned_properties.get("user_linkedin_url", "").strip()
266
+ if not user_linkedin_url:
267
+ lead_info_str = str(cloned_properties)
268
+ logger.info(
269
+ "Detected invalid name fields. Using LLM to infer/correct name fields."
270
+ )
271
+ # Attempt LLM-based cleanup
272
+ new_lead_info = await get_clean_lead_info_with_llm(lead_info_str, tool_config=tool_config)
273
+ if new_lead_info:
274
+ cloned_properties["full_name"] = new_lead_info.get("full_name", "")
275
+ cloned_properties["first_name"] = new_lead_info.get("first_name", "")
276
+ cloned_properties["last_name"] = new_lead_info.get("last_name", "")
277
+ else:
278
+ logger.info("Valid LinkedIn URL found. Skipping LLM cleanup as ProxyCurl will enrich the data.")
279
+ else:
280
+ # Use the cheaper logic
281
+ cloned_properties = cleanup_user_name(cloned_properties)
282
+
283
+ # ------------------------------------------------------------------
284
+ # Step 3: If domain & website are empty but there's a corporate email
285
+ # ------------------------------------------------------------------
286
+ # - If email is present, check if domain is personal or corporate
287
+ # - If corporate, set primary_domain_of_organization from email domain
288
+ # ------------------------------------------------------------------
289
+ domain_empty = not cloned_properties.get("primary_domain_of_organization")
290
+ website_empty = not cloned_properties.get("organization_website")
291
+ email = cloned_properties.get("email", "")
292
+
293
+ if domain_empty and website_empty and email:
294
+ # parse domain from email
295
+ extracted_domain = email.split("@")[-1].strip().lower()
296
+ if extracted_domain and (not is_personal_email_domain(extracted_domain)):
297
+ # This is a "corporate" email domain, so use it
298
+ cloned_properties["primary_domain_of_organization"] = extracted_domain
299
+ cloned_properties["organization_website"] = f"https://www.{extracted_domain}"
300
+ logger.info("Set primary_domain_of_organization from corporate email domain.")
301
+
302
+ if domain_empty and not website_empty:
303
+ from urllib.parse import urlparse
304
+ parsed_website = urlparse(cloned_properties["organization_website"])
305
+ possible_domain = parsed_website.netloc.replace("www.", "")
306
+ if possible_domain:
307
+ cloned_properties["primary_domain_of_organization"] = possible_domain
308
+ logger.info("Set primary_domain_of_organization from organization_website domain.")
133
309
  return cloned_properties
134
310
 
135
-
136
311
  @assistant_tool
137
312
  async def enrich_lead_information(
138
313
  user_properties: Dict[str, Any],
139
314
  use_strict_check: bool = True,
140
315
  get_valid_email: bool = True,
316
+ company_research_instructions: str = "",
317
+ lead_research_instructions: str = "",
318
+ enrich_company_information: bool = True,
319
+ enrich_lead_information: bool = True,
141
320
  tool_config: Optional[List[Dict[str, Any]]] = None,
142
321
  ) -> Dict[str, Any]:
143
- """
144
- Enrich lead information including company details and LinkedIn URL.
145
- Steps performed:
146
- 1) Enrich organization information (primary domain, LinkedIn URL, website).
147
- 2) Attempt to fix/find user LinkedIn URL if not present.
148
- 3) Enrich with additional provider data and validate matches (e.g., Apollo).
149
-
150
- :param user_properties: Dictionary containing user/lead details to be enriched.
151
- :param use_strict_check: Whether to use strict matching in certain search functions.
152
- :param tool_config: Optional list of tool configuration dicts (e.g., [{"name": "apollo"}, ...]).
153
- :return: Enriched user_properties dictionary.
154
- """
155
322
  logger.debug("Starting enrich_lead_information with user_properties: %s", user_properties)
156
323
  cloned_properties = dict(user_properties)
157
324
 
158
- cloned_properties = validate_and_cleanup(cloned_properties)
325
+ cloned_properties = await validate_and_cleanup(cloned_properties, tool_config=tool_config, use_strict_check=use_strict_check)
159
326
 
160
327
  cloned_properties = await enrich_user_info(
161
328
  input_properties=cloned_properties,
162
329
  use_strict_check=use_strict_check,
163
330
  tool_config=tool_config,
164
331
  )
332
+ if use_strict_check and not cloned_properties.get("user_linkedin_url") and not cloned_properties.get("email"):
333
+ return cloned_properties
334
+
335
+ await enrich_organization_info_from_name(
336
+ row=cloned_properties,
337
+ use_strict_check=use_strict_check,
338
+ tool_config=tool_config,
339
+ )
165
340
 
166
341
  cloned_properties = await enrich_with_provider(cloned_properties, tool_config)
167
342
 
168
- await set_organization_domain(
343
+ await enrich_organization_info_from_name(
169
344
  row=cloned_properties,
170
345
  use_strict_check=use_strict_check,
171
346
  tool_config=tool_config,
172
347
  )
173
-
348
+
174
349
  if get_valid_email:
175
350
  await process_email_properties(cloned_properties, tool_config)
176
-
177
- cloned_properties = validate_and_cleanup(cloned_properties)
351
+
352
+ # ------------------------------------------------------------------
353
+ # Supplement missing follower count or name information using Serper
354
+ # ------------------------------------------------------------------
355
+ linkedin_url = cloned_properties.get("user_linkedin_url", "").strip()
356
+ follower_count = cloned_properties.get("linkedin_follower_count")
357
+ first_name = cloned_properties.get("first_name")
358
+ if (
359
+ linkedin_url
360
+ and (follower_count is None or (isinstance(follower_count, str) and not follower_count.strip()) or not first_name)
361
+ ):
362
+ serper_result = await find_user_linkedin_url_with_serper(
363
+ linkedin_url, tool_config=tool_config
364
+ )
365
+ if serper_result:
366
+ if follower_count is None or (
367
+ isinstance(follower_count, str) and not follower_count.strip()
368
+ ):
369
+ cloned_properties["linkedin_follower_count"] = serper_result.get(
370
+ "linkedin_follower_count", 0
371
+ )
372
+ if not first_name:
373
+ cloned_properties["first_name"] = serper_result.get("first_name", "")
374
+ cloned_properties["last_name"] = serper_result.get("last_name", "")
375
+
376
+ cloned_properties = await validate_and_cleanup(
377
+ cloned_properties, tool_config=tool_config, use_strict_check=use_strict_check
378
+ )
379
+
380
+ research_summary = cloned_properties.get("research_summary", "")
381
+
382
+ if enrich_lead_information:
383
+ summary = await research_lead_with_full_info_ai(
384
+ cloned_properties, lead_research_instructions, tool_config=tool_config
385
+ )
386
+ if summary:
387
+ research_summary = summary.get("research_summary", "")
388
+
389
+ if enrich_company_information:
390
+ company_company_properties = {
391
+ "organization_name": cloned_properties.get("organization_name", ""),
392
+ "primary_domain_of_organization": cloned_properties.get("primary_domain_of_organization", ""),
393
+ "organization_website": cloned_properties.get("organization_website", ""),
394
+ }
395
+ company_summary = await research_company_with_full_info_ai(
396
+ company_company_properties,
397
+ company_research_instructions,
398
+ tool_config=tool_config,
399
+ )
400
+ if company_summary:
401
+ markdown_text = research_summary + "\n\n#### " + company_summary.get(
402
+ "research_summary", ""
403
+ )
404
+ formatted_markdown = mdformat.text(markdown_text)
405
+ research_summary = re.sub(
406
+ r'^(#{1,6})\s+', '##### ', formatted_markdown, flags=re.MULTILINE
407
+ )
408
+
409
+ cloned_properties["research_summary"] = research_summary
178
410
  return cloned_properties
179
411
 
180
412
 
413
+ class UserInfoFromGithubProfileId(BaseModel):
414
+ first_name: str
415
+ last_name: str
416
+ full_name: str
417
+ linkedin_url: str
418
+ github_url: str
419
+ email: str
420
+ twitter_handle: str
421
+ website: str
422
+ location: str
423
+
424
+
425
+ def extract_id_from_salesnav_url(url_key: str) -> str:
426
+ """
427
+ Extract the Sales Navigator lead ID from a URL like
428
+ 'https://www.linkedin.com/sales/lead/<ID>?...'
429
+ """
430
+ if not url_key:
431
+ return ""
432
+ match = re.search(r"linkedin\.com/sales/lead/([^/?#,]+)", url_key, re.IGNORECASE)
433
+ if not match:
434
+ return ""
435
+ # strip out any non-word or hyphen chars
436
+ return re.sub(r"[^\w-]", "", match.group(1))
437
+
438
+ def proxy_linkedin_url(user_linkedin_salesnav_url: str) -> str:
439
+ """
440
+ Given a Sales Navigator URL, return the corresponding public LinkedIn URL.
441
+ Raises ValueError if the ID cannot be extracted.
442
+ """
443
+ salesnav_id = extract_id_from_salesnav_url(user_linkedin_salesnav_url)
444
+ if not salesnav_id:
445
+ raise ValueError("Could not extract ID from Sales Nav URL.")
446
+ return f"https://www.linkedin.com/in/{salesnav_id}"
447
+
448
+ # -------------------------------------------------------------------
449
+ # (Pseudo) get_structured_output_internal, find_user_linkedin_url_google
450
+ # and other references assumed to exist in your environment.
451
+ # -------------------------------------------------------------------
452
+
453
+ async def get_user_linkedin_url_from_github_profile(
454
+ github_profile_id: str,
455
+ lead_properties: dict,
456
+ instructions: str,
457
+ tool_config: Optional[List[Dict]] = None
458
+ ) -> Dict[str, Any]:
459
+ """
460
+ Attempt to locate a user's LinkedIn profile URL from their GitHub profile ID via web search.
461
+ Also gather basic user info (first/last name) if possible.
462
+ """
463
+ instructions = f"""
464
+ Give user information from user GitHub handle; try to locate the LinkedIn profile URL
465
+ for the user using web search.
466
+ ---
467
+ Github profile id:
468
+ {github_profile_id}
469
+ Company Data include name, domain and website:
470
+ {lead_properties}
471
+
472
+ Instructions:
473
+ {instructions}
474
+ ---
475
+ Use websearch to locate the LinkedIn profile url for the user if present.
476
+
477
+ **Output**:
478
+ Return your final output as valid JSON with the following structure:
479
+ {{
480
+ "first_name": "...",
481
+ "last_name": "...",
482
+ "full_name": "...",
483
+ "linkedin_url": "...",
484
+ "github_url": "...",
485
+ "email": "...",
486
+ "twitter_handle": "...",
487
+ "website": "...",
488
+ "location": "..."
489
+ }}
490
+ """
491
+
492
+ # Example call to structured output function
493
+ response, status = await get_structured_output_internal(
494
+ instructions,
495
+ UserInfoFromGithubProfileId,
496
+ model="gpt-5.1-chat",
497
+ use_web_search=True,
498
+ tool_config=tool_config
499
+ )
500
+ if status == "SUCCESS":
501
+ return response
502
+ else:
503
+ return {}
504
+
181
505
  async def enrich_user_info(
182
506
  input_properties: Dict[str, Any],
183
507
  use_strict_check: bool,
184
508
  tool_config: Optional[List[Dict[str, Any]]] = None,
185
509
  ) -> Dict[str, Any]:
186
510
  """
187
- Attempt to find or fix a user's LinkedIn URL using name, title, location, and company information.
188
-
189
- :param input_properties: Dictionary containing user/lead details.
190
- :param use_strict_check: Whether to use strict matching during searches.
191
- :param tool_config: Optional list of tool configurations dicts.
192
- :return: Updated dictionary with user LinkedIn URL if found.
511
+ Attempt to find or fix a user's LinkedIn URL using name, title, location,
512
+ company info or GitHub profile handle if present. If still not found,
513
+ but user_linkedin_salesnav_url exists, we fall back to creating a
514
+ proxy URL from the Sales Navigator link.
193
515
  """
194
516
  logger.debug("Starting enrich_user_info for: %s", input_properties.get("full_name"))
195
517
  user_linkedin_url = (input_properties.get("user_linkedin_url") or "").strip()
196
518
  input_properties["linkedin_url_match"] = False
519
+ github_profile_id = (input_properties.get("github_profile_id") or "").strip()
197
520
 
521
+ # 1) If we do not have a user_linkedin_url, try getting it from GitHub
198
522
  if not user_linkedin_url:
523
+ if github_profile_id:
524
+ response = await get_user_linkedin_url_from_github_profile(
525
+ github_profile_id=github_profile_id,
526
+ lead_properties=input_properties,
527
+ instructions="Use web search to find the user's LinkedIn profile from GitHub handle if present.",
528
+ tool_config=tool_config,
529
+ )
530
+ user_linkedin_url = response.get("linkedin_url", "")
531
+ if user_linkedin_url:
532
+ input_properties["user_linkedin_url"] = user_linkedin_url
533
+ if not input_properties.get("first_name"):
534
+ input_properties["first_name"] = response.get("first_name", "")
535
+ if not input_properties.get("last_name"):
536
+ input_properties["last_name"] = response.get("last_name", "")
537
+ if not input_properties.get("email"):
538
+ input_properties["email"] = response.get("email", "")
539
+ if not input_properties.get("lead_location"):
540
+ input_properties["lead_location"] = response.get("location", "")
541
+ return input_properties
542
+
543
+ # 2) If still no LinkedIn URL, try name/title/org searching
199
544
  full_name = (input_properties.get("full_name") or "").strip()
200
545
  if not full_name:
201
546
  first_name = (input_properties.get("first_name", "") or "").strip()
@@ -205,20 +550,64 @@ async def enrich_user_info(
205
550
  title = input_properties.get("job_title", "") or ""
206
551
  location = input_properties.get("lead_location", "") or ""
207
552
  org_name = (input_properties.get("organization_name", "") or "").strip()
208
- if full_name and org_name:
209
- user_linkedin_url = await find_user_linkedin_url_google(
553
+ org_domain = (input_properties.get("primary_domain_of_organization", "") or "").strip()
554
+ email = (input_properties.get("email") or "").strip()
555
+
556
+ if full_name and (org_name or org_domain or title):
557
+ # This function does a google-based search for the user's LinkedIn
558
+ found_linkedin_url = await find_user_linkedin_url_google(
210
559
  user_name=full_name,
211
560
  user_title=title,
212
561
  user_location=location,
213
562
  user_company=org_name,
563
+ user_company_domain=org_domain,
214
564
  use_strict_check=use_strict_check,
215
565
  tool_config=tool_config,
216
566
  )
217
- input_properties["user_linkedin_url"] = user_linkedin_url
567
+ if found_linkedin_url:
568
+ user_linkedin_url = found_linkedin_url
569
+ input_properties["user_linkedin_url"] = user_linkedin_url
570
+ if not user_linkedin_url and email:
571
+ # If we have an email but no name, try searching by email
572
+ email_lookup_result = await find_user_linkedin_url_by_email_google(
573
+ email=email,
574
+ user_name=full_name,
575
+ user_title=title,
576
+ user_location=location,
577
+ user_company=org_name,
578
+ tool_config=tool_config,
579
+ )
580
+ if email_lookup_result and email_lookup_result.get("linkedin_url"):
581
+ user_linkedin_url = email_lookup_result["linkedin_url"]
582
+ input_properties["user_linkedin_url"] = user_linkedin_url
583
+ confidence = email_lookup_result.get("confidence", 0.0)
584
+ reasoning = email_lookup_result.get("reasoning", "")
585
+ input_properties["user_linkedin_url_confidence"] = confidence
586
+ input_properties["user_linkedin_url_reasoning"] = reasoning
587
+
588
+ additional_properties = input_properties.get("additional_properties") or {}
589
+ additional_properties["user_linkedin_url_confidence"] = confidence
590
+ if reasoning:
591
+ additional_properties["user_linkedin_url_reasoning"] = reasoning
592
+ input_properties["additional_properties"] = additional_properties
593
+
594
+ # 3) Final fallback: if STILL no user_linkedin_url,
595
+ # but user_linkedin_salesnav_url is present, use proxy
596
+ if not input_properties.get("user_linkedin_url"):
597
+ salesnav_url = input_properties.get("user_linkedin_salesnav_url", "")
598
+ if salesnav_url:
599
+ try:
600
+ proxy_url = proxy_linkedin_url(salesnav_url)
601
+ input_properties["user_linkedin_url"] = proxy_url
602
+ logger.debug("Falling back to proxy LinkedIn URL from SalesNav: %s", proxy_url)
603
+ except ValueError:
604
+ # If we can't parse an ID from the sales nav URL, skip
605
+ logger.warning("Could not parse ID from user_linkedin_salesnav_url: %s", salesnav_url)
218
606
 
219
607
  return input_properties
220
608
 
221
609
 
610
+
222
611
  async def enrich_with_provider(
223
612
  cloned_properties: Dict[str, Any],
224
613
  tool_config: Optional[List[Dict[str, Any]]],
@@ -260,14 +649,9 @@ async def enrich_organization_info_from_name(
260
649
  Given a dictionary (treated like a CSV row) containing 'organization_name',
261
650
  'organization_linkedin_url', and 'website' keys, enrich the row only if the
262
651
  domain and website are currently empty.
263
-
264
- :param row: Dictionary representing a lead or company record.
265
- :param use_strict_check: Whether to use strict matching for searches.
266
- :param tool_config: Optional list of tool configuration dicts.
267
652
  """
268
653
  org_name_key = "organization_name"
269
654
  org_domain_key = "primary_domain_of_organization"
270
- linkedin_url_key = "organization_linkedin_url"
271
655
  website_key = "organization_website"
272
656
 
273
657
  org_name = (row.get(org_name_key) or "").strip()
@@ -276,26 +660,14 @@ async def enrich_organization_info_from_name(
276
660
  row[org_name_key] = ""
277
661
  org_name = ""
278
662
 
663
+ # If there's no organization name, just return
279
664
  if not org_name:
280
665
  return
281
666
 
667
+ # If domain or website is already present, we consider it enriched
282
668
  if row.get(org_domain_key) or row.get(website_key):
283
669
  return
284
-
285
- linkedin_url = row.get(linkedin_url_key, "").strip()
286
- if not linkedin_url:
287
- linkedin_url = await find_organization_linkedin_url_with_google_search(
288
- org_name,
289
- company_location="US",
290
- use_strict_check=use_strict_check,
291
- tool_config=tool_config,
292
- )
293
-
294
- if linkedin_url:
295
- row[linkedin_url_key] = linkedin_url
296
- await set_organization_domain(row, use_strict_check, tool_config)
297
- else:
298
- row[org_domain_key] = ""
670
+ await set_organization_domain(row, use_strict_check, tool_config)
299
671
 
300
672
 
301
673
  async def set_organization_domain(
@@ -306,10 +678,6 @@ async def set_organization_domain(
306
678
  """
307
679
  Update the row with a 'primary_domain_of_organization' based on 'website' or
308
680
  search results if the domain is absent.
309
-
310
- :param row: Dictionary representing a lead or company record.
311
- :param use_strict_check: Whether to use strict matching for searches.
312
- :param tool_config: Optional list of tool configuration dicts.
313
681
  """
314
682
  org_name_key = "organization_name"
315
683
  org_domain_key = "primary_domain_of_organization"
@@ -342,40 +710,95 @@ async def set_organization_domain(
342
710
 
343
711
  if not extracted_domain and not use_strict_check and org_name:
344
712
  logger.debug("Performing Google search to find domain for org_name: %s", org_name)
345
- extracted_domain = await get_company_domain_from_google_search(
346
- org_name,
347
- "US",
348
- tool_config=tool_config,
713
+ company_info = await get_company_domain_from_llm_web_search(
714
+ company_name=org_name,
715
+ lead_info=row,
716
+ location="US",
717
+ tool_config=tool_config
349
718
  )
350
- logger.debug("Found domain from Google search: %s", extracted_domain)
719
+ if company_info and isinstance(company_info, dict):
720
+ # If the LLM found a domain, set it
721
+ if company_info.get("primary_domain_of_organization") and not row[org_domain_key]:
722
+ row[org_domain_key] = company_info["primary_domain_of_organization"]
723
+
724
+ # If the LLM found an organization website, set it
725
+ if company_info.get("organization_website") and not row[website_key]:
726
+ row[website_key] = company_info["organization_website"]
727
+
728
+ # If there's a LinkedIn URL from LLM, set it
729
+ if company_info.get("organization_linkedin_url") and not row[linkedin_url_key]:
730
+ row[linkedin_url_key] = company_info["organization_linkedin_url"]
731
+
732
+ if company_info.get("organization_name") and not row[org_name_key]:
733
+ row[org_name_key] = company_info["organization_name"]
351
734
 
352
735
  row[org_domain_key] = extracted_domain or ""
353
736
  logger.debug("Final domain selected: %s", row[org_domain_key])
354
737
  row[website_key] = company_website or ""
738
+
739
+ # If there's still no website but we have a domain, set a default website
355
740
  company_website = (row.get(website_key) or "").strip()
356
741
  if existing_domain and not company_website:
357
742
  row[website_key] = f"https://www.{existing_domain}"
358
743
 
359
744
 
745
+ async def get_organization_linkedin_url(lead: Dict[str, Any], tools: Optional[List[Dict[str, Any]]]) -> str:
746
+ """
747
+ Retrieve the organization's LinkedIn URL using the company name, domain, and search tools.
748
+ Returns an empty string if the organization name is missing.
749
+ """
750
+ name = lead.get("organization_name", "").strip()
751
+ if not name:
752
+ return ""
753
+
754
+ linkedin_url = await find_organization_linkedin_url_with_google_search(
755
+ name,
756
+ company_location="US",
757
+ company_domain=lead.get("primary_domain_of_organization"),
758
+ use_strict_check=True,
759
+ tool_config=tools,
760
+ )
761
+ return linkedin_url
762
+
763
+
360
764
  async def enrich_organization_info_from_company_url(
361
765
  organization_linkedin_url: str,
362
766
  use_strict_check: bool = True,
363
767
  tool_config: Optional[List[Dict[str, Any]]] = None,
768
+ categories: Optional[bool] = None,
769
+ funding_data: Optional[bool] = None,
770
+ exit_data: Optional[bool] = None,
771
+ acquisitions: Optional[bool] = None,
772
+ extra: Optional[bool] = None,
773
+ use_cache: Optional[str] = "if-present",
774
+ fallback_to_cache: Optional[str] = "on-error",
364
775
  ) -> Dict[str, Any]:
365
776
  """
366
777
  Given an organization LinkedIn URL, attempt to enrich its data (e.g. name, website)
367
- via ProxyCurl. If data is found, return the dict with domain set. Otherwise, return {}.
778
+ via ProxyCurl. Additional Proxycurl Company API boolean flags (categories, funding_data, etc.)
779
+ can be supplied to control the returned payload (True -> "include"). If data is found,
780
+ set domain, then return the dict. Otherwise, return {}.
368
781
  """
369
782
 
370
783
  # Call ProxyCurl to enrich
371
784
  company_data = await enrich_organization_info_from_proxycurl(
372
785
  organization_linkedin_url=organization_linkedin_url,
373
- tool_config=tool_config
786
+ tool_config=tool_config,
787
+ categories=categories,
788
+ funding_data=funding_data,
789
+ exit_data=exit_data,
790
+ acquisitions=acquisitions,
791
+ extra=extra,
792
+ use_cache=use_cache,
793
+ fallback_to_cache=fallback_to_cache,
374
794
  )
375
795
 
376
796
  # If ProxyCurl returned any data, set domain, then return
377
797
  if company_data and isinstance(company_data, dict):
378
798
  await set_organization_domain(company_data, use_strict_check, tool_config)
799
+ summary = await research_company_with_full_info_ai(company_data, "", tool_config=tool_config)
800
+ if summary:
801
+ company_data["organization_details"] = summary.get("research_summary", "")
379
802
  return company_data
380
803
 
381
804
  return {}
@@ -389,7 +812,6 @@ async def enrich_organization_info_from_job_url(
389
812
  """
390
813
  Given a LinkedIn job posting URL, fetch job details using Proxycurl.
391
814
  If job details are successfully retrieved, extract organization information
392
- (organization_name, organization_linkedin_url, primary_domain_of_organization, organization_website)
393
815
  and return them in a dictionary. If not found, return {}.
394
816
  """
395
817
  # Validate the job URL.
@@ -406,7 +828,7 @@ async def enrich_organization_info_from_job_url(
406
828
  job_info = await enrich_job_info_from_proxycurl(
407
829
  normalized_job_url, tool_config=tool_config
408
830
  )
409
- except Exception as e:
831
+ except Exception:
410
832
  logger.exception("Exception occurred while fetching job info from Proxycurl.")
411
833
  return {}
412
834
 
@@ -431,3 +853,81 @@ async def enrich_organization_info_from_job_url(
431
853
  return result
432
854
 
433
855
  return {}
856
+
857
+
858
+ class CompanyInfoFromName(BaseModel):
859
+ organization_name: str
860
+ primary_domain_of_organization: str
861
+ organization_website: str
862
+ organization_linkedin_url: str
863
+
864
+
865
+ @assistant_tool
866
+ async def get_company_domain_from_llm_web_search(
867
+ company_name: str,
868
+ lead_info: dict,
869
+ location: Optional[str] = None,
870
+ tool_config: Optional[List[Dict]] = None
871
+ ) -> Dict[str, Any]:
872
+ """
873
+ Tries to find relevant company info (name, domain, website, LinkedIn URL) from the company name
874
+ using an LLM with web search. Returns a dictionary with keys:
875
+ {
876
+ "organization_name": str,
877
+ "primary_domain_of_organization": str,
878
+ "organization_website": str,
879
+ "organization_linkedin_url": str
880
+ }
881
+ or an empty dict on failure.
882
+ """
883
+ logger.info("Entering get_company_domain_from_llm_web_search")
884
+
885
+ cleaned_name = company_name.replace(" ", "")
886
+ if not cleaned_name or company_name.lower() in ["none", "freelance"]:
887
+ logger.debug("Invalid or excluded company_name provided.")
888
+ return {}
889
+
890
+ query = f"\"{company_name}\" official website"
891
+ if location:
892
+ query += f", {location}"
893
+
894
+ try:
895
+ logger.debug(f"Performing LLM search with query: {query}")
896
+ # Build instructions for the LLM
897
+ instructions = f"""
898
+ Given the following information, find the company name, website, and domain information.
899
+ ---
900
+ Company name:
901
+ {company_name}
902
+
903
+ Additional lead info:
904
+ {lead_info}
905
+
906
+ Search and gather any domain/website info or LinkedIn details.
907
+ DO NOT make up information about company.
908
+ Find based on the domain in the leads email if its a corporate email, company name if sepcified to find the company name, website and domain.
909
+
910
+ **Output**:
911
+ Return your final output as valid JSON with the following structure:
912
+ {{
913
+ "organization_name": "...",
914
+ "primary_domain_of_organization": "...",
915
+ "organization_website": "...",
916
+ "organization_linkedin_url": "..."
917
+ }}
918
+ """
919
+ response, status = await get_structured_output_internal(
920
+ instructions,
921
+ CompanyInfoFromName,
922
+ model="gpt-5.1-chat",
923
+ use_web_search=True,
924
+ tool_config=tool_config
925
+ )
926
+ if status == "SUCCESS":
927
+ # Return the dictionary form of the model
928
+ return response.model_dump()
929
+ else:
930
+ return {}
931
+ except Exception:
932
+ logger.exception("Exception during get_company_domain_from_llm_web_search.")
933
+ return {}