dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,852 @@
1
+ import json
2
+ import re
3
+ from typing import Any, Dict, List, Optional, Set
4
+ from urllib.parse import urlparse
5
+ import urllib.parse
6
+ import aiohttp
7
+ from bs4 import BeautifulSoup
8
+ import urllib
9
+ from pydantic import BaseModel
10
+
11
+ from dhisana.utils.serperdev_search import search_google_serper
12
+ from dhisana.utils.generate_structured_output_internal import (
13
+ get_structured_output_internal,
14
+ )
15
+
16
+ import logging
17
+
18
+ logging.basicConfig(level=logging.INFO)
19
+ logger = logging.getLogger(__name__)
20
+
21
+ from dhisana.utils.search_router import search_google_with_tools
22
+ from dhisana.utils.assistant_tool_tag import assistant_tool
23
+
24
+ from dhisana.utils.web_download_parse_tools import fetch_html_content
25
+
26
+
27
+ class LeadSearchResult(BaseModel):
28
+ first_name: str = ""
29
+ last_name: str = ""
30
+ full_name: str = ""
31
+ job_title: str = ""
32
+ linkedin_follower_count: int = 0
33
+ lead_location: str = ""
34
+ summary_about_lead: str = ""
35
+ user_linkedin_url: str = ""
36
+
37
+
38
+ class LinkedinCandidateChoice(BaseModel):
39
+ chosen_link: str = ""
40
+ confidence: float = 0.0
41
+ reasoning: str = ""
42
+
43
+
44
+ async def get_structured_output(text: str, tool_config: Optional[List[Dict]] = None) -> LeadSearchResult:
45
+ """Parse text snippet into ``LeadSearchResult`` using OpenAI."""
46
+
47
+ prompt = (
48
+ "Extract lead details from the text below.\n"
49
+ "If follower counts are mentioned, convert values like '1.5k+ followers' to an integer (e.g. 1500).\n"
50
+ f"Return JSON matching this schema:\n{json.dumps(LeadSearchResult.model_json_schema(), indent=2)}\n\n"
51
+ f"Text:\n{text}"
52
+ )
53
+ result, status = await get_structured_output_internal(
54
+ prompt, LeadSearchResult, model = "gpt-5.1-chat", tool_config=tool_config
55
+ )
56
+ if status != "SUCCESS" or result is None:
57
+ return LeadSearchResult()
58
+ return result
59
+
60
+
61
+ @assistant_tool
62
+ async def find_user_linkedin_url_with_serper(
63
+ user_linkedin_url: str,
64
+ tool_config: Optional[List[Dict]] = None,
65
+ ) -> Optional[Dict]:
66
+ """Search Google via Serper.dev for ``user_linkedin_url`` and parse lead details."""
67
+
68
+ if not user_linkedin_url:
69
+ return None
70
+
71
+ normalized_input = extract_user_linkedin_page(user_linkedin_url)
72
+ results = await search_google_serper(user_linkedin_url, 10, tool_config=tool_config)
73
+ for item_json in results:
74
+ try:
75
+ item = json.loads(item_json)
76
+ except Exception:
77
+ continue
78
+ link = item.get("link", "")
79
+ if not link:
80
+ continue
81
+ if extract_user_linkedin_page(link) == normalized_input:
82
+ text = " ".join(
83
+ [item.get("title", ""), item.get("subtitle", ""), item.get("snippet", "")]
84
+ ).strip()
85
+ structured = await get_structured_output(text, tool_config=tool_config)
86
+ structured.user_linkedin_url = normalized_input
87
+ return json.loads(structured.model_dump_json())
88
+ return None
89
+
90
+
91
+ async def pick_best_linkedin_candidate_with_llm(
92
+ email: str,
93
+ user_name: str,
94
+ user_title: str,
95
+ user_location: str,
96
+ user_company: str,
97
+ candidates: List[Dict],
98
+ tool_config: Optional[List[Dict]] = None,
99
+ ) -> Optional[LinkedinCandidateChoice]:
100
+ """Ask the LLM to assess candidate LinkedIn URLs and pick the best match."""
101
+
102
+ if not candidates:
103
+ return None
104
+
105
+ candidates_sorted = candidates[-3:]
106
+ candidate_lines = []
107
+ for idx, candidate in enumerate(candidates_sorted, start=1):
108
+ candidate_lines.append(
109
+ "\n".join(
110
+ [
111
+ f"Candidate {idx}:",
112
+ f" Link: {candidate.get('link', '')}",
113
+ f" Title: {candidate.get('title', '')}",
114
+ f" Snippet: {candidate.get('snippet', '')}",
115
+ f" Subtitle: {candidate.get('subtitle', '')}",
116
+ f" Query: {candidate.get('query', '')}",
117
+ ]
118
+ )
119
+ )
120
+
121
+ prompt = (
122
+ "You are validating LinkedIn profile matches for a lead enrichment workflow.\n"
123
+ "Given the lead context and candidate search results, pick the most likely LinkedIn profile.\n"
124
+ "If no candidate seems appropriate, return an empty link and confidence 0.\n"
125
+ "Consider whether the email, name, company, title, or location aligns with the candidate.\n"
126
+ "Lead context:\n"
127
+ f"- Email: {email or 'unknown'}\n"
128
+ f"- Name: {user_name or 'unknown'}\n"
129
+ f"- Title: {user_title or 'unknown'}\n"
130
+ f"- Company: {user_company or 'unknown'}\n"
131
+ f"- Location: {user_location or 'unknown'}\n\n"
132
+ "Candidates:\n"
133
+ f"{chr(10).join(candidate_lines)}\n\n"
134
+ "Return JSON with fields: chosen_link (string), confidence (0-1 float), reasoning (short string)."
135
+ )
136
+
137
+ result, status = await get_structured_output_internal(
138
+ prompt,
139
+ LinkedinCandidateChoice,
140
+ model="gpt-5.1-chat",
141
+ tool_config=tool_config,
142
+ )
143
+
144
+ if status != "SUCCESS" or result is None:
145
+ return None
146
+
147
+ return result
148
+
149
+
150
+ @assistant_tool
151
+ async def get_company_domain_from_google_search(
152
+ company_name: str,
153
+ location: Optional[str] = None,
154
+ tool_config: Optional[List[Dict]] = None
155
+ ) -> str:
156
+ """
157
+ Tries to find the company domain from the company name using Google (SerpAPI or Serper.dev).
158
+ """
159
+ logger.info("Entering get_company_domain_from_google_search")
160
+
161
+ company_name_no_spaces = company_name.replace(" ", "")
162
+ if not company_name_no_spaces or company_name.lower() in ["none", "freelance"]:
163
+ logger.debug("Invalid or excluded company_name provided.")
164
+ return ""
165
+
166
+ query = f"\"{company_name}\" official website"
167
+ if location:
168
+ query = f"\"{company_name}\" official website, {location}"
169
+
170
+ try:
171
+ logger.debug(f"Performing search with query: {query}")
172
+ result = await search_google_with_tools(query, 1, tool_config=tool_config)
173
+ if not isinstance(result, list) or len(result) == 0:
174
+ logger.debug("No results for first attempt, retrying with fallback query.")
175
+ query = f"{company_name} official website"
176
+ result = await search_google_with_tools(query, 1, tool_config=tool_config)
177
+ if not isinstance(result, list) or len(result) == 0:
178
+ logger.debug("No results from fallback query either.")
179
+ return ''
180
+ except Exception:
181
+ logger.exception("Exception during get_company_domain_from_google_search.")
182
+ return ''
183
+
184
+ exclude_compan_names = ["linkedin", "wikipedia", "facebook", "instagram", "twitter", "youtube", "netflix"]
185
+ if any(exclude_name in company_name.lower() for exclude_name in exclude_compan_names):
186
+ logger.debug("Company name is in excluded list, returning empty domain.")
187
+ return ""
188
+
189
+ try:
190
+ result_json = json.loads(result[0])
191
+ except (json.JSONDecodeError, IndexError) as e:
192
+ logger.debug(f"Failed to parse the JSON from the result: {str(e)}")
193
+ return ''
194
+
195
+ link = result_json.get('link', '')
196
+ if not link:
197
+ logger.debug("No link found in the first search result.")
198
+ return ''
199
+
200
+ parsed_url = urlparse(link)
201
+ domain = parsed_url.netloc.lower()
202
+ if domain.startswith('www.'):
203
+ domain = domain[4:]
204
+
205
+ excluded_domains = [
206
+ "linkedin.com", "wikipedia.org", "usa.gov", "facebook.com",
207
+ "instagram.com", "twitter.com", "x.com", "google.com", "youtube.com",
208
+ "netflix.com", "freelance.com", "zoominfo.com", "reditt.com"
209
+ ]
210
+ excluded_domains_lower = [d.lower() for d in excluded_domains]
211
+
212
+ if any(domain == d or domain.endswith(f".{d}") for d in excluded_domains_lower):
213
+ logger.debug(f"Domain {domain} is in the excluded list.")
214
+ return ""
215
+
216
+ logger.info(f"Found domain {domain}")
217
+ return domain
218
+
219
+
220
+ @assistant_tool
221
+ async def get_signal_strength(
222
+ domain_to_search: str,
223
+ keywords: List[str],
224
+ in_title: List[str] = [],
225
+ not_in_title: List[str] = [],
226
+ negative_keywords: List[str] = [],
227
+ tool_config: Optional[List[Dict]] = None
228
+ ) -> int:
229
+ """
230
+ Find how strong a match for the keywords in search is by checking
231
+ how many search results contain all desired keywords in the snippet.
232
+ """
233
+ logger.info("Entering get_signal_strength")
234
+
235
+ if not keywords and not domain_to_search:
236
+ logger.warning("No domain to search or keywords provided.")
237
+ return 0
238
+
239
+ query_parts = []
240
+ if domain_to_search:
241
+ query_parts.append(f"site:{domain_to_search}")
242
+ for kw in keywords:
243
+ query_parts.append(f"\"{kw}\"")
244
+ for kw in in_title:
245
+ query_parts.append(f'intitle:"{kw}"')
246
+ for kw in not_in_title:
247
+ query_parts.append(f'-intitle:"{kw}"')
248
+ for kw in negative_keywords:
249
+ query_parts.append(f'-"{kw}"')
250
+
251
+ final_query = " ".join(query_parts).strip()
252
+ if not final_query:
253
+ logger.debug("Constructed query is empty, returning score=0.")
254
+ return 0
255
+
256
+ logger.debug(f"Performing get_signal_strength search with query: {final_query}")
257
+ try:
258
+ results = await search_google_with_tools(final_query, 5, tool_config=tool_config)
259
+ except Exception:
260
+ logger.exception("Exception occurred while searching for signal strength.")
261
+ return 0
262
+
263
+ if not isinstance(results, list) or len(results) == 0:
264
+ logger.debug("No results found; returning 0.")
265
+ return 0
266
+
267
+ score = 0
268
+ for result_item in results:
269
+ try:
270
+ result_json = json.loads(result_item)
271
+ snippet_text = result_json.get('snippet', '').lower()
272
+ if all(kw.lower() in snippet_text for kw in keywords):
273
+ logger.debug(f"Found match in snippet: {snippet_text[:60]}...")
274
+ score += 1
275
+ if score == 5:
276
+ break
277
+ except (json.JSONDecodeError, KeyError):
278
+ logger.debug("Failed to decode or parse snippet from a result.")
279
+ continue
280
+
281
+ logger.info(f"Final signal strength score: {score}")
282
+ return score
283
+
284
+
285
+ def extract_user_linkedin_page(url: str) -> str:
286
+ """
287
+ Extracts and returns the user page part of a LinkedIn URL.
288
+ Ensures the domain is www.linkedin.com and removes any suffix path or query parameters.
289
+ """
290
+ logger.debug(f"Entering extract_user_linkedin_page with URL: {url}")
291
+ if not url:
292
+ return ""
293
+
294
+ normalized_url = re.sub(r"^(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
295
+ match = re.match(r"https://www\.linkedin\.com/in/([^/?#]+)", normalized_url)
296
+ if match:
297
+ page = f"https://www.linkedin.com/in/{match.group(1)}"
298
+ logger.debug(f"Extracted user LinkedIn page: {page}")
299
+ return page
300
+
301
+ logger.debug("No valid LinkedIn user page found.")
302
+ return ""
303
+
304
+
305
+ @assistant_tool
306
+ async def find_user_linkedin_url_google(
307
+ user_name: str,
308
+ user_title: str,
309
+ user_location: str,
310
+ user_company: str,
311
+ user_company_domain: str = "",
312
+ use_strict_check: bool = True,
313
+ tool_config: Optional[List[Dict]] = None
314
+ ) -> str:
315
+ """
316
+ Find the LinkedIn URL for a user based on their name, title, location, and company.
317
+ """
318
+ logger.info("Entering find_user_linkedin_url_google")
319
+
320
+ if not user_name:
321
+ logger.warning("No user_name provided.")
322
+ return ""
323
+
324
+ if use_strict_check:
325
+ queries = [
326
+ f'site:linkedin.com/in ("{user_name}") ({user_company} | {user_company_domain}) ( {user_title} | ) intitle:"{user_name}" -intitle:"profiles" '
327
+ ]
328
+ else:
329
+ queries = [
330
+ f'site:linkedin.com/in "{user_name}" "{user_location}" "{user_title}" "{user_company}" intitle:"{user_name}" -intitle:"profiles" ',
331
+ f'site:linkedin.com/in "{user_name}" "{user_location}" "{user_company}" intitle:"{user_name}" -intitle:"profiles" ',
332
+ f'site:linkedin.com/in "{user_name}", {user_location} intitle:"{user_name}" -intitle:"profiles" ',
333
+ f'site:linkedin.com/in "{user_name}" intitle:"{user_name}"'
334
+ ]
335
+
336
+ async with aiohttp.ClientSession() as session:
337
+ for query in queries:
338
+ if not query.strip():
339
+ continue
340
+ logger.debug(f"Searching with query: {query}")
341
+ try:
342
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
343
+ except Exception:
344
+ logger.exception("Error searching for LinkedIn user URL.")
345
+ continue
346
+
347
+ if not isinstance(results, list) or len(results) == 0:
348
+ logger.debug("No results for this query, moving to next.")
349
+ continue
350
+
351
+ try:
352
+ result_json = json.loads(results[0])
353
+ except (json.JSONDecodeError, IndexError):
354
+ logger.debug("Failed to parse JSON from the search result.")
355
+ continue
356
+
357
+ link = result_json.get('link', '')
358
+ if not link:
359
+ logger.debug("No link in first search result.")
360
+ continue
361
+
362
+ parsed_url = urlparse(link)
363
+ if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
364
+ link = extract_user_linkedin_page(link)
365
+ logger.info(f"Found LinkedIn user page: {link}")
366
+ return link
367
+
368
+ logger.info("No matching LinkedIn user page found.")
369
+ return ""
370
+
371
+
372
+ @assistant_tool
373
+ async def find_user_linkedin_url_by_email_google(
374
+ email: str,
375
+ user_name: str = "",
376
+ user_title: str = "",
377
+ user_location: str = "",
378
+ user_company: str = "",
379
+ tool_config: Optional[List[Dict]] = None,
380
+ ) -> Optional[Dict[str, Any]]:
381
+ """
382
+ Find the LinkedIn URL for a user based primarily on their email address.
383
+
384
+ Additional profile hints (name, title, location, company) improve query precision
385
+ when supplied. Returns a dict with the best LinkedIn URL, LLM confidence score,
386
+ and short reasoning when a match clears the confidence threshold; otherwise ``None``.
387
+ """
388
+ logger.info("Entering find_user_linkedin_url_by_email_google")
389
+
390
+ if not email:
391
+ logger.warning("No email provided.")
392
+ return None
393
+
394
+ normalized_email = email.strip().lower()
395
+ email_local_part = normalized_email.split("@")[0] if "@" in normalized_email else normalized_email
396
+ email_local_humanized = re.sub(r"[._-]+", " ", email_local_part).strip()
397
+
398
+ queries: List[str] = []
399
+
400
+ def add_query(query: str) -> None:
401
+ query = query.strip()
402
+ if query and query not in queries:
403
+ queries.append(query)
404
+
405
+ def add_query_parts(*parts: str) -> None:
406
+ tokens = [part.strip() for part in parts if part and part.strip()]
407
+ if not tokens:
408
+ return
409
+ add_query(" ".join(tokens))
410
+
411
+ enriched_terms = []
412
+ if user_name:
413
+ enriched_terms.append(f'"{user_name}"')
414
+ if user_company:
415
+ enriched_terms.append(f'"{user_company}"')
416
+ if user_title:
417
+ enriched_terms.append(f'"{user_title}"')
418
+ if user_location:
419
+ enriched_terms.append(f'"{user_location}"')
420
+ base_hint = " ".join(enriched_terms)
421
+
422
+ # Prioritise the direct email search variants before broader fallbacks.
423
+ add_query_parts(normalized_email, "linkedin.com/in", base_hint)
424
+ add_query_parts(normalized_email, "linkedin.com", base_hint)
425
+ add_query_parts(normalized_email, "linkedin", base_hint)
426
+ add_query_parts(normalized_email, base_hint)
427
+ add_query(f'"{normalized_email}" "linkedin.com/in" {base_hint}')
428
+ add_query(f'"{normalized_email}" "linkedin.com" {base_hint}')
429
+ add_query(f'"{normalized_email}" linkedin {base_hint}')
430
+
431
+ if email_local_part and email_local_part != normalized_email:
432
+ add_query_parts(email_local_part, "linkedin.com/in", base_hint)
433
+ add_query_parts(email_local_part, "linkedin.com", base_hint)
434
+ add_query_parts(email_local_part, "linkedin", base_hint)
435
+ add_query(f'"{email_local_part}" "linkedin.com/in" {base_hint}')
436
+ add_query(f'"{email_local_part}" "linkedin.com" {base_hint}')
437
+
438
+ if email_local_humanized and email_local_humanized not in {email_local_part, normalized_email}:
439
+ add_query_parts(email_local_humanized, "linkedin", base_hint)
440
+ add_query(f'"{email_local_humanized}" linkedin {base_hint}')
441
+
442
+ if normalized_email:
443
+ add_query(f'site:linkedin.com/in "{normalized_email}" {base_hint}')
444
+
445
+ if email_local_part:
446
+ add_query(f'site:linkedin.com/in "{email_local_part}" {base_hint}')
447
+
448
+ if email_local_humanized and email_local_humanized != email_local_part:
449
+ add_query(f'site:linkedin.com/in "{email_local_humanized}" {base_hint}')
450
+
451
+ if base_hint:
452
+ lookup_hint = user_name or email_local_humanized or email_local_part or normalized_email
453
+ add_query(
454
+ f'site:linkedin.com/in "{normalized_email}" {base_hint} '
455
+ f'intitle:"{lookup_hint}" -intitle:"profiles"'
456
+ )
457
+ if email_local_humanized:
458
+ add_query(
459
+ f'site:linkedin.com/in "{email_local_humanized}" {base_hint} '
460
+ f'intitle:"{lookup_hint}" -intitle:"profiles"'
461
+ )
462
+
463
+ candidate_records: List[Dict[str, str]] = []
464
+ seen_links: Set[str] = set()
465
+ best_llm_choice: Optional[LinkedinCandidateChoice] = None
466
+ best_llm_link: str = ""
467
+ HIGH_CONFIDENCE_THRESHOLD = 0.8
468
+ MIN_CONFIDENCE_THRESHOLD = 0.75
469
+
470
+ async def evaluate_with_llm() -> Optional[LinkedinCandidateChoice]:
471
+ nonlocal best_llm_choice, best_llm_link
472
+
473
+ llm_choice = await pick_best_linkedin_candidate_with_llm(
474
+ email=email,
475
+ user_name=user_name,
476
+ user_title=user_title,
477
+ user_location=user_location,
478
+ user_company=user_company,
479
+ candidates=candidate_records,
480
+ tool_config=tool_config,
481
+ )
482
+
483
+ if not llm_choice or not llm_choice.chosen_link:
484
+ return None
485
+
486
+ chosen_link = extract_user_linkedin_page(llm_choice.chosen_link)
487
+ if not chosen_link:
488
+ return None
489
+
490
+ llm_choice.chosen_link = chosen_link
491
+
492
+ if best_llm_choice is None or llm_choice.confidence > best_llm_choice.confidence:
493
+ best_llm_choice = llm_choice
494
+ best_llm_link = chosen_link
495
+ logger.debug(
496
+ "LLM updated best candidate: %s (confidence %.2f) reason: %s",
497
+ chosen_link,
498
+ llm_choice.confidence,
499
+ llm_choice.reasoning,
500
+ )
501
+
502
+ if llm_choice.confidence >= HIGH_CONFIDENCE_THRESHOLD:
503
+ logger.info(
504
+ "Returning LinkedIn user page by email via LLM scoring: %s (confidence %.2f)",
505
+ chosen_link,
506
+ llm_choice.confidence,
507
+ )
508
+ return llm_choice
509
+
510
+ return None
511
+
512
+ async with aiohttp.ClientSession() as session:
513
+ for query in queries:
514
+ query = query.strip()
515
+ if not query:
516
+ continue
517
+ logger.debug(f"Searching with query: {query}")
518
+
519
+ try:
520
+ results = await search_google_with_tools(query, 5, tool_config=tool_config)
521
+ except Exception:
522
+ logger.exception("Error searching for LinkedIn user URL by email.")
523
+ continue
524
+
525
+ if not isinstance(results, list) or len(results) == 0:
526
+ logger.debug("No results for this query, moving to next.")
527
+ continue
528
+
529
+ for result_item in results:
530
+ try:
531
+ result_json = json.loads(result_item)
532
+ except (json.JSONDecodeError, IndexError):
533
+ logger.debug("Failed to parse JSON from the search result.")
534
+ continue
535
+
536
+ link = result_json.get('link', '')
537
+ if not link:
538
+ continue
539
+
540
+ parsed_url = urlparse(link)
541
+ if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
542
+ link = extract_user_linkedin_page(link)
543
+ if not link or link in seen_links:
544
+ continue
545
+
546
+ title = result_json.get('title', '')
547
+ snippet = result_json.get('snippet', '')
548
+ subtitle = result_json.get('subtitle', '')
549
+
550
+ candidate_records.append(
551
+ {
552
+ "link": link,
553
+ "title": title,
554
+ "snippet": snippet,
555
+ "subtitle": subtitle,
556
+ "query": query,
557
+ }
558
+ )
559
+ if len(candidate_records) > 6:
560
+ candidate_records.pop(0)
561
+ seen_links.add(link)
562
+
563
+ high_conf_choice = await evaluate_with_llm()
564
+ if high_conf_choice:
565
+ return {
566
+ "linkedin_url": high_conf_choice.chosen_link,
567
+ "confidence": high_conf_choice.confidence,
568
+ "reasoning": high_conf_choice.reasoning,
569
+ }
570
+
571
+ if best_llm_choice and best_llm_link and best_llm_choice.confidence >= MIN_CONFIDENCE_THRESHOLD:
572
+ logger.info(
573
+ "Returning LinkedIn user page by email via LLM scoring (best overall): %s (confidence %.2f)",
574
+ best_llm_link,
575
+ best_llm_choice.confidence,
576
+ )
577
+ return {
578
+ "linkedin_url": best_llm_link,
579
+ "confidence": best_llm_choice.confidence,
580
+ "reasoning": best_llm_choice.reasoning,
581
+ }
582
+
583
+ logger.info("No matching LinkedIn user page found using email queries.")
584
+ return None
585
+
586
+
587
+ @assistant_tool
588
+ async def find_user_linkedin_url_by_job_title_google(
589
+ user_title: str,
590
+ user_location: str,
591
+ user_company: str,
592
+ tool_config: Optional[List[Dict]] = None
593
+ ) -> str:
594
+ """
595
+ Find the LinkedIn URL for a user based on their job_title, location, and company.
596
+ """
597
+ logger.info("Entering find_user_linkedin_url_by_job_title_google")
598
+
599
+ queries = [
600
+ f'site:linkedin.com/in "{user_company}" AND "{user_title}" -intitle:"profiles" ',
601
+ ]
602
+
603
+ async with aiohttp.ClientSession() as session:
604
+ for query in queries:
605
+ if not query.strip():
606
+ continue
607
+ logger.debug(f"Searching with query: {query}")
608
+
609
+ try:
610
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
611
+ except Exception:
612
+ logger.exception("Error searching for LinkedIn URL by job title.")
613
+ continue
614
+
615
+ if not isinstance(results, list) or len(results) == 0:
616
+ logger.debug("No results for this query, moving to next.")
617
+ continue
618
+
619
+ try:
620
+ result_json = json.loads(results[0])
621
+ except (json.JSONDecodeError, IndexError):
622
+ logger.debug("Failed to parse JSON from the search result.")
623
+ continue
624
+
625
+ link = result_json.get('link', '')
626
+ if not link:
627
+ logger.debug("No link in the first search result.")
628
+ continue
629
+
630
+ parsed_url = urlparse(link)
631
+ if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
632
+ link = extract_user_linkedin_page(link)
633
+ logger.info(f"Found LinkedIn user page by job title: {link}")
634
+ return link
635
+
636
+ logger.info("No matching LinkedIn user page found by job title.")
637
+ return ""
638
+
639
+
640
+ @assistant_tool
641
+ async def find_user_linkedin_url_by_google_search(
642
+ queries: List[str],
643
+ number_of_results: int = 5,
644
+ tool_config: Optional[List[Dict]] = None
645
+ ) -> List[str]:
646
+ """
647
+ Find LinkedIn user URLs based on provided Google search queries.
648
+ """
649
+ logger.info("Entering find_user_linkedin_url_by_google_search")
650
+ found_urls = []
651
+
652
+ for query in queries:
653
+ if not query.strip():
654
+ continue
655
+ logger.debug(f"Searching with query: {query}")
656
+
657
+ try:
658
+ results = await search_google_with_tools(query.strip(), number_of_results, tool_config=tool_config)
659
+ except Exception:
660
+ logger.exception("Error searching for LinkedIn URL using Google search.")
661
+ continue
662
+
663
+ if not isinstance(results, list) or len(results) == 0:
664
+ logger.debug("No results for this query, moving to next.")
665
+ continue
666
+
667
+ try:
668
+ result_json = json.loads(results[0])
669
+ except (json.JSONDecodeError, IndexError):
670
+ logger.debug("Failed to parse JSON from the search result.")
671
+ continue
672
+
673
+ link = result_json.get('link', '')
674
+ if not link:
675
+ logger.debug("No link in the first search result.")
676
+ continue
677
+
678
+ parsed_url = urlparse(link)
679
+ if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
680
+ link = extract_user_linkedin_page(link)
681
+ logger.info(f"Found LinkedIn user page: {link}")
682
+ found_urls.append(link)
683
+
684
+ if not found_urls:
685
+ logger.info("No matching LinkedIn user page found based on provided queries.")
686
+ return found_urls
687
+
688
+
689
+ def extract_company_page(url: str) -> str:
690
+ """
691
+ Extracts and returns the company page part of a LinkedIn URL.
692
+ Ensures the domain is www.linkedin.com and removes any suffix path or query parameters.
693
+ """
694
+ logger.debug(f"Entering extract_company_page with URL: {url}")
695
+ if not url:
696
+ return ""
697
+
698
+ normalized_url = re.sub(r"(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
699
+ match = re.match(r"https://www.linkedin.com/company/([\w\-]+)", normalized_url)
700
+ if match:
701
+ company_page = f"https://www.linkedin.com/company/{match.group(1)}"
702
+ logger.debug(f"Extracted LinkedIn company page: {company_page}")
703
+ return company_page
704
+
705
+ logger.debug("No valid LinkedIn company page found.")
706
+ return ""
707
+
708
+
709
+ @assistant_tool
710
+ async def find_organization_linkedin_url_with_google_search(
711
+ company_name: str,
712
+ company_location: Optional[str] = None,
713
+ company_domain: Optional[str] = None,
714
+ use_strict_check: bool = True,
715
+ tool_config: Optional[List[Dict]] = None,
716
+ ) -> str:
717
+ """
718
+ Find the LinkedIn URL for a company based on its name and optional location using Google search.
719
+ """
720
+ logger.info("Entering find_organization_linkedin_url_with_google_search")
721
+
722
+ if not company_name:
723
+ logger.warning("No company_name provided.")
724
+ return ""
725
+
726
+ if use_strict_check:
727
+ queries = [f'site:linkedin.com/company "{company_name}" {company_domain} ']
728
+ else:
729
+ if company_location:
730
+ queries = [
731
+ f'site:linkedin.com/company "{company_name}" {company_location} -intitle:"jobs" ',
732
+ f'site:linkedin.com/company "{company_name}" -intitle:"jobs" ',
733
+ f'site:linkedin.com/company {company_name} {company_location} -intitle:"jobs" ',
734
+ ]
735
+ else:
736
+ queries = [
737
+ f'site:linkedin.com/company "{company_name}" -intitle:"jobs" ',
738
+ f'site:linkedin.com/company {company_name} -intitle:"jobs" '
739
+ ]
740
+
741
+ async with aiohttp.ClientSession() as session:
742
+ for query in queries:
743
+ if not query.strip():
744
+ continue
745
+
746
+ logger.debug(f"Searching with query: {query}")
747
+ try:
748
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
749
+ except Exception:
750
+ logger.exception("Error searching for organization LinkedIn URL.")
751
+ continue
752
+
753
+ if not isinstance(results, list) or len(results) == 0:
754
+ logger.debug("No results for this query, moving to next.")
755
+ continue
756
+
757
+ try:
758
+ result_json = json.loads(results[0])
759
+ except (json.JSONDecodeError, IndexError):
760
+ logger.debug("Failed to parse JSON from the search result.")
761
+ continue
762
+
763
+ link = result_json.get('link', '')
764
+ if not link:
765
+ logger.debug("No link found in the first result.")
766
+ continue
767
+
768
+ parsed_url = urlparse(link)
769
+ if 'linkedin.com/company' in (parsed_url.netloc + parsed_url.path):
770
+ link = extract_company_page(link)
771
+ logger.info(f"Found LinkedIn company page: {link}")
772
+ return link
773
+
774
+ logger.info("No matching LinkedIn company page found.")
775
+ return ""
776
+
777
+
778
+ async def get_external_links(url: str) -> List[str]:
779
+ """
780
+ Fetch external links from a given URL by parsing its HTML content.
781
+ """
782
+ logger.debug(f"Entering get_external_links for URL: {url}")
783
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
784
+
785
+ try:
786
+ async with aiohttp.ClientSession(headers=headers) as session:
787
+ async with session.get(url, allow_redirects=True) as response:
788
+ logger.debug(f"Received status for external links: {response.status}")
789
+ if response.status == 200:
790
+ content = await response.text()
791
+ soup = BeautifulSoup(content, "html.parser")
792
+ external_links = []
793
+ for link in soup.find_all('a', href=True):
794
+ href = link['href']
795
+ if href.startswith('http') and not href.startswith(url):
796
+ external_links.append(href)
797
+ logger.debug(f"Found {len(external_links)} external links.")
798
+ return external_links
799
+ else:
800
+ logger.warning(f"Non-200 status ({response.status}) while fetching external links.")
801
+ return []
802
+ except Exception:
803
+ logger.exception("Exception occurred while fetching external links.")
804
+ return []
805
+
806
+
807
+ async def get_resolved_linkedin_links(url: str) -> List[str]:
808
+ """
809
+ Fetch HTML content from a URL and return any LinkedIn.com/company links found.
810
+ """
811
+ logger.debug(f"Entering get_resolved_linkedin_links for URL: {url}")
812
+ try:
813
+ content = await fetch_html_content(url)
814
+ except Exception:
815
+ logger.exception("Exception occurred while fetching HTML content.")
816
+ return []
817
+
818
+ linkedin_links = re.findall(r'https://www\.linkedin\.com/company/[^\s]+', content)
819
+ unique_links = list(set(linkedin_links))
820
+ logger.debug(f"Found {len(unique_links)} LinkedIn links.")
821
+ return unique_links
822
+
823
+
824
+ @assistant_tool
825
+ async def get_company_website_from_linkedin_url(linkedin_url: str) -> str:
826
+ """
827
+ Attempt to extract a company's website from its LinkedIn URL by
828
+ scanning external links that contain "trk=about_website".
829
+ """
830
+ logger.info("Entering get_company_website_from_linkedin_url")
831
+
832
+ if not linkedin_url:
833
+ logger.debug("Empty LinkedIn URL provided, returning empty string.")
834
+ return ""
835
+
836
+ try:
837
+ links = await get_external_links(linkedin_url)
838
+ except Exception:
839
+ logger.exception("Exception occurred while getting external links for LinkedIn URL.")
840
+ return ""
841
+
842
+ for link in links:
843
+ if 'trk=about_website' in link:
844
+ parsed_link = urllib.parse.urlparse(link)
845
+ query_params = urllib.parse.parse_qs(parsed_link.query)
846
+ if 'url' in query_params:
847
+ encoded_url = query_params['url'][0]
848
+ company_website = urllib.parse.unquote(encoded_url)
849
+ logger.info(f"Extracted company website: {company_website}")
850
+ return company_website
851
+ logger.debug("No company website link found with 'trk=about_website'.")
852
+ return ""