dhisana 0.0.1.dev85__py3-none-any.whl → 0.0.1.dev236__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. dhisana/schemas/common.py +33 -0
  2. dhisana/schemas/sales.py +224 -23
  3. dhisana/utils/add_mapping.py +72 -63
  4. dhisana/utils/apollo_tools.py +739 -109
  5. dhisana/utils/built_with_api_tools.py +4 -2
  6. dhisana/utils/cache_output_tools.py +23 -23
  7. dhisana/utils/check_email_validity_tools.py +456 -458
  8. dhisana/utils/check_for_intent_signal.py +1 -2
  9. dhisana/utils/check_linkedin_url_validity.py +34 -8
  10. dhisana/utils/clay_tools.py +3 -2
  11. dhisana/utils/clean_properties.py +3 -1
  12. dhisana/utils/compose_salesnav_query.py +0 -1
  13. dhisana/utils/compose_search_query.py +7 -3
  14. dhisana/utils/composite_tools.py +0 -1
  15. dhisana/utils/dataframe_tools.py +2 -2
  16. dhisana/utils/email_body_utils.py +72 -0
  17. dhisana/utils/email_provider.py +375 -0
  18. dhisana/utils/enrich_lead_information.py +585 -85
  19. dhisana/utils/fetch_openai_config.py +129 -0
  20. dhisana/utils/field_validators.py +1 -1
  21. dhisana/utils/g2_tools.py +0 -1
  22. dhisana/utils/generate_content.py +0 -1
  23. dhisana/utils/generate_email.py +69 -16
  24. dhisana/utils/generate_email_response.py +298 -41
  25. dhisana/utils/generate_flow.py +0 -1
  26. dhisana/utils/generate_linkedin_connect_message.py +19 -6
  27. dhisana/utils/generate_linkedin_response_message.py +156 -65
  28. dhisana/utils/generate_structured_output_internal.py +351 -131
  29. dhisana/utils/google_custom_search.py +150 -44
  30. dhisana/utils/google_oauth_tools.py +721 -0
  31. dhisana/utils/google_workspace_tools.py +391 -25
  32. dhisana/utils/hubspot_clearbit.py +3 -1
  33. dhisana/utils/hubspot_crm_tools.py +771 -167
  34. dhisana/utils/instantly_tools.py +3 -1
  35. dhisana/utils/lusha_tools.py +10 -7
  36. dhisana/utils/mailgun_tools.py +150 -0
  37. dhisana/utils/microsoft365_tools.py +447 -0
  38. dhisana/utils/openai_assistant_and_file_utils.py +121 -177
  39. dhisana/utils/openai_helpers.py +19 -16
  40. dhisana/utils/parse_linkedin_messages_txt.py +2 -3
  41. dhisana/utils/profile.py +37 -0
  42. dhisana/utils/proxy_curl_tools.py +507 -206
  43. dhisana/utils/proxycurl_search_leads.py +426 -0
  44. dhisana/utils/research_lead.py +121 -68
  45. dhisana/utils/sales_navigator_crawler.py +1 -6
  46. dhisana/utils/salesforce_crm_tools.py +323 -50
  47. dhisana/utils/search_router.py +131 -0
  48. dhisana/utils/search_router_jobs.py +51 -0
  49. dhisana/utils/sendgrid_tools.py +126 -91
  50. dhisana/utils/serarch_router_local_business.py +75 -0
  51. dhisana/utils/serpapi_additional_tools.py +290 -0
  52. dhisana/utils/serpapi_google_jobs.py +117 -0
  53. dhisana/utils/serpapi_google_search.py +188 -0
  54. dhisana/utils/serpapi_local_business_search.py +129 -0
  55. dhisana/utils/serpapi_search_tools.py +363 -432
  56. dhisana/utils/serperdev_google_jobs.py +125 -0
  57. dhisana/utils/serperdev_local_business.py +154 -0
  58. dhisana/utils/serperdev_search.py +233 -0
  59. dhisana/utils/smtp_email_tools.py +576 -0
  60. dhisana/utils/test_connect.py +1765 -92
  61. dhisana/utils/trasform_json.py +95 -16
  62. dhisana/utils/web_download_parse_tools.py +0 -1
  63. dhisana/utils/zoominfo_tools.py +2 -3
  64. dhisana/workflow/test.py +1 -1
  65. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/METADATA +5 -2
  66. dhisana-0.0.1.dev236.dist-info/RECORD +100 -0
  67. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/WHEEL +1 -1
  68. dhisana-0.0.1.dev85.dist-info/RECORD +0 -81
  69. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/entry_points.txt +0 -0
  70. {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/top_level.txt +0 -0
@@ -1,384 +1,150 @@
1
1
  import json
2
- import os
3
2
  import re
4
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Set
5
4
  from urllib.parse import urlparse
5
+ import urllib.parse
6
6
  import aiohttp
7
7
  from bs4 import BeautifulSoup
8
8
  import urllib
9
+ from pydantic import BaseModel
9
10
 
10
- from dhisana.utils.assistant_tool_tag import assistant_tool
11
- from dhisana.utils.cache_output_tools import cache_output, retrieve_output
12
- from dhisana.utils.web_download_parse_tools import fetch_html_content, get_html_content_from_url
11
+ from dhisana.utils.serperdev_search import search_google_serper
12
+ from dhisana.utils.generate_structured_output_internal import (
13
+ get_structured_output_internal,
14
+ )
13
15
 
14
16
  import logging
17
+
15
18
  logging.basicConfig(level=logging.INFO)
16
19
  logger = logging.getLogger(__name__)
17
20
 
21
+ from dhisana.utils.search_router import search_google_with_tools
22
+ from dhisana.utils.assistant_tool_tag import assistant_tool
18
23
 
19
- def get_serp_api_access_token(tool_config: Optional[List[Dict]] = None) -> str:
20
- """
21
- Retrieves the SERPAPI_KEY access token from the provided tool configuration.
22
-
23
- Args:
24
- tool_config (list): A list of dictionaries containing the tool configuration.
25
- Each dictionary should have a "name" key and a "configuration" key,
26
- where "configuration" is a list of dictionaries containing "name" and "value" keys.
27
-
28
- Returns:
29
- str: The SERPAPI_KEY access token.
30
-
31
- Raises:
32
- ValueError: If the access token is not found in the tool configuration or environment variable.
33
- """
34
- logger.info("Entering get_serp_api_access_token")
35
- SERPAPI_KEY = None
36
-
37
- if tool_config:
38
- logger.debug(f"Tool config provided: {tool_config}")
39
- serpapi_config = next(
40
- (item for item in tool_config if item.get("name") == "serpapi"), None
41
- )
42
- if serpapi_config:
43
- config_map = {
44
- item["name"]: item["value"]
45
- for item in serpapi_config.get("configuration", [])
46
- if item
47
- }
48
- SERPAPI_KEY = config_map.get("apiKey")
49
- else:
50
- logger.warning("No 'serpapi' config item found in tool_config.")
51
- else:
52
- logger.debug("No tool_config provided or it's None.")
53
-
54
- SERPAPI_KEY = SERPAPI_KEY or os.getenv("SERPAPI_KEY")
55
- if not SERPAPI_KEY:
56
- logger.error("SERPAPI_KEY not found in configuration or environment.")
57
- raise ValueError("SERPAPI_KEY access token not found in tool_config or environment variable")
58
-
59
- logger.info("Retrieved SERPAPI_KEY successfully.")
60
- return SERPAPI_KEY
61
-
62
-
63
- @assistant_tool
64
- async def search_google(
65
- query: str,
66
- number_of_results: int = 10,
67
- offset: int = 0,
68
- tool_config: Optional[List[Dict]] = None,
69
- as_oq: Optional[str] = None # <-- NEW PARAM for optional keywords
70
- ) -> List[str]:
71
- """
72
- Search Google using SERP API, supporting pagination and an explicit 'offset'
73
- parameter to start from a specific result index.
74
- Now also supports 'as_oq' for optional query terms in SERP API.
75
-
76
- Parameters:
77
- - query (str): The search query.
78
- - number_of_results (int): The total number of results to return. Default is 10.
79
- - offset (int): The starting index for the first result returned (Google pagination).
80
- - tool_config (Optional[List[Dict]]): Configuration containing SERP API token, etc.
81
- - as_oq (Optional[str]): Optional query terms for SerpAPI (if supported).
82
-
83
- Returns:
84
- - List[str]: A list of organic search results, each serialized as a JSON string.
85
- """
86
- logger.info("Entering search_google")
87
- if not query:
88
- logger.warning("Empty query string provided.")
89
- return []
90
-
91
- # Use 'as_oq' in the cache key too, so different optional terms don't conflict
92
- cache_key = f"{query}_{number_of_results}_{offset}_{as_oq or ''}"
93
- cached_response = retrieve_output("search_google_serp", cache_key)
94
- if cached_response is not None:
95
- logger.info("Cache hit for search_google.")
96
- return cached_response
97
-
98
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
99
- url = "https://serpapi.com/search"
100
-
101
- page_size = 100
102
- all_results: List[Dict[str, Any]] = []
103
- start_index = offset
104
-
105
- logger.debug(f"Requesting up to {number_of_results} results for '{query}' starting at offset {offset}.")
106
-
107
- async with aiohttp.ClientSession() as session:
108
- while len(all_results) < number_of_results:
109
- to_fetch = min(page_size, number_of_results - len(all_results))
110
- params = {
111
- "q": query,
112
- "num": to_fetch,
113
- "start": start_index,
114
- "api_key": SERPAPI_KEY,
115
- "location": "United States"
116
- }
117
-
118
- # If we have optional terms, add them
119
- if as_oq:
120
- params["as_oq"] = as_oq
121
-
122
- logger.debug(f"SERP API GET request with params: {params}")
123
-
124
- try:
125
- async with session.get(url, params=params) as response:
126
- logger.debug(f"Received response status: {response.status}")
127
- if response.status != 200:
128
- try:
129
- error_content = await response.json()
130
- except Exception:
131
- error_content = await response.text()
132
- logger.warning(f"Non-200 response from SERP API: {error_content}")
133
- return [json.dumps({"error": error_content})]
134
-
135
- result = await response.json()
136
- except Exception as e:
137
- logger.exception("Exception during SERP API request.")
138
- return [json.dumps({"error": str(e)})]
139
-
140
- organic_results = result.get('organic_results', [])
141
- if not organic_results:
142
- logger.debug("No more organic results returned; stopping.")
143
- break
144
-
145
- all_results.extend(organic_results)
146
- start_index += to_fetch
147
-
148
- if len(all_results) >= number_of_results:
149
- break
150
-
151
- all_results = all_results[:number_of_results]
152
- logger.info(f"Found {len(all_results)} results for query '{query}'.")
153
-
154
- serialized_results = [json.dumps(item) for item in all_results]
155
- cache_output("search_google_serp", cache_key, serialized_results)
156
- return serialized_results
157
-
158
-
159
- @assistant_tool
160
- async def search_google_maps(
161
- query: str,
162
- number_of_results: int = 3,
163
- tool_config: Optional[List[Dict]] = None
164
- ) -> List[str]:
165
- """
166
- Search Google Maps using SERP API and return the results as an array of serialized JSON strings.
167
-
168
- Parameters:
169
- - query (str): The search query.
170
- - number_of_results (int): The number of results to return.
171
- """
172
- logger.info("Entering search_google_maps")
173
- if not query:
174
- logger.warning("Empty query string provided for search_google_maps.")
175
- return []
176
-
177
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
178
- params = {
179
- "q": query,
180
- "num": number_of_results,
181
- "api_key": SERPAPI_KEY,
182
- "engine": "google_maps"
183
- }
184
- url = "https://serpapi.com/search"
185
-
186
- logger.debug(f"Searching Google Maps with params: {params}")
187
- try:
188
- async with aiohttp.ClientSession() as session:
189
- async with session.get(url, params=params) as response:
190
- logger.debug(f"Received status: {response.status}")
191
- result = await response.json()
192
- if response.status != 200:
193
- logger.warning(f"Non-200 response from SERP API: {result}")
194
- return [json.dumps({"error": result})]
195
-
196
- serialized_results = [json.dumps(item) for item in result.get('local_results', [])]
197
- logger.info(f"Returning {len(serialized_results)} map results.")
198
- return serialized_results
199
- except Exception as e:
200
- logger.exception("Exception during search_google_maps request.")
201
- return [json.dumps({"error": str(e)})]
202
-
24
+ from dhisana.utils.web_download_parse_tools import fetch_html_content
203
25
 
204
- @assistant_tool
205
- async def search_google_news(
206
- query: str,
207
- number_of_results: int = 3,
208
- tool_config: Optional[List[Dict]] = None
209
- ) -> List[str]:
210
- """
211
- Search Google News using SERP API and return the results as an array of serialized JSON strings.
212
-
213
- Parameters:
214
- - query (str): The search query.
215
- - number_of_results (int): The number of results to return.
216
- """
217
- logger.info("Entering search_google_news")
218
- if not query:
219
- logger.warning("Empty query string provided for search_google_news.")
220
- return []
221
26
 
222
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
223
- params = {
224
- "q": query,
225
- "num": number_of_results,
226
- "api_key": SERPAPI_KEY,
227
- "engine": "google_news"
228
- }
229
- url = "https://serpapi.com/search"
27
+ class LeadSearchResult(BaseModel):
28
+ first_name: str = ""
29
+ last_name: str = ""
30
+ full_name: str = ""
31
+ job_title: str = ""
32
+ linkedin_follower_count: int = 0
33
+ lead_location: str = ""
34
+ summary_about_lead: str = ""
35
+ user_linkedin_url: str = ""
230
36
 
231
- logger.debug(f"Searching Google News with params: {params}")
232
- try:
233
- async with aiohttp.ClientSession() as session:
234
- async with session.get(url, params=params) as response:
235
- logger.debug(f"Received status: {response.status}")
236
- result = await response.json()
237
- if response.status != 200:
238
- logger.warning(f"Non-200 response from SERP API: {result}")
239
- return [json.dumps({"error": result})]
240
-
241
- serialized_results = [json.dumps(item) for item in result.get('news_results', [])]
242
- logger.info(f"Returning {len(serialized_results)} news results.")
243
- return serialized_results
244
- except Exception as e:
245
- logger.exception("Exception during search_google_news request.")
246
- return [json.dumps({"error": str(e)})]
247
37
 
38
+ class LinkedinCandidateChoice(BaseModel):
39
+ chosen_link: str = ""
40
+ confidence: float = 0.0
41
+ reasoning: str = ""
248
42
 
249
- @assistant_tool
250
- async def search_job_postings(
251
- query: str,
252
- number_of_results: int,
253
- tool_config: Optional[List[Dict]] = None
254
- ) -> List[str]:
255
- """
256
- Search for job postings using SERP API and return the results as an array of serialized JSON strings.
257
-
258
- Parameters:
259
- - query (str): The search query.
260
- - number_of_results (int): The number of results to return.
261
- """
262
- logger.info("Entering search_job_postings")
263
- if not query:
264
- logger.warning("Empty query string provided for search_job_postings.")
265
- return []
266
43
 
267
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
268
- params = {
269
- "q": query,
270
- "num": number_of_results,
271
- "api_key": SERPAPI_KEY,
272
- "engine": "google_jobs"
273
- }
274
- url = "https://serpapi.com/search"
44
+ async def get_structured_output(text: str, tool_config: Optional[List[Dict]] = None) -> LeadSearchResult:
45
+ """Parse text snippet into ``LeadSearchResult`` using OpenAI."""
275
46
 
276
- logger.debug(f"Searching Google Jobs with params: {params}")
277
- try:
278
- async with aiohttp.ClientSession() as session:
279
- async with session.get(url, params=params) as response:
280
- logger.debug(f"Received status: {response.status}")
281
- result = await response.json()
282
- if response.status != 200:
283
- logger.warning(f"Non-200 response from SERP API: {result}")
284
- return [json.dumps({"error": result})]
285
-
286
- serialized_results = [json.dumps(item) for item in result.get('jobs_results', [])]
287
- logger.info(f"Returning {len(serialized_results)} job posting results.")
288
- return serialized_results
289
- except Exception as e:
290
- logger.exception("Exception during search_job_postings request.")
291
- return [json.dumps({"error": str(e)})]
47
+ prompt = (
48
+ "Extract lead details from the text below.\n"
49
+ "If follower counts are mentioned, convert values like '1.5k+ followers' to an integer (e.g. 1500).\n"
50
+ f"Return JSON matching this schema:\n{json.dumps(LeadSearchResult.model_json_schema(), indent=2)}\n\n"
51
+ f"Text:\n{text}"
52
+ )
53
+ result, status = await get_structured_output_internal(
54
+ prompt, LeadSearchResult, model = "gpt-5.1-chat", tool_config=tool_config
55
+ )
56
+ if status != "SUCCESS" or result is None:
57
+ return LeadSearchResult()
58
+ return result
292
59
 
293
60
 
294
61
  @assistant_tool
295
- async def search_google_images(
296
- query: str,
297
- number_of_results: int,
298
- tool_config: Optional[List[Dict]] = None
299
- ) -> List[str]:
300
- """
301
- Search Google Images using SERP API and return the results as an array of serialized JSON strings.
302
-
303
- Parameters:
304
- - query (str): The search query.
305
- - number_of_results (int): The number of results to return.
306
- """
307
- logger.info("Entering search_google_images")
308
- if not query:
309
- logger.warning("Empty query string provided for search_google_images.")
310
- return []
311
-
312
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
313
- params = {
314
- "q": query,
315
- "num": number_of_results,
316
- "api_key": SERPAPI_KEY,
317
- "engine": "google_images"
318
- }
319
- url = "https://serpapi.com/search"
320
-
321
- logger.debug(f"Searching Google Images with params: {params}")
322
- try:
323
- async with aiohttp.ClientSession() as session:
324
- async with session.get(url, params=params) as response:
325
- logger.debug(f"Received status: {response.status}")
326
- result = await response.json()
327
- if response.status != 200:
328
- logger.warning(f"Non-200 response from SERP API: {result}")
329
- return [json.dumps({"error": result})]
330
-
331
- serialized_results = [json.dumps(item) for item in result.get('images_results', [])]
332
- logger.info(f"Returning {len(serialized_results)} image results.")
333
- return serialized_results
334
- except Exception as e:
335
- logger.exception("Exception during search_google_images request.")
336
- return [json.dumps({"error": str(e)})]
337
-
62
+ async def find_user_linkedin_url_with_serper(
63
+ user_linkedin_url: str,
64
+ tool_config: Optional[List[Dict]] = None,
65
+ ) -> Optional[Dict]:
66
+ """Search Google via Serper.dev for ``user_linkedin_url`` and parse lead details."""
338
67
 
339
- @assistant_tool
340
- async def search_google_videos(
341
- query: str,
342
- number_of_results: int,
343
- tool_config: Optional[List[Dict]] = None
344
- ) -> List[str]:
345
- """
346
- Search Google Videos using SERP API and return the results as an array of serialized JSON strings.
347
-
348
- Parameters:
349
- - query (str): The search query.
350
- - number_of_results (int): The number of results to return.
351
- """
352
- logger.info("Entering search_google_videos")
353
- if not query:
354
- logger.warning("Empty query string provided for search_google_videos.")
355
- return []
68
+ if not user_linkedin_url:
69
+ return None
356
70
 
357
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
358
- params = {
359
- "q": query,
360
- "num": number_of_results,
361
- "api_key": SERPAPI_KEY,
362
- "engine": "google_videos"
363
- }
364
- url = "https://serpapi.com/search"
71
+ normalized_input = extract_user_linkedin_page(user_linkedin_url)
72
+ results = await search_google_serper(user_linkedin_url, 10, tool_config=tool_config)
73
+ for item_json in results:
74
+ try:
75
+ item = json.loads(item_json)
76
+ except Exception:
77
+ continue
78
+ link = item.get("link", "")
79
+ if not link:
80
+ continue
81
+ if extract_user_linkedin_page(link) == normalized_input:
82
+ text = " ".join(
83
+ [item.get("title", ""), item.get("subtitle", ""), item.get("snippet", "")]
84
+ ).strip()
85
+ structured = await get_structured_output(text, tool_config=tool_config)
86
+ structured.user_linkedin_url = normalized_input
87
+ return json.loads(structured.model_dump_json())
88
+ return None
89
+
90
+
91
+ async def pick_best_linkedin_candidate_with_llm(
92
+ email: str,
93
+ user_name: str,
94
+ user_title: str,
95
+ user_location: str,
96
+ user_company: str,
97
+ candidates: List[Dict],
98
+ tool_config: Optional[List[Dict]] = None,
99
+ ) -> Optional[LinkedinCandidateChoice]:
100
+ """Ask the LLM to assess candidate LinkedIn URLs and pick the best match."""
101
+
102
+ if not candidates:
103
+ return None
104
+
105
+ candidates_sorted = candidates[-3:]
106
+ candidate_lines = []
107
+ for idx, candidate in enumerate(candidates_sorted, start=1):
108
+ candidate_lines.append(
109
+ "\n".join(
110
+ [
111
+ f"Candidate {idx}:",
112
+ f" Link: {candidate.get('link', '')}",
113
+ f" Title: {candidate.get('title', '')}",
114
+ f" Snippet: {candidate.get('snippet', '')}",
115
+ f" Subtitle: {candidate.get('subtitle', '')}",
116
+ f" Query: {candidate.get('query', '')}",
117
+ ]
118
+ )
119
+ )
365
120
 
366
- logger.debug(f"Searching Google Videos with params: {params}")
367
- try:
368
- async with aiohttp.ClientSession() as session:
369
- async with session.get(url, params=params) as response:
370
- logger.debug(f"Received status: {response.status}")
371
- result = await response.json()
372
- if response.status != 200:
373
- logger.warning(f"Non-200 response from SERP API: {result}")
374
- return [json.dumps({"error": result})]
375
-
376
- serialized_results = [json.dumps(item) for item in result.get('video_results', [])]
377
- logger.info(f"Returning {len(serialized_results)} video results.")
378
- return serialized_results
379
- except Exception as e:
380
- logger.exception("Exception during search_google_videos request.")
381
- return [json.dumps({"error": str(e)})]
121
+ prompt = (
122
+ "You are validating LinkedIn profile matches for a lead enrichment workflow.\n"
123
+ "Given the lead context and candidate search results, pick the most likely LinkedIn profile.\n"
124
+ "If no candidate seems appropriate, return an empty link and confidence 0.\n"
125
+ "Consider whether the email, name, company, title, or location aligns with the candidate.\n"
126
+ "Lead context:\n"
127
+ f"- Email: {email or 'unknown'}\n"
128
+ f"- Name: {user_name or 'unknown'}\n"
129
+ f"- Title: {user_title or 'unknown'}\n"
130
+ f"- Company: {user_company or 'unknown'}\n"
131
+ f"- Location: {user_location or 'unknown'}\n\n"
132
+ "Candidates:\n"
133
+ f"{chr(10).join(candidate_lines)}\n\n"
134
+ "Return JSON with fields: chosen_link (string), confidence (0-1 float), reasoning (short string)."
135
+ )
136
+
137
+ result, status = await get_structured_output_internal(
138
+ prompt,
139
+ LinkedinCandidateChoice,
140
+ model="gpt-5.1-chat",
141
+ tool_config=tool_config,
142
+ )
143
+
144
+ if status != "SUCCESS" or result is None:
145
+ return None
146
+
147
+ return result
382
148
 
383
149
 
384
150
  @assistant_tool
@@ -388,14 +154,7 @@ async def get_company_domain_from_google_search(
388
154
  tool_config: Optional[List[Dict]] = None
389
155
  ) -> str:
390
156
  """
391
- Tries to find the company domain from the company name using Google search.
392
-
393
- Args:
394
- company_name (str): The name of the company to search for.
395
- location (str, optional): A location to include in the query.
396
-
397
- Returns:
398
- str: The domain of the company's official website if found, otherwise an empty string.
157
+ Tries to find the company domain from the company name using Google (SerpAPI or Serper.dev).
399
158
  """
400
159
  logger.info("Entering get_company_domain_from_google_search")
401
160
 
@@ -404,22 +163,21 @@ async def get_company_domain_from_google_search(
404
163
  logger.debug("Invalid or excluded company_name provided.")
405
164
  return ""
406
165
 
407
- exclude_company_names = ["linkedin", "wikipedia", "facebook", "instagram", "twitter", "youtube", "netflix", "zoominfo", "reditt"]
408
166
  query = f"\"{company_name}\" official website"
409
167
  if location:
410
168
  query = f"\"{company_name}\" official website, {location}"
411
169
 
412
170
  try:
413
171
  logger.debug(f"Performing search with query: {query}")
414
- result = await search_google(query, 1, tool_config=tool_config)
172
+ result = await search_google_with_tools(query, 1, tool_config=tool_config)
415
173
  if not isinstance(result, list) or len(result) == 0:
416
174
  logger.debug("No results for first attempt, retrying with fallback query.")
417
175
  query = f"{company_name} official website"
418
- result = await search_google(query, 1, tool_config=tool_config)
176
+ result = await search_google_with_tools(query, 1, tool_config=tool_config)
419
177
  if not isinstance(result, list) or len(result) == 0:
420
178
  logger.debug("No results from fallback query either.")
421
179
  return ''
422
- except Exception as e:
180
+ except Exception:
423
181
  logger.exception("Exception during get_company_domain_from_google_search.")
424
182
  return ''
425
183
 
@@ -471,16 +229,6 @@ async def get_signal_strength(
471
229
  """
472
230
  Find how strong a match for the keywords in search is by checking
473
231
  how many search results contain all desired keywords in the snippet.
474
-
475
- Args:
476
- domain_to_search (str): The domain to search inside.
477
- keywords (List[str]): The keywords to search for.
478
- in_title (List[str]): Keywords that must appear in the title.
479
- not_in_title (List[str]): Keywords that must not appear in the title.
480
- negative_keywords (List[str]): Keywords to exclude from results.
481
-
482
- Returns:
483
- int: A strength score on a scale of 0 to 5.
484
232
  """
485
233
  logger.info("Entering get_signal_strength")
486
234
 
@@ -507,8 +255,8 @@ async def get_signal_strength(
507
255
 
508
256
  logger.debug(f"Performing get_signal_strength search with query: {final_query}")
509
257
  try:
510
- results = await search_google(final_query, 5, tool_config=tool_config)
511
- except Exception as e:
258
+ results = await search_google_with_tools(final_query, 5, tool_config=tool_config)
259
+ except Exception:
512
260
  logger.exception("Exception occurred while searching for signal strength.")
513
261
  return 0
514
262
 
@@ -517,9 +265,9 @@ async def get_signal_strength(
517
265
  return 0
518
266
 
519
267
  score = 0
520
- for result in results:
268
+ for result_item in results:
521
269
  try:
522
- result_json = json.loads(result)
270
+ result_json = json.loads(result_item)
523
271
  snippet_text = result_json.get('snippet', '').lower()
524
272
  if all(kw.lower() in snippet_text for kw in keywords):
525
273
  logger.debug(f"Found match in snippet: {snippet_text[:60]}...")
@@ -543,8 +291,8 @@ def extract_user_linkedin_page(url: str) -> str:
543
291
  if not url:
544
292
  return ""
545
293
 
546
- normalized_url = re.sub(r"(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
547
- match = re.match(r"https://www.linkedin.com/in/([\w\-]+)", normalized_url)
294
+ normalized_url = re.sub(r"^(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
295
+ match = re.match(r"https://www\.linkedin\.com/in/([^/?#]+)", normalized_url)
548
296
  if match:
549
297
  page = f"https://www.linkedin.com/in/{match.group(1)}"
550
298
  logger.debug(f"Extracted user LinkedIn page: {page}")
@@ -560,21 +308,12 @@ async def find_user_linkedin_url_google(
560
308
  user_title: str,
561
309
  user_location: str,
562
310
  user_company: str,
311
+ user_company_domain: str = "",
563
312
  use_strict_check: bool = True,
564
313
  tool_config: Optional[List[Dict]] = None
565
314
  ) -> str:
566
315
  """
567
316
  Find the LinkedIn URL for a user based on their name, title, location, and company.
568
-
569
- Args:
570
- user_name (str): The name of the user.
571
- user_title (str): The title of the user.
572
- user_location (str): The location of the user.
573
- user_company (str): The company of the user.
574
- use_strict_check (bool): Whether to use a strict single query or a series of relaxed queries.
575
-
576
- Returns:
577
- str: The LinkedIn URL if found, otherwise an empty string.
578
317
  """
579
318
  logger.info("Entering find_user_linkedin_url_google")
580
319
 
@@ -584,7 +323,7 @@ async def find_user_linkedin_url_google(
584
323
 
585
324
  if use_strict_check:
586
325
  queries = [
587
- f'site:linkedin.com/in "{user_name}" "{user_location}" "{user_title}" "{user_company}" intitle:"{user_name}" -intitle:"profiles" '
326
+ f'site:linkedin.com/in ("{user_name}") ({user_company} | {user_company_domain}) ( {user_title} | ) intitle:"{user_name}" -intitle:"profiles" '
588
327
  ]
589
328
  else:
590
329
  queries = [
@@ -594,14 +333,14 @@ async def find_user_linkedin_url_google(
594
333
  f'site:linkedin.com/in "{user_name}" intitle:"{user_name}"'
595
334
  ]
596
335
 
597
- async with aiohttp.ClientSession() as session: # Not strictly necessary here, but kept for parallel structure
336
+ async with aiohttp.ClientSession() as session:
598
337
  for query in queries:
599
338
  if not query.strip():
600
339
  continue
601
340
  logger.debug(f"Searching with query: {query}")
602
341
  try:
603
- results = await search_google(query.strip(), 1, tool_config=tool_config)
604
- except Exception as e:
342
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
343
+ except Exception:
605
344
  logger.exception("Error searching for LinkedIn user URL.")
606
345
  continue
607
346
 
@@ -630,6 +369,221 @@ async def find_user_linkedin_url_google(
630
369
  return ""
631
370
 
632
371
 
372
+ @assistant_tool
373
+ async def find_user_linkedin_url_by_email_google(
374
+ email: str,
375
+ user_name: str = "",
376
+ user_title: str = "",
377
+ user_location: str = "",
378
+ user_company: str = "",
379
+ tool_config: Optional[List[Dict]] = None,
380
+ ) -> Optional[Dict[str, Any]]:
381
+ """
382
+ Find the LinkedIn URL for a user based primarily on their email address.
383
+
384
+ Additional profile hints (name, title, location, company) improve query precision
385
+ when supplied. Returns a dict with the best LinkedIn URL, LLM confidence score,
386
+ and short reasoning when a match clears the confidence threshold; otherwise ``None``.
387
+ """
388
+ logger.info("Entering find_user_linkedin_url_by_email_google")
389
+
390
+ if not email:
391
+ logger.warning("No email provided.")
392
+ return None
393
+
394
+ normalized_email = email.strip().lower()
395
+ email_local_part = normalized_email.split("@")[0] if "@" in normalized_email else normalized_email
396
+ email_local_humanized = re.sub(r"[._-]+", " ", email_local_part).strip()
397
+
398
+ queries: List[str] = []
399
+
400
+ def add_query(query: str) -> None:
401
+ query = query.strip()
402
+ if query and query not in queries:
403
+ queries.append(query)
404
+
405
+ def add_query_parts(*parts: str) -> None:
406
+ tokens = [part.strip() for part in parts if part and part.strip()]
407
+ if not tokens:
408
+ return
409
+ add_query(" ".join(tokens))
410
+
411
+ enriched_terms = []
412
+ if user_name:
413
+ enriched_terms.append(f'"{user_name}"')
414
+ if user_company:
415
+ enriched_terms.append(f'"{user_company}"')
416
+ if user_title:
417
+ enriched_terms.append(f'"{user_title}"')
418
+ if user_location:
419
+ enriched_terms.append(f'"{user_location}"')
420
+ base_hint = " ".join(enriched_terms)
421
+
422
+ # Prioritise the direct email search variants before broader fallbacks.
423
+ add_query_parts(normalized_email, "linkedin.com/in", base_hint)
424
+ add_query_parts(normalized_email, "linkedin.com", base_hint)
425
+ add_query_parts(normalized_email, "linkedin", base_hint)
426
+ add_query_parts(normalized_email, base_hint)
427
+ add_query(f'"{normalized_email}" "linkedin.com/in" {base_hint}')
428
+ add_query(f'"{normalized_email}" "linkedin.com" {base_hint}')
429
+ add_query(f'"{normalized_email}" linkedin {base_hint}')
430
+
431
+ if email_local_part and email_local_part != normalized_email:
432
+ add_query_parts(email_local_part, "linkedin.com/in", base_hint)
433
+ add_query_parts(email_local_part, "linkedin.com", base_hint)
434
+ add_query_parts(email_local_part, "linkedin", base_hint)
435
+ add_query(f'"{email_local_part}" "linkedin.com/in" {base_hint}')
436
+ add_query(f'"{email_local_part}" "linkedin.com" {base_hint}')
437
+
438
+ if email_local_humanized and email_local_humanized not in {email_local_part, normalized_email}:
439
+ add_query_parts(email_local_humanized, "linkedin", base_hint)
440
+ add_query(f'"{email_local_humanized}" linkedin {base_hint}')
441
+
442
+ if normalized_email:
443
+ add_query(f'site:linkedin.com/in "{normalized_email}" {base_hint}')
444
+
445
+ if email_local_part:
446
+ add_query(f'site:linkedin.com/in "{email_local_part}" {base_hint}')
447
+
448
+ if email_local_humanized and email_local_humanized != email_local_part:
449
+ add_query(f'site:linkedin.com/in "{email_local_humanized}" {base_hint}')
450
+
451
+ if base_hint:
452
+ lookup_hint = user_name or email_local_humanized or email_local_part or normalized_email
453
+ add_query(
454
+ f'site:linkedin.com/in "{normalized_email}" {base_hint} '
455
+ f'intitle:"{lookup_hint}" -intitle:"profiles"'
456
+ )
457
+ if email_local_humanized:
458
+ add_query(
459
+ f'site:linkedin.com/in "{email_local_humanized}" {base_hint} '
460
+ f'intitle:"{lookup_hint}" -intitle:"profiles"'
461
+ )
462
+
463
+ candidate_records: List[Dict[str, str]] = []
464
+ seen_links: Set[str] = set()
465
+ best_llm_choice: Optional[LinkedinCandidateChoice] = None
466
+ best_llm_link: str = ""
467
+ HIGH_CONFIDENCE_THRESHOLD = 0.8
468
+ MIN_CONFIDENCE_THRESHOLD = 0.75
469
+
470
+ async def evaluate_with_llm() -> Optional[LinkedinCandidateChoice]:
471
+ nonlocal best_llm_choice, best_llm_link
472
+
473
+ llm_choice = await pick_best_linkedin_candidate_with_llm(
474
+ email=email,
475
+ user_name=user_name,
476
+ user_title=user_title,
477
+ user_location=user_location,
478
+ user_company=user_company,
479
+ candidates=candidate_records,
480
+ tool_config=tool_config,
481
+ )
482
+
483
+ if not llm_choice or not llm_choice.chosen_link:
484
+ return None
485
+
486
+ chosen_link = extract_user_linkedin_page(llm_choice.chosen_link)
487
+ if not chosen_link:
488
+ return None
489
+
490
+ llm_choice.chosen_link = chosen_link
491
+
492
+ if best_llm_choice is None or llm_choice.confidence > best_llm_choice.confidence:
493
+ best_llm_choice = llm_choice
494
+ best_llm_link = chosen_link
495
+ logger.debug(
496
+ "LLM updated best candidate: %s (confidence %.2f) reason: %s",
497
+ chosen_link,
498
+ llm_choice.confidence,
499
+ llm_choice.reasoning,
500
+ )
501
+
502
+ if llm_choice.confidence >= HIGH_CONFIDENCE_THRESHOLD:
503
+ logger.info(
504
+ "Returning LinkedIn user page by email via LLM scoring: %s (confidence %.2f)",
505
+ chosen_link,
506
+ llm_choice.confidence,
507
+ )
508
+ return llm_choice
509
+
510
+ return None
511
+
512
+ async with aiohttp.ClientSession() as session:
513
+ for query in queries:
514
+ query = query.strip()
515
+ if not query:
516
+ continue
517
+ logger.debug(f"Searching with query: {query}")
518
+
519
+ try:
520
+ results = await search_google_with_tools(query, 5, tool_config=tool_config)
521
+ except Exception:
522
+ logger.exception("Error searching for LinkedIn user URL by email.")
523
+ continue
524
+
525
+ if not isinstance(results, list) or len(results) == 0:
526
+ logger.debug("No results for this query, moving to next.")
527
+ continue
528
+
529
+ for result_item in results:
530
+ try:
531
+ result_json = json.loads(result_item)
532
+ except (json.JSONDecodeError, IndexError):
533
+ logger.debug("Failed to parse JSON from the search result.")
534
+ continue
535
+
536
+ link = result_json.get('link', '')
537
+ if not link:
538
+ continue
539
+
540
+ parsed_url = urlparse(link)
541
+ if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
542
+ link = extract_user_linkedin_page(link)
543
+ if not link or link in seen_links:
544
+ continue
545
+
546
+ title = result_json.get('title', '')
547
+ snippet = result_json.get('snippet', '')
548
+ subtitle = result_json.get('subtitle', '')
549
+
550
+ candidate_records.append(
551
+ {
552
+ "link": link,
553
+ "title": title,
554
+ "snippet": snippet,
555
+ "subtitle": subtitle,
556
+ "query": query,
557
+ }
558
+ )
559
+ if len(candidate_records) > 6:
560
+ candidate_records.pop(0)
561
+ seen_links.add(link)
562
+
563
+ high_conf_choice = await evaluate_with_llm()
564
+ if high_conf_choice:
565
+ return {
566
+ "linkedin_url": high_conf_choice.chosen_link,
567
+ "confidence": high_conf_choice.confidence,
568
+ "reasoning": high_conf_choice.reasoning,
569
+ }
570
+
571
+ if best_llm_choice and best_llm_link and best_llm_choice.confidence >= MIN_CONFIDENCE_THRESHOLD:
572
+ logger.info(
573
+ "Returning LinkedIn user page by email via LLM scoring (best overall): %s (confidence %.2f)",
574
+ best_llm_link,
575
+ best_llm_choice.confidence,
576
+ )
577
+ return {
578
+ "linkedin_url": best_llm_link,
579
+ "confidence": best_llm_choice.confidence,
580
+ "reasoning": best_llm_choice.reasoning,
581
+ }
582
+
583
+ logger.info("No matching LinkedIn user page found using email queries.")
584
+ return None
585
+
586
+
633
587
  @assistant_tool
634
588
  async def find_user_linkedin_url_by_job_title_google(
635
589
  user_title: str,
@@ -639,14 +593,6 @@ async def find_user_linkedin_url_by_job_title_google(
639
593
  ) -> str:
640
594
  """
641
595
  Find the LinkedIn URL for a user based on their job_title, location, and company.
642
-
643
- Args:
644
- user_title (str): The title of the user.
645
- user_location (str): The location of the user.
646
- user_company (str): The company of the user.
647
-
648
- Returns:
649
- str: The LinkedIn URL if found, otherwise an empty string.
650
596
  """
651
597
  logger.info("Entering find_user_linkedin_url_by_job_title_google")
652
598
 
@@ -654,15 +600,15 @@ async def find_user_linkedin_url_by_job_title_google(
654
600
  f'site:linkedin.com/in "{user_company}" AND "{user_title}" -intitle:"profiles" ',
655
601
  ]
656
602
 
657
- async with aiohttp.ClientSession() as session:
603
+ async with aiohttp.ClientSession() as session:
658
604
  for query in queries:
659
605
  if not query.strip():
660
606
  continue
661
607
  logger.debug(f"Searching with query: {query}")
662
608
 
663
609
  try:
664
- results = await search_google(query.strip(), 1, tool_config=tool_config)
665
- except Exception as e:
610
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
611
+ except Exception:
666
612
  logger.exception("Error searching for LinkedIn URL by job title.")
667
613
  continue
668
614
 
@@ -699,14 +645,6 @@ async def find_user_linkedin_url_by_google_search(
699
645
  ) -> List[str]:
700
646
  """
701
647
  Find LinkedIn user URLs based on provided Google search queries.
702
-
703
- Args:
704
- queries (List[str]): A list of Google search queries.
705
- number_of_results (int): Number of results to return from each query (default is 5).
706
- tool_config (Optional[List[Dict]]): Optional configuration for the SERP API.
707
-
708
- Returns:
709
- List[str]: A list of matching LinkedIn user URLs found, or an empty list if none.
710
648
  """
711
649
  logger.info("Entering find_user_linkedin_url_by_google_search")
712
650
  found_urls = []
@@ -717,8 +655,8 @@ async def find_user_linkedin_url_by_google_search(
717
655
  logger.debug(f"Searching with query: {query}")
718
656
 
719
657
  try:
720
- results = await search_google(query.strip(), number_of_results, tool_config=tool_config)
721
- except Exception as e:
658
+ results = await search_google_with_tools(query.strip(), number_of_results, tool_config=tool_config)
659
+ except Exception:
722
660
  logger.exception("Error searching for LinkedIn URL using Google search.")
723
661
  continue
724
662
 
@@ -772,19 +710,12 @@ def extract_company_page(url: str) -> str:
772
710
  async def find_organization_linkedin_url_with_google_search(
773
711
  company_name: str,
774
712
  company_location: Optional[str] = None,
713
+ company_domain: Optional[str] = None,
775
714
  use_strict_check: bool = True,
776
715
  tool_config: Optional[List[Dict]] = None,
777
716
  ) -> str:
778
717
  """
779
718
  Find the LinkedIn URL for a company based on its name and optional location using Google search.
780
-
781
- Args:
782
- company_name (str): The name of the company.
783
- company_location (str, optional): The location of the company.
784
- use_strict_check (bool): Whether to use stricter or multiple queries.
785
-
786
- Returns:
787
- str: The LinkedIn URL if found, otherwise an empty string.
788
719
  """
789
720
  logger.info("Entering find_organization_linkedin_url_with_google_search")
790
721
 
@@ -793,7 +724,7 @@ async def find_organization_linkedin_url_with_google_search(
793
724
  return ""
794
725
 
795
726
  if use_strict_check:
796
- queries = [f'site:linkedin.com/company "{company_name}" {company_location} -intitle:"jobs" ']
727
+ queries = [f'site:linkedin.com/company "{company_name}" {company_domain} ']
797
728
  else:
798
729
  if company_location:
799
730
  queries = [
@@ -814,8 +745,8 @@ async def find_organization_linkedin_url_with_google_search(
814
745
 
815
746
  logger.debug(f"Searching with query: {query}")
816
747
  try:
817
- results = await search_google(query.strip(), 1, tool_config=tool_config)
818
- except Exception as e:
748
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
749
+ except Exception:
819
750
  logger.exception("Error searching for organization LinkedIn URL.")
820
751
  continue
821
752
 
@@ -868,7 +799,7 @@ async def get_external_links(url: str) -> List[str]:
868
799
  else:
869
800
  logger.warning(f"Non-200 status ({response.status}) while fetching external links.")
870
801
  return []
871
- except Exception as e:
802
+ except Exception:
872
803
  logger.exception("Exception occurred while fetching external links.")
873
804
  return []
874
805
 
@@ -880,7 +811,7 @@ async def get_resolved_linkedin_links(url: str) -> List[str]:
880
811
  logger.debug(f"Entering get_resolved_linkedin_links for URL: {url}")
881
812
  try:
882
813
  content = await fetch_html_content(url)
883
- except Exception as e:
814
+ except Exception:
884
815
  logger.exception("Exception occurred while fetching HTML content.")
885
816
  return []
886
817
 
@@ -904,7 +835,7 @@ async def get_company_website_from_linkedin_url(linkedin_url: str) -> str:
904
835
 
905
836
  try:
906
837
  links = await get_external_links(linkedin_url)
907
- except Exception as e:
838
+ except Exception:
908
839
  logger.exception("Exception occurred while getting external links for LinkedIn URL.")
909
840
  return ""
910
841