dhisana 0.0.1.dev116__py3-none-any.whl → 0.0.1.dev236__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. dhisana/schemas/common.py +10 -1
  2. dhisana/schemas/sales.py +203 -22
  3. dhisana/utils/add_mapping.py +0 -2
  4. dhisana/utils/apollo_tools.py +739 -119
  5. dhisana/utils/built_with_api_tools.py +4 -2
  6. dhisana/utils/check_email_validity_tools.py +35 -18
  7. dhisana/utils/check_for_intent_signal.py +1 -2
  8. dhisana/utils/check_linkedin_url_validity.py +34 -8
  9. dhisana/utils/clay_tools.py +3 -2
  10. dhisana/utils/clean_properties.py +1 -4
  11. dhisana/utils/compose_salesnav_query.py +0 -1
  12. dhisana/utils/compose_search_query.py +7 -3
  13. dhisana/utils/composite_tools.py +0 -1
  14. dhisana/utils/dataframe_tools.py +2 -2
  15. dhisana/utils/email_body_utils.py +72 -0
  16. dhisana/utils/email_provider.py +174 -35
  17. dhisana/utils/enrich_lead_information.py +183 -53
  18. dhisana/utils/fetch_openai_config.py +129 -0
  19. dhisana/utils/field_validators.py +1 -1
  20. dhisana/utils/g2_tools.py +0 -1
  21. dhisana/utils/generate_content.py +0 -1
  22. dhisana/utils/generate_email.py +68 -23
  23. dhisana/utils/generate_email_response.py +294 -46
  24. dhisana/utils/generate_flow.py +0 -1
  25. dhisana/utils/generate_linkedin_connect_message.py +9 -2
  26. dhisana/utils/generate_linkedin_response_message.py +137 -66
  27. dhisana/utils/generate_structured_output_internal.py +317 -164
  28. dhisana/utils/google_custom_search.py +150 -44
  29. dhisana/utils/google_oauth_tools.py +721 -0
  30. dhisana/utils/google_workspace_tools.py +278 -54
  31. dhisana/utils/hubspot_clearbit.py +3 -1
  32. dhisana/utils/hubspot_crm_tools.py +718 -272
  33. dhisana/utils/instantly_tools.py +3 -1
  34. dhisana/utils/lusha_tools.py +10 -7
  35. dhisana/utils/mailgun_tools.py +150 -0
  36. dhisana/utils/microsoft365_tools.py +447 -0
  37. dhisana/utils/openai_assistant_and_file_utils.py +121 -177
  38. dhisana/utils/openai_helpers.py +8 -6
  39. dhisana/utils/parse_linkedin_messages_txt.py +1 -3
  40. dhisana/utils/profile.py +37 -0
  41. dhisana/utils/proxy_curl_tools.py +377 -76
  42. dhisana/utils/proxycurl_search_leads.py +426 -0
  43. dhisana/utils/research_lead.py +3 -3
  44. dhisana/utils/sales_navigator_crawler.py +1 -6
  45. dhisana/utils/salesforce_crm_tools.py +323 -50
  46. dhisana/utils/search_router.py +131 -0
  47. dhisana/utils/search_router_jobs.py +51 -0
  48. dhisana/utils/sendgrid_tools.py +126 -91
  49. dhisana/utils/serarch_router_local_business.py +75 -0
  50. dhisana/utils/serpapi_additional_tools.py +290 -0
  51. dhisana/utils/serpapi_google_jobs.py +117 -0
  52. dhisana/utils/serpapi_google_search.py +188 -0
  53. dhisana/utils/serpapi_local_business_search.py +129 -0
  54. dhisana/utils/serpapi_search_tools.py +360 -432
  55. dhisana/utils/serperdev_google_jobs.py +125 -0
  56. dhisana/utils/serperdev_local_business.py +154 -0
  57. dhisana/utils/serperdev_search.py +233 -0
  58. dhisana/utils/smtp_email_tools.py +178 -18
  59. dhisana/utils/test_connect.py +1603 -130
  60. dhisana/utils/trasform_json.py +3 -3
  61. dhisana/utils/web_download_parse_tools.py +0 -1
  62. dhisana/utils/zoominfo_tools.py +2 -3
  63. dhisana/workflow/test.py +1 -1
  64. {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/METADATA +1 -1
  65. dhisana-0.0.1.dev236.dist-info/RECORD +100 -0
  66. {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/WHEEL +1 -1
  67. dhisana-0.0.1.dev116.dist-info/RECORD +0 -83
  68. {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/entry_points.txt +0 -0
  69. {dhisana-0.0.1.dev116.dist-info → dhisana-0.0.1.dev236.dist-info}/top_level.txt +0 -0
@@ -1,385 +1,150 @@
1
1
  import json
2
- import os
3
2
  import re
4
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Set
5
4
  from urllib.parse import urlparse
5
+ import urllib.parse
6
6
  import aiohttp
7
7
  from bs4 import BeautifulSoup
8
8
  import urllib
9
+ from pydantic import BaseModel
9
10
 
10
- from dhisana.utils.assistant_tool_tag import assistant_tool
11
- from dhisana.utils.cache_output_tools import cache_output, retrieve_output
12
- from dhisana.utils.web_download_parse_tools import fetch_html_content, get_html_content_from_url
11
+ from dhisana.utils.serperdev_search import search_google_serper
12
+ from dhisana.utils.generate_structured_output_internal import (
13
+ get_structured_output_internal,
14
+ )
13
15
 
14
16
  import logging
17
+
15
18
  logging.basicConfig(level=logging.INFO)
16
19
  logger = logging.getLogger(__name__)
17
20
 
21
+ from dhisana.utils.search_router import search_google_with_tools
22
+ from dhisana.utils.assistant_tool_tag import assistant_tool
18
23
 
19
- def get_serp_api_access_token(tool_config: Optional[List[Dict]] = None) -> str:
20
- """
21
- Retrieves the SERPAPI_KEY access token from the provided tool configuration.
22
-
23
- Args:
24
- tool_config (list): A list of dictionaries containing the tool configuration.
25
- Each dictionary should have a "name" key and a "configuration" key,
26
- where "configuration" is a list of dictionaries containing "name" and "value" keys.
27
-
28
- Returns:
29
- str: The SERPAPI_KEY access token.
30
-
31
- Raises:
32
- ValueError: If the access token is not found in the tool configuration or environment variable.
33
- """
34
- logger.info("Entering get_serp_api_access_token")
35
- SERPAPI_KEY = None
36
-
37
- if tool_config:
38
- logger.debug(f"Tool config provided: {tool_config}")
39
- serpapi_config = next(
40
- (item for item in tool_config if item.get("name") == "serpapi"), None
41
- )
42
- if serpapi_config:
43
- config_map = {
44
- item["name"]: item["value"]
45
- for item in serpapi_config.get("configuration", [])
46
- if item
47
- }
48
- SERPAPI_KEY = config_map.get("apiKey")
49
- else:
50
- logger.warning("No 'serpapi' config item found in tool_config.")
51
- else:
52
- logger.debug("No tool_config provided or it's None.")
53
-
54
- SERPAPI_KEY = SERPAPI_KEY or os.getenv("SERPAPI_KEY")
55
- if not SERPAPI_KEY:
56
- logger.error("SERPAPI_KEY not found in configuration or environment.")
57
- raise ValueError("SERPAPI_KEY access token not found in tool_config or environment variable")
58
-
59
- logger.info("Retrieved SERPAPI_KEY successfully.")
60
- return SERPAPI_KEY
61
-
62
-
63
- @assistant_tool
64
- async def search_google(
65
- query: str,
66
- number_of_results: int = 10,
67
- offset: int = 0,
68
- tool_config: Optional[List[Dict]] = None,
69
- as_oq: Optional[str] = None # <-- NEW PARAM for optional keywords
70
- ) -> List[str]:
71
- """
72
- Search Google using SERP API, supporting pagination and an explicit 'offset'
73
- parameter to start from a specific result index.
74
- Now also supports 'as_oq' for optional query terms in SERP API.
75
-
76
- Parameters:
77
- - query (str): The search query.
78
- - number_of_results (int): The total number of results to return. Default is 10.
79
- - offset (int): The starting index for the first result returned (Google pagination).
80
- - tool_config (Optional[List[Dict]]): Configuration containing SERP API token, etc.
81
- - as_oq (Optional[str]): Optional query terms for SerpAPI (if supported).
82
-
83
- Returns:
84
- - List[str]: A list of organic search results, each serialized as a JSON string.
85
- """
86
- logger.info("Entering search_google")
87
- if not query:
88
- logger.warning("Empty query string provided.")
89
- return []
90
-
91
- # Use 'as_oq' in the cache key too, so different optional terms don't conflict
92
- cache_key = f"{query}_{number_of_results}_{offset}_{as_oq or ''}"
93
- cached_response = retrieve_output("search_google_serp", cache_key)
94
- if cached_response is not None:
95
- logger.info("Cache hit for search_google.")
96
- return cached_response
97
-
98
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
99
- url = "https://serpapi.com/search"
100
-
101
- page_size = 100
102
- all_results: List[Dict[str, Any]] = []
103
- start_index = offset
104
-
105
- logger.debug(f"Requesting up to {number_of_results} results for '{query}' starting at offset {offset}.")
106
-
107
- async with aiohttp.ClientSession() as session:
108
- while len(all_results) < number_of_results:
109
- to_fetch = min(page_size, number_of_results - len(all_results))
110
- params = {
111
- "q": query,
112
- "num": to_fetch,
113
- "start": start_index,
114
- "api_key": SERPAPI_KEY,
115
- "engine": "google",
116
- "location": "United States"
117
- }
118
-
119
- # If we have optional terms, add them
120
- if as_oq:
121
- params["as_oq"] = as_oq
122
-
123
- logger.debug(f"SERP API GET request with params: {params}")
124
-
125
- try:
126
- async with session.get(url, params=params) as response:
127
- logger.debug(f"Received response status: {response.status}")
128
- if response.status != 200:
129
- try:
130
- error_content = await response.json()
131
- except Exception:
132
- error_content = await response.text()
133
- logger.warning(f"Non-200 response from SERP API: {error_content}")
134
- return [json.dumps({"error": error_content})]
135
-
136
- result = await response.json()
137
- except Exception as e:
138
- logger.exception("Exception during SERP API request.")
139
- return [json.dumps({"error": str(e)})]
140
-
141
- organic_results = result.get('organic_results', [])
142
- if not organic_results:
143
- logger.debug("No more organic results returned; stopping.")
144
- break
145
-
146
- all_results.extend(organic_results)
147
- start_index += to_fetch
148
-
149
- if len(all_results) >= number_of_results:
150
- break
151
-
152
- all_results = all_results[:number_of_results]
153
- logger.info(f"Found {len(all_results)} results for query '{query}'.")
154
-
155
- serialized_results = [json.dumps(item) for item in all_results]
156
- cache_output("search_google_serp", cache_key, serialized_results)
157
- return serialized_results
158
-
159
-
160
- @assistant_tool
161
- async def search_google_maps(
162
- query: str,
163
- number_of_results: int = 3,
164
- tool_config: Optional[List[Dict]] = None
165
- ) -> List[str]:
166
- """
167
- Search Google Maps using SERP API and return the results as an array of serialized JSON strings.
168
-
169
- Parameters:
170
- - query (str): The search query.
171
- - number_of_results (int): The number of results to return.
172
- """
173
- logger.info("Entering search_google_maps")
174
- if not query:
175
- logger.warning("Empty query string provided for search_google_maps.")
176
- return []
177
-
178
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
179
- params = {
180
- "q": query,
181
- "num": number_of_results,
182
- "api_key": SERPAPI_KEY,
183
- "engine": "google_maps"
184
- }
185
- url = "https://serpapi.com/search"
186
-
187
- logger.debug(f"Searching Google Maps with params: {params}")
188
- try:
189
- async with aiohttp.ClientSession() as session:
190
- async with session.get(url, params=params) as response:
191
- logger.debug(f"Received status: {response.status}")
192
- result = await response.json()
193
- if response.status != 200:
194
- logger.warning(f"Non-200 response from SERP API: {result}")
195
- return [json.dumps({"error": result})]
196
-
197
- serialized_results = [json.dumps(item) for item in result.get('local_results', [])]
198
- logger.info(f"Returning {len(serialized_results)} map results.")
199
- return serialized_results
200
- except Exception as e:
201
- logger.exception("Exception during search_google_maps request.")
202
- return [json.dumps({"error": str(e)})]
203
-
24
+ from dhisana.utils.web_download_parse_tools import fetch_html_content
204
25
 
205
- @assistant_tool
206
- async def search_google_news(
207
- query: str,
208
- number_of_results: int = 3,
209
- tool_config: Optional[List[Dict]] = None
210
- ) -> List[str]:
211
- """
212
- Search Google News using SERP API and return the results as an array of serialized JSON strings.
213
-
214
- Parameters:
215
- - query (str): The search query.
216
- - number_of_results (int): The number of results to return.
217
- """
218
- logger.info("Entering search_google_news")
219
- if not query:
220
- logger.warning("Empty query string provided for search_google_news.")
221
- return []
222
26
 
223
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
224
- params = {
225
- "q": query,
226
- "num": number_of_results,
227
- "api_key": SERPAPI_KEY,
228
- "engine": "google_news"
229
- }
230
- url = "https://serpapi.com/search"
27
+ class LeadSearchResult(BaseModel):
28
+ first_name: str = ""
29
+ last_name: str = ""
30
+ full_name: str = ""
31
+ job_title: str = ""
32
+ linkedin_follower_count: int = 0
33
+ lead_location: str = ""
34
+ summary_about_lead: str = ""
35
+ user_linkedin_url: str = ""
231
36
 
232
- logger.debug(f"Searching Google News with params: {params}")
233
- try:
234
- async with aiohttp.ClientSession() as session:
235
- async with session.get(url, params=params) as response:
236
- logger.debug(f"Received status: {response.status}")
237
- result = await response.json()
238
- if response.status != 200:
239
- logger.warning(f"Non-200 response from SERP API: {result}")
240
- return [json.dumps({"error": result})]
241
-
242
- serialized_results = [json.dumps(item) for item in result.get('news_results', [])]
243
- logger.info(f"Returning {len(serialized_results)} news results.")
244
- return serialized_results
245
- except Exception as e:
246
- logger.exception("Exception during search_google_news request.")
247
- return [json.dumps({"error": str(e)})]
248
37
 
38
+ class LinkedinCandidateChoice(BaseModel):
39
+ chosen_link: str = ""
40
+ confidence: float = 0.0
41
+ reasoning: str = ""
249
42
 
250
- @assistant_tool
251
- async def search_job_postings(
252
- query: str,
253
- number_of_results: int,
254
- tool_config: Optional[List[Dict]] = None
255
- ) -> List[str]:
256
- """
257
- Search for job postings using SERP API and return the results as an array of serialized JSON strings.
258
-
259
- Parameters:
260
- - query (str): The search query.
261
- - number_of_results (int): The number of results to return.
262
- """
263
- logger.info("Entering search_job_postings")
264
- if not query:
265
- logger.warning("Empty query string provided for search_job_postings.")
266
- return []
267
43
 
268
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
269
- params = {
270
- "q": query,
271
- "num": number_of_results,
272
- "api_key": SERPAPI_KEY,
273
- "engine": "google_jobs"
274
- }
275
- url = "https://serpapi.com/search"
44
+ async def get_structured_output(text: str, tool_config: Optional[List[Dict]] = None) -> LeadSearchResult:
45
+ """Parse text snippet into ``LeadSearchResult`` using OpenAI."""
276
46
 
277
- logger.debug(f"Searching Google Jobs with params: {params}")
278
- try:
279
- async with aiohttp.ClientSession() as session:
280
- async with session.get(url, params=params) as response:
281
- logger.debug(f"Received status: {response.status}")
282
- result = await response.json()
283
- if response.status != 200:
284
- logger.warning(f"Non-200 response from SERP API: {result}")
285
- return [json.dumps({"error": result})]
286
-
287
- serialized_results = [json.dumps(item) for item in result.get('jobs_results', [])]
288
- logger.info(f"Returning {len(serialized_results)} job posting results.")
289
- return serialized_results
290
- except Exception as e:
291
- logger.exception("Exception during search_job_postings request.")
292
- return [json.dumps({"error": str(e)})]
47
+ prompt = (
48
+ "Extract lead details from the text below.\n"
49
+ "If follower counts are mentioned, convert values like '1.5k+ followers' to an integer (e.g. 1500).\n"
50
+ f"Return JSON matching this schema:\n{json.dumps(LeadSearchResult.model_json_schema(), indent=2)}\n\n"
51
+ f"Text:\n{text}"
52
+ )
53
+ result, status = await get_structured_output_internal(
54
+ prompt, LeadSearchResult, model = "gpt-5.1-chat", tool_config=tool_config
55
+ )
56
+ if status != "SUCCESS" or result is None:
57
+ return LeadSearchResult()
58
+ return result
293
59
 
294
60
 
295
61
  @assistant_tool
296
- async def search_google_images(
297
- query: str,
298
- number_of_results: int,
299
- tool_config: Optional[List[Dict]] = None
300
- ) -> List[str]:
301
- """
302
- Search Google Images using SERP API and return the results as an array of serialized JSON strings.
303
-
304
- Parameters:
305
- - query (str): The search query.
306
- - number_of_results (int): The number of results to return.
307
- """
308
- logger.info("Entering search_google_images")
309
- if not query:
310
- logger.warning("Empty query string provided for search_google_images.")
311
- return []
312
-
313
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
314
- params = {
315
- "q": query,
316
- "num": number_of_results,
317
- "api_key": SERPAPI_KEY,
318
- "engine": "google_images"
319
- }
320
- url = "https://serpapi.com/search"
321
-
322
- logger.debug(f"Searching Google Images with params: {params}")
323
- try:
324
- async with aiohttp.ClientSession() as session:
325
- async with session.get(url, params=params) as response:
326
- logger.debug(f"Received status: {response.status}")
327
- result = await response.json()
328
- if response.status != 200:
329
- logger.warning(f"Non-200 response from SERP API: {result}")
330
- return [json.dumps({"error": result})]
331
-
332
- serialized_results = [json.dumps(item) for item in result.get('images_results', [])]
333
- logger.info(f"Returning {len(serialized_results)} image results.")
334
- return serialized_results
335
- except Exception as e:
336
- logger.exception("Exception during search_google_images request.")
337
- return [json.dumps({"error": str(e)})]
338
-
62
+ async def find_user_linkedin_url_with_serper(
63
+ user_linkedin_url: str,
64
+ tool_config: Optional[List[Dict]] = None,
65
+ ) -> Optional[Dict]:
66
+ """Search Google via Serper.dev for ``user_linkedin_url`` and parse lead details."""
339
67
 
340
- @assistant_tool
341
- async def search_google_videos(
342
- query: str,
343
- number_of_results: int,
344
- tool_config: Optional[List[Dict]] = None
345
- ) -> List[str]:
346
- """
347
- Search Google Videos using SERP API and return the results as an array of serialized JSON strings.
348
-
349
- Parameters:
350
- - query (str): The search query.
351
- - number_of_results (int): The number of results to return.
352
- """
353
- logger.info("Entering search_google_videos")
354
- if not query:
355
- logger.warning("Empty query string provided for search_google_videos.")
356
- return []
68
+ if not user_linkedin_url:
69
+ return None
357
70
 
358
- SERPAPI_KEY = get_serp_api_access_token(tool_config)
359
- params = {
360
- "q": query,
361
- "num": number_of_results,
362
- "api_key": SERPAPI_KEY,
363
- "engine": "google_videos"
364
- }
365
- url = "https://serpapi.com/search"
71
+ normalized_input = extract_user_linkedin_page(user_linkedin_url)
72
+ results = await search_google_serper(user_linkedin_url, 10, tool_config=tool_config)
73
+ for item_json in results:
74
+ try:
75
+ item = json.loads(item_json)
76
+ except Exception:
77
+ continue
78
+ link = item.get("link", "")
79
+ if not link:
80
+ continue
81
+ if extract_user_linkedin_page(link) == normalized_input:
82
+ text = " ".join(
83
+ [item.get("title", ""), item.get("subtitle", ""), item.get("snippet", "")]
84
+ ).strip()
85
+ structured = await get_structured_output(text, tool_config=tool_config)
86
+ structured.user_linkedin_url = normalized_input
87
+ return json.loads(structured.model_dump_json())
88
+ return None
89
+
90
+
91
+ async def pick_best_linkedin_candidate_with_llm(
92
+ email: str,
93
+ user_name: str,
94
+ user_title: str,
95
+ user_location: str,
96
+ user_company: str,
97
+ candidates: List[Dict],
98
+ tool_config: Optional[List[Dict]] = None,
99
+ ) -> Optional[LinkedinCandidateChoice]:
100
+ """Ask the LLM to assess candidate LinkedIn URLs and pick the best match."""
101
+
102
+ if not candidates:
103
+ return None
104
+
105
+ candidates_sorted = candidates[-3:]
106
+ candidate_lines = []
107
+ for idx, candidate in enumerate(candidates_sorted, start=1):
108
+ candidate_lines.append(
109
+ "\n".join(
110
+ [
111
+ f"Candidate {idx}:",
112
+ f" Link: {candidate.get('link', '')}",
113
+ f" Title: {candidate.get('title', '')}",
114
+ f" Snippet: {candidate.get('snippet', '')}",
115
+ f" Subtitle: {candidate.get('subtitle', '')}",
116
+ f" Query: {candidate.get('query', '')}",
117
+ ]
118
+ )
119
+ )
366
120
 
367
- logger.debug(f"Searching Google Videos with params: {params}")
368
- try:
369
- async with aiohttp.ClientSession() as session:
370
- async with session.get(url, params=params) as response:
371
- logger.debug(f"Received status: {response.status}")
372
- result = await response.json()
373
- if response.status != 200:
374
- logger.warning(f"Non-200 response from SERP API: {result}")
375
- return [json.dumps({"error": result})]
376
-
377
- serialized_results = [json.dumps(item) for item in result.get('video_results', [])]
378
- logger.info(f"Returning {len(serialized_results)} video results.")
379
- return serialized_results
380
- except Exception as e:
381
- logger.exception("Exception during search_google_videos request.")
382
- return [json.dumps({"error": str(e)})]
121
+ prompt = (
122
+ "You are validating LinkedIn profile matches for a lead enrichment workflow.\n"
123
+ "Given the lead context and candidate search results, pick the most likely LinkedIn profile.\n"
124
+ "If no candidate seems appropriate, return an empty link and confidence 0.\n"
125
+ "Consider whether the email, name, company, title, or location aligns with the candidate.\n"
126
+ "Lead context:\n"
127
+ f"- Email: {email or 'unknown'}\n"
128
+ f"- Name: {user_name or 'unknown'}\n"
129
+ f"- Title: {user_title or 'unknown'}\n"
130
+ f"- Company: {user_company or 'unknown'}\n"
131
+ f"- Location: {user_location or 'unknown'}\n\n"
132
+ "Candidates:\n"
133
+ f"{chr(10).join(candidate_lines)}\n\n"
134
+ "Return JSON with fields: chosen_link (string), confidence (0-1 float), reasoning (short string)."
135
+ )
136
+
137
+ result, status = await get_structured_output_internal(
138
+ prompt,
139
+ LinkedinCandidateChoice,
140
+ model="gpt-5.1-chat",
141
+ tool_config=tool_config,
142
+ )
143
+
144
+ if status != "SUCCESS" or result is None:
145
+ return None
146
+
147
+ return result
383
148
 
384
149
 
385
150
  @assistant_tool
@@ -389,14 +154,7 @@ async def get_company_domain_from_google_search(
389
154
  tool_config: Optional[List[Dict]] = None
390
155
  ) -> str:
391
156
  """
392
- Tries to find the company domain from the company name using Google search.
393
-
394
- Args:
395
- company_name (str): The name of the company to search for.
396
- location (str, optional): A location to include in the query.
397
-
398
- Returns:
399
- str: The domain of the company's official website if found, otherwise an empty string.
157
+ Tries to find the company domain from the company name using Google (SerpAPI or Serper.dev).
400
158
  """
401
159
  logger.info("Entering get_company_domain_from_google_search")
402
160
 
@@ -405,22 +163,21 @@ async def get_company_domain_from_google_search(
405
163
  logger.debug("Invalid or excluded company_name provided.")
406
164
  return ""
407
165
 
408
- exclude_company_names = ["linkedin", "wikipedia", "facebook", "instagram", "twitter", "youtube", "netflix", "zoominfo", "reditt"]
409
166
  query = f"\"{company_name}\" official website"
410
167
  if location:
411
168
  query = f"\"{company_name}\" official website, {location}"
412
169
 
413
170
  try:
414
171
  logger.debug(f"Performing search with query: {query}")
415
- result = await search_google(query, 1, tool_config=tool_config)
172
+ result = await search_google_with_tools(query, 1, tool_config=tool_config)
416
173
  if not isinstance(result, list) or len(result) == 0:
417
174
  logger.debug("No results for first attempt, retrying with fallback query.")
418
175
  query = f"{company_name} official website"
419
- result = await search_google(query, 1, tool_config=tool_config)
176
+ result = await search_google_with_tools(query, 1, tool_config=tool_config)
420
177
  if not isinstance(result, list) or len(result) == 0:
421
178
  logger.debug("No results from fallback query either.")
422
179
  return ''
423
- except Exception as e:
180
+ except Exception:
424
181
  logger.exception("Exception during get_company_domain_from_google_search.")
425
182
  return ''
426
183
 
@@ -472,16 +229,6 @@ async def get_signal_strength(
472
229
  """
473
230
  Find how strong a match for the keywords in search is by checking
474
231
  how many search results contain all desired keywords in the snippet.
475
-
476
- Args:
477
- domain_to_search (str): The domain to search inside.
478
- keywords (List[str]): The keywords to search for.
479
- in_title (List[str]): Keywords that must appear in the title.
480
- not_in_title (List[str]): Keywords that must not appear in the title.
481
- negative_keywords (List[str]): Keywords to exclude from results.
482
-
483
- Returns:
484
- int: A strength score on a scale of 0 to 5.
485
232
  """
486
233
  logger.info("Entering get_signal_strength")
487
234
 
@@ -508,8 +255,8 @@ async def get_signal_strength(
508
255
 
509
256
  logger.debug(f"Performing get_signal_strength search with query: {final_query}")
510
257
  try:
511
- results = await search_google(final_query, 5, tool_config=tool_config)
512
- except Exception as e:
258
+ results = await search_google_with_tools(final_query, 5, tool_config=tool_config)
259
+ except Exception:
513
260
  logger.exception("Exception occurred while searching for signal strength.")
514
261
  return 0
515
262
 
@@ -518,9 +265,9 @@ async def get_signal_strength(
518
265
  return 0
519
266
 
520
267
  score = 0
521
- for result in results:
268
+ for result_item in results:
522
269
  try:
523
- result_json = json.loads(result)
270
+ result_json = json.loads(result_item)
524
271
  snippet_text = result_json.get('snippet', '').lower()
525
272
  if all(kw.lower() in snippet_text for kw in keywords):
526
273
  logger.debug(f"Found match in snippet: {snippet_text[:60]}...")
@@ -544,8 +291,8 @@ def extract_user_linkedin_page(url: str) -> str:
544
291
  if not url:
545
292
  return ""
546
293
 
547
- normalized_url = re.sub(r"(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
548
- match = re.match(r"https://www.linkedin.com/in/([\w\-]+)", normalized_url)
294
+ normalized_url = re.sub(r"^(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
295
+ match = re.match(r"https://www\.linkedin\.com/in/([^/?#]+)", normalized_url)
549
296
  if match:
550
297
  page = f"https://www.linkedin.com/in/{match.group(1)}"
551
298
  logger.debug(f"Extracted user LinkedIn page: {page}")
@@ -567,16 +314,6 @@ async def find_user_linkedin_url_google(
567
314
  ) -> str:
568
315
  """
569
316
  Find the LinkedIn URL for a user based on their name, title, location, and company.
570
-
571
- Args:
572
- user_name (str): The name of the user.
573
- user_title (str): The title of the user.
574
- user_location (str): The location of the user.
575
- user_company (str): The company of the user.
576
- use_strict_check (bool): Whether to use a strict single query or a series of relaxed queries.
577
-
578
- Returns:
579
- str: The LinkedIn URL if found, otherwise an empty string.
580
317
  """
581
318
  logger.info("Entering find_user_linkedin_url_google")
582
319
 
@@ -596,14 +333,14 @@ async def find_user_linkedin_url_google(
596
333
  f'site:linkedin.com/in "{user_name}" intitle:"{user_name}"'
597
334
  ]
598
335
 
599
- async with aiohttp.ClientSession() as session: # Not strictly necessary here, but kept for parallel structure
336
+ async with aiohttp.ClientSession() as session:
600
337
  for query in queries:
601
338
  if not query.strip():
602
339
  continue
603
340
  logger.debug(f"Searching with query: {query}")
604
341
  try:
605
- results = await search_google(query.strip(), 1, tool_config=tool_config)
606
- except Exception as e:
342
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
343
+ except Exception:
607
344
  logger.exception("Error searching for LinkedIn user URL.")
608
345
  continue
609
346
 
@@ -632,6 +369,221 @@ async def find_user_linkedin_url_google(
632
369
  return ""
633
370
 
634
371
 
372
+ @assistant_tool
373
+ async def find_user_linkedin_url_by_email_google(
374
+ email: str,
375
+ user_name: str = "",
376
+ user_title: str = "",
377
+ user_location: str = "",
378
+ user_company: str = "",
379
+ tool_config: Optional[List[Dict]] = None,
380
+ ) -> Optional[Dict[str, Any]]:
381
+ """
382
+ Find the LinkedIn URL for a user based primarily on their email address.
383
+
384
+ Additional profile hints (name, title, location, company) improve query precision
385
+ when supplied. Returns a dict with the best LinkedIn URL, LLM confidence score,
386
+ and short reasoning when a match clears the confidence threshold; otherwise ``None``.
387
+ """
388
+ logger.info("Entering find_user_linkedin_url_by_email_google")
389
+
390
+ if not email:
391
+ logger.warning("No email provided.")
392
+ return None
393
+
394
+ normalized_email = email.strip().lower()
395
+ email_local_part = normalized_email.split("@")[0] if "@" in normalized_email else normalized_email
396
+ email_local_humanized = re.sub(r"[._-]+", " ", email_local_part).strip()
397
+
398
+ queries: List[str] = []
399
+
400
+ def add_query(query: str) -> None:
401
+ query = query.strip()
402
+ if query and query not in queries:
403
+ queries.append(query)
404
+
405
+ def add_query_parts(*parts: str) -> None:
406
+ tokens = [part.strip() for part in parts if part and part.strip()]
407
+ if not tokens:
408
+ return
409
+ add_query(" ".join(tokens))
410
+
411
+ enriched_terms = []
412
+ if user_name:
413
+ enriched_terms.append(f'"{user_name}"')
414
+ if user_company:
415
+ enriched_terms.append(f'"{user_company}"')
416
+ if user_title:
417
+ enriched_terms.append(f'"{user_title}"')
418
+ if user_location:
419
+ enriched_terms.append(f'"{user_location}"')
420
+ base_hint = " ".join(enriched_terms)
421
+
422
+ # Prioritise the direct email search variants before broader fallbacks.
423
+ add_query_parts(normalized_email, "linkedin.com/in", base_hint)
424
+ add_query_parts(normalized_email, "linkedin.com", base_hint)
425
+ add_query_parts(normalized_email, "linkedin", base_hint)
426
+ add_query_parts(normalized_email, base_hint)
427
+ add_query(f'"{normalized_email}" "linkedin.com/in" {base_hint}')
428
+ add_query(f'"{normalized_email}" "linkedin.com" {base_hint}')
429
+ add_query(f'"{normalized_email}" linkedin {base_hint}')
430
+
431
+ if email_local_part and email_local_part != normalized_email:
432
+ add_query_parts(email_local_part, "linkedin.com/in", base_hint)
433
+ add_query_parts(email_local_part, "linkedin.com", base_hint)
434
+ add_query_parts(email_local_part, "linkedin", base_hint)
435
+ add_query(f'"{email_local_part}" "linkedin.com/in" {base_hint}')
436
+ add_query(f'"{email_local_part}" "linkedin.com" {base_hint}')
437
+
438
+ if email_local_humanized and email_local_humanized not in {email_local_part, normalized_email}:
439
+ add_query_parts(email_local_humanized, "linkedin", base_hint)
440
+ add_query(f'"{email_local_humanized}" linkedin {base_hint}')
441
+
442
+ if normalized_email:
443
+ add_query(f'site:linkedin.com/in "{normalized_email}" {base_hint}')
444
+
445
+ if email_local_part:
446
+ add_query(f'site:linkedin.com/in "{email_local_part}" {base_hint}')
447
+
448
+ if email_local_humanized and email_local_humanized != email_local_part:
449
+ add_query(f'site:linkedin.com/in "{email_local_humanized}" {base_hint}')
450
+
451
+ if base_hint:
452
+ lookup_hint = user_name or email_local_humanized or email_local_part or normalized_email
453
+ add_query(
454
+ f'site:linkedin.com/in "{normalized_email}" {base_hint} '
455
+ f'intitle:"{lookup_hint}" -intitle:"profiles"'
456
+ )
457
+ if email_local_humanized:
458
+ add_query(
459
+ f'site:linkedin.com/in "{email_local_humanized}" {base_hint} '
460
+ f'intitle:"{lookup_hint}" -intitle:"profiles"'
461
+ )
462
+
463
+ candidate_records: List[Dict[str, str]] = []
464
+ seen_links: Set[str] = set()
465
+ best_llm_choice: Optional[LinkedinCandidateChoice] = None
466
+ best_llm_link: str = ""
467
+ HIGH_CONFIDENCE_THRESHOLD = 0.8
468
+ MIN_CONFIDENCE_THRESHOLD = 0.75
469
+
470
+ async def evaluate_with_llm() -> Optional[LinkedinCandidateChoice]:
471
+ nonlocal best_llm_choice, best_llm_link
472
+
473
+ llm_choice = await pick_best_linkedin_candidate_with_llm(
474
+ email=email,
475
+ user_name=user_name,
476
+ user_title=user_title,
477
+ user_location=user_location,
478
+ user_company=user_company,
479
+ candidates=candidate_records,
480
+ tool_config=tool_config,
481
+ )
482
+
483
+ if not llm_choice or not llm_choice.chosen_link:
484
+ return None
485
+
486
+ chosen_link = extract_user_linkedin_page(llm_choice.chosen_link)
487
+ if not chosen_link:
488
+ return None
489
+
490
+ llm_choice.chosen_link = chosen_link
491
+
492
+ if best_llm_choice is None or llm_choice.confidence > best_llm_choice.confidence:
493
+ best_llm_choice = llm_choice
494
+ best_llm_link = chosen_link
495
+ logger.debug(
496
+ "LLM updated best candidate: %s (confidence %.2f) reason: %s",
497
+ chosen_link,
498
+ llm_choice.confidence,
499
+ llm_choice.reasoning,
500
+ )
501
+
502
+ if llm_choice.confidence >= HIGH_CONFIDENCE_THRESHOLD:
503
+ logger.info(
504
+ "Returning LinkedIn user page by email via LLM scoring: %s (confidence %.2f)",
505
+ chosen_link,
506
+ llm_choice.confidence,
507
+ )
508
+ return llm_choice
509
+
510
+ return None
511
+
512
+ async with aiohttp.ClientSession() as session:
513
+ for query in queries:
514
+ query = query.strip()
515
+ if not query:
516
+ continue
517
+ logger.debug(f"Searching with query: {query}")
518
+
519
+ try:
520
+ results = await search_google_with_tools(query, 5, tool_config=tool_config)
521
+ except Exception:
522
+ logger.exception("Error searching for LinkedIn user URL by email.")
523
+ continue
524
+
525
+ if not isinstance(results, list) or len(results) == 0:
526
+ logger.debug("No results for this query, moving to next.")
527
+ continue
528
+
529
+ for result_item in results:
530
+ try:
531
+ result_json = json.loads(result_item)
532
+ except (json.JSONDecodeError, IndexError):
533
+ logger.debug("Failed to parse JSON from the search result.")
534
+ continue
535
+
536
+ link = result_json.get('link', '')
537
+ if not link:
538
+ continue
539
+
540
+ parsed_url = urlparse(link)
541
+ if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
542
+ link = extract_user_linkedin_page(link)
543
+ if not link or link in seen_links:
544
+ continue
545
+
546
+ title = result_json.get('title', '')
547
+ snippet = result_json.get('snippet', '')
548
+ subtitle = result_json.get('subtitle', '')
549
+
550
+ candidate_records.append(
551
+ {
552
+ "link": link,
553
+ "title": title,
554
+ "snippet": snippet,
555
+ "subtitle": subtitle,
556
+ "query": query,
557
+ }
558
+ )
559
+ if len(candidate_records) > 6:
560
+ candidate_records.pop(0)
561
+ seen_links.add(link)
562
+
563
+ high_conf_choice = await evaluate_with_llm()
564
+ if high_conf_choice:
565
+ return {
566
+ "linkedin_url": high_conf_choice.chosen_link,
567
+ "confidence": high_conf_choice.confidence,
568
+ "reasoning": high_conf_choice.reasoning,
569
+ }
570
+
571
+ if best_llm_choice and best_llm_link and best_llm_choice.confidence >= MIN_CONFIDENCE_THRESHOLD:
572
+ logger.info(
573
+ "Returning LinkedIn user page by email via LLM scoring (best overall): %s (confidence %.2f)",
574
+ best_llm_link,
575
+ best_llm_choice.confidence,
576
+ )
577
+ return {
578
+ "linkedin_url": best_llm_link,
579
+ "confidence": best_llm_choice.confidence,
580
+ "reasoning": best_llm_choice.reasoning,
581
+ }
582
+
583
+ logger.info("No matching LinkedIn user page found using email queries.")
584
+ return None
585
+
586
+
635
587
  @assistant_tool
636
588
  async def find_user_linkedin_url_by_job_title_google(
637
589
  user_title: str,
@@ -641,14 +593,6 @@ async def find_user_linkedin_url_by_job_title_google(
641
593
  ) -> str:
642
594
  """
643
595
  Find the LinkedIn URL for a user based on their job_title, location, and company.
644
-
645
- Args:
646
- user_title (str): The title of the user.
647
- user_location (str): The location of the user.
648
- user_company (str): The company of the user.
649
-
650
- Returns:
651
- str: The LinkedIn URL if found, otherwise an empty string.
652
596
  """
653
597
  logger.info("Entering find_user_linkedin_url_by_job_title_google")
654
598
 
@@ -656,15 +600,15 @@ async def find_user_linkedin_url_by_job_title_google(
656
600
  f'site:linkedin.com/in "{user_company}" AND "{user_title}" -intitle:"profiles" ',
657
601
  ]
658
602
 
659
- async with aiohttp.ClientSession() as session:
603
+ async with aiohttp.ClientSession() as session:
660
604
  for query in queries:
661
605
  if not query.strip():
662
606
  continue
663
607
  logger.debug(f"Searching with query: {query}")
664
608
 
665
609
  try:
666
- results = await search_google(query.strip(), 1, tool_config=tool_config)
667
- except Exception as e:
610
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
611
+ except Exception:
668
612
  logger.exception("Error searching for LinkedIn URL by job title.")
669
613
  continue
670
614
 
@@ -701,14 +645,6 @@ async def find_user_linkedin_url_by_google_search(
701
645
  ) -> List[str]:
702
646
  """
703
647
  Find LinkedIn user URLs based on provided Google search queries.
704
-
705
- Args:
706
- queries (List[str]): A list of Google search queries.
707
- number_of_results (int): Number of results to return from each query (default is 5).
708
- tool_config (Optional[List[Dict]]): Optional configuration for the SERP API.
709
-
710
- Returns:
711
- List[str]: A list of matching LinkedIn user URLs found, or an empty list if none.
712
648
  """
713
649
  logger.info("Entering find_user_linkedin_url_by_google_search")
714
650
  found_urls = []
@@ -719,8 +655,8 @@ async def find_user_linkedin_url_by_google_search(
719
655
  logger.debug(f"Searching with query: {query}")
720
656
 
721
657
  try:
722
- results = await search_google(query.strip(), number_of_results, tool_config=tool_config)
723
- except Exception as e:
658
+ results = await search_google_with_tools(query.strip(), number_of_results, tool_config=tool_config)
659
+ except Exception:
724
660
  logger.exception("Error searching for LinkedIn URL using Google search.")
725
661
  continue
726
662
 
@@ -780,14 +716,6 @@ async def find_organization_linkedin_url_with_google_search(
780
716
  ) -> str:
781
717
  """
782
718
  Find the LinkedIn URL for a company based on its name and optional location using Google search.
783
-
784
- Args:
785
- company_name (str): The name of the company.
786
- company_location (str, optional): The location of the company.
787
- use_strict_check (bool): Whether to use stricter or multiple queries.
788
-
789
- Returns:
790
- str: The LinkedIn URL if found, otherwise an empty string.
791
719
  """
792
720
  logger.info("Entering find_organization_linkedin_url_with_google_search")
793
721
 
@@ -796,7 +724,7 @@ async def find_organization_linkedin_url_with_google_search(
796
724
  return ""
797
725
 
798
726
  if use_strict_check:
799
- queries = [f'site:linkedin.com/company "{company_name}" {company_domain} -intitle:"jobs" ']
727
+ queries = [f'site:linkedin.com/company "{company_name}" {company_domain} ']
800
728
  else:
801
729
  if company_location:
802
730
  queries = [
@@ -817,8 +745,8 @@ async def find_organization_linkedin_url_with_google_search(
817
745
 
818
746
  logger.debug(f"Searching with query: {query}")
819
747
  try:
820
- results = await search_google(query.strip(), 1, tool_config=tool_config)
821
- except Exception as e:
748
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
749
+ except Exception:
822
750
  logger.exception("Error searching for organization LinkedIn URL.")
823
751
  continue
824
752
 
@@ -871,7 +799,7 @@ async def get_external_links(url: str) -> List[str]:
871
799
  else:
872
800
  logger.warning(f"Non-200 status ({response.status}) while fetching external links.")
873
801
  return []
874
- except Exception as e:
802
+ except Exception:
875
803
  logger.exception("Exception occurred while fetching external links.")
876
804
  return []
877
805
 
@@ -883,7 +811,7 @@ async def get_resolved_linkedin_links(url: str) -> List[str]:
883
811
  logger.debug(f"Entering get_resolved_linkedin_links for URL: {url}")
884
812
  try:
885
813
  content = await fetch_html_content(url)
886
- except Exception as e:
814
+ except Exception:
887
815
  logger.exception("Exception occurred while fetching HTML content.")
888
816
  return []
889
817
 
@@ -907,7 +835,7 @@ async def get_company_website_from_linkedin_url(linkedin_url: str) -> str:
907
835
 
908
836
  try:
909
837
  links = await get_external_links(linkedin_url)
910
- except Exception as e:
838
+ except Exception:
911
839
  logger.exception("Exception occurred while getting external links for LinkedIn URL.")
912
840
  return ""
913
841