dhisana 0.0.1.dev85__py3-none-any.whl → 0.0.1.dev236__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dhisana/schemas/common.py +33 -0
- dhisana/schemas/sales.py +224 -23
- dhisana/utils/add_mapping.py +72 -63
- dhisana/utils/apollo_tools.py +739 -109
- dhisana/utils/built_with_api_tools.py +4 -2
- dhisana/utils/cache_output_tools.py +23 -23
- dhisana/utils/check_email_validity_tools.py +456 -458
- dhisana/utils/check_for_intent_signal.py +1 -2
- dhisana/utils/check_linkedin_url_validity.py +34 -8
- dhisana/utils/clay_tools.py +3 -2
- dhisana/utils/clean_properties.py +3 -1
- dhisana/utils/compose_salesnav_query.py +0 -1
- dhisana/utils/compose_search_query.py +7 -3
- dhisana/utils/composite_tools.py +0 -1
- dhisana/utils/dataframe_tools.py +2 -2
- dhisana/utils/email_body_utils.py +72 -0
- dhisana/utils/email_provider.py +375 -0
- dhisana/utils/enrich_lead_information.py +585 -85
- dhisana/utils/fetch_openai_config.py +129 -0
- dhisana/utils/field_validators.py +1 -1
- dhisana/utils/g2_tools.py +0 -1
- dhisana/utils/generate_content.py +0 -1
- dhisana/utils/generate_email.py +69 -16
- dhisana/utils/generate_email_response.py +298 -41
- dhisana/utils/generate_flow.py +0 -1
- dhisana/utils/generate_linkedin_connect_message.py +19 -6
- dhisana/utils/generate_linkedin_response_message.py +156 -65
- dhisana/utils/generate_structured_output_internal.py +351 -131
- dhisana/utils/google_custom_search.py +150 -44
- dhisana/utils/google_oauth_tools.py +721 -0
- dhisana/utils/google_workspace_tools.py +391 -25
- dhisana/utils/hubspot_clearbit.py +3 -1
- dhisana/utils/hubspot_crm_tools.py +771 -167
- dhisana/utils/instantly_tools.py +3 -1
- dhisana/utils/lusha_tools.py +10 -7
- dhisana/utils/mailgun_tools.py +150 -0
- dhisana/utils/microsoft365_tools.py +447 -0
- dhisana/utils/openai_assistant_and_file_utils.py +121 -177
- dhisana/utils/openai_helpers.py +19 -16
- dhisana/utils/parse_linkedin_messages_txt.py +2 -3
- dhisana/utils/profile.py +37 -0
- dhisana/utils/proxy_curl_tools.py +507 -206
- dhisana/utils/proxycurl_search_leads.py +426 -0
- dhisana/utils/research_lead.py +121 -68
- dhisana/utils/sales_navigator_crawler.py +1 -6
- dhisana/utils/salesforce_crm_tools.py +323 -50
- dhisana/utils/search_router.py +131 -0
- dhisana/utils/search_router_jobs.py +51 -0
- dhisana/utils/sendgrid_tools.py +126 -91
- dhisana/utils/serarch_router_local_business.py +75 -0
- dhisana/utils/serpapi_additional_tools.py +290 -0
- dhisana/utils/serpapi_google_jobs.py +117 -0
- dhisana/utils/serpapi_google_search.py +188 -0
- dhisana/utils/serpapi_local_business_search.py +129 -0
- dhisana/utils/serpapi_search_tools.py +363 -432
- dhisana/utils/serperdev_google_jobs.py +125 -0
- dhisana/utils/serperdev_local_business.py +154 -0
- dhisana/utils/serperdev_search.py +233 -0
- dhisana/utils/smtp_email_tools.py +576 -0
- dhisana/utils/test_connect.py +1765 -92
- dhisana/utils/trasform_json.py +95 -16
- dhisana/utils/web_download_parse_tools.py +0 -1
- dhisana/utils/zoominfo_tools.py +2 -3
- dhisana/workflow/test.py +1 -1
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/METADATA +5 -2
- dhisana-0.0.1.dev236.dist-info/RECORD +100 -0
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/WHEEL +1 -1
- dhisana-0.0.1.dev85.dist-info/RECORD +0 -81
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/entry_points.txt +0 -0
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/top_level.txt +0 -0
|
@@ -1,384 +1,150 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import os
|
|
3
2
|
import re
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set
|
|
5
4
|
from urllib.parse import urlparse
|
|
5
|
+
import urllib.parse
|
|
6
6
|
import aiohttp
|
|
7
7
|
from bs4 import BeautifulSoup
|
|
8
8
|
import urllib
|
|
9
|
+
from pydantic import BaseModel
|
|
9
10
|
|
|
10
|
-
from dhisana.utils.
|
|
11
|
-
from dhisana.utils.
|
|
12
|
-
|
|
11
|
+
from dhisana.utils.serperdev_search import search_google_serper
|
|
12
|
+
from dhisana.utils.generate_structured_output_internal import (
|
|
13
|
+
get_structured_output_internal,
|
|
14
|
+
)
|
|
13
15
|
|
|
14
16
|
import logging
|
|
17
|
+
|
|
15
18
|
logging.basicConfig(level=logging.INFO)
|
|
16
19
|
logger = logging.getLogger(__name__)
|
|
17
20
|
|
|
21
|
+
from dhisana.utils.search_router import search_google_with_tools
|
|
22
|
+
from dhisana.utils.assistant_tool_tag import assistant_tool
|
|
18
23
|
|
|
19
|
-
|
|
20
|
-
"""
|
|
21
|
-
Retrieves the SERPAPI_KEY access token from the provided tool configuration.
|
|
22
|
-
|
|
23
|
-
Args:
|
|
24
|
-
tool_config (list): A list of dictionaries containing the tool configuration.
|
|
25
|
-
Each dictionary should have a "name" key and a "configuration" key,
|
|
26
|
-
where "configuration" is a list of dictionaries containing "name" and "value" keys.
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
str: The SERPAPI_KEY access token.
|
|
30
|
-
|
|
31
|
-
Raises:
|
|
32
|
-
ValueError: If the access token is not found in the tool configuration or environment variable.
|
|
33
|
-
"""
|
|
34
|
-
logger.info("Entering get_serp_api_access_token")
|
|
35
|
-
SERPAPI_KEY = None
|
|
36
|
-
|
|
37
|
-
if tool_config:
|
|
38
|
-
logger.debug(f"Tool config provided: {tool_config}")
|
|
39
|
-
serpapi_config = next(
|
|
40
|
-
(item for item in tool_config if item.get("name") == "serpapi"), None
|
|
41
|
-
)
|
|
42
|
-
if serpapi_config:
|
|
43
|
-
config_map = {
|
|
44
|
-
item["name"]: item["value"]
|
|
45
|
-
for item in serpapi_config.get("configuration", [])
|
|
46
|
-
if item
|
|
47
|
-
}
|
|
48
|
-
SERPAPI_KEY = config_map.get("apiKey")
|
|
49
|
-
else:
|
|
50
|
-
logger.warning("No 'serpapi' config item found in tool_config.")
|
|
51
|
-
else:
|
|
52
|
-
logger.debug("No tool_config provided or it's None.")
|
|
53
|
-
|
|
54
|
-
SERPAPI_KEY = SERPAPI_KEY or os.getenv("SERPAPI_KEY")
|
|
55
|
-
if not SERPAPI_KEY:
|
|
56
|
-
logger.error("SERPAPI_KEY not found in configuration or environment.")
|
|
57
|
-
raise ValueError("SERPAPI_KEY access token not found in tool_config or environment variable")
|
|
58
|
-
|
|
59
|
-
logger.info("Retrieved SERPAPI_KEY successfully.")
|
|
60
|
-
return SERPAPI_KEY
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
@assistant_tool
|
|
64
|
-
async def search_google(
|
|
65
|
-
query: str,
|
|
66
|
-
number_of_results: int = 10,
|
|
67
|
-
offset: int = 0,
|
|
68
|
-
tool_config: Optional[List[Dict]] = None,
|
|
69
|
-
as_oq: Optional[str] = None # <-- NEW PARAM for optional keywords
|
|
70
|
-
) -> List[str]:
|
|
71
|
-
"""
|
|
72
|
-
Search Google using SERP API, supporting pagination and an explicit 'offset'
|
|
73
|
-
parameter to start from a specific result index.
|
|
74
|
-
Now also supports 'as_oq' for optional query terms in SERP API.
|
|
75
|
-
|
|
76
|
-
Parameters:
|
|
77
|
-
- query (str): The search query.
|
|
78
|
-
- number_of_results (int): The total number of results to return. Default is 10.
|
|
79
|
-
- offset (int): The starting index for the first result returned (Google pagination).
|
|
80
|
-
- tool_config (Optional[List[Dict]]): Configuration containing SERP API token, etc.
|
|
81
|
-
- as_oq (Optional[str]): Optional query terms for SerpAPI (if supported).
|
|
82
|
-
|
|
83
|
-
Returns:
|
|
84
|
-
- List[str]: A list of organic search results, each serialized as a JSON string.
|
|
85
|
-
"""
|
|
86
|
-
logger.info("Entering search_google")
|
|
87
|
-
if not query:
|
|
88
|
-
logger.warning("Empty query string provided.")
|
|
89
|
-
return []
|
|
90
|
-
|
|
91
|
-
# Use 'as_oq' in the cache key too, so different optional terms don't conflict
|
|
92
|
-
cache_key = f"{query}_{number_of_results}_{offset}_{as_oq or ''}"
|
|
93
|
-
cached_response = retrieve_output("search_google_serp", cache_key)
|
|
94
|
-
if cached_response is not None:
|
|
95
|
-
logger.info("Cache hit for search_google.")
|
|
96
|
-
return cached_response
|
|
97
|
-
|
|
98
|
-
SERPAPI_KEY = get_serp_api_access_token(tool_config)
|
|
99
|
-
url = "https://serpapi.com/search"
|
|
100
|
-
|
|
101
|
-
page_size = 100
|
|
102
|
-
all_results: List[Dict[str, Any]] = []
|
|
103
|
-
start_index = offset
|
|
104
|
-
|
|
105
|
-
logger.debug(f"Requesting up to {number_of_results} results for '{query}' starting at offset {offset}.")
|
|
106
|
-
|
|
107
|
-
async with aiohttp.ClientSession() as session:
|
|
108
|
-
while len(all_results) < number_of_results:
|
|
109
|
-
to_fetch = min(page_size, number_of_results - len(all_results))
|
|
110
|
-
params = {
|
|
111
|
-
"q": query,
|
|
112
|
-
"num": to_fetch,
|
|
113
|
-
"start": start_index,
|
|
114
|
-
"api_key": SERPAPI_KEY,
|
|
115
|
-
"location": "United States"
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
# If we have optional terms, add them
|
|
119
|
-
if as_oq:
|
|
120
|
-
params["as_oq"] = as_oq
|
|
121
|
-
|
|
122
|
-
logger.debug(f"SERP API GET request with params: {params}")
|
|
123
|
-
|
|
124
|
-
try:
|
|
125
|
-
async with session.get(url, params=params) as response:
|
|
126
|
-
logger.debug(f"Received response status: {response.status}")
|
|
127
|
-
if response.status != 200:
|
|
128
|
-
try:
|
|
129
|
-
error_content = await response.json()
|
|
130
|
-
except Exception:
|
|
131
|
-
error_content = await response.text()
|
|
132
|
-
logger.warning(f"Non-200 response from SERP API: {error_content}")
|
|
133
|
-
return [json.dumps({"error": error_content})]
|
|
134
|
-
|
|
135
|
-
result = await response.json()
|
|
136
|
-
except Exception as e:
|
|
137
|
-
logger.exception("Exception during SERP API request.")
|
|
138
|
-
return [json.dumps({"error": str(e)})]
|
|
139
|
-
|
|
140
|
-
organic_results = result.get('organic_results', [])
|
|
141
|
-
if not organic_results:
|
|
142
|
-
logger.debug("No more organic results returned; stopping.")
|
|
143
|
-
break
|
|
144
|
-
|
|
145
|
-
all_results.extend(organic_results)
|
|
146
|
-
start_index += to_fetch
|
|
147
|
-
|
|
148
|
-
if len(all_results) >= number_of_results:
|
|
149
|
-
break
|
|
150
|
-
|
|
151
|
-
all_results = all_results[:number_of_results]
|
|
152
|
-
logger.info(f"Found {len(all_results)} results for query '{query}'.")
|
|
153
|
-
|
|
154
|
-
serialized_results = [json.dumps(item) for item in all_results]
|
|
155
|
-
cache_output("search_google_serp", cache_key, serialized_results)
|
|
156
|
-
return serialized_results
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
@assistant_tool
|
|
160
|
-
async def search_google_maps(
|
|
161
|
-
query: str,
|
|
162
|
-
number_of_results: int = 3,
|
|
163
|
-
tool_config: Optional[List[Dict]] = None
|
|
164
|
-
) -> List[str]:
|
|
165
|
-
"""
|
|
166
|
-
Search Google Maps using SERP API and return the results as an array of serialized JSON strings.
|
|
167
|
-
|
|
168
|
-
Parameters:
|
|
169
|
-
- query (str): The search query.
|
|
170
|
-
- number_of_results (int): The number of results to return.
|
|
171
|
-
"""
|
|
172
|
-
logger.info("Entering search_google_maps")
|
|
173
|
-
if not query:
|
|
174
|
-
logger.warning("Empty query string provided for search_google_maps.")
|
|
175
|
-
return []
|
|
176
|
-
|
|
177
|
-
SERPAPI_KEY = get_serp_api_access_token(tool_config)
|
|
178
|
-
params = {
|
|
179
|
-
"q": query,
|
|
180
|
-
"num": number_of_results,
|
|
181
|
-
"api_key": SERPAPI_KEY,
|
|
182
|
-
"engine": "google_maps"
|
|
183
|
-
}
|
|
184
|
-
url = "https://serpapi.com/search"
|
|
185
|
-
|
|
186
|
-
logger.debug(f"Searching Google Maps with params: {params}")
|
|
187
|
-
try:
|
|
188
|
-
async with aiohttp.ClientSession() as session:
|
|
189
|
-
async with session.get(url, params=params) as response:
|
|
190
|
-
logger.debug(f"Received status: {response.status}")
|
|
191
|
-
result = await response.json()
|
|
192
|
-
if response.status != 200:
|
|
193
|
-
logger.warning(f"Non-200 response from SERP API: {result}")
|
|
194
|
-
return [json.dumps({"error": result})]
|
|
195
|
-
|
|
196
|
-
serialized_results = [json.dumps(item) for item in result.get('local_results', [])]
|
|
197
|
-
logger.info(f"Returning {len(serialized_results)} map results.")
|
|
198
|
-
return serialized_results
|
|
199
|
-
except Exception as e:
|
|
200
|
-
logger.exception("Exception during search_google_maps request.")
|
|
201
|
-
return [json.dumps({"error": str(e)})]
|
|
202
|
-
|
|
24
|
+
from dhisana.utils.web_download_parse_tools import fetch_html_content
|
|
203
25
|
|
|
204
|
-
@assistant_tool
|
|
205
|
-
async def search_google_news(
|
|
206
|
-
query: str,
|
|
207
|
-
number_of_results: int = 3,
|
|
208
|
-
tool_config: Optional[List[Dict]] = None
|
|
209
|
-
) -> List[str]:
|
|
210
|
-
"""
|
|
211
|
-
Search Google News using SERP API and return the results as an array of serialized JSON strings.
|
|
212
|
-
|
|
213
|
-
Parameters:
|
|
214
|
-
- query (str): The search query.
|
|
215
|
-
- number_of_results (int): The number of results to return.
|
|
216
|
-
"""
|
|
217
|
-
logger.info("Entering search_google_news")
|
|
218
|
-
if not query:
|
|
219
|
-
logger.warning("Empty query string provided for search_google_news.")
|
|
220
|
-
return []
|
|
221
26
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
27
|
+
class LeadSearchResult(BaseModel):
|
|
28
|
+
first_name: str = ""
|
|
29
|
+
last_name: str = ""
|
|
30
|
+
full_name: str = ""
|
|
31
|
+
job_title: str = ""
|
|
32
|
+
linkedin_follower_count: int = 0
|
|
33
|
+
lead_location: str = ""
|
|
34
|
+
summary_about_lead: str = ""
|
|
35
|
+
user_linkedin_url: str = ""
|
|
230
36
|
|
|
231
|
-
logger.debug(f"Searching Google News with params: {params}")
|
|
232
|
-
try:
|
|
233
|
-
async with aiohttp.ClientSession() as session:
|
|
234
|
-
async with session.get(url, params=params) as response:
|
|
235
|
-
logger.debug(f"Received status: {response.status}")
|
|
236
|
-
result = await response.json()
|
|
237
|
-
if response.status != 200:
|
|
238
|
-
logger.warning(f"Non-200 response from SERP API: {result}")
|
|
239
|
-
return [json.dumps({"error": result})]
|
|
240
|
-
|
|
241
|
-
serialized_results = [json.dumps(item) for item in result.get('news_results', [])]
|
|
242
|
-
logger.info(f"Returning {len(serialized_results)} news results.")
|
|
243
|
-
return serialized_results
|
|
244
|
-
except Exception as e:
|
|
245
|
-
logger.exception("Exception during search_google_news request.")
|
|
246
|
-
return [json.dumps({"error": str(e)})]
|
|
247
37
|
|
|
38
|
+
class LinkedinCandidateChoice(BaseModel):
|
|
39
|
+
chosen_link: str = ""
|
|
40
|
+
confidence: float = 0.0
|
|
41
|
+
reasoning: str = ""
|
|
248
42
|
|
|
249
|
-
@assistant_tool
|
|
250
|
-
async def search_job_postings(
|
|
251
|
-
query: str,
|
|
252
|
-
number_of_results: int,
|
|
253
|
-
tool_config: Optional[List[Dict]] = None
|
|
254
|
-
) -> List[str]:
|
|
255
|
-
"""
|
|
256
|
-
Search for job postings using SERP API and return the results as an array of serialized JSON strings.
|
|
257
|
-
|
|
258
|
-
Parameters:
|
|
259
|
-
- query (str): The search query.
|
|
260
|
-
- number_of_results (int): The number of results to return.
|
|
261
|
-
"""
|
|
262
|
-
logger.info("Entering search_job_postings")
|
|
263
|
-
if not query:
|
|
264
|
-
logger.warning("Empty query string provided for search_job_postings.")
|
|
265
|
-
return []
|
|
266
43
|
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
"q": query,
|
|
270
|
-
"num": number_of_results,
|
|
271
|
-
"api_key": SERPAPI_KEY,
|
|
272
|
-
"engine": "google_jobs"
|
|
273
|
-
}
|
|
274
|
-
url = "https://serpapi.com/search"
|
|
44
|
+
async def get_structured_output(text: str, tool_config: Optional[List[Dict]] = None) -> LeadSearchResult:
|
|
45
|
+
"""Parse text snippet into ``LeadSearchResult`` using OpenAI."""
|
|
275
46
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
return serialized_results
|
|
289
|
-
except Exception as e:
|
|
290
|
-
logger.exception("Exception during search_job_postings request.")
|
|
291
|
-
return [json.dumps({"error": str(e)})]
|
|
47
|
+
prompt = (
|
|
48
|
+
"Extract lead details from the text below.\n"
|
|
49
|
+
"If follower counts are mentioned, convert values like '1.5k+ followers' to an integer (e.g. 1500).\n"
|
|
50
|
+
f"Return JSON matching this schema:\n{json.dumps(LeadSearchResult.model_json_schema(), indent=2)}\n\n"
|
|
51
|
+
f"Text:\n{text}"
|
|
52
|
+
)
|
|
53
|
+
result, status = await get_structured_output_internal(
|
|
54
|
+
prompt, LeadSearchResult, model = "gpt-5.1-chat", tool_config=tool_config
|
|
55
|
+
)
|
|
56
|
+
if status != "SUCCESS" or result is None:
|
|
57
|
+
return LeadSearchResult()
|
|
58
|
+
return result
|
|
292
59
|
|
|
293
60
|
|
|
294
61
|
@assistant_tool
|
|
295
|
-
async def
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
"""
|
|
301
|
-
Search Google Images using SERP API and return the results as an array of serialized JSON strings.
|
|
302
|
-
|
|
303
|
-
Parameters:
|
|
304
|
-
- query (str): The search query.
|
|
305
|
-
- number_of_results (int): The number of results to return.
|
|
306
|
-
"""
|
|
307
|
-
logger.info("Entering search_google_images")
|
|
308
|
-
if not query:
|
|
309
|
-
logger.warning("Empty query string provided for search_google_images.")
|
|
310
|
-
return []
|
|
311
|
-
|
|
312
|
-
SERPAPI_KEY = get_serp_api_access_token(tool_config)
|
|
313
|
-
params = {
|
|
314
|
-
"q": query,
|
|
315
|
-
"num": number_of_results,
|
|
316
|
-
"api_key": SERPAPI_KEY,
|
|
317
|
-
"engine": "google_images"
|
|
318
|
-
}
|
|
319
|
-
url = "https://serpapi.com/search"
|
|
320
|
-
|
|
321
|
-
logger.debug(f"Searching Google Images with params: {params}")
|
|
322
|
-
try:
|
|
323
|
-
async with aiohttp.ClientSession() as session:
|
|
324
|
-
async with session.get(url, params=params) as response:
|
|
325
|
-
logger.debug(f"Received status: {response.status}")
|
|
326
|
-
result = await response.json()
|
|
327
|
-
if response.status != 200:
|
|
328
|
-
logger.warning(f"Non-200 response from SERP API: {result}")
|
|
329
|
-
return [json.dumps({"error": result})]
|
|
330
|
-
|
|
331
|
-
serialized_results = [json.dumps(item) for item in result.get('images_results', [])]
|
|
332
|
-
logger.info(f"Returning {len(serialized_results)} image results.")
|
|
333
|
-
return serialized_results
|
|
334
|
-
except Exception as e:
|
|
335
|
-
logger.exception("Exception during search_google_images request.")
|
|
336
|
-
return [json.dumps({"error": str(e)})]
|
|
337
|
-
|
|
62
|
+
async def find_user_linkedin_url_with_serper(
|
|
63
|
+
user_linkedin_url: str,
|
|
64
|
+
tool_config: Optional[List[Dict]] = None,
|
|
65
|
+
) -> Optional[Dict]:
|
|
66
|
+
"""Search Google via Serper.dev for ``user_linkedin_url`` and parse lead details."""
|
|
338
67
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
query: str,
|
|
342
|
-
number_of_results: int,
|
|
343
|
-
tool_config: Optional[List[Dict]] = None
|
|
344
|
-
) -> List[str]:
|
|
345
|
-
"""
|
|
346
|
-
Search Google Videos using SERP API and return the results as an array of serialized JSON strings.
|
|
347
|
-
|
|
348
|
-
Parameters:
|
|
349
|
-
- query (str): The search query.
|
|
350
|
-
- number_of_results (int): The number of results to return.
|
|
351
|
-
"""
|
|
352
|
-
logger.info("Entering search_google_videos")
|
|
353
|
-
if not query:
|
|
354
|
-
logger.warning("Empty query string provided for search_google_videos.")
|
|
355
|
-
return []
|
|
68
|
+
if not user_linkedin_url:
|
|
69
|
+
return None
|
|
356
70
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
71
|
+
normalized_input = extract_user_linkedin_page(user_linkedin_url)
|
|
72
|
+
results = await search_google_serper(user_linkedin_url, 10, tool_config=tool_config)
|
|
73
|
+
for item_json in results:
|
|
74
|
+
try:
|
|
75
|
+
item = json.loads(item_json)
|
|
76
|
+
except Exception:
|
|
77
|
+
continue
|
|
78
|
+
link = item.get("link", "")
|
|
79
|
+
if not link:
|
|
80
|
+
continue
|
|
81
|
+
if extract_user_linkedin_page(link) == normalized_input:
|
|
82
|
+
text = " ".join(
|
|
83
|
+
[item.get("title", ""), item.get("subtitle", ""), item.get("snippet", "")]
|
|
84
|
+
).strip()
|
|
85
|
+
structured = await get_structured_output(text, tool_config=tool_config)
|
|
86
|
+
structured.user_linkedin_url = normalized_input
|
|
87
|
+
return json.loads(structured.model_dump_json())
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def pick_best_linkedin_candidate_with_llm(
|
|
92
|
+
email: str,
|
|
93
|
+
user_name: str,
|
|
94
|
+
user_title: str,
|
|
95
|
+
user_location: str,
|
|
96
|
+
user_company: str,
|
|
97
|
+
candidates: List[Dict],
|
|
98
|
+
tool_config: Optional[List[Dict]] = None,
|
|
99
|
+
) -> Optional[LinkedinCandidateChoice]:
|
|
100
|
+
"""Ask the LLM to assess candidate LinkedIn URLs and pick the best match."""
|
|
101
|
+
|
|
102
|
+
if not candidates:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
candidates_sorted = candidates[-3:]
|
|
106
|
+
candidate_lines = []
|
|
107
|
+
for idx, candidate in enumerate(candidates_sorted, start=1):
|
|
108
|
+
candidate_lines.append(
|
|
109
|
+
"\n".join(
|
|
110
|
+
[
|
|
111
|
+
f"Candidate {idx}:",
|
|
112
|
+
f" Link: {candidate.get('link', '')}",
|
|
113
|
+
f" Title: {candidate.get('title', '')}",
|
|
114
|
+
f" Snippet: {candidate.get('snippet', '')}",
|
|
115
|
+
f" Subtitle: {candidate.get('subtitle', '')}",
|
|
116
|
+
f" Query: {candidate.get('query', '')}",
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
)
|
|
365
120
|
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
121
|
+
prompt = (
|
|
122
|
+
"You are validating LinkedIn profile matches for a lead enrichment workflow.\n"
|
|
123
|
+
"Given the lead context and candidate search results, pick the most likely LinkedIn profile.\n"
|
|
124
|
+
"If no candidate seems appropriate, return an empty link and confidence 0.\n"
|
|
125
|
+
"Consider whether the email, name, company, title, or location aligns with the candidate.\n"
|
|
126
|
+
"Lead context:\n"
|
|
127
|
+
f"- Email: {email or 'unknown'}\n"
|
|
128
|
+
f"- Name: {user_name or 'unknown'}\n"
|
|
129
|
+
f"- Title: {user_title or 'unknown'}\n"
|
|
130
|
+
f"- Company: {user_company or 'unknown'}\n"
|
|
131
|
+
f"- Location: {user_location or 'unknown'}\n\n"
|
|
132
|
+
"Candidates:\n"
|
|
133
|
+
f"{chr(10).join(candidate_lines)}\n\n"
|
|
134
|
+
"Return JSON with fields: chosen_link (string), confidence (0-1 float), reasoning (short string)."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
result, status = await get_structured_output_internal(
|
|
138
|
+
prompt,
|
|
139
|
+
LinkedinCandidateChoice,
|
|
140
|
+
model="gpt-5.1-chat",
|
|
141
|
+
tool_config=tool_config,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if status != "SUCCESS" or result is None:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
return result
|
|
382
148
|
|
|
383
149
|
|
|
384
150
|
@assistant_tool
|
|
@@ -388,14 +154,7 @@ async def get_company_domain_from_google_search(
|
|
|
388
154
|
tool_config: Optional[List[Dict]] = None
|
|
389
155
|
) -> str:
|
|
390
156
|
"""
|
|
391
|
-
Tries to find the company domain from the company name using Google
|
|
392
|
-
|
|
393
|
-
Args:
|
|
394
|
-
company_name (str): The name of the company to search for.
|
|
395
|
-
location (str, optional): A location to include in the query.
|
|
396
|
-
|
|
397
|
-
Returns:
|
|
398
|
-
str: The domain of the company's official website if found, otherwise an empty string.
|
|
157
|
+
Tries to find the company domain from the company name using Google (SerpAPI or Serper.dev).
|
|
399
158
|
"""
|
|
400
159
|
logger.info("Entering get_company_domain_from_google_search")
|
|
401
160
|
|
|
@@ -404,22 +163,21 @@ async def get_company_domain_from_google_search(
|
|
|
404
163
|
logger.debug("Invalid or excluded company_name provided.")
|
|
405
164
|
return ""
|
|
406
165
|
|
|
407
|
-
exclude_company_names = ["linkedin", "wikipedia", "facebook", "instagram", "twitter", "youtube", "netflix", "zoominfo", "reditt"]
|
|
408
166
|
query = f"\"{company_name}\" official website"
|
|
409
167
|
if location:
|
|
410
168
|
query = f"\"{company_name}\" official website, {location}"
|
|
411
169
|
|
|
412
170
|
try:
|
|
413
171
|
logger.debug(f"Performing search with query: {query}")
|
|
414
|
-
result = await
|
|
172
|
+
result = await search_google_with_tools(query, 1, tool_config=tool_config)
|
|
415
173
|
if not isinstance(result, list) or len(result) == 0:
|
|
416
174
|
logger.debug("No results for first attempt, retrying with fallback query.")
|
|
417
175
|
query = f"{company_name} official website"
|
|
418
|
-
result = await
|
|
176
|
+
result = await search_google_with_tools(query, 1, tool_config=tool_config)
|
|
419
177
|
if not isinstance(result, list) or len(result) == 0:
|
|
420
178
|
logger.debug("No results from fallback query either.")
|
|
421
179
|
return ''
|
|
422
|
-
except Exception
|
|
180
|
+
except Exception:
|
|
423
181
|
logger.exception("Exception during get_company_domain_from_google_search.")
|
|
424
182
|
return ''
|
|
425
183
|
|
|
@@ -471,16 +229,6 @@ async def get_signal_strength(
|
|
|
471
229
|
"""
|
|
472
230
|
Find how strong a match for the keywords in search is by checking
|
|
473
231
|
how many search results contain all desired keywords in the snippet.
|
|
474
|
-
|
|
475
|
-
Args:
|
|
476
|
-
domain_to_search (str): The domain to search inside.
|
|
477
|
-
keywords (List[str]): The keywords to search for.
|
|
478
|
-
in_title (List[str]): Keywords that must appear in the title.
|
|
479
|
-
not_in_title (List[str]): Keywords that must not appear in the title.
|
|
480
|
-
negative_keywords (List[str]): Keywords to exclude from results.
|
|
481
|
-
|
|
482
|
-
Returns:
|
|
483
|
-
int: A strength score on a scale of 0 to 5.
|
|
484
232
|
"""
|
|
485
233
|
logger.info("Entering get_signal_strength")
|
|
486
234
|
|
|
@@ -507,8 +255,8 @@ async def get_signal_strength(
|
|
|
507
255
|
|
|
508
256
|
logger.debug(f"Performing get_signal_strength search with query: {final_query}")
|
|
509
257
|
try:
|
|
510
|
-
results = await
|
|
511
|
-
except Exception
|
|
258
|
+
results = await search_google_with_tools(final_query, 5, tool_config=tool_config)
|
|
259
|
+
except Exception:
|
|
512
260
|
logger.exception("Exception occurred while searching for signal strength.")
|
|
513
261
|
return 0
|
|
514
262
|
|
|
@@ -517,9 +265,9 @@ async def get_signal_strength(
|
|
|
517
265
|
return 0
|
|
518
266
|
|
|
519
267
|
score = 0
|
|
520
|
-
for
|
|
268
|
+
for result_item in results:
|
|
521
269
|
try:
|
|
522
|
-
result_json = json.loads(
|
|
270
|
+
result_json = json.loads(result_item)
|
|
523
271
|
snippet_text = result_json.get('snippet', '').lower()
|
|
524
272
|
if all(kw.lower() in snippet_text for kw in keywords):
|
|
525
273
|
logger.debug(f"Found match in snippet: {snippet_text[:60]}...")
|
|
@@ -543,8 +291,8 @@ def extract_user_linkedin_page(url: str) -> str:
|
|
|
543
291
|
if not url:
|
|
544
292
|
return ""
|
|
545
293
|
|
|
546
|
-
normalized_url = re.sub(r"(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
|
|
547
|
-
match = re.match(r"https://www
|
|
294
|
+
normalized_url = re.sub(r"^(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
|
|
295
|
+
match = re.match(r"https://www\.linkedin\.com/in/([^/?#]+)", normalized_url)
|
|
548
296
|
if match:
|
|
549
297
|
page = f"https://www.linkedin.com/in/{match.group(1)}"
|
|
550
298
|
logger.debug(f"Extracted user LinkedIn page: {page}")
|
|
@@ -560,21 +308,12 @@ async def find_user_linkedin_url_google(
|
|
|
560
308
|
user_title: str,
|
|
561
309
|
user_location: str,
|
|
562
310
|
user_company: str,
|
|
311
|
+
user_company_domain: str = "",
|
|
563
312
|
use_strict_check: bool = True,
|
|
564
313
|
tool_config: Optional[List[Dict]] = None
|
|
565
314
|
) -> str:
|
|
566
315
|
"""
|
|
567
316
|
Find the LinkedIn URL for a user based on their name, title, location, and company.
|
|
568
|
-
|
|
569
|
-
Args:
|
|
570
|
-
user_name (str): The name of the user.
|
|
571
|
-
user_title (str): The title of the user.
|
|
572
|
-
user_location (str): The location of the user.
|
|
573
|
-
user_company (str): The company of the user.
|
|
574
|
-
use_strict_check (bool): Whether to use a strict single query or a series of relaxed queries.
|
|
575
|
-
|
|
576
|
-
Returns:
|
|
577
|
-
str: The LinkedIn URL if found, otherwise an empty string.
|
|
578
317
|
"""
|
|
579
318
|
logger.info("Entering find_user_linkedin_url_google")
|
|
580
319
|
|
|
@@ -584,7 +323,7 @@ async def find_user_linkedin_url_google(
|
|
|
584
323
|
|
|
585
324
|
if use_strict_check:
|
|
586
325
|
queries = [
|
|
587
|
-
f'site:linkedin.com/in "{user_name}"
|
|
326
|
+
f'site:linkedin.com/in ("{user_name}") ({user_company} | {user_company_domain}) ( {user_title} | ) intitle:"{user_name}" -intitle:"profiles" '
|
|
588
327
|
]
|
|
589
328
|
else:
|
|
590
329
|
queries = [
|
|
@@ -594,14 +333,14 @@ async def find_user_linkedin_url_google(
|
|
|
594
333
|
f'site:linkedin.com/in "{user_name}" intitle:"{user_name}"'
|
|
595
334
|
]
|
|
596
335
|
|
|
597
|
-
async with aiohttp.ClientSession() as session:
|
|
336
|
+
async with aiohttp.ClientSession() as session:
|
|
598
337
|
for query in queries:
|
|
599
338
|
if not query.strip():
|
|
600
339
|
continue
|
|
601
340
|
logger.debug(f"Searching with query: {query}")
|
|
602
341
|
try:
|
|
603
|
-
results = await
|
|
604
|
-
except Exception
|
|
342
|
+
results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
|
|
343
|
+
except Exception:
|
|
605
344
|
logger.exception("Error searching for LinkedIn user URL.")
|
|
606
345
|
continue
|
|
607
346
|
|
|
@@ -630,6 +369,221 @@ async def find_user_linkedin_url_google(
|
|
|
630
369
|
return ""
|
|
631
370
|
|
|
632
371
|
|
|
372
|
+
@assistant_tool
|
|
373
|
+
async def find_user_linkedin_url_by_email_google(
|
|
374
|
+
email: str,
|
|
375
|
+
user_name: str = "",
|
|
376
|
+
user_title: str = "",
|
|
377
|
+
user_location: str = "",
|
|
378
|
+
user_company: str = "",
|
|
379
|
+
tool_config: Optional[List[Dict]] = None,
|
|
380
|
+
) -> Optional[Dict[str, Any]]:
|
|
381
|
+
"""
|
|
382
|
+
Find the LinkedIn URL for a user based primarily on their email address.
|
|
383
|
+
|
|
384
|
+
Additional profile hints (name, title, location, company) improve query precision
|
|
385
|
+
when supplied. Returns a dict with the best LinkedIn URL, LLM confidence score,
|
|
386
|
+
and short reasoning when a match clears the confidence threshold; otherwise ``None``.
|
|
387
|
+
"""
|
|
388
|
+
logger.info("Entering find_user_linkedin_url_by_email_google")
|
|
389
|
+
|
|
390
|
+
if not email:
|
|
391
|
+
logger.warning("No email provided.")
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
normalized_email = email.strip().lower()
|
|
395
|
+
email_local_part = normalized_email.split("@")[0] if "@" in normalized_email else normalized_email
|
|
396
|
+
email_local_humanized = re.sub(r"[._-]+", " ", email_local_part).strip()
|
|
397
|
+
|
|
398
|
+
queries: List[str] = []
|
|
399
|
+
|
|
400
|
+
def add_query(query: str) -> None:
|
|
401
|
+
query = query.strip()
|
|
402
|
+
if query and query not in queries:
|
|
403
|
+
queries.append(query)
|
|
404
|
+
|
|
405
|
+
def add_query_parts(*parts: str) -> None:
|
|
406
|
+
tokens = [part.strip() for part in parts if part and part.strip()]
|
|
407
|
+
if not tokens:
|
|
408
|
+
return
|
|
409
|
+
add_query(" ".join(tokens))
|
|
410
|
+
|
|
411
|
+
enriched_terms = []
|
|
412
|
+
if user_name:
|
|
413
|
+
enriched_terms.append(f'"{user_name}"')
|
|
414
|
+
if user_company:
|
|
415
|
+
enriched_terms.append(f'"{user_company}"')
|
|
416
|
+
if user_title:
|
|
417
|
+
enriched_terms.append(f'"{user_title}"')
|
|
418
|
+
if user_location:
|
|
419
|
+
enriched_terms.append(f'"{user_location}"')
|
|
420
|
+
base_hint = " ".join(enriched_terms)
|
|
421
|
+
|
|
422
|
+
# Prioritise the direct email search variants before broader fallbacks.
|
|
423
|
+
add_query_parts(normalized_email, "linkedin.com/in", base_hint)
|
|
424
|
+
add_query_parts(normalized_email, "linkedin.com", base_hint)
|
|
425
|
+
add_query_parts(normalized_email, "linkedin", base_hint)
|
|
426
|
+
add_query_parts(normalized_email, base_hint)
|
|
427
|
+
add_query(f'"{normalized_email}" "linkedin.com/in" {base_hint}')
|
|
428
|
+
add_query(f'"{normalized_email}" "linkedin.com" {base_hint}')
|
|
429
|
+
add_query(f'"{normalized_email}" linkedin {base_hint}')
|
|
430
|
+
|
|
431
|
+
if email_local_part and email_local_part != normalized_email:
|
|
432
|
+
add_query_parts(email_local_part, "linkedin.com/in", base_hint)
|
|
433
|
+
add_query_parts(email_local_part, "linkedin.com", base_hint)
|
|
434
|
+
add_query_parts(email_local_part, "linkedin", base_hint)
|
|
435
|
+
add_query(f'"{email_local_part}" "linkedin.com/in" {base_hint}')
|
|
436
|
+
add_query(f'"{email_local_part}" "linkedin.com" {base_hint}')
|
|
437
|
+
|
|
438
|
+
if email_local_humanized and email_local_humanized not in {email_local_part, normalized_email}:
|
|
439
|
+
add_query_parts(email_local_humanized, "linkedin", base_hint)
|
|
440
|
+
add_query(f'"{email_local_humanized}" linkedin {base_hint}')
|
|
441
|
+
|
|
442
|
+
if normalized_email:
|
|
443
|
+
add_query(f'site:linkedin.com/in "{normalized_email}" {base_hint}')
|
|
444
|
+
|
|
445
|
+
if email_local_part:
|
|
446
|
+
add_query(f'site:linkedin.com/in "{email_local_part}" {base_hint}')
|
|
447
|
+
|
|
448
|
+
if email_local_humanized and email_local_humanized != email_local_part:
|
|
449
|
+
add_query(f'site:linkedin.com/in "{email_local_humanized}" {base_hint}')
|
|
450
|
+
|
|
451
|
+
if base_hint:
|
|
452
|
+
lookup_hint = user_name or email_local_humanized or email_local_part or normalized_email
|
|
453
|
+
add_query(
|
|
454
|
+
f'site:linkedin.com/in "{normalized_email}" {base_hint} '
|
|
455
|
+
f'intitle:"{lookup_hint}" -intitle:"profiles"'
|
|
456
|
+
)
|
|
457
|
+
if email_local_humanized:
|
|
458
|
+
add_query(
|
|
459
|
+
f'site:linkedin.com/in "{email_local_humanized}" {base_hint} '
|
|
460
|
+
f'intitle:"{lookup_hint}" -intitle:"profiles"'
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
candidate_records: List[Dict[str, str]] = []
|
|
464
|
+
seen_links: Set[str] = set()
|
|
465
|
+
best_llm_choice: Optional[LinkedinCandidateChoice] = None
|
|
466
|
+
best_llm_link: str = ""
|
|
467
|
+
HIGH_CONFIDENCE_THRESHOLD = 0.8
|
|
468
|
+
MIN_CONFIDENCE_THRESHOLD = 0.75
|
|
469
|
+
|
|
470
|
+
async def evaluate_with_llm() -> Optional[LinkedinCandidateChoice]:
|
|
471
|
+
nonlocal best_llm_choice, best_llm_link
|
|
472
|
+
|
|
473
|
+
llm_choice = await pick_best_linkedin_candidate_with_llm(
|
|
474
|
+
email=email,
|
|
475
|
+
user_name=user_name,
|
|
476
|
+
user_title=user_title,
|
|
477
|
+
user_location=user_location,
|
|
478
|
+
user_company=user_company,
|
|
479
|
+
candidates=candidate_records,
|
|
480
|
+
tool_config=tool_config,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
if not llm_choice or not llm_choice.chosen_link:
|
|
484
|
+
return None
|
|
485
|
+
|
|
486
|
+
chosen_link = extract_user_linkedin_page(llm_choice.chosen_link)
|
|
487
|
+
if not chosen_link:
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
llm_choice.chosen_link = chosen_link
|
|
491
|
+
|
|
492
|
+
if best_llm_choice is None or llm_choice.confidence > best_llm_choice.confidence:
|
|
493
|
+
best_llm_choice = llm_choice
|
|
494
|
+
best_llm_link = chosen_link
|
|
495
|
+
logger.debug(
|
|
496
|
+
"LLM updated best candidate: %s (confidence %.2f) reason: %s",
|
|
497
|
+
chosen_link,
|
|
498
|
+
llm_choice.confidence,
|
|
499
|
+
llm_choice.reasoning,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
if llm_choice.confidence >= HIGH_CONFIDENCE_THRESHOLD:
|
|
503
|
+
logger.info(
|
|
504
|
+
"Returning LinkedIn user page by email via LLM scoring: %s (confidence %.2f)",
|
|
505
|
+
chosen_link,
|
|
506
|
+
llm_choice.confidence,
|
|
507
|
+
)
|
|
508
|
+
return llm_choice
|
|
509
|
+
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
async with aiohttp.ClientSession() as session:
|
|
513
|
+
for query in queries:
|
|
514
|
+
query = query.strip()
|
|
515
|
+
if not query:
|
|
516
|
+
continue
|
|
517
|
+
logger.debug(f"Searching with query: {query}")
|
|
518
|
+
|
|
519
|
+
try:
|
|
520
|
+
results = await search_google_with_tools(query, 5, tool_config=tool_config)
|
|
521
|
+
except Exception:
|
|
522
|
+
logger.exception("Error searching for LinkedIn user URL by email.")
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
526
|
+
logger.debug("No results for this query, moving to next.")
|
|
527
|
+
continue
|
|
528
|
+
|
|
529
|
+
for result_item in results:
|
|
530
|
+
try:
|
|
531
|
+
result_json = json.loads(result_item)
|
|
532
|
+
except (json.JSONDecodeError, IndexError):
|
|
533
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
link = result_json.get('link', '')
|
|
537
|
+
if not link:
|
|
538
|
+
continue
|
|
539
|
+
|
|
540
|
+
parsed_url = urlparse(link)
|
|
541
|
+
if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
|
|
542
|
+
link = extract_user_linkedin_page(link)
|
|
543
|
+
if not link or link in seen_links:
|
|
544
|
+
continue
|
|
545
|
+
|
|
546
|
+
title = result_json.get('title', '')
|
|
547
|
+
snippet = result_json.get('snippet', '')
|
|
548
|
+
subtitle = result_json.get('subtitle', '')
|
|
549
|
+
|
|
550
|
+
candidate_records.append(
|
|
551
|
+
{
|
|
552
|
+
"link": link,
|
|
553
|
+
"title": title,
|
|
554
|
+
"snippet": snippet,
|
|
555
|
+
"subtitle": subtitle,
|
|
556
|
+
"query": query,
|
|
557
|
+
}
|
|
558
|
+
)
|
|
559
|
+
if len(candidate_records) > 6:
|
|
560
|
+
candidate_records.pop(0)
|
|
561
|
+
seen_links.add(link)
|
|
562
|
+
|
|
563
|
+
high_conf_choice = await evaluate_with_llm()
|
|
564
|
+
if high_conf_choice:
|
|
565
|
+
return {
|
|
566
|
+
"linkedin_url": high_conf_choice.chosen_link,
|
|
567
|
+
"confidence": high_conf_choice.confidence,
|
|
568
|
+
"reasoning": high_conf_choice.reasoning,
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if best_llm_choice and best_llm_link and best_llm_choice.confidence >= MIN_CONFIDENCE_THRESHOLD:
|
|
572
|
+
logger.info(
|
|
573
|
+
"Returning LinkedIn user page by email via LLM scoring (best overall): %s (confidence %.2f)",
|
|
574
|
+
best_llm_link,
|
|
575
|
+
best_llm_choice.confidence,
|
|
576
|
+
)
|
|
577
|
+
return {
|
|
578
|
+
"linkedin_url": best_llm_link,
|
|
579
|
+
"confidence": best_llm_choice.confidence,
|
|
580
|
+
"reasoning": best_llm_choice.reasoning,
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
logger.info("No matching LinkedIn user page found using email queries.")
|
|
584
|
+
return None
|
|
585
|
+
|
|
586
|
+
|
|
633
587
|
@assistant_tool
|
|
634
588
|
async def find_user_linkedin_url_by_job_title_google(
|
|
635
589
|
user_title: str,
|
|
@@ -639,14 +593,6 @@ async def find_user_linkedin_url_by_job_title_google(
|
|
|
639
593
|
) -> str:
|
|
640
594
|
"""
|
|
641
595
|
Find the LinkedIn URL for a user based on their job_title, location, and company.
|
|
642
|
-
|
|
643
|
-
Args:
|
|
644
|
-
user_title (str): The title of the user.
|
|
645
|
-
user_location (str): The location of the user.
|
|
646
|
-
user_company (str): The company of the user.
|
|
647
|
-
|
|
648
|
-
Returns:
|
|
649
|
-
str: The LinkedIn URL if found, otherwise an empty string.
|
|
650
596
|
"""
|
|
651
597
|
logger.info("Entering find_user_linkedin_url_by_job_title_google")
|
|
652
598
|
|
|
@@ -654,15 +600,15 @@ async def find_user_linkedin_url_by_job_title_google(
|
|
|
654
600
|
f'site:linkedin.com/in "{user_company}" AND "{user_title}" -intitle:"profiles" ',
|
|
655
601
|
]
|
|
656
602
|
|
|
657
|
-
async with aiohttp.ClientSession() as session:
|
|
603
|
+
async with aiohttp.ClientSession() as session:
|
|
658
604
|
for query in queries:
|
|
659
605
|
if not query.strip():
|
|
660
606
|
continue
|
|
661
607
|
logger.debug(f"Searching with query: {query}")
|
|
662
608
|
|
|
663
609
|
try:
|
|
664
|
-
results = await
|
|
665
|
-
except Exception
|
|
610
|
+
results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
|
|
611
|
+
except Exception:
|
|
666
612
|
logger.exception("Error searching for LinkedIn URL by job title.")
|
|
667
613
|
continue
|
|
668
614
|
|
|
@@ -699,14 +645,6 @@ async def find_user_linkedin_url_by_google_search(
|
|
|
699
645
|
) -> List[str]:
|
|
700
646
|
"""
|
|
701
647
|
Find LinkedIn user URLs based on provided Google search queries.
|
|
702
|
-
|
|
703
|
-
Args:
|
|
704
|
-
queries (List[str]): A list of Google search queries.
|
|
705
|
-
number_of_results (int): Number of results to return from each query (default is 5).
|
|
706
|
-
tool_config (Optional[List[Dict]]): Optional configuration for the SERP API.
|
|
707
|
-
|
|
708
|
-
Returns:
|
|
709
|
-
List[str]: A list of matching LinkedIn user URLs found, or an empty list if none.
|
|
710
648
|
"""
|
|
711
649
|
logger.info("Entering find_user_linkedin_url_by_google_search")
|
|
712
650
|
found_urls = []
|
|
@@ -717,8 +655,8 @@ async def find_user_linkedin_url_by_google_search(
|
|
|
717
655
|
logger.debug(f"Searching with query: {query}")
|
|
718
656
|
|
|
719
657
|
try:
|
|
720
|
-
results = await
|
|
721
|
-
except Exception
|
|
658
|
+
results = await search_google_with_tools(query.strip(), number_of_results, tool_config=tool_config)
|
|
659
|
+
except Exception:
|
|
722
660
|
logger.exception("Error searching for LinkedIn URL using Google search.")
|
|
723
661
|
continue
|
|
724
662
|
|
|
@@ -772,19 +710,12 @@ def extract_company_page(url: str) -> str:
|
|
|
772
710
|
async def find_organization_linkedin_url_with_google_search(
|
|
773
711
|
company_name: str,
|
|
774
712
|
company_location: Optional[str] = None,
|
|
713
|
+
company_domain: Optional[str] = None,
|
|
775
714
|
use_strict_check: bool = True,
|
|
776
715
|
tool_config: Optional[List[Dict]] = None,
|
|
777
716
|
) -> str:
|
|
778
717
|
"""
|
|
779
718
|
Find the LinkedIn URL for a company based on its name and optional location using Google search.
|
|
780
|
-
|
|
781
|
-
Args:
|
|
782
|
-
company_name (str): The name of the company.
|
|
783
|
-
company_location (str, optional): The location of the company.
|
|
784
|
-
use_strict_check (bool): Whether to use stricter or multiple queries.
|
|
785
|
-
|
|
786
|
-
Returns:
|
|
787
|
-
str: The LinkedIn URL if found, otherwise an empty string.
|
|
788
719
|
"""
|
|
789
720
|
logger.info("Entering find_organization_linkedin_url_with_google_search")
|
|
790
721
|
|
|
@@ -793,7 +724,7 @@ async def find_organization_linkedin_url_with_google_search(
|
|
|
793
724
|
return ""
|
|
794
725
|
|
|
795
726
|
if use_strict_check:
|
|
796
|
-
queries = [f'site:linkedin.com/company "{company_name}" {
|
|
727
|
+
queries = [f'site:linkedin.com/company "{company_name}" {company_domain} ']
|
|
797
728
|
else:
|
|
798
729
|
if company_location:
|
|
799
730
|
queries = [
|
|
@@ -814,8 +745,8 @@ async def find_organization_linkedin_url_with_google_search(
|
|
|
814
745
|
|
|
815
746
|
logger.debug(f"Searching with query: {query}")
|
|
816
747
|
try:
|
|
817
|
-
results = await
|
|
818
|
-
except Exception
|
|
748
|
+
results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
|
|
749
|
+
except Exception:
|
|
819
750
|
logger.exception("Error searching for organization LinkedIn URL.")
|
|
820
751
|
continue
|
|
821
752
|
|
|
@@ -868,7 +799,7 @@ async def get_external_links(url: str) -> List[str]:
|
|
|
868
799
|
else:
|
|
869
800
|
logger.warning(f"Non-200 status ({response.status}) while fetching external links.")
|
|
870
801
|
return []
|
|
871
|
-
except Exception
|
|
802
|
+
except Exception:
|
|
872
803
|
logger.exception("Exception occurred while fetching external links.")
|
|
873
804
|
return []
|
|
874
805
|
|
|
@@ -880,7 +811,7 @@ async def get_resolved_linkedin_links(url: str) -> List[str]:
|
|
|
880
811
|
logger.debug(f"Entering get_resolved_linkedin_links for URL: {url}")
|
|
881
812
|
try:
|
|
882
813
|
content = await fetch_html_content(url)
|
|
883
|
-
except Exception
|
|
814
|
+
except Exception:
|
|
884
815
|
logger.exception("Exception occurred while fetching HTML content.")
|
|
885
816
|
return []
|
|
886
817
|
|
|
@@ -904,7 +835,7 @@ async def get_company_website_from_linkedin_url(linkedin_url: str) -> str:
|
|
|
904
835
|
|
|
905
836
|
try:
|
|
906
837
|
links = await get_external_links(linkedin_url)
|
|
907
|
-
except Exception
|
|
838
|
+
except Exception:
|
|
908
839
|
logger.exception("Exception occurred while getting external links for LinkedIn URL.")
|
|
909
840
|
return ""
|
|
910
841
|
|