dhisana 0.0.1.dev243__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dhisana/__init__.py +1 -0
- dhisana/cli/__init__.py +1 -0
- dhisana/cli/cli.py +20 -0
- dhisana/cli/datasets.py +27 -0
- dhisana/cli/models.py +26 -0
- dhisana/cli/predictions.py +20 -0
- dhisana/schemas/__init__.py +1 -0
- dhisana/schemas/common.py +399 -0
- dhisana/schemas/sales.py +965 -0
- dhisana/ui/__init__.py +1 -0
- dhisana/ui/components.py +472 -0
- dhisana/utils/__init__.py +1 -0
- dhisana/utils/add_mapping.py +352 -0
- dhisana/utils/agent_tools.py +51 -0
- dhisana/utils/apollo_tools.py +1597 -0
- dhisana/utils/assistant_tool_tag.py +4 -0
- dhisana/utils/built_with_api_tools.py +282 -0
- dhisana/utils/cache_output_tools.py +98 -0
- dhisana/utils/cache_output_tools_local.py +78 -0
- dhisana/utils/check_email_validity_tools.py +717 -0
- dhisana/utils/check_for_intent_signal.py +107 -0
- dhisana/utils/check_linkedin_url_validity.py +209 -0
- dhisana/utils/clay_tools.py +43 -0
- dhisana/utils/clean_properties.py +135 -0
- dhisana/utils/company_utils.py +60 -0
- dhisana/utils/compose_salesnav_query.py +259 -0
- dhisana/utils/compose_search_query.py +759 -0
- dhisana/utils/compose_three_step_workflow.py +234 -0
- dhisana/utils/composite_tools.py +137 -0
- dhisana/utils/dataframe_tools.py +237 -0
- dhisana/utils/domain_parser.py +45 -0
- dhisana/utils/email_body_utils.py +72 -0
- dhisana/utils/email_parse_helpers.py +132 -0
- dhisana/utils/email_provider.py +375 -0
- dhisana/utils/enrich_lead_information.py +933 -0
- dhisana/utils/extract_email_content_for_llm.py +101 -0
- dhisana/utils/fetch_openai_config.py +129 -0
- dhisana/utils/field_validators.py +426 -0
- dhisana/utils/g2_tools.py +104 -0
- dhisana/utils/generate_content.py +41 -0
- dhisana/utils/generate_custom_message.py +271 -0
- dhisana/utils/generate_email.py +278 -0
- dhisana/utils/generate_email_response.py +465 -0
- dhisana/utils/generate_flow.py +102 -0
- dhisana/utils/generate_leads_salesnav.py +303 -0
- dhisana/utils/generate_linkedin_connect_message.py +224 -0
- dhisana/utils/generate_linkedin_response_message.py +317 -0
- dhisana/utils/generate_structured_output_internal.py +462 -0
- dhisana/utils/google_custom_search.py +267 -0
- dhisana/utils/google_oauth_tools.py +727 -0
- dhisana/utils/google_workspace_tools.py +1294 -0
- dhisana/utils/hubspot_clearbit.py +96 -0
- dhisana/utils/hubspot_crm_tools.py +2440 -0
- dhisana/utils/instantly_tools.py +149 -0
- dhisana/utils/linkedin_crawler.py +168 -0
- dhisana/utils/lusha_tools.py +333 -0
- dhisana/utils/mailgun_tools.py +156 -0
- dhisana/utils/mailreach_tools.py +123 -0
- dhisana/utils/microsoft365_tools.py +455 -0
- dhisana/utils/openai_assistant_and_file_utils.py +267 -0
- dhisana/utils/openai_helpers.py +977 -0
- dhisana/utils/openapi_spec_to_tools.py +45 -0
- dhisana/utils/openapi_tool/__init__.py +1 -0
- dhisana/utils/openapi_tool/api_models.py +633 -0
- dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
- dhisana/utils/openapi_tool/openapi_tool.py +319 -0
- dhisana/utils/parse_linkedin_messages_txt.py +100 -0
- dhisana/utils/profile.py +37 -0
- dhisana/utils/proxy_curl_tools.py +1226 -0
- dhisana/utils/proxycurl_search_leads.py +426 -0
- dhisana/utils/python_function_to_tools.py +83 -0
- dhisana/utils/research_lead.py +176 -0
- dhisana/utils/sales_navigator_crawler.py +1103 -0
- dhisana/utils/salesforce_crm_tools.py +477 -0
- dhisana/utils/search_router.py +131 -0
- dhisana/utils/search_router_jobs.py +51 -0
- dhisana/utils/sendgrid_tools.py +162 -0
- dhisana/utils/serarch_router_local_business.py +75 -0
- dhisana/utils/serpapi_additional_tools.py +290 -0
- dhisana/utils/serpapi_google_jobs.py +117 -0
- dhisana/utils/serpapi_google_search.py +188 -0
- dhisana/utils/serpapi_local_business_search.py +129 -0
- dhisana/utils/serpapi_search_tools.py +852 -0
- dhisana/utils/serperdev_google_jobs.py +125 -0
- dhisana/utils/serperdev_local_business.py +154 -0
- dhisana/utils/serperdev_search.py +233 -0
- dhisana/utils/smtp_email_tools.py +582 -0
- dhisana/utils/test_connect.py +2087 -0
- dhisana/utils/trasform_json.py +173 -0
- dhisana/utils/web_download_parse_tools.py +189 -0
- dhisana/utils/workflow_code_model.py +5 -0
- dhisana/utils/zoominfo_tools.py +357 -0
- dhisana/workflow/__init__.py +1 -0
- dhisana/workflow/agent.py +18 -0
- dhisana/workflow/flow.py +44 -0
- dhisana/workflow/task.py +43 -0
- dhisana/workflow/test.py +90 -0
- dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
- dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
- dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
- dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
- dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import re
|
|
3
|
+
from typing import Any, Dict, List, Optional, Set
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
import urllib.parse
|
|
6
|
+
import aiohttp
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
import urllib
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from dhisana.utils.serperdev_search import search_google_serper
|
|
12
|
+
from dhisana.utils.generate_structured_output_internal import (
|
|
13
|
+
get_structured_output_internal,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
logging.basicConfig(level=logging.INFO)
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
from dhisana.utils.search_router import search_google_with_tools
|
|
22
|
+
from dhisana.utils.assistant_tool_tag import assistant_tool
|
|
23
|
+
|
|
24
|
+
from dhisana.utils.web_download_parse_tools import fetch_html_content
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class LeadSearchResult(BaseModel):
|
|
28
|
+
first_name: str = ""
|
|
29
|
+
last_name: str = ""
|
|
30
|
+
full_name: str = ""
|
|
31
|
+
job_title: str = ""
|
|
32
|
+
linkedin_follower_count: int = 0
|
|
33
|
+
lead_location: str = ""
|
|
34
|
+
summary_about_lead: str = ""
|
|
35
|
+
user_linkedin_url: str = ""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class LinkedinCandidateChoice(BaseModel):
|
|
39
|
+
chosen_link: str = ""
|
|
40
|
+
confidence: float = 0.0
|
|
41
|
+
reasoning: str = ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
async def get_structured_output(text: str, tool_config: Optional[List[Dict]] = None) -> LeadSearchResult:
|
|
45
|
+
"""Parse text snippet into ``LeadSearchResult`` using OpenAI."""
|
|
46
|
+
|
|
47
|
+
prompt = (
|
|
48
|
+
"Extract lead details from the text below.\n"
|
|
49
|
+
"If follower counts are mentioned, convert values like '1.5k+ followers' to an integer (e.g. 1500).\n"
|
|
50
|
+
f"Return JSON matching this schema:\n{json.dumps(LeadSearchResult.model_json_schema(), indent=2)}\n\n"
|
|
51
|
+
f"Text:\n{text}"
|
|
52
|
+
)
|
|
53
|
+
result, status = await get_structured_output_internal(
|
|
54
|
+
prompt, LeadSearchResult, model = "gpt-5.1-chat", tool_config=tool_config
|
|
55
|
+
)
|
|
56
|
+
if status != "SUCCESS" or result is None:
|
|
57
|
+
return LeadSearchResult()
|
|
58
|
+
return result
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@assistant_tool
|
|
62
|
+
async def find_user_linkedin_url_with_serper(
|
|
63
|
+
user_linkedin_url: str,
|
|
64
|
+
tool_config: Optional[List[Dict]] = None,
|
|
65
|
+
) -> Optional[Dict]:
|
|
66
|
+
"""Search Google via Serper.dev for ``user_linkedin_url`` and parse lead details."""
|
|
67
|
+
|
|
68
|
+
if not user_linkedin_url:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
normalized_input = extract_user_linkedin_page(user_linkedin_url)
|
|
72
|
+
results = await search_google_serper(user_linkedin_url, 10, tool_config=tool_config)
|
|
73
|
+
for item_json in results:
|
|
74
|
+
try:
|
|
75
|
+
item = json.loads(item_json)
|
|
76
|
+
except Exception:
|
|
77
|
+
continue
|
|
78
|
+
link = item.get("link", "")
|
|
79
|
+
if not link:
|
|
80
|
+
continue
|
|
81
|
+
if extract_user_linkedin_page(link) == normalized_input:
|
|
82
|
+
text = " ".join(
|
|
83
|
+
[item.get("title", ""), item.get("subtitle", ""), item.get("snippet", "")]
|
|
84
|
+
).strip()
|
|
85
|
+
structured = await get_structured_output(text, tool_config=tool_config)
|
|
86
|
+
structured.user_linkedin_url = normalized_input
|
|
87
|
+
return json.loads(structured.model_dump_json())
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
async def pick_best_linkedin_candidate_with_llm(
|
|
92
|
+
email: str,
|
|
93
|
+
user_name: str,
|
|
94
|
+
user_title: str,
|
|
95
|
+
user_location: str,
|
|
96
|
+
user_company: str,
|
|
97
|
+
candidates: List[Dict],
|
|
98
|
+
tool_config: Optional[List[Dict]] = None,
|
|
99
|
+
) -> Optional[LinkedinCandidateChoice]:
|
|
100
|
+
"""Ask the LLM to assess candidate LinkedIn URLs and pick the best match."""
|
|
101
|
+
|
|
102
|
+
if not candidates:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
candidates_sorted = candidates[-3:]
|
|
106
|
+
candidate_lines = []
|
|
107
|
+
for idx, candidate in enumerate(candidates_sorted, start=1):
|
|
108
|
+
candidate_lines.append(
|
|
109
|
+
"\n".join(
|
|
110
|
+
[
|
|
111
|
+
f"Candidate {idx}:",
|
|
112
|
+
f" Link: {candidate.get('link', '')}",
|
|
113
|
+
f" Title: {candidate.get('title', '')}",
|
|
114
|
+
f" Snippet: {candidate.get('snippet', '')}",
|
|
115
|
+
f" Subtitle: {candidate.get('subtitle', '')}",
|
|
116
|
+
f" Query: {candidate.get('query', '')}",
|
|
117
|
+
]
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
prompt = (
|
|
122
|
+
"You are validating LinkedIn profile matches for a lead enrichment workflow.\n"
|
|
123
|
+
"Given the lead context and candidate search results, pick the most likely LinkedIn profile.\n"
|
|
124
|
+
"If no candidate seems appropriate, return an empty link and confidence 0.\n"
|
|
125
|
+
"Consider whether the email, name, company, title, or location aligns with the candidate.\n"
|
|
126
|
+
"Lead context:\n"
|
|
127
|
+
f"- Email: {email or 'unknown'}\n"
|
|
128
|
+
f"- Name: {user_name or 'unknown'}\n"
|
|
129
|
+
f"- Title: {user_title or 'unknown'}\n"
|
|
130
|
+
f"- Company: {user_company or 'unknown'}\n"
|
|
131
|
+
f"- Location: {user_location or 'unknown'}\n\n"
|
|
132
|
+
"Candidates:\n"
|
|
133
|
+
f"{chr(10).join(candidate_lines)}\n\n"
|
|
134
|
+
"Return JSON with fields: chosen_link (string), confidence (0-1 float), reasoning (short string)."
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
result, status = await get_structured_output_internal(
|
|
138
|
+
prompt,
|
|
139
|
+
LinkedinCandidateChoice,
|
|
140
|
+
model="gpt-5.1-chat",
|
|
141
|
+
tool_config=tool_config,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
if status != "SUCCESS" or result is None:
|
|
145
|
+
return None
|
|
146
|
+
|
|
147
|
+
return result
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@assistant_tool
|
|
151
|
+
async def get_company_domain_from_google_search(
|
|
152
|
+
company_name: str,
|
|
153
|
+
location: Optional[str] = None,
|
|
154
|
+
tool_config: Optional[List[Dict]] = None
|
|
155
|
+
) -> str:
|
|
156
|
+
"""
|
|
157
|
+
Tries to find the company domain from the company name using Google (SerpAPI or Serper.dev).
|
|
158
|
+
"""
|
|
159
|
+
logger.info("Entering get_company_domain_from_google_search")
|
|
160
|
+
|
|
161
|
+
company_name_no_spaces = company_name.replace(" ", "")
|
|
162
|
+
if not company_name_no_spaces or company_name.lower() in ["none", "freelance"]:
|
|
163
|
+
logger.debug("Invalid or excluded company_name provided.")
|
|
164
|
+
return ""
|
|
165
|
+
|
|
166
|
+
query = f"\"{company_name}\" official website"
|
|
167
|
+
if location:
|
|
168
|
+
query = f"\"{company_name}\" official website, {location}"
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
logger.debug(f"Performing search with query: {query}")
|
|
172
|
+
result = await search_google_with_tools(query, 1, tool_config=tool_config)
|
|
173
|
+
if not isinstance(result, list) or len(result) == 0:
|
|
174
|
+
logger.debug("No results for first attempt, retrying with fallback query.")
|
|
175
|
+
query = f"{company_name} official website"
|
|
176
|
+
result = await search_google_with_tools(query, 1, tool_config=tool_config)
|
|
177
|
+
if not isinstance(result, list) or len(result) == 0:
|
|
178
|
+
logger.debug("No results from fallback query either.")
|
|
179
|
+
return ''
|
|
180
|
+
except Exception:
|
|
181
|
+
logger.exception("Exception during get_company_domain_from_google_search.")
|
|
182
|
+
return ''
|
|
183
|
+
|
|
184
|
+
exclude_compan_names = ["linkedin", "wikipedia", "facebook", "instagram", "twitter", "youtube", "netflix"]
|
|
185
|
+
if any(exclude_name in company_name.lower() for exclude_name in exclude_compan_names):
|
|
186
|
+
logger.debug("Company name is in excluded list, returning empty domain.")
|
|
187
|
+
return ""
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
result_json = json.loads(result[0])
|
|
191
|
+
except (json.JSONDecodeError, IndexError) as e:
|
|
192
|
+
logger.debug(f"Failed to parse the JSON from the result: {str(e)}")
|
|
193
|
+
return ''
|
|
194
|
+
|
|
195
|
+
link = result_json.get('link', '')
|
|
196
|
+
if not link:
|
|
197
|
+
logger.debug("No link found in the first search result.")
|
|
198
|
+
return ''
|
|
199
|
+
|
|
200
|
+
parsed_url = urlparse(link)
|
|
201
|
+
domain = parsed_url.netloc.lower()
|
|
202
|
+
if domain.startswith('www.'):
|
|
203
|
+
domain = domain[4:]
|
|
204
|
+
|
|
205
|
+
excluded_domains = [
|
|
206
|
+
"linkedin.com", "wikipedia.org", "usa.gov", "facebook.com",
|
|
207
|
+
"instagram.com", "twitter.com", "x.com", "google.com", "youtube.com",
|
|
208
|
+
"netflix.com", "freelance.com", "zoominfo.com", "reditt.com"
|
|
209
|
+
]
|
|
210
|
+
excluded_domains_lower = [d.lower() for d in excluded_domains]
|
|
211
|
+
|
|
212
|
+
if any(domain == d or domain.endswith(f".{d}") for d in excluded_domains_lower):
|
|
213
|
+
logger.debug(f"Domain {domain} is in the excluded list.")
|
|
214
|
+
return ""
|
|
215
|
+
|
|
216
|
+
logger.info(f"Found domain {domain}")
|
|
217
|
+
return domain
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
@assistant_tool
|
|
221
|
+
async def get_signal_strength(
|
|
222
|
+
domain_to_search: str,
|
|
223
|
+
keywords: List[str],
|
|
224
|
+
in_title: List[str] = [],
|
|
225
|
+
not_in_title: List[str] = [],
|
|
226
|
+
negative_keywords: List[str] = [],
|
|
227
|
+
tool_config: Optional[List[Dict]] = None
|
|
228
|
+
) -> int:
|
|
229
|
+
"""
|
|
230
|
+
Find how strong a match for the keywords in search is by checking
|
|
231
|
+
how many search results contain all desired keywords in the snippet.
|
|
232
|
+
"""
|
|
233
|
+
logger.info("Entering get_signal_strength")
|
|
234
|
+
|
|
235
|
+
if not keywords and not domain_to_search:
|
|
236
|
+
logger.warning("No domain to search or keywords provided.")
|
|
237
|
+
return 0
|
|
238
|
+
|
|
239
|
+
query_parts = []
|
|
240
|
+
if domain_to_search:
|
|
241
|
+
query_parts.append(f"site:{domain_to_search}")
|
|
242
|
+
for kw in keywords:
|
|
243
|
+
query_parts.append(f"\"{kw}\"")
|
|
244
|
+
for kw in in_title:
|
|
245
|
+
query_parts.append(f'intitle:"{kw}"')
|
|
246
|
+
for kw in not_in_title:
|
|
247
|
+
query_parts.append(f'-intitle:"{kw}"')
|
|
248
|
+
for kw in negative_keywords:
|
|
249
|
+
query_parts.append(f'-"{kw}"')
|
|
250
|
+
|
|
251
|
+
final_query = " ".join(query_parts).strip()
|
|
252
|
+
if not final_query:
|
|
253
|
+
logger.debug("Constructed query is empty, returning score=0.")
|
|
254
|
+
return 0
|
|
255
|
+
|
|
256
|
+
logger.debug(f"Performing get_signal_strength search with query: {final_query}")
|
|
257
|
+
try:
|
|
258
|
+
results = await search_google_with_tools(final_query, 5, tool_config=tool_config)
|
|
259
|
+
except Exception:
|
|
260
|
+
logger.exception("Exception occurred while searching for signal strength.")
|
|
261
|
+
return 0
|
|
262
|
+
|
|
263
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
264
|
+
logger.debug("No results found; returning 0.")
|
|
265
|
+
return 0
|
|
266
|
+
|
|
267
|
+
score = 0
|
|
268
|
+
for result_item in results:
|
|
269
|
+
try:
|
|
270
|
+
result_json = json.loads(result_item)
|
|
271
|
+
snippet_text = result_json.get('snippet', '').lower()
|
|
272
|
+
if all(kw.lower() in snippet_text for kw in keywords):
|
|
273
|
+
logger.debug(f"Found match in snippet: {snippet_text[:60]}...")
|
|
274
|
+
score += 1
|
|
275
|
+
if score == 5:
|
|
276
|
+
break
|
|
277
|
+
except (json.JSONDecodeError, KeyError):
|
|
278
|
+
logger.debug("Failed to decode or parse snippet from a result.")
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
logger.info(f"Final signal strength score: {score}")
|
|
282
|
+
return score
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def extract_user_linkedin_page(url: str) -> str:
|
|
286
|
+
"""
|
|
287
|
+
Extracts and returns the user page part of a LinkedIn URL.
|
|
288
|
+
Ensures the domain is www.linkedin.com and removes any suffix path or query parameters.
|
|
289
|
+
"""
|
|
290
|
+
logger.debug(f"Entering extract_user_linkedin_page with URL: {url}")
|
|
291
|
+
if not url:
|
|
292
|
+
return ""
|
|
293
|
+
|
|
294
|
+
normalized_url = re.sub(r"^(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
|
|
295
|
+
match = re.match(r"https://www\.linkedin\.com/in/([^/?#]+)", normalized_url)
|
|
296
|
+
if match:
|
|
297
|
+
page = f"https://www.linkedin.com/in/{match.group(1)}"
|
|
298
|
+
logger.debug(f"Extracted user LinkedIn page: {page}")
|
|
299
|
+
return page
|
|
300
|
+
|
|
301
|
+
logger.debug("No valid LinkedIn user page found.")
|
|
302
|
+
return ""
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
@assistant_tool
|
|
306
|
+
async def find_user_linkedin_url_google(
|
|
307
|
+
user_name: str,
|
|
308
|
+
user_title: str,
|
|
309
|
+
user_location: str,
|
|
310
|
+
user_company: str,
|
|
311
|
+
user_company_domain: str = "",
|
|
312
|
+
use_strict_check: bool = True,
|
|
313
|
+
tool_config: Optional[List[Dict]] = None
|
|
314
|
+
) -> str:
|
|
315
|
+
"""
|
|
316
|
+
Find the LinkedIn URL for a user based on their name, title, location, and company.
|
|
317
|
+
"""
|
|
318
|
+
logger.info("Entering find_user_linkedin_url_google")
|
|
319
|
+
|
|
320
|
+
if not user_name:
|
|
321
|
+
logger.warning("No user_name provided.")
|
|
322
|
+
return ""
|
|
323
|
+
|
|
324
|
+
if use_strict_check:
|
|
325
|
+
queries = [
|
|
326
|
+
f'site:linkedin.com/in ("{user_name}") ({user_company} | {user_company_domain}) ( {user_title} | ) intitle:"{user_name}" -intitle:"profiles" '
|
|
327
|
+
]
|
|
328
|
+
else:
|
|
329
|
+
queries = [
|
|
330
|
+
f'site:linkedin.com/in "{user_name}" "{user_location}" "{user_title}" "{user_company}" intitle:"{user_name}" -intitle:"profiles" ',
|
|
331
|
+
f'site:linkedin.com/in "{user_name}" "{user_location}" "{user_company}" intitle:"{user_name}" -intitle:"profiles" ',
|
|
332
|
+
f'site:linkedin.com/in "{user_name}", {user_location} intitle:"{user_name}" -intitle:"profiles" ',
|
|
333
|
+
f'site:linkedin.com/in "{user_name}" intitle:"{user_name}"'
|
|
334
|
+
]
|
|
335
|
+
|
|
336
|
+
async with aiohttp.ClientSession() as session:
|
|
337
|
+
for query in queries:
|
|
338
|
+
if not query.strip():
|
|
339
|
+
continue
|
|
340
|
+
logger.debug(f"Searching with query: {query}")
|
|
341
|
+
try:
|
|
342
|
+
results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
|
|
343
|
+
except Exception:
|
|
344
|
+
logger.exception("Error searching for LinkedIn user URL.")
|
|
345
|
+
continue
|
|
346
|
+
|
|
347
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
348
|
+
logger.debug("No results for this query, moving to next.")
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
result_json = json.loads(results[0])
|
|
353
|
+
except (json.JSONDecodeError, IndexError):
|
|
354
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
355
|
+
continue
|
|
356
|
+
|
|
357
|
+
link = result_json.get('link', '')
|
|
358
|
+
if not link:
|
|
359
|
+
logger.debug("No link in first search result.")
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
parsed_url = urlparse(link)
|
|
363
|
+
if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
|
|
364
|
+
link = extract_user_linkedin_page(link)
|
|
365
|
+
logger.info(f"Found LinkedIn user page: {link}")
|
|
366
|
+
return link
|
|
367
|
+
|
|
368
|
+
logger.info("No matching LinkedIn user page found.")
|
|
369
|
+
return ""
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
@assistant_tool
|
|
373
|
+
async def find_user_linkedin_url_by_email_google(
|
|
374
|
+
email: str,
|
|
375
|
+
user_name: str = "",
|
|
376
|
+
user_title: str = "",
|
|
377
|
+
user_location: str = "",
|
|
378
|
+
user_company: str = "",
|
|
379
|
+
tool_config: Optional[List[Dict]] = None,
|
|
380
|
+
) -> Optional[Dict[str, Any]]:
|
|
381
|
+
"""
|
|
382
|
+
Find the LinkedIn URL for a user based primarily on their email address.
|
|
383
|
+
|
|
384
|
+
Additional profile hints (name, title, location, company) improve query precision
|
|
385
|
+
when supplied. Returns a dict with the best LinkedIn URL, LLM confidence score,
|
|
386
|
+
and short reasoning when a match clears the confidence threshold; otherwise ``None``.
|
|
387
|
+
"""
|
|
388
|
+
logger.info("Entering find_user_linkedin_url_by_email_google")
|
|
389
|
+
|
|
390
|
+
if not email:
|
|
391
|
+
logger.warning("No email provided.")
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
normalized_email = email.strip().lower()
|
|
395
|
+
email_local_part = normalized_email.split("@")[0] if "@" in normalized_email else normalized_email
|
|
396
|
+
email_local_humanized = re.sub(r"[._-]+", " ", email_local_part).strip()
|
|
397
|
+
|
|
398
|
+
queries: List[str] = []
|
|
399
|
+
|
|
400
|
+
def add_query(query: str) -> None:
|
|
401
|
+
query = query.strip()
|
|
402
|
+
if query and query not in queries:
|
|
403
|
+
queries.append(query)
|
|
404
|
+
|
|
405
|
+
def add_query_parts(*parts: str) -> None:
|
|
406
|
+
tokens = [part.strip() for part in parts if part and part.strip()]
|
|
407
|
+
if not tokens:
|
|
408
|
+
return
|
|
409
|
+
add_query(" ".join(tokens))
|
|
410
|
+
|
|
411
|
+
enriched_terms = []
|
|
412
|
+
if user_name:
|
|
413
|
+
enriched_terms.append(f'"{user_name}"')
|
|
414
|
+
if user_company:
|
|
415
|
+
enriched_terms.append(f'"{user_company}"')
|
|
416
|
+
if user_title:
|
|
417
|
+
enriched_terms.append(f'"{user_title}"')
|
|
418
|
+
if user_location:
|
|
419
|
+
enriched_terms.append(f'"{user_location}"')
|
|
420
|
+
base_hint = " ".join(enriched_terms)
|
|
421
|
+
|
|
422
|
+
# Prioritise the direct email search variants before broader fallbacks.
|
|
423
|
+
add_query_parts(normalized_email, "linkedin.com/in", base_hint)
|
|
424
|
+
add_query_parts(normalized_email, "linkedin.com", base_hint)
|
|
425
|
+
add_query_parts(normalized_email, "linkedin", base_hint)
|
|
426
|
+
add_query_parts(normalized_email, base_hint)
|
|
427
|
+
add_query(f'"{normalized_email}" "linkedin.com/in" {base_hint}')
|
|
428
|
+
add_query(f'"{normalized_email}" "linkedin.com" {base_hint}')
|
|
429
|
+
add_query(f'"{normalized_email}" linkedin {base_hint}')
|
|
430
|
+
|
|
431
|
+
if email_local_part and email_local_part != normalized_email:
|
|
432
|
+
add_query_parts(email_local_part, "linkedin.com/in", base_hint)
|
|
433
|
+
add_query_parts(email_local_part, "linkedin.com", base_hint)
|
|
434
|
+
add_query_parts(email_local_part, "linkedin", base_hint)
|
|
435
|
+
add_query(f'"{email_local_part}" "linkedin.com/in" {base_hint}')
|
|
436
|
+
add_query(f'"{email_local_part}" "linkedin.com" {base_hint}')
|
|
437
|
+
|
|
438
|
+
if email_local_humanized and email_local_humanized not in {email_local_part, normalized_email}:
|
|
439
|
+
add_query_parts(email_local_humanized, "linkedin", base_hint)
|
|
440
|
+
add_query(f'"{email_local_humanized}" linkedin {base_hint}')
|
|
441
|
+
|
|
442
|
+
if normalized_email:
|
|
443
|
+
add_query(f'site:linkedin.com/in "{normalized_email}" {base_hint}')
|
|
444
|
+
|
|
445
|
+
if email_local_part:
|
|
446
|
+
add_query(f'site:linkedin.com/in "{email_local_part}" {base_hint}')
|
|
447
|
+
|
|
448
|
+
if email_local_humanized and email_local_humanized != email_local_part:
|
|
449
|
+
add_query(f'site:linkedin.com/in "{email_local_humanized}" {base_hint}')
|
|
450
|
+
|
|
451
|
+
if base_hint:
|
|
452
|
+
lookup_hint = user_name or email_local_humanized or email_local_part or normalized_email
|
|
453
|
+
add_query(
|
|
454
|
+
f'site:linkedin.com/in "{normalized_email}" {base_hint} '
|
|
455
|
+
f'intitle:"{lookup_hint}" -intitle:"profiles"'
|
|
456
|
+
)
|
|
457
|
+
if email_local_humanized:
|
|
458
|
+
add_query(
|
|
459
|
+
f'site:linkedin.com/in "{email_local_humanized}" {base_hint} '
|
|
460
|
+
f'intitle:"{lookup_hint}" -intitle:"profiles"'
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
candidate_records: List[Dict[str, str]] = []
|
|
464
|
+
seen_links: Set[str] = set()
|
|
465
|
+
best_llm_choice: Optional[LinkedinCandidateChoice] = None
|
|
466
|
+
best_llm_link: str = ""
|
|
467
|
+
HIGH_CONFIDENCE_THRESHOLD = 0.8
|
|
468
|
+
MIN_CONFIDENCE_THRESHOLD = 0.75
|
|
469
|
+
|
|
470
|
+
async def evaluate_with_llm() -> Optional[LinkedinCandidateChoice]:
|
|
471
|
+
nonlocal best_llm_choice, best_llm_link
|
|
472
|
+
|
|
473
|
+
llm_choice = await pick_best_linkedin_candidate_with_llm(
|
|
474
|
+
email=email,
|
|
475
|
+
user_name=user_name,
|
|
476
|
+
user_title=user_title,
|
|
477
|
+
user_location=user_location,
|
|
478
|
+
user_company=user_company,
|
|
479
|
+
candidates=candidate_records,
|
|
480
|
+
tool_config=tool_config,
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
if not llm_choice or not llm_choice.chosen_link:
|
|
484
|
+
return None
|
|
485
|
+
|
|
486
|
+
chosen_link = extract_user_linkedin_page(llm_choice.chosen_link)
|
|
487
|
+
if not chosen_link:
|
|
488
|
+
return None
|
|
489
|
+
|
|
490
|
+
llm_choice.chosen_link = chosen_link
|
|
491
|
+
|
|
492
|
+
if best_llm_choice is None or llm_choice.confidence > best_llm_choice.confidence:
|
|
493
|
+
best_llm_choice = llm_choice
|
|
494
|
+
best_llm_link = chosen_link
|
|
495
|
+
logger.debug(
|
|
496
|
+
"LLM updated best candidate: %s (confidence %.2f) reason: %s",
|
|
497
|
+
chosen_link,
|
|
498
|
+
llm_choice.confidence,
|
|
499
|
+
llm_choice.reasoning,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
if llm_choice.confidence >= HIGH_CONFIDENCE_THRESHOLD:
|
|
503
|
+
logger.info(
|
|
504
|
+
"Returning LinkedIn user page by email via LLM scoring: %s (confidence %.2f)",
|
|
505
|
+
chosen_link,
|
|
506
|
+
llm_choice.confidence,
|
|
507
|
+
)
|
|
508
|
+
return llm_choice
|
|
509
|
+
|
|
510
|
+
return None
|
|
511
|
+
|
|
512
|
+
async with aiohttp.ClientSession() as session:
|
|
513
|
+
for query in queries:
|
|
514
|
+
query = query.strip()
|
|
515
|
+
if not query:
|
|
516
|
+
continue
|
|
517
|
+
logger.debug(f"Searching with query: {query}")
|
|
518
|
+
|
|
519
|
+
try:
|
|
520
|
+
results = await search_google_with_tools(query, 5, tool_config=tool_config)
|
|
521
|
+
except Exception:
|
|
522
|
+
logger.exception("Error searching for LinkedIn user URL by email.")
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
526
|
+
logger.debug("No results for this query, moving to next.")
|
|
527
|
+
continue
|
|
528
|
+
|
|
529
|
+
for result_item in results:
|
|
530
|
+
try:
|
|
531
|
+
result_json = json.loads(result_item)
|
|
532
|
+
except (json.JSONDecodeError, IndexError):
|
|
533
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
534
|
+
continue
|
|
535
|
+
|
|
536
|
+
link = result_json.get('link', '')
|
|
537
|
+
if not link:
|
|
538
|
+
continue
|
|
539
|
+
|
|
540
|
+
parsed_url = urlparse(link)
|
|
541
|
+
if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
|
|
542
|
+
link = extract_user_linkedin_page(link)
|
|
543
|
+
if not link or link in seen_links:
|
|
544
|
+
continue
|
|
545
|
+
|
|
546
|
+
title = result_json.get('title', '')
|
|
547
|
+
snippet = result_json.get('snippet', '')
|
|
548
|
+
subtitle = result_json.get('subtitle', '')
|
|
549
|
+
|
|
550
|
+
candidate_records.append(
|
|
551
|
+
{
|
|
552
|
+
"link": link,
|
|
553
|
+
"title": title,
|
|
554
|
+
"snippet": snippet,
|
|
555
|
+
"subtitle": subtitle,
|
|
556
|
+
"query": query,
|
|
557
|
+
}
|
|
558
|
+
)
|
|
559
|
+
if len(candidate_records) > 6:
|
|
560
|
+
candidate_records.pop(0)
|
|
561
|
+
seen_links.add(link)
|
|
562
|
+
|
|
563
|
+
high_conf_choice = await evaluate_with_llm()
|
|
564
|
+
if high_conf_choice:
|
|
565
|
+
return {
|
|
566
|
+
"linkedin_url": high_conf_choice.chosen_link,
|
|
567
|
+
"confidence": high_conf_choice.confidence,
|
|
568
|
+
"reasoning": high_conf_choice.reasoning,
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
if best_llm_choice and best_llm_link and best_llm_choice.confidence >= MIN_CONFIDENCE_THRESHOLD:
|
|
572
|
+
logger.info(
|
|
573
|
+
"Returning LinkedIn user page by email via LLM scoring (best overall): %s (confidence %.2f)",
|
|
574
|
+
best_llm_link,
|
|
575
|
+
best_llm_choice.confidence,
|
|
576
|
+
)
|
|
577
|
+
return {
|
|
578
|
+
"linkedin_url": best_llm_link,
|
|
579
|
+
"confidence": best_llm_choice.confidence,
|
|
580
|
+
"reasoning": best_llm_choice.reasoning,
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
logger.info("No matching LinkedIn user page found using email queries.")
|
|
584
|
+
return None
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
@assistant_tool
|
|
588
|
+
async def find_user_linkedin_url_by_job_title_google(
|
|
589
|
+
user_title: str,
|
|
590
|
+
user_location: str,
|
|
591
|
+
user_company: str,
|
|
592
|
+
tool_config: Optional[List[Dict]] = None
|
|
593
|
+
) -> str:
|
|
594
|
+
"""
|
|
595
|
+
Find the LinkedIn URL for a user based on their job_title, location, and company.
|
|
596
|
+
"""
|
|
597
|
+
logger.info("Entering find_user_linkedin_url_by_job_title_google")
|
|
598
|
+
|
|
599
|
+
queries = [
|
|
600
|
+
f'site:linkedin.com/in "{user_company}" AND "{user_title}" -intitle:"profiles" ',
|
|
601
|
+
]
|
|
602
|
+
|
|
603
|
+
async with aiohttp.ClientSession() as session:
|
|
604
|
+
for query in queries:
|
|
605
|
+
if not query.strip():
|
|
606
|
+
continue
|
|
607
|
+
logger.debug(f"Searching with query: {query}")
|
|
608
|
+
|
|
609
|
+
try:
|
|
610
|
+
results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
|
|
611
|
+
except Exception:
|
|
612
|
+
logger.exception("Error searching for LinkedIn URL by job title.")
|
|
613
|
+
continue
|
|
614
|
+
|
|
615
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
616
|
+
logger.debug("No results for this query, moving to next.")
|
|
617
|
+
continue
|
|
618
|
+
|
|
619
|
+
try:
|
|
620
|
+
result_json = json.loads(results[0])
|
|
621
|
+
except (json.JSONDecodeError, IndexError):
|
|
622
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
623
|
+
continue
|
|
624
|
+
|
|
625
|
+
link = result_json.get('link', '')
|
|
626
|
+
if not link:
|
|
627
|
+
logger.debug("No link in the first search result.")
|
|
628
|
+
continue
|
|
629
|
+
|
|
630
|
+
parsed_url = urlparse(link)
|
|
631
|
+
if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
|
|
632
|
+
link = extract_user_linkedin_page(link)
|
|
633
|
+
logger.info(f"Found LinkedIn user page by job title: {link}")
|
|
634
|
+
return link
|
|
635
|
+
|
|
636
|
+
logger.info("No matching LinkedIn user page found by job title.")
|
|
637
|
+
return ""
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
@assistant_tool
|
|
641
|
+
async def find_user_linkedin_url_by_google_search(
|
|
642
|
+
queries: List[str],
|
|
643
|
+
number_of_results: int = 5,
|
|
644
|
+
tool_config: Optional[List[Dict]] = None
|
|
645
|
+
) -> List[str]:
|
|
646
|
+
"""
|
|
647
|
+
Find LinkedIn user URLs based on provided Google search queries.
|
|
648
|
+
"""
|
|
649
|
+
logger.info("Entering find_user_linkedin_url_by_google_search")
|
|
650
|
+
found_urls = []
|
|
651
|
+
|
|
652
|
+
for query in queries:
|
|
653
|
+
if not query.strip():
|
|
654
|
+
continue
|
|
655
|
+
logger.debug(f"Searching with query: {query}")
|
|
656
|
+
|
|
657
|
+
try:
|
|
658
|
+
results = await search_google_with_tools(query.strip(), number_of_results, tool_config=tool_config)
|
|
659
|
+
except Exception:
|
|
660
|
+
logger.exception("Error searching for LinkedIn URL using Google search.")
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
664
|
+
logger.debug("No results for this query, moving to next.")
|
|
665
|
+
continue
|
|
666
|
+
|
|
667
|
+
try:
|
|
668
|
+
result_json = json.loads(results[0])
|
|
669
|
+
except (json.JSONDecodeError, IndexError):
|
|
670
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
link = result_json.get('link', '')
|
|
674
|
+
if not link:
|
|
675
|
+
logger.debug("No link in the first search result.")
|
|
676
|
+
continue
|
|
677
|
+
|
|
678
|
+
parsed_url = urlparse(link)
|
|
679
|
+
if 'linkedin.com/in' in (parsed_url.netloc + parsed_url.path):
|
|
680
|
+
link = extract_user_linkedin_page(link)
|
|
681
|
+
logger.info(f"Found LinkedIn user page: {link}")
|
|
682
|
+
found_urls.append(link)
|
|
683
|
+
|
|
684
|
+
if not found_urls:
|
|
685
|
+
logger.info("No matching LinkedIn user page found based on provided queries.")
|
|
686
|
+
return found_urls
|
|
687
|
+
|
|
688
|
+
|
|
689
|
+
def extract_company_page(url: str) -> str:
|
|
690
|
+
"""
|
|
691
|
+
Extracts and returns the company page part of a LinkedIn URL.
|
|
692
|
+
Ensures the domain is www.linkedin.com and removes any suffix path or query parameters.
|
|
693
|
+
"""
|
|
694
|
+
logger.debug(f"Entering extract_company_page with URL: {url}")
|
|
695
|
+
if not url:
|
|
696
|
+
return ""
|
|
697
|
+
|
|
698
|
+
normalized_url = re.sub(r"(https?://)?([\w\-]+\.)?linkedin\.com", "https://www.linkedin.com", url)
|
|
699
|
+
match = re.match(r"https://www.linkedin.com/company/([\w\-]+)", normalized_url)
|
|
700
|
+
if match:
|
|
701
|
+
company_page = f"https://www.linkedin.com/company/{match.group(1)}"
|
|
702
|
+
logger.debug(f"Extracted LinkedIn company page: {company_page}")
|
|
703
|
+
return company_page
|
|
704
|
+
|
|
705
|
+
logger.debug("No valid LinkedIn company page found.")
|
|
706
|
+
return ""
|
|
707
|
+
|
|
708
|
+
|
|
709
|
+
@assistant_tool
|
|
710
|
+
async def find_organization_linkedin_url_with_google_search(
|
|
711
|
+
company_name: str,
|
|
712
|
+
company_location: Optional[str] = None,
|
|
713
|
+
company_domain: Optional[str] = None,
|
|
714
|
+
use_strict_check: bool = True,
|
|
715
|
+
tool_config: Optional[List[Dict]] = None,
|
|
716
|
+
) -> str:
|
|
717
|
+
"""
|
|
718
|
+
Find the LinkedIn URL for a company based on its name and optional location using Google search.
|
|
719
|
+
"""
|
|
720
|
+
logger.info("Entering find_organization_linkedin_url_with_google_search")
|
|
721
|
+
|
|
722
|
+
if not company_name:
|
|
723
|
+
logger.warning("No company_name provided.")
|
|
724
|
+
return ""
|
|
725
|
+
|
|
726
|
+
if use_strict_check:
|
|
727
|
+
queries = [f'site:linkedin.com/company "{company_name}" {company_domain} ']
|
|
728
|
+
else:
|
|
729
|
+
if company_location:
|
|
730
|
+
queries = [
|
|
731
|
+
f'site:linkedin.com/company "{company_name}" {company_location} -intitle:"jobs" ',
|
|
732
|
+
f'site:linkedin.com/company "{company_name}" -intitle:"jobs" ',
|
|
733
|
+
f'site:linkedin.com/company {company_name} {company_location} -intitle:"jobs" ',
|
|
734
|
+
]
|
|
735
|
+
else:
|
|
736
|
+
queries = [
|
|
737
|
+
f'site:linkedin.com/company "{company_name}" -intitle:"jobs" ',
|
|
738
|
+
f'site:linkedin.com/company {company_name} -intitle:"jobs" '
|
|
739
|
+
]
|
|
740
|
+
|
|
741
|
+
async with aiohttp.ClientSession() as session:
|
|
742
|
+
for query in queries:
|
|
743
|
+
if not query.strip():
|
|
744
|
+
continue
|
|
745
|
+
|
|
746
|
+
logger.debug(f"Searching with query: {query}")
|
|
747
|
+
try:
|
|
748
|
+
results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
|
|
749
|
+
except Exception:
|
|
750
|
+
logger.exception("Error searching for organization LinkedIn URL.")
|
|
751
|
+
continue
|
|
752
|
+
|
|
753
|
+
if not isinstance(results, list) or len(results) == 0:
|
|
754
|
+
logger.debug("No results for this query, moving to next.")
|
|
755
|
+
continue
|
|
756
|
+
|
|
757
|
+
try:
|
|
758
|
+
result_json = json.loads(results[0])
|
|
759
|
+
except (json.JSONDecodeError, IndexError):
|
|
760
|
+
logger.debug("Failed to parse JSON from the search result.")
|
|
761
|
+
continue
|
|
762
|
+
|
|
763
|
+
link = result_json.get('link', '')
|
|
764
|
+
if not link:
|
|
765
|
+
logger.debug("No link found in the first result.")
|
|
766
|
+
continue
|
|
767
|
+
|
|
768
|
+
parsed_url = urlparse(link)
|
|
769
|
+
if 'linkedin.com/company' in (parsed_url.netloc + parsed_url.path):
|
|
770
|
+
link = extract_company_page(link)
|
|
771
|
+
logger.info(f"Found LinkedIn company page: {link}")
|
|
772
|
+
return link
|
|
773
|
+
|
|
774
|
+
logger.info("No matching LinkedIn company page found.")
|
|
775
|
+
return ""
|
|
776
|
+
|
|
777
|
+
|
|
778
|
+
async def get_external_links(url: str) -> List[str]:
|
|
779
|
+
"""
|
|
780
|
+
Fetch external links from a given URL by parsing its HTML content.
|
|
781
|
+
"""
|
|
782
|
+
logger.debug(f"Entering get_external_links for URL: {url}")
|
|
783
|
+
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"}
|
|
784
|
+
|
|
785
|
+
try:
|
|
786
|
+
async with aiohttp.ClientSession(headers=headers) as session:
|
|
787
|
+
async with session.get(url, allow_redirects=True) as response:
|
|
788
|
+
logger.debug(f"Received status for external links: {response.status}")
|
|
789
|
+
if response.status == 200:
|
|
790
|
+
content = await response.text()
|
|
791
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
792
|
+
external_links = []
|
|
793
|
+
for link in soup.find_all('a', href=True):
|
|
794
|
+
href = link['href']
|
|
795
|
+
if href.startswith('http') and not href.startswith(url):
|
|
796
|
+
external_links.append(href)
|
|
797
|
+
logger.debug(f"Found {len(external_links)} external links.")
|
|
798
|
+
return external_links
|
|
799
|
+
else:
|
|
800
|
+
logger.warning(f"Non-200 status ({response.status}) while fetching external links.")
|
|
801
|
+
return []
|
|
802
|
+
except Exception:
|
|
803
|
+
logger.exception("Exception occurred while fetching external links.")
|
|
804
|
+
return []
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
async def get_resolved_linkedin_links(url: str) -> List[str]:
|
|
808
|
+
"""
|
|
809
|
+
Fetch HTML content from a URL and return any LinkedIn.com/company links found.
|
|
810
|
+
"""
|
|
811
|
+
logger.debug(f"Entering get_resolved_linkedin_links for URL: {url}")
|
|
812
|
+
try:
|
|
813
|
+
content = await fetch_html_content(url)
|
|
814
|
+
except Exception:
|
|
815
|
+
logger.exception("Exception occurred while fetching HTML content.")
|
|
816
|
+
return []
|
|
817
|
+
|
|
818
|
+
linkedin_links = re.findall(r'https://www\.linkedin\.com/company/[^\s]+', content)
|
|
819
|
+
unique_links = list(set(linkedin_links))
|
|
820
|
+
logger.debug(f"Found {len(unique_links)} LinkedIn links.")
|
|
821
|
+
return unique_links
|
|
822
|
+
|
|
823
|
+
|
|
824
|
+
@assistant_tool
|
|
825
|
+
async def get_company_website_from_linkedin_url(linkedin_url: str) -> str:
|
|
826
|
+
"""
|
|
827
|
+
Attempt to extract a company's website from its LinkedIn URL by
|
|
828
|
+
scanning external links that contain "trk=about_website".
|
|
829
|
+
"""
|
|
830
|
+
logger.info("Entering get_company_website_from_linkedin_url")
|
|
831
|
+
|
|
832
|
+
if not linkedin_url:
|
|
833
|
+
logger.debug("Empty LinkedIn URL provided, returning empty string.")
|
|
834
|
+
return ""
|
|
835
|
+
|
|
836
|
+
try:
|
|
837
|
+
links = await get_external_links(linkedin_url)
|
|
838
|
+
except Exception:
|
|
839
|
+
logger.exception("Exception occurred while getting external links for LinkedIn URL.")
|
|
840
|
+
return ""
|
|
841
|
+
|
|
842
|
+
for link in links:
|
|
843
|
+
if 'trk=about_website' in link:
|
|
844
|
+
parsed_link = urllib.parse.urlparse(link)
|
|
845
|
+
query_params = urllib.parse.parse_qs(parsed_link.query)
|
|
846
|
+
if 'url' in query_params:
|
|
847
|
+
encoded_url = query_params['url'][0]
|
|
848
|
+
company_website = urllib.parse.unquote(encoded_url)
|
|
849
|
+
logger.info(f"Extracted company website: {company_website}")
|
|
850
|
+
return company_website
|
|
851
|
+
logger.debug("No company website link found with 'trk=about_website'.")
|
|
852
|
+
return ""
|