dhisana 0.0.1.dev85__py3-none-any.whl → 0.0.1.dev236__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dhisana/schemas/common.py +33 -0
- dhisana/schemas/sales.py +224 -23
- dhisana/utils/add_mapping.py +72 -63
- dhisana/utils/apollo_tools.py +739 -109
- dhisana/utils/built_with_api_tools.py +4 -2
- dhisana/utils/cache_output_tools.py +23 -23
- dhisana/utils/check_email_validity_tools.py +456 -458
- dhisana/utils/check_for_intent_signal.py +1 -2
- dhisana/utils/check_linkedin_url_validity.py +34 -8
- dhisana/utils/clay_tools.py +3 -2
- dhisana/utils/clean_properties.py +3 -1
- dhisana/utils/compose_salesnav_query.py +0 -1
- dhisana/utils/compose_search_query.py +7 -3
- dhisana/utils/composite_tools.py +0 -1
- dhisana/utils/dataframe_tools.py +2 -2
- dhisana/utils/email_body_utils.py +72 -0
- dhisana/utils/email_provider.py +375 -0
- dhisana/utils/enrich_lead_information.py +585 -85
- dhisana/utils/fetch_openai_config.py +129 -0
- dhisana/utils/field_validators.py +1 -1
- dhisana/utils/g2_tools.py +0 -1
- dhisana/utils/generate_content.py +0 -1
- dhisana/utils/generate_email.py +69 -16
- dhisana/utils/generate_email_response.py +298 -41
- dhisana/utils/generate_flow.py +0 -1
- dhisana/utils/generate_linkedin_connect_message.py +19 -6
- dhisana/utils/generate_linkedin_response_message.py +156 -65
- dhisana/utils/generate_structured_output_internal.py +351 -131
- dhisana/utils/google_custom_search.py +150 -44
- dhisana/utils/google_oauth_tools.py +721 -0
- dhisana/utils/google_workspace_tools.py +391 -25
- dhisana/utils/hubspot_clearbit.py +3 -1
- dhisana/utils/hubspot_crm_tools.py +771 -167
- dhisana/utils/instantly_tools.py +3 -1
- dhisana/utils/lusha_tools.py +10 -7
- dhisana/utils/mailgun_tools.py +150 -0
- dhisana/utils/microsoft365_tools.py +447 -0
- dhisana/utils/openai_assistant_and_file_utils.py +121 -177
- dhisana/utils/openai_helpers.py +19 -16
- dhisana/utils/parse_linkedin_messages_txt.py +2 -3
- dhisana/utils/profile.py +37 -0
- dhisana/utils/proxy_curl_tools.py +507 -206
- dhisana/utils/proxycurl_search_leads.py +426 -0
- dhisana/utils/research_lead.py +121 -68
- dhisana/utils/sales_navigator_crawler.py +1 -6
- dhisana/utils/salesforce_crm_tools.py +323 -50
- dhisana/utils/search_router.py +131 -0
- dhisana/utils/search_router_jobs.py +51 -0
- dhisana/utils/sendgrid_tools.py +126 -91
- dhisana/utils/serarch_router_local_business.py +75 -0
- dhisana/utils/serpapi_additional_tools.py +290 -0
- dhisana/utils/serpapi_google_jobs.py +117 -0
- dhisana/utils/serpapi_google_search.py +188 -0
- dhisana/utils/serpapi_local_business_search.py +129 -0
- dhisana/utils/serpapi_search_tools.py +363 -432
- dhisana/utils/serperdev_google_jobs.py +125 -0
- dhisana/utils/serperdev_local_business.py +154 -0
- dhisana/utils/serperdev_search.py +233 -0
- dhisana/utils/smtp_email_tools.py +576 -0
- dhisana/utils/test_connect.py +1765 -92
- dhisana/utils/trasform_json.py +95 -16
- dhisana/utils/web_download_parse_tools.py +0 -1
- dhisana/utils/zoominfo_tools.py +2 -3
- dhisana/workflow/test.py +1 -1
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/METADATA +5 -2
- dhisana-0.0.1.dev236.dist-info/RECORD +100 -0
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/WHEEL +1 -1
- dhisana-0.0.1.dev85.dist-info/RECORD +0 -81
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/entry_points.txt +0 -0
- {dhisana-0.0.1.dev85.dist-info → dhisana-0.0.1.dev236.dist-info}/top_level.txt +0 -0
|
@@ -8,61 +8,105 @@ import re
|
|
|
8
8
|
from typing import Any, Dict, List, Optional
|
|
9
9
|
from urllib.parse import urlparse
|
|
10
10
|
|
|
11
|
-
import
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
import mdformat
|
|
12
13
|
|
|
13
14
|
from dhisana.utils.check_email_validity_tools import process_email_properties
|
|
14
15
|
from dhisana.utils.company_utils import normalize_company_name
|
|
15
16
|
from dhisana.utils.field_validators import (
|
|
16
|
-
normalize_linkedin_url,
|
|
17
|
+
normalize_linkedin_url,
|
|
18
|
+
normalize_linkedin_company_url,
|
|
19
|
+
normalize_salesnav_url,
|
|
20
|
+
normalize_linkedin_company_salesnav_url,
|
|
21
|
+
validate_and_clean_email,
|
|
22
|
+
validation_organization_domain,
|
|
23
|
+
validate_website_url
|
|
24
|
+
)
|
|
17
25
|
from dhisana.utils.apollo_tools import enrich_user_info_with_apollo
|
|
18
26
|
from dhisana.utils.assistant_tool_tag import assistant_tool
|
|
19
27
|
from dhisana.utils.domain_parser import get_domain_from_website, is_excluded_domain
|
|
28
|
+
from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
|
|
20
29
|
from dhisana.utils.proxy_curl_tools import (
|
|
21
30
|
enrich_job_info_from_proxycurl,
|
|
22
31
|
enrich_organization_info_from_proxycurl,
|
|
23
32
|
enrich_user_info_with_proxy_curl,
|
|
24
33
|
)
|
|
34
|
+
from dhisana.utils.research_lead import research_company_with_full_info_ai, research_lead_with_full_info_ai
|
|
25
35
|
from dhisana.utils.serpapi_search_tools import (
|
|
26
36
|
find_organization_linkedin_url_with_google_search,
|
|
37
|
+
find_user_linkedin_url_by_email_google,
|
|
27
38
|
find_user_linkedin_url_google,
|
|
28
|
-
|
|
39
|
+
find_user_linkedin_url_with_serper,
|
|
29
40
|
get_company_website_from_linkedin_url,
|
|
30
41
|
)
|
|
31
|
-
from dhisana.utils.field_validators import (
|
|
32
|
-
validate_and_clean_email,
|
|
33
|
-
validation_organization_domain,
|
|
34
|
-
validate_website_url
|
|
35
|
-
)
|
|
36
42
|
|
|
37
|
-
|
|
43
|
+
import logging
|
|
44
|
+
logging.basicConfig(level=logging.INFO)
|
|
45
|
+
logger = logging.getLogger(__name__)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ----------------------------------------------------------------------
|
|
49
|
+
# Allowed Enrichment Tools
|
|
50
|
+
# ----------------------------------------------------------------------
|
|
38
51
|
ALLOWED_ENRICHMENT_TOOLS = ["proxycurl", "apollo", "zoominfo"]
|
|
39
52
|
|
|
40
|
-
# A map from tool name to the corresponding function that will enrich user info.
|
|
41
53
|
USER_LOOKUP_TOOL_NAME_TO_FUNCTION_MAP = {
|
|
42
54
|
"apollo": enrich_user_info_with_apollo,
|
|
43
55
|
"proxycurl": enrich_user_info_with_proxy_curl,
|
|
44
56
|
}
|
|
45
57
|
|
|
46
|
-
import logging
|
|
47
|
-
logging.basicConfig(level=logging.INFO)
|
|
48
|
-
logger = logging.getLogger(__name__)
|
|
49
58
|
|
|
59
|
+
# ----------------------------------------------------------------------
|
|
60
|
+
# BasicLeadInformation model
|
|
61
|
+
# ----------------------------------------------------------------------
|
|
62
|
+
class BasicLeadInformation(BaseModel):
|
|
63
|
+
full_name: str = Field(..., description="Full name of the lead")
|
|
64
|
+
first_name: str = Field(..., description="First name of the lead")
|
|
65
|
+
last_name: str = Field(..., description="Last name of the lead")
|
|
66
|
+
email: str = Field(..., description="Email address of the lead")
|
|
67
|
+
primary_domain_of_organization: str = Field(..., description="Primary domain of the organization")
|
|
68
|
+
job_title: str = Field(..., description="Job Title of the lead")
|
|
69
|
+
phone: str = Field(..., description="Phone number of the lead")
|
|
70
|
+
headline: str = Field(..., description="Headline of the lead")
|
|
71
|
+
lead_location: str = Field(..., description="Location of the lead")
|
|
72
|
+
organization_name: str = Field(..., description="Current Company where lead works")
|
|
73
|
+
common_connections: int = Field(..., description="Number of common connections with the lead. Default 0")
|
|
74
|
+
followers_count: int = Field(..., description="Number of followers of the lead. Default 0")
|
|
75
|
+
tenure_in_current_role: str = Field(..., description="Tenure in the current role")
|
|
76
|
+
tenure_in_current_company: str = Field(..., description="Tenure in the current company")
|
|
77
|
+
connection_degree: str = Field(..., description="Degree of connection with the lead (1st, 2nd, 3rd)")
|
|
78
|
+
is_premium_account: bool = Field(..., description="Is the lead a premium account. Default is false.")
|
|
79
|
+
country_code: str = Field(..., description="Alpha-2 ISO3166 country code eg. US")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ----------------------------------------------------------------------
|
|
83
|
+
# Helper: chunkify
|
|
84
|
+
# ----------------------------------------------------------------------
|
|
85
|
+
def chunkify(items: List[Any], chunk_size: int) -> List[List[Any]]:
|
|
86
|
+
"""
|
|
87
|
+
Splits a list into sublists (chunks) of size `chunk_size`.
|
|
88
|
+
"""
|
|
89
|
+
for i in range(0, len(items), chunk_size):
|
|
90
|
+
yield items[i : i + chunk_size]
|
|
50
91
|
|
|
92
|
+
|
|
93
|
+
# ----------------------------------------------------------------------
|
|
94
|
+
# Function: cleanup_user_name
|
|
95
|
+
# ----------------------------------------------------------------------
|
|
51
96
|
def cleanup_user_name(cloned_properties: dict) -> dict:
|
|
52
97
|
"""
|
|
53
98
|
Cleans up user name fields: 'full_name', 'first_name', 'last_name'.
|
|
54
99
|
Returns the updated dictionary. If values are invalid or placeholders, sets them to ''.
|
|
55
100
|
"""
|
|
56
|
-
|
|
57
101
|
if not isinstance(cloned_properties, dict):
|
|
58
102
|
return {}
|
|
59
103
|
|
|
60
|
-
def normalize(name) -> str:
|
|
104
|
+
def normalize(name: str) -> str:
|
|
61
105
|
if not name or not isinstance(name, str):
|
|
62
106
|
return ""
|
|
63
107
|
# Common placeholders or invalid tokens
|
|
64
108
|
invalid_tokens = [
|
|
65
|
-
"null", "none", "na", "n.a", "notfound", "error",
|
|
109
|
+
"null", "none", "na", "n.a", "notfound", "error",
|
|
66
110
|
"na.", "na,", "notavilable", "notavailable", ""
|
|
67
111
|
]
|
|
68
112
|
stripped = name.strip().lower()
|
|
@@ -75,33 +119,99 @@ def cleanup_user_name(cloned_properties: dict) -> dict:
|
|
|
75
119
|
stripped = stripped.split("|", 1)[0]
|
|
76
120
|
# Remove extra non-alphanumeric characters (but allow whitespace)
|
|
77
121
|
stripped = re.sub(r"[^a-zA-Z0-9\s]", "", stripped)
|
|
78
|
-
|
|
79
|
-
|
|
122
|
+
|
|
123
|
+
# Capitalize the first letter of each word, and lowercase the rest
|
|
124
|
+
return " ".join(word.capitalize() for word in stripped.strip().split())
|
|
80
125
|
|
|
81
126
|
full_name = normalize(cloned_properties.get("full_name"))
|
|
82
127
|
first_name = normalize(cloned_properties.get("first_name"))
|
|
83
128
|
last_name = normalize(cloned_properties.get("last_name"))
|
|
84
129
|
|
|
85
130
|
# If full_name is empty, build from first_name + last_name
|
|
86
|
-
if first_name and last_name:
|
|
131
|
+
if first_name and last_name and not full_name:
|
|
87
132
|
full_name = (first_name + " " + last_name).strip()
|
|
88
133
|
|
|
89
134
|
cloned_properties["full_name"] = full_name
|
|
90
135
|
cloned_properties["first_name"] = first_name
|
|
91
136
|
cloned_properties["last_name"] = last_name
|
|
137
|
+
|
|
92
138
|
return cloned_properties
|
|
93
139
|
|
|
94
140
|
|
|
95
|
-
|
|
141
|
+
# ----------------------------------------------------------------------
|
|
142
|
+
# LLM-based cleanup for single lead
|
|
143
|
+
# ----------------------------------------------------------------------
|
|
144
|
+
async def get_clean_lead_info_with_llm(lead_info_str: str, tool_config: Optional[dict]) -> Dict[str, Any]:
|
|
145
|
+
"""
|
|
146
|
+
Takes a JSON string representation of partial lead info,
|
|
147
|
+
returns a cleaned-up lead dictionary matching BasicLeadInformation fields.
|
|
148
|
+
"""
|
|
149
|
+
prompt = f"""
|
|
150
|
+
Given the following data about a lead and the organization they work for,
|
|
151
|
+
extract and clean up the lead information.
|
|
152
|
+
- Format 'full_name' properly.
|
|
153
|
+
- Format 'first_name' and 'last_name' so they're capitalized properly if available.
|
|
154
|
+
- Make sure 'organization_name' is properly capitalized if provided.
|
|
155
|
+
- Do not invent data that isn't provided.
|
|
156
|
+
|
|
157
|
+
Data:
|
|
158
|
+
{lead_info_str}
|
|
159
|
+
|
|
160
|
+
The output format is in JSON. The expected fields match BasicLeadInformation.
|
|
161
|
+
"""
|
|
162
|
+
lead_info, status = await get_structured_output_internal(
|
|
163
|
+
prompt,
|
|
164
|
+
BasicLeadInformation,
|
|
165
|
+
model="gpt-5.1-chat",
|
|
166
|
+
tool_config=tool_config
|
|
167
|
+
)
|
|
168
|
+
if status == "ERROR":
|
|
169
|
+
return {}
|
|
170
|
+
return lead_info.model_dump()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
# ----------------------------------------------------------------------
|
|
174
|
+
# Helper: is_personal_email_domain
|
|
175
|
+
# ----------------------------------------------------------------------
|
|
176
|
+
def is_personal_email_domain(domain: str) -> bool:
|
|
177
|
+
"""
|
|
178
|
+
Very simple check to see if the domain is one of the common free/personal
|
|
179
|
+
email providers. Could expand this list or integrate a third-party API
|
|
180
|
+
for more accuracy.
|
|
181
|
+
"""
|
|
182
|
+
common_free_domains = {
|
|
183
|
+
"gmail.com", "yahoo.com", "hotmail.com", "outlook.com",
|
|
184
|
+
"protonmail.com", "icloud.com", "aol.com", "mail.com",
|
|
185
|
+
"pm.me", "yandex.com", "gmx.com"
|
|
186
|
+
}
|
|
187
|
+
domain = domain.strip().lower()
|
|
188
|
+
return (domain in common_free_domains) or domain.endswith(".edu")
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
# ----------------------------------------------------------------------
|
|
192
|
+
# Main validation & cleanup function
|
|
193
|
+
# ----------------------------------------------------------------------
|
|
194
|
+
async def validate_and_cleanup(
|
|
195
|
+
cloned_properties: dict,
|
|
196
|
+
tool_config: Optional[dict] = None,
|
|
197
|
+
use_strict_check: bool = False
|
|
198
|
+
) -> dict:
|
|
96
199
|
"""
|
|
97
200
|
Wrapper to validate & normalize various properties in a dictionary.
|
|
98
|
-
|
|
201
|
+
|
|
202
|
+
1) Clean up/validate typical fields.
|
|
203
|
+
2) If name fields appear invalid, fallback to LLM-based name inference.
|
|
204
|
+
3) If 'primary_domain_of_organization' AND 'organization_website' are both empty,
|
|
205
|
+
but there's a valid corporate email, use that as the domain.
|
|
206
|
+
4) (Optional) Enrich the organization info from the name if needed.
|
|
99
207
|
"""
|
|
100
208
|
|
|
101
209
|
if not isinstance(cloned_properties, dict):
|
|
102
210
|
return {}
|
|
103
211
|
|
|
104
|
-
#
|
|
212
|
+
# ------------------------------------------------------------------
|
|
213
|
+
# Step 1: Normalize typical fields
|
|
214
|
+
# ------------------------------------------------------------------
|
|
105
215
|
cloned_properties["user_linkedin_url"] = normalize_linkedin_url(
|
|
106
216
|
cloned_properties.get("user_linkedin_url")
|
|
107
217
|
)
|
|
@@ -127,75 +237,310 @@ def validate_and_cleanup(cloned_properties: dict) -> dict:
|
|
|
127
237
|
cloned_properties.get("organization_name")
|
|
128
238
|
)
|
|
129
239
|
|
|
130
|
-
#
|
|
131
|
-
|
|
132
|
-
|
|
240
|
+
# ------------------------------------------------------------------
|
|
241
|
+
# Step 2: Basic name-check. If invalid => LLM fallback.
|
|
242
|
+
# ------------------------------------------------------------------
|
|
243
|
+
def has_special_characters(val: str) -> bool:
|
|
244
|
+
return bool(re.search(r"[^a-zA-Z0-9\s]", val))
|
|
245
|
+
|
|
246
|
+
def is_invalid_name(val: str) -> bool:
|
|
247
|
+
return (len(val.strip()) < 3) or has_special_characters(val)
|
|
248
|
+
|
|
249
|
+
full_name = cloned_properties.get("full_name", "")
|
|
250
|
+
first_name = cloned_properties.get("first_name", "")
|
|
251
|
+
last_name = cloned_properties.get("last_name", "")
|
|
252
|
+
if (not full_name or full_name.startswith("None")):
|
|
253
|
+
full_name = ""
|
|
254
|
+
if (not first_name or first_name.startswith("None")):
|
|
255
|
+
first_name = ""
|
|
256
|
+
if (not last_name or last_name.startswith("None")):
|
|
257
|
+
last_name = ""
|
|
258
|
+
|
|
259
|
+
if (
|
|
260
|
+
is_invalid_name(full_name)
|
|
261
|
+
or is_invalid_name(first_name)
|
|
262
|
+
or is_invalid_name(last_name)
|
|
263
|
+
):
|
|
264
|
+
# Check if we have a valid LinkedIn URL - if so, skip LLM as ProxyCurl will fill the data
|
|
265
|
+
user_linkedin_url = cloned_properties.get("user_linkedin_url", "").strip()
|
|
266
|
+
if not user_linkedin_url:
|
|
267
|
+
lead_info_str = str(cloned_properties)
|
|
268
|
+
logger.info(
|
|
269
|
+
"Detected invalid name fields. Using LLM to infer/correct name fields."
|
|
270
|
+
)
|
|
271
|
+
# Attempt LLM-based cleanup
|
|
272
|
+
new_lead_info = await get_clean_lead_info_with_llm(lead_info_str, tool_config=tool_config)
|
|
273
|
+
if new_lead_info:
|
|
274
|
+
cloned_properties["full_name"] = new_lead_info.get("full_name", "")
|
|
275
|
+
cloned_properties["first_name"] = new_lead_info.get("first_name", "")
|
|
276
|
+
cloned_properties["last_name"] = new_lead_info.get("last_name", "")
|
|
277
|
+
else:
|
|
278
|
+
logger.info("Valid LinkedIn URL found. Skipping LLM cleanup as ProxyCurl will enrich the data.")
|
|
279
|
+
else:
|
|
280
|
+
# Use the cheaper logic
|
|
281
|
+
cloned_properties = cleanup_user_name(cloned_properties)
|
|
282
|
+
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
# Step 3: If domain & website are empty but there's a corporate email
|
|
285
|
+
# ------------------------------------------------------------------
|
|
286
|
+
# - If email is present, check if domain is personal or corporate
|
|
287
|
+
# - If corporate, set primary_domain_of_organization from email domain
|
|
288
|
+
# ------------------------------------------------------------------
|
|
289
|
+
domain_empty = not cloned_properties.get("primary_domain_of_organization")
|
|
290
|
+
website_empty = not cloned_properties.get("organization_website")
|
|
291
|
+
email = cloned_properties.get("email", "")
|
|
292
|
+
|
|
293
|
+
if domain_empty and website_empty and email:
|
|
294
|
+
# parse domain from email
|
|
295
|
+
extracted_domain = email.split("@")[-1].strip().lower()
|
|
296
|
+
if extracted_domain and (not is_personal_email_domain(extracted_domain)):
|
|
297
|
+
# This is a "corporate" email domain, so use it
|
|
298
|
+
cloned_properties["primary_domain_of_organization"] = extracted_domain
|
|
299
|
+
cloned_properties["organization_website"] = f"https://www.{extracted_domain}"
|
|
300
|
+
logger.info("Set primary_domain_of_organization from corporate email domain.")
|
|
301
|
+
|
|
302
|
+
if domain_empty and not website_empty:
|
|
303
|
+
from urllib.parse import urlparse
|
|
304
|
+
parsed_website = urlparse(cloned_properties["organization_website"])
|
|
305
|
+
possible_domain = parsed_website.netloc.replace("www.", "")
|
|
306
|
+
if possible_domain:
|
|
307
|
+
cloned_properties["primary_domain_of_organization"] = possible_domain
|
|
308
|
+
logger.info("Set primary_domain_of_organization from organization_website domain.")
|
|
133
309
|
return cloned_properties
|
|
134
310
|
|
|
135
|
-
|
|
136
311
|
@assistant_tool
|
|
137
312
|
async def enrich_lead_information(
|
|
138
313
|
user_properties: Dict[str, Any],
|
|
139
314
|
use_strict_check: bool = True,
|
|
140
315
|
get_valid_email: bool = True,
|
|
316
|
+
company_research_instructions: str = "",
|
|
317
|
+
lead_research_instructions: str = "",
|
|
318
|
+
enrich_company_information: bool = True,
|
|
319
|
+
enrich_lead_information: bool = True,
|
|
141
320
|
tool_config: Optional[List[Dict[str, Any]]] = None,
|
|
142
321
|
) -> Dict[str, Any]:
|
|
143
|
-
"""
|
|
144
|
-
Enrich lead information including company details and LinkedIn URL.
|
|
145
|
-
Steps performed:
|
|
146
|
-
1) Enrich organization information (primary domain, LinkedIn URL, website).
|
|
147
|
-
2) Attempt to fix/find user LinkedIn URL if not present.
|
|
148
|
-
3) Enrich with additional provider data and validate matches (e.g., Apollo).
|
|
149
|
-
|
|
150
|
-
:param user_properties: Dictionary containing user/lead details to be enriched.
|
|
151
|
-
:param use_strict_check: Whether to use strict matching in certain search functions.
|
|
152
|
-
:param tool_config: Optional list of tool configuration dicts (e.g., [{"name": "apollo"}, ...]).
|
|
153
|
-
:return: Enriched user_properties dictionary.
|
|
154
|
-
"""
|
|
155
322
|
logger.debug("Starting enrich_lead_information with user_properties: %s", user_properties)
|
|
156
323
|
cloned_properties = dict(user_properties)
|
|
157
324
|
|
|
158
|
-
cloned_properties = validate_and_cleanup(cloned_properties)
|
|
325
|
+
cloned_properties = await validate_and_cleanup(cloned_properties, tool_config=tool_config, use_strict_check=use_strict_check)
|
|
159
326
|
|
|
160
327
|
cloned_properties = await enrich_user_info(
|
|
161
328
|
input_properties=cloned_properties,
|
|
162
329
|
use_strict_check=use_strict_check,
|
|
163
330
|
tool_config=tool_config,
|
|
164
331
|
)
|
|
332
|
+
if use_strict_check and not cloned_properties.get("user_linkedin_url") and not cloned_properties.get("email"):
|
|
333
|
+
return cloned_properties
|
|
334
|
+
|
|
335
|
+
await enrich_organization_info_from_name(
|
|
336
|
+
row=cloned_properties,
|
|
337
|
+
use_strict_check=use_strict_check,
|
|
338
|
+
tool_config=tool_config,
|
|
339
|
+
)
|
|
165
340
|
|
|
166
341
|
cloned_properties = await enrich_with_provider(cloned_properties, tool_config)
|
|
167
342
|
|
|
168
|
-
await
|
|
343
|
+
await enrich_organization_info_from_name(
|
|
169
344
|
row=cloned_properties,
|
|
170
345
|
use_strict_check=use_strict_check,
|
|
171
346
|
tool_config=tool_config,
|
|
172
347
|
)
|
|
173
|
-
|
|
348
|
+
|
|
174
349
|
if get_valid_email:
|
|
175
350
|
await process_email_properties(cloned_properties, tool_config)
|
|
176
|
-
|
|
177
|
-
|
|
351
|
+
|
|
352
|
+
# ------------------------------------------------------------------
|
|
353
|
+
# Supplement missing follower count or name information using Serper
|
|
354
|
+
# ------------------------------------------------------------------
|
|
355
|
+
linkedin_url = cloned_properties.get("user_linkedin_url", "").strip()
|
|
356
|
+
follower_count = cloned_properties.get("linkedin_follower_count")
|
|
357
|
+
first_name = cloned_properties.get("first_name")
|
|
358
|
+
if (
|
|
359
|
+
linkedin_url
|
|
360
|
+
and (follower_count is None or (isinstance(follower_count, str) and not follower_count.strip()) or not first_name)
|
|
361
|
+
):
|
|
362
|
+
serper_result = await find_user_linkedin_url_with_serper(
|
|
363
|
+
linkedin_url, tool_config=tool_config
|
|
364
|
+
)
|
|
365
|
+
if serper_result:
|
|
366
|
+
if follower_count is None or (
|
|
367
|
+
isinstance(follower_count, str) and not follower_count.strip()
|
|
368
|
+
):
|
|
369
|
+
cloned_properties["linkedin_follower_count"] = serper_result.get(
|
|
370
|
+
"linkedin_follower_count", 0
|
|
371
|
+
)
|
|
372
|
+
if not first_name:
|
|
373
|
+
cloned_properties["first_name"] = serper_result.get("first_name", "")
|
|
374
|
+
cloned_properties["last_name"] = serper_result.get("last_name", "")
|
|
375
|
+
|
|
376
|
+
cloned_properties = await validate_and_cleanup(
|
|
377
|
+
cloned_properties, tool_config=tool_config, use_strict_check=use_strict_check
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
research_summary = cloned_properties.get("research_summary", "")
|
|
381
|
+
|
|
382
|
+
if enrich_lead_information:
|
|
383
|
+
summary = await research_lead_with_full_info_ai(
|
|
384
|
+
cloned_properties, lead_research_instructions, tool_config=tool_config
|
|
385
|
+
)
|
|
386
|
+
if summary:
|
|
387
|
+
research_summary = summary.get("research_summary", "")
|
|
388
|
+
|
|
389
|
+
if enrich_company_information:
|
|
390
|
+
company_company_properties = {
|
|
391
|
+
"organization_name": cloned_properties.get("organization_name", ""),
|
|
392
|
+
"primary_domain_of_organization": cloned_properties.get("primary_domain_of_organization", ""),
|
|
393
|
+
"organization_website": cloned_properties.get("organization_website", ""),
|
|
394
|
+
}
|
|
395
|
+
company_summary = await research_company_with_full_info_ai(
|
|
396
|
+
company_company_properties,
|
|
397
|
+
company_research_instructions,
|
|
398
|
+
tool_config=tool_config,
|
|
399
|
+
)
|
|
400
|
+
if company_summary:
|
|
401
|
+
markdown_text = research_summary + "\n\n#### " + company_summary.get(
|
|
402
|
+
"research_summary", ""
|
|
403
|
+
)
|
|
404
|
+
formatted_markdown = mdformat.text(markdown_text)
|
|
405
|
+
research_summary = re.sub(
|
|
406
|
+
r'^(#{1,6})\s+', '##### ', formatted_markdown, flags=re.MULTILINE
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
cloned_properties["research_summary"] = research_summary
|
|
178
410
|
return cloned_properties
|
|
179
411
|
|
|
180
412
|
|
|
413
|
+
class UserInfoFromGithubProfileId(BaseModel):
|
|
414
|
+
first_name: str
|
|
415
|
+
last_name: str
|
|
416
|
+
full_name: str
|
|
417
|
+
linkedin_url: str
|
|
418
|
+
github_url: str
|
|
419
|
+
email: str
|
|
420
|
+
twitter_handle: str
|
|
421
|
+
website: str
|
|
422
|
+
location: str
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def extract_id_from_salesnav_url(url_key: str) -> str:
|
|
426
|
+
"""
|
|
427
|
+
Extract the Sales Navigator lead ID from a URL like
|
|
428
|
+
'https://www.linkedin.com/sales/lead/<ID>?...'
|
|
429
|
+
"""
|
|
430
|
+
if not url_key:
|
|
431
|
+
return ""
|
|
432
|
+
match = re.search(r"linkedin\.com/sales/lead/([^/?#,]+)", url_key, re.IGNORECASE)
|
|
433
|
+
if not match:
|
|
434
|
+
return ""
|
|
435
|
+
# strip out any non-word or hyphen chars
|
|
436
|
+
return re.sub(r"[^\w-]", "", match.group(1))
|
|
437
|
+
|
|
438
|
+
def proxy_linkedin_url(user_linkedin_salesnav_url: str) -> str:
|
|
439
|
+
"""
|
|
440
|
+
Given a Sales Navigator URL, return the corresponding public LinkedIn URL.
|
|
441
|
+
Raises ValueError if the ID cannot be extracted.
|
|
442
|
+
"""
|
|
443
|
+
salesnav_id = extract_id_from_salesnav_url(user_linkedin_salesnav_url)
|
|
444
|
+
if not salesnav_id:
|
|
445
|
+
raise ValueError("Could not extract ID from Sales Nav URL.")
|
|
446
|
+
return f"https://www.linkedin.com/in/{salesnav_id}"
|
|
447
|
+
|
|
448
|
+
# -------------------------------------------------------------------
|
|
449
|
+
# (Pseudo) get_structured_output_internal, find_user_linkedin_url_google
|
|
450
|
+
# and other references assumed to exist in your environment.
|
|
451
|
+
# -------------------------------------------------------------------
|
|
452
|
+
|
|
453
|
+
async def get_user_linkedin_url_from_github_profile(
|
|
454
|
+
github_profile_id: str,
|
|
455
|
+
lead_properties: dict,
|
|
456
|
+
instructions: str,
|
|
457
|
+
tool_config: Optional[List[Dict]] = None
|
|
458
|
+
) -> Dict[str, Any]:
|
|
459
|
+
"""
|
|
460
|
+
Attempt to locate a user's LinkedIn profile URL from their GitHub profile ID via web search.
|
|
461
|
+
Also gather basic user info (first/last name) if possible.
|
|
462
|
+
"""
|
|
463
|
+
instructions = f"""
|
|
464
|
+
Give user information from user GitHub handle; try to locate the LinkedIn profile URL
|
|
465
|
+
for the user using web search.
|
|
466
|
+
---
|
|
467
|
+
Github profile id:
|
|
468
|
+
{github_profile_id}
|
|
469
|
+
Company Data include name, domain and website:
|
|
470
|
+
{lead_properties}
|
|
471
|
+
|
|
472
|
+
Instructions:
|
|
473
|
+
{instructions}
|
|
474
|
+
---
|
|
475
|
+
Use websearch to locate the LinkedIn profile url for the user if present.
|
|
476
|
+
|
|
477
|
+
**Output**:
|
|
478
|
+
Return your final output as valid JSON with the following structure:
|
|
479
|
+
{{
|
|
480
|
+
"first_name": "...",
|
|
481
|
+
"last_name": "...",
|
|
482
|
+
"full_name": "...",
|
|
483
|
+
"linkedin_url": "...",
|
|
484
|
+
"github_url": "...",
|
|
485
|
+
"email": "...",
|
|
486
|
+
"twitter_handle": "...",
|
|
487
|
+
"website": "...",
|
|
488
|
+
"location": "..."
|
|
489
|
+
}}
|
|
490
|
+
"""
|
|
491
|
+
|
|
492
|
+
# Example call to structured output function
|
|
493
|
+
response, status = await get_structured_output_internal(
|
|
494
|
+
instructions,
|
|
495
|
+
UserInfoFromGithubProfileId,
|
|
496
|
+
model="gpt-5.1-chat",
|
|
497
|
+
use_web_search=True,
|
|
498
|
+
tool_config=tool_config
|
|
499
|
+
)
|
|
500
|
+
if status == "SUCCESS":
|
|
501
|
+
return response
|
|
502
|
+
else:
|
|
503
|
+
return {}
|
|
504
|
+
|
|
181
505
|
async def enrich_user_info(
|
|
182
506
|
input_properties: Dict[str, Any],
|
|
183
507
|
use_strict_check: bool,
|
|
184
508
|
tool_config: Optional[List[Dict[str, Any]]] = None,
|
|
185
509
|
) -> Dict[str, Any]:
|
|
186
510
|
"""
|
|
187
|
-
Attempt to find or fix a user's LinkedIn URL using name, title, location,
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
:param tool_config: Optional list of tool configurations dicts.
|
|
192
|
-
:return: Updated dictionary with user LinkedIn URL if found.
|
|
511
|
+
Attempt to find or fix a user's LinkedIn URL using name, title, location,
|
|
512
|
+
company info or GitHub profile handle if present. If still not found,
|
|
513
|
+
but user_linkedin_salesnav_url exists, we fall back to creating a
|
|
514
|
+
proxy URL from the Sales Navigator link.
|
|
193
515
|
"""
|
|
194
516
|
logger.debug("Starting enrich_user_info for: %s", input_properties.get("full_name"))
|
|
195
517
|
user_linkedin_url = (input_properties.get("user_linkedin_url") or "").strip()
|
|
196
518
|
input_properties["linkedin_url_match"] = False
|
|
519
|
+
github_profile_id = (input_properties.get("github_profile_id") or "").strip()
|
|
197
520
|
|
|
521
|
+
# 1) If we do not have a user_linkedin_url, try getting it from GitHub
|
|
198
522
|
if not user_linkedin_url:
|
|
523
|
+
if github_profile_id:
|
|
524
|
+
response = await get_user_linkedin_url_from_github_profile(
|
|
525
|
+
github_profile_id=github_profile_id,
|
|
526
|
+
lead_properties=input_properties,
|
|
527
|
+
instructions="Use web search to find the user's LinkedIn profile from GitHub handle if present.",
|
|
528
|
+
tool_config=tool_config,
|
|
529
|
+
)
|
|
530
|
+
user_linkedin_url = response.get("linkedin_url", "")
|
|
531
|
+
if user_linkedin_url:
|
|
532
|
+
input_properties["user_linkedin_url"] = user_linkedin_url
|
|
533
|
+
if not input_properties.get("first_name"):
|
|
534
|
+
input_properties["first_name"] = response.get("first_name", "")
|
|
535
|
+
if not input_properties.get("last_name"):
|
|
536
|
+
input_properties["last_name"] = response.get("last_name", "")
|
|
537
|
+
if not input_properties.get("email"):
|
|
538
|
+
input_properties["email"] = response.get("email", "")
|
|
539
|
+
if not input_properties.get("lead_location"):
|
|
540
|
+
input_properties["lead_location"] = response.get("location", "")
|
|
541
|
+
return input_properties
|
|
542
|
+
|
|
543
|
+
# 2) If still no LinkedIn URL, try name/title/org searching
|
|
199
544
|
full_name = (input_properties.get("full_name") or "").strip()
|
|
200
545
|
if not full_name:
|
|
201
546
|
first_name = (input_properties.get("first_name", "") or "").strip()
|
|
@@ -205,20 +550,64 @@ async def enrich_user_info(
|
|
|
205
550
|
title = input_properties.get("job_title", "") or ""
|
|
206
551
|
location = input_properties.get("lead_location", "") or ""
|
|
207
552
|
org_name = (input_properties.get("organization_name", "") or "").strip()
|
|
208
|
-
|
|
209
|
-
|
|
553
|
+
org_domain = (input_properties.get("primary_domain_of_organization", "") or "").strip()
|
|
554
|
+
email = (input_properties.get("email") or "").strip()
|
|
555
|
+
|
|
556
|
+
if full_name and (org_name or org_domain or title):
|
|
557
|
+
# This function does a google-based search for the user's LinkedIn
|
|
558
|
+
found_linkedin_url = await find_user_linkedin_url_google(
|
|
210
559
|
user_name=full_name,
|
|
211
560
|
user_title=title,
|
|
212
561
|
user_location=location,
|
|
213
562
|
user_company=org_name,
|
|
563
|
+
user_company_domain=org_domain,
|
|
214
564
|
use_strict_check=use_strict_check,
|
|
215
565
|
tool_config=tool_config,
|
|
216
566
|
)
|
|
217
|
-
|
|
567
|
+
if found_linkedin_url:
|
|
568
|
+
user_linkedin_url = found_linkedin_url
|
|
569
|
+
input_properties["user_linkedin_url"] = user_linkedin_url
|
|
570
|
+
if not user_linkedin_url and email:
|
|
571
|
+
# If we have an email but no name, try searching by email
|
|
572
|
+
email_lookup_result = await find_user_linkedin_url_by_email_google(
|
|
573
|
+
email=email,
|
|
574
|
+
user_name=full_name,
|
|
575
|
+
user_title=title,
|
|
576
|
+
user_location=location,
|
|
577
|
+
user_company=org_name,
|
|
578
|
+
tool_config=tool_config,
|
|
579
|
+
)
|
|
580
|
+
if email_lookup_result and email_lookup_result.get("linkedin_url"):
|
|
581
|
+
user_linkedin_url = email_lookup_result["linkedin_url"]
|
|
582
|
+
input_properties["user_linkedin_url"] = user_linkedin_url
|
|
583
|
+
confidence = email_lookup_result.get("confidence", 0.0)
|
|
584
|
+
reasoning = email_lookup_result.get("reasoning", "")
|
|
585
|
+
input_properties["user_linkedin_url_confidence"] = confidence
|
|
586
|
+
input_properties["user_linkedin_url_reasoning"] = reasoning
|
|
587
|
+
|
|
588
|
+
additional_properties = input_properties.get("additional_properties") or {}
|
|
589
|
+
additional_properties["user_linkedin_url_confidence"] = confidence
|
|
590
|
+
if reasoning:
|
|
591
|
+
additional_properties["user_linkedin_url_reasoning"] = reasoning
|
|
592
|
+
input_properties["additional_properties"] = additional_properties
|
|
593
|
+
|
|
594
|
+
# 3) Final fallback: if STILL no user_linkedin_url,
|
|
595
|
+
# but user_linkedin_salesnav_url is present, use proxy
|
|
596
|
+
if not input_properties.get("user_linkedin_url"):
|
|
597
|
+
salesnav_url = input_properties.get("user_linkedin_salesnav_url", "")
|
|
598
|
+
if salesnav_url:
|
|
599
|
+
try:
|
|
600
|
+
proxy_url = proxy_linkedin_url(salesnav_url)
|
|
601
|
+
input_properties["user_linkedin_url"] = proxy_url
|
|
602
|
+
logger.debug("Falling back to proxy LinkedIn URL from SalesNav: %s", proxy_url)
|
|
603
|
+
except ValueError:
|
|
604
|
+
# If we can't parse an ID from the sales nav URL, skip
|
|
605
|
+
logger.warning("Could not parse ID from user_linkedin_salesnav_url: %s", salesnav_url)
|
|
218
606
|
|
|
219
607
|
return input_properties
|
|
220
608
|
|
|
221
609
|
|
|
610
|
+
|
|
222
611
|
async def enrich_with_provider(
|
|
223
612
|
cloned_properties: Dict[str, Any],
|
|
224
613
|
tool_config: Optional[List[Dict[str, Any]]],
|
|
@@ -260,14 +649,9 @@ async def enrich_organization_info_from_name(
|
|
|
260
649
|
Given a dictionary (treated like a CSV row) containing 'organization_name',
|
|
261
650
|
'organization_linkedin_url', and 'website' keys, enrich the row only if the
|
|
262
651
|
domain and website are currently empty.
|
|
263
|
-
|
|
264
|
-
:param row: Dictionary representing a lead or company record.
|
|
265
|
-
:param use_strict_check: Whether to use strict matching for searches.
|
|
266
|
-
:param tool_config: Optional list of tool configuration dicts.
|
|
267
652
|
"""
|
|
268
653
|
org_name_key = "organization_name"
|
|
269
654
|
org_domain_key = "primary_domain_of_organization"
|
|
270
|
-
linkedin_url_key = "organization_linkedin_url"
|
|
271
655
|
website_key = "organization_website"
|
|
272
656
|
|
|
273
657
|
org_name = (row.get(org_name_key) or "").strip()
|
|
@@ -276,26 +660,14 @@ async def enrich_organization_info_from_name(
|
|
|
276
660
|
row[org_name_key] = ""
|
|
277
661
|
org_name = ""
|
|
278
662
|
|
|
663
|
+
# If there's no organization name, just return
|
|
279
664
|
if not org_name:
|
|
280
665
|
return
|
|
281
666
|
|
|
667
|
+
# If domain or website is already present, we consider it enriched
|
|
282
668
|
if row.get(org_domain_key) or row.get(website_key):
|
|
283
669
|
return
|
|
284
|
-
|
|
285
|
-
linkedin_url = row.get(linkedin_url_key, "").strip()
|
|
286
|
-
if not linkedin_url:
|
|
287
|
-
linkedin_url = await find_organization_linkedin_url_with_google_search(
|
|
288
|
-
org_name,
|
|
289
|
-
company_location="US",
|
|
290
|
-
use_strict_check=use_strict_check,
|
|
291
|
-
tool_config=tool_config,
|
|
292
|
-
)
|
|
293
|
-
|
|
294
|
-
if linkedin_url:
|
|
295
|
-
row[linkedin_url_key] = linkedin_url
|
|
296
|
-
await set_organization_domain(row, use_strict_check, tool_config)
|
|
297
|
-
else:
|
|
298
|
-
row[org_domain_key] = ""
|
|
670
|
+
await set_organization_domain(row, use_strict_check, tool_config)
|
|
299
671
|
|
|
300
672
|
|
|
301
673
|
async def set_organization_domain(
|
|
@@ -306,10 +678,6 @@ async def set_organization_domain(
|
|
|
306
678
|
"""
|
|
307
679
|
Update the row with a 'primary_domain_of_organization' based on 'website' or
|
|
308
680
|
search results if the domain is absent.
|
|
309
|
-
|
|
310
|
-
:param row: Dictionary representing a lead or company record.
|
|
311
|
-
:param use_strict_check: Whether to use strict matching for searches.
|
|
312
|
-
:param tool_config: Optional list of tool configuration dicts.
|
|
313
681
|
"""
|
|
314
682
|
org_name_key = "organization_name"
|
|
315
683
|
org_domain_key = "primary_domain_of_organization"
|
|
@@ -342,40 +710,95 @@ async def set_organization_domain(
|
|
|
342
710
|
|
|
343
711
|
if not extracted_domain and not use_strict_check and org_name:
|
|
344
712
|
logger.debug("Performing Google search to find domain for org_name: %s", org_name)
|
|
345
|
-
|
|
346
|
-
org_name,
|
|
347
|
-
|
|
348
|
-
|
|
713
|
+
company_info = await get_company_domain_from_llm_web_search(
|
|
714
|
+
company_name=org_name,
|
|
715
|
+
lead_info=row,
|
|
716
|
+
location="US",
|
|
717
|
+
tool_config=tool_config
|
|
349
718
|
)
|
|
350
|
-
|
|
719
|
+
if company_info and isinstance(company_info, dict):
|
|
720
|
+
# If the LLM found a domain, set it
|
|
721
|
+
if company_info.get("primary_domain_of_organization") and not row[org_domain_key]:
|
|
722
|
+
row[org_domain_key] = company_info["primary_domain_of_organization"]
|
|
723
|
+
|
|
724
|
+
# If the LLM found an organization website, set it
|
|
725
|
+
if company_info.get("organization_website") and not row[website_key]:
|
|
726
|
+
row[website_key] = company_info["organization_website"]
|
|
727
|
+
|
|
728
|
+
# If there's a LinkedIn URL from LLM, set it
|
|
729
|
+
if company_info.get("organization_linkedin_url") and not row[linkedin_url_key]:
|
|
730
|
+
row[linkedin_url_key] = company_info["organization_linkedin_url"]
|
|
731
|
+
|
|
732
|
+
if company_info.get("organization_name") and not row[org_name_key]:
|
|
733
|
+
row[org_name_key] = company_info["organization_name"]
|
|
351
734
|
|
|
352
735
|
row[org_domain_key] = extracted_domain or ""
|
|
353
736
|
logger.debug("Final domain selected: %s", row[org_domain_key])
|
|
354
737
|
row[website_key] = company_website or ""
|
|
738
|
+
|
|
739
|
+
# If there's still no website but we have a domain, set a default website
|
|
355
740
|
company_website = (row.get(website_key) or "").strip()
|
|
356
741
|
if existing_domain and not company_website:
|
|
357
742
|
row[website_key] = f"https://www.{existing_domain}"
|
|
358
743
|
|
|
359
744
|
|
|
745
|
+
async def get_organization_linkedin_url(lead: Dict[str, Any], tools: Optional[List[Dict[str, Any]]]) -> str:
|
|
746
|
+
"""
|
|
747
|
+
Retrieve the organization's LinkedIn URL using the company name, domain, and search tools.
|
|
748
|
+
Returns an empty string if the organization name is missing.
|
|
749
|
+
"""
|
|
750
|
+
name = lead.get("organization_name", "").strip()
|
|
751
|
+
if not name:
|
|
752
|
+
return ""
|
|
753
|
+
|
|
754
|
+
linkedin_url = await find_organization_linkedin_url_with_google_search(
|
|
755
|
+
name,
|
|
756
|
+
company_location="US",
|
|
757
|
+
company_domain=lead.get("primary_domain_of_organization"),
|
|
758
|
+
use_strict_check=True,
|
|
759
|
+
tool_config=tools,
|
|
760
|
+
)
|
|
761
|
+
return linkedin_url
|
|
762
|
+
|
|
763
|
+
|
|
360
764
|
async def enrich_organization_info_from_company_url(
|
|
361
765
|
organization_linkedin_url: str,
|
|
362
766
|
use_strict_check: bool = True,
|
|
363
767
|
tool_config: Optional[List[Dict[str, Any]]] = None,
|
|
768
|
+
categories: Optional[bool] = None,
|
|
769
|
+
funding_data: Optional[bool] = None,
|
|
770
|
+
exit_data: Optional[bool] = None,
|
|
771
|
+
acquisitions: Optional[bool] = None,
|
|
772
|
+
extra: Optional[bool] = None,
|
|
773
|
+
use_cache: Optional[str] = "if-present",
|
|
774
|
+
fallback_to_cache: Optional[str] = "on-error",
|
|
364
775
|
) -> Dict[str, Any]:
|
|
365
776
|
"""
|
|
366
777
|
Given an organization LinkedIn URL, attempt to enrich its data (e.g. name, website)
|
|
367
|
-
via ProxyCurl.
|
|
778
|
+
via ProxyCurl. Additional Proxycurl Company API boolean flags (categories, funding_data, etc.)
|
|
779
|
+
can be supplied to control the returned payload (True -> "include"). If data is found,
|
|
780
|
+
set domain, then return the dict. Otherwise, return {}.
|
|
368
781
|
"""
|
|
369
782
|
|
|
370
783
|
# Call ProxyCurl to enrich
|
|
371
784
|
company_data = await enrich_organization_info_from_proxycurl(
|
|
372
785
|
organization_linkedin_url=organization_linkedin_url,
|
|
373
|
-
tool_config=tool_config
|
|
786
|
+
tool_config=tool_config,
|
|
787
|
+
categories=categories,
|
|
788
|
+
funding_data=funding_data,
|
|
789
|
+
exit_data=exit_data,
|
|
790
|
+
acquisitions=acquisitions,
|
|
791
|
+
extra=extra,
|
|
792
|
+
use_cache=use_cache,
|
|
793
|
+
fallback_to_cache=fallback_to_cache,
|
|
374
794
|
)
|
|
375
795
|
|
|
376
796
|
# If ProxyCurl returned any data, set domain, then return
|
|
377
797
|
if company_data and isinstance(company_data, dict):
|
|
378
798
|
await set_organization_domain(company_data, use_strict_check, tool_config)
|
|
799
|
+
summary = await research_company_with_full_info_ai(company_data, "", tool_config=tool_config)
|
|
800
|
+
if summary:
|
|
801
|
+
company_data["organization_details"] = summary.get("research_summary", "")
|
|
379
802
|
return company_data
|
|
380
803
|
|
|
381
804
|
return {}
|
|
@@ -389,7 +812,6 @@ async def enrich_organization_info_from_job_url(
|
|
|
389
812
|
"""
|
|
390
813
|
Given a LinkedIn job posting URL, fetch job details using Proxycurl.
|
|
391
814
|
If job details are successfully retrieved, extract organization information
|
|
392
|
-
(organization_name, organization_linkedin_url, primary_domain_of_organization, organization_website)
|
|
393
815
|
and return them in a dictionary. If not found, return {}.
|
|
394
816
|
"""
|
|
395
817
|
# Validate the job URL.
|
|
@@ -406,7 +828,7 @@ async def enrich_organization_info_from_job_url(
|
|
|
406
828
|
job_info = await enrich_job_info_from_proxycurl(
|
|
407
829
|
normalized_job_url, tool_config=tool_config
|
|
408
830
|
)
|
|
409
|
-
except Exception
|
|
831
|
+
except Exception:
|
|
410
832
|
logger.exception("Exception occurred while fetching job info from Proxycurl.")
|
|
411
833
|
return {}
|
|
412
834
|
|
|
@@ -431,3 +853,81 @@ async def enrich_organization_info_from_job_url(
|
|
|
431
853
|
return result
|
|
432
854
|
|
|
433
855
|
return {}
|
|
856
|
+
|
|
857
|
+
|
|
858
|
+
class CompanyInfoFromName(BaseModel):
|
|
859
|
+
organization_name: str
|
|
860
|
+
primary_domain_of_organization: str
|
|
861
|
+
organization_website: str
|
|
862
|
+
organization_linkedin_url: str
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
@assistant_tool
|
|
866
|
+
async def get_company_domain_from_llm_web_search(
|
|
867
|
+
company_name: str,
|
|
868
|
+
lead_info: dict,
|
|
869
|
+
location: Optional[str] = None,
|
|
870
|
+
tool_config: Optional[List[Dict]] = None
|
|
871
|
+
) -> Dict[str, Any]:
|
|
872
|
+
"""
|
|
873
|
+
Tries to find relevant company info (name, domain, website, LinkedIn URL) from the company name
|
|
874
|
+
using an LLM with web search. Returns a dictionary with keys:
|
|
875
|
+
{
|
|
876
|
+
"organization_name": str,
|
|
877
|
+
"primary_domain_of_organization": str,
|
|
878
|
+
"organization_website": str,
|
|
879
|
+
"organization_linkedin_url": str
|
|
880
|
+
}
|
|
881
|
+
or an empty dict on failure.
|
|
882
|
+
"""
|
|
883
|
+
logger.info("Entering get_company_domain_from_llm_web_search")
|
|
884
|
+
|
|
885
|
+
cleaned_name = company_name.replace(" ", "")
|
|
886
|
+
if not cleaned_name or company_name.lower() in ["none", "freelance"]:
|
|
887
|
+
logger.debug("Invalid or excluded company_name provided.")
|
|
888
|
+
return {}
|
|
889
|
+
|
|
890
|
+
query = f"\"{company_name}\" official website"
|
|
891
|
+
if location:
|
|
892
|
+
query += f", {location}"
|
|
893
|
+
|
|
894
|
+
try:
|
|
895
|
+
logger.debug(f"Performing LLM search with query: {query}")
|
|
896
|
+
# Build instructions for the LLM
|
|
897
|
+
instructions = f"""
|
|
898
|
+
Given the following information, find the company name, website, and domain information.
|
|
899
|
+
---
|
|
900
|
+
Company name:
|
|
901
|
+
{company_name}
|
|
902
|
+
|
|
903
|
+
Additional lead info:
|
|
904
|
+
{lead_info}
|
|
905
|
+
|
|
906
|
+
Search and gather any domain/website info or LinkedIn details.
|
|
907
|
+
DO NOT make up information about company.
|
|
908
|
+
Find based on the domain in the leads email if its a corporate email, company name if sepcified to find the company name, website and domain.
|
|
909
|
+
|
|
910
|
+
**Output**:
|
|
911
|
+
Return your final output as valid JSON with the following structure:
|
|
912
|
+
{{
|
|
913
|
+
"organization_name": "...",
|
|
914
|
+
"primary_domain_of_organization": "...",
|
|
915
|
+
"organization_website": "...",
|
|
916
|
+
"organization_linkedin_url": "..."
|
|
917
|
+
}}
|
|
918
|
+
"""
|
|
919
|
+
response, status = await get_structured_output_internal(
|
|
920
|
+
instructions,
|
|
921
|
+
CompanyInfoFromName,
|
|
922
|
+
model="gpt-5.1-chat",
|
|
923
|
+
use_web_search=True,
|
|
924
|
+
tool_config=tool_config
|
|
925
|
+
)
|
|
926
|
+
if status == "SUCCESS":
|
|
927
|
+
# Return the dictionary form of the model
|
|
928
|
+
return response.model_dump()
|
|
929
|
+
else:
|
|
930
|
+
return {}
|
|
931
|
+
except Exception:
|
|
932
|
+
logger.exception("Exception during get_company_domain_from_llm_web_search.")
|
|
933
|
+
return {}
|