dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1226 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import re
6
+ import aiohttp
7
+ import backoff
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from dhisana.utils.assistant_tool_tag import assistant_tool
11
+ from dhisana.utils.cache_output_tools import cache_output, retrieve_output
12
+ from dhisana.utils.clean_properties import cleanup_properties
13
+ from dhisana.utils.search_router import search_google_with_tools
14
+ from urllib.parse import urlparse, urlunparse
15
+
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def get_proxycurl_access_token(tool_config: Optional[List[Dict]] = None) -> str:
21
+ """
22
+ Retrieves the PROXY_CURL_API_KEY access token from the provided tool configuration.
23
+
24
+ Raises:
25
+ ValueError: If the Proxycurl integration has not been configured.
26
+ """
27
+ PROXY_CURL_API_KEY = None
28
+
29
+ if tool_config:
30
+ logger.debug(f"Tool config provided: {tool_config}")
31
+ proxy_curl_config = next(
32
+ (item for item in tool_config if item.get("name") == "proxycurl"), None
33
+ )
34
+ if proxy_curl_config:
35
+ config_map = {
36
+ item["name"]: item["value"]
37
+ for item in proxy_curl_config.get("configuration", [])
38
+ if item
39
+ }
40
+ PROXY_CURL_API_KEY = config_map.get("apiKey")
41
+ else:
42
+ logger.warning("No 'proxycurl' config item found in tool_config.")
43
+ else:
44
+ logger.debug("No tool_config provided or it's None.")
45
+
46
+ # Check environment variable if no key found yet
47
+ PROXY_CURL_API_KEY = PROXY_CURL_API_KEY or os.getenv("PROXY_CURL_API_KEY")
48
+
49
+ if not PROXY_CURL_API_KEY:
50
+ logger.error("Proxycurl integration is not configured.")
51
+ raise ValueError(
52
+ "Proxycurl integration is not configured. Please configure the connection to Proxycurl in Integrations."
53
+ )
54
+
55
+ return PROXY_CURL_API_KEY
56
+
57
+
58
+ @assistant_tool
59
+ @backoff.on_exception(
60
+ backoff.expo,
61
+ aiohttp.ClientResponseError,
62
+ max_tries=3,
63
+ giveup=lambda e: e.status != 429,
64
+ factor=10,
65
+ )
66
+ async def enrich_person_info_from_proxycurl(
67
+ linkedin_url: Optional[str] = None,
68
+ email: Optional[str] = None,
69
+ phone: Optional[str] = None,
70
+ tool_config: Optional[List[Dict]] = None
71
+ ) -> Dict:
72
+ """
73
+ Fetch a person's details from Proxycurl using LinkedIn URL, email, or phone number.
74
+
75
+ Returns:
76
+ dict: JSON response containing person information or an error.
77
+ """
78
+ logger.info("Entering enrich_person_info_from_proxycurl")
79
+
80
+ try:
81
+ API_KEY = get_proxycurl_access_token(tool_config)
82
+ except ValueError as e:
83
+ return {"error": str(e)}
84
+
85
+ HEADERS = {
86
+ 'Authorization': f'Bearer {API_KEY}',
87
+ 'Content-Type': 'application/json'
88
+ }
89
+
90
+ if not linkedin_url:
91
+ logger.warning("No linkedin_url provided.")
92
+ return {'error': "linkedin_url must be provided"}
93
+
94
+ # Check cache if linkedin_url is provided
95
+ if linkedin_url:
96
+ cached_response = retrieve_output("enrich_person_info_from_proxycurl", linkedin_url)
97
+ if cached_response is not None and cached_response.get('error') is None:
98
+ logger.info(f"Cache hit for LinkedIn URL: {linkedin_url}")
99
+ return cached_response
100
+
101
+ params = {}
102
+ if linkedin_url:
103
+ params['url'] = linkedin_url
104
+ if email:
105
+ params['email'] = email
106
+
107
+ if phone:
108
+ params['phone'] = phone
109
+
110
+ url = 'https://enrichlayer.com/api/v2/profile'
111
+ logger.debug(f"Making request to Proxycurl with params: {params}")
112
+
113
+ async with aiohttp.ClientSession() as session:
114
+ try:
115
+ async with session.get(url, headers=HEADERS, params=params) as response:
116
+ logger.debug(f"Received response status: {response.status}")
117
+ if response.status == 200:
118
+ result = await response.json()
119
+ if linkedin_url:
120
+ cache_output("enrich_person_info_from_proxycurl", linkedin_url, result)
121
+ logger.info("Successfully retrieved person info from Proxycurl.")
122
+ return result
123
+ elif response.status == 404:
124
+ msg = "Person not found"
125
+ logger.warning(msg)
126
+ return {'error': msg}
127
+ elif response.status == 429:
128
+ msg = "Rate limit exceeded"
129
+ logger.warning(msg)
130
+ # Sleep and then return an error (no raise)
131
+ await asyncio.sleep(30)
132
+ return {'error': msg}
133
+ else:
134
+ error_text = await response.text()
135
+ logger.error(f"Error from Proxycurl: {error_text}")
136
+ return {'error': error_text}
137
+ except Exception as e:
138
+ logger.exception("Exception occurred while fetching person info from Proxycurl.")
139
+ return {"error": str(e)}
140
+
141
+
142
+ @assistant_tool
143
+ @backoff.on_exception(
144
+ backoff.expo,
145
+ aiohttp.ClientResponseError,
146
+ max_tries=3,
147
+ giveup=lambda e: e.status != 429,
148
+ factor=10,
149
+ )
150
+ async def lookup_person_in_proxy_curl_by_name(
151
+ first_name: str,
152
+ last_name: str,
153
+ company_name: Optional[str] = None,
154
+ tool_config: Optional[List[Dict]] = None,
155
+ ) -> Dict:
156
+ """
157
+ Look up a person in Proxycurl by first and last name, optionally a company name.
158
+
159
+ Returns:
160
+ dict: JSON response containing search results or an error.
161
+ """
162
+ logger.info("Entering lookup_person_in_proxy_curl_by_name")
163
+
164
+ if not first_name or not last_name:
165
+ logger.warning("First name or last name missing for lookup.")
166
+ return {'error': "Full name is required"}
167
+
168
+ try:
169
+ API_KEY = get_proxycurl_access_token(tool_config)
170
+ except ValueError as e:
171
+ return {"error": str(e)}
172
+
173
+ headers = {'Authorization': f'Bearer {API_KEY}'}
174
+ params = {
175
+ 'first_name': first_name,
176
+ 'last_name': last_name,
177
+ 'page_size': '1',
178
+ }
179
+ if company_name:
180
+ params['current_company_name'] = company_name
181
+
182
+ key = f"{first_name} {last_name} {company_name}".strip()
183
+ if key:
184
+ cached_response = retrieve_output("lookup_person_in_proxycurl_by_name", key)
185
+ if cached_response is not None:
186
+ logger.info(f"Cache hit for name lookup key: {key}")
187
+ return cached_response
188
+
189
+ url = 'https://enrichlayer.com/api/v2/search/person'
190
+ logger.debug(f"Making request to Proxycurl with params: {params}")
191
+
192
+ async with aiohttp.ClientSession() as session:
193
+ try:
194
+ async with session.get(url, headers=headers, params=params) as response:
195
+ logger.debug(f"Received response status: {response.status}")
196
+ if response.status == 200:
197
+ result = await response.json()
198
+ cache_output("lookup_person_in_proxycurl_by_name", key, result)
199
+ logger.info("Successfully retrieved person search info from Proxycurl.")
200
+ return result
201
+ elif response.status == 404:
202
+ msg = "Person not found"
203
+ logger.warning(msg)
204
+ if key:
205
+ cache_output("lookup_person_in_proxycurl_by_name", key, {'error': msg})
206
+ return {'error': msg}
207
+ elif response.status == 429:
208
+ msg = "Rate limit exceeded"
209
+ logger.warning(msg)
210
+ await asyncio.sleep(30)
211
+ return {'error': msg}
212
+ else:
213
+ result = await response.json()
214
+ logger.warning(f"lookup_person_in_proxycurl_by_name error: {result}")
215
+ return {'error': result}
216
+ except Exception as e:
217
+ logger.exception("Exception occurred while looking up person by name.")
218
+ return {"error": str(e)}
219
+
220
+
221
+ def transform_company_data(data: dict) -> dict:
222
+ """
223
+ Transform the company data by mapping:
224
+ - 'name' to 'organization_name'
225
+ - 'website' to 'organization_website'
226
+ - 'industry' to 'organization_industry'
227
+ - 'hq' or 'headquarters' to 'organization_hq_location'
228
+ in the format "city, state, country" (skipping empty parts).
229
+ Copies over all other properties except the ones that are mapped.
230
+ If data is empty, returns an empty dictionary.
231
+ """
232
+ if not data:
233
+ return {}
234
+
235
+ transformed = {}
236
+
237
+ # Map name, website, and industry
238
+ if "name" in data:
239
+ transformed["organization_name"] = data["name"]
240
+ if "website" in data:
241
+ transformed["organization_website"] = data["website"]
242
+ if "industry" in data:
243
+ transformed["organization_industry"] = data["industry"]
244
+
245
+ if "company_size" in data:
246
+ transformed["company_size_list"] = data["company_size"]
247
+
248
+ if "company_size_on_linkedin" in data:
249
+ transformed["organization_size"] = data["company_size_on_linkedin"]
250
+ transformed["company_size"] = data["company_size_on_linkedin"]
251
+
252
+ # Determine headquarters info from "hq" or "headquarters"
253
+ hq_data = data.get("hq") or data.get("headquarters")
254
+ if hq_data:
255
+ if isinstance(hq_data, dict):
256
+ city = hq_data.get("city", "")
257
+ state = hq_data.get("geographic_area", "")
258
+ country = hq_data.get("country", "")
259
+ # Join non-empty parts with a comma and a space
260
+ parts = [part for part in (city, state, country) if part]
261
+ transformed["organization_hq_location"] = ", ".join(parts)
262
+ else:
263
+ # If hq_data is not a dict, assume it's already in the desired format
264
+ transformed["organization_hq_location"] = hq_data
265
+
266
+ # Copy all other properties, excluding those already mapped
267
+ for key, value in data.items():
268
+ if key not in ("name", "website", "industry", "hq", "headquarters", "company_size"):
269
+ transformed[key] = value
270
+
271
+ return transformed
272
+
273
+
274
+ def _build_company_profile_params(
275
+ company_url: str,
276
+ profile_flags: Dict[str, Optional[str]],
277
+ ) -> Dict[str, str]:
278
+ """
279
+ Build request params for the Enrichlayer company profile endpoint,
280
+ ensuring we only forward flags that were explicitly provided.
281
+ """
282
+ params: Dict[str, str] = {'url': company_url}
283
+ for key, value in profile_flags.items():
284
+ if value is not None:
285
+ params[key] = value
286
+ return params
287
+
288
+
289
+ def _build_company_cache_key(identifier: str, profile_flags: Dict[str, Optional[str]]) -> str:
290
+ """
291
+ Builds a cache key that is unique for the combination of identifier
292
+ (LinkedIn URL or domain) and the optional enrichment flags.
293
+ """
294
+ suffix_bits = [
295
+ f"{key}={value}"
296
+ for key, value in sorted(profile_flags.items())
297
+ if value is not None
298
+ ]
299
+ if suffix_bits:
300
+ return f"{identifier}|{'&'.join(suffix_bits)}"
301
+ return identifier
302
+
303
+
304
+ def _bool_to_include_exclude(value: Optional[bool]) -> Optional[str]:
305
+ """
306
+ Convert a boolean flag into the string literals expected by Proxycurl.
307
+ True -> "include", False -> "exclude", None -> None (omit parameter).
308
+ """
309
+ if value is None:
310
+ return None
311
+ return "include" if value else "exclude"
312
+
313
+
314
+ @backoff.on_exception(
315
+ backoff.expo,
316
+ aiohttp.ClientResponseError,
317
+ max_tries=3,
318
+ giveup=lambda e: e.status != 429,
319
+ factor=10,
320
+ )
321
+ async def enrich_organization_info_from_proxycurl(
322
+ organization_domain: Optional[str] = None,
323
+ organization_linkedin_url: Optional[str] = None,
324
+ tool_config: Optional[List[Dict]] = None,
325
+ categories: Optional[bool] = None,
326
+ funding_data: Optional[bool] = None,
327
+ exit_data: Optional[bool] = None,
328
+ acquisitions: Optional[bool] = None,
329
+ extra: Optional[bool] = None,
330
+ use_cache: Optional[str] = "if-present",
331
+ fallback_to_cache: Optional[str] = "on-error",
332
+ ) -> Dict:
333
+ """
334
+ Fetch an organization's details from Proxycurl using either the organization domain or LinkedIn URL.
335
+ Additional keyword parameters map directly to the Enrichlayer Company Profile endpoint.
336
+
337
+ Args:
338
+ organization_domain: Organization's domain name to resolve via Proxycurl.
339
+ organization_linkedin_url: LinkedIn company profile URL.
340
+ tool_config: Optional tool configuration metadata for credential lookup.
341
+ categories/funding_data/exit_data/acquisitions/extra: Set True to request
342
+ "include", False for "exclude", or None to omit.
343
+ use_cache: Controls Proxycurl caching behaviour (e.g. "if-present").
344
+ fallback_to_cache: Controls Proxycurl cache fallback behaviour (e.g. "on-error").
345
+
346
+ Returns:
347
+ dict: Transformed JSON response containing organization information,
348
+ or {'error': ...} on error, or empty dict if not found.
349
+ """
350
+ logger.info("Entering enrich_organization_info_from_proxycurl")
351
+
352
+ try:
353
+ API_KEY = get_proxycurl_access_token(tool_config)
354
+ except ValueError as e:
355
+ return {"error": str(e)}
356
+
357
+ HEADERS = {
358
+ 'Authorization': f'Bearer {API_KEY}',
359
+ 'Content-Type': 'application/json'
360
+ }
361
+
362
+ if not organization_domain and not organization_linkedin_url:
363
+ logger.warning("No organization domain or LinkedIn URL provided.")
364
+ return {}
365
+
366
+ profile_flags: Dict[str, Optional[str]] = {
367
+ "categories": _bool_to_include_exclude(categories),
368
+ "funding_data": _bool_to_include_exclude(funding_data),
369
+ "exit_data": _bool_to_include_exclude(exit_data),
370
+ "acquisitions": _bool_to_include_exclude(acquisitions),
371
+ "extra": _bool_to_include_exclude(extra),
372
+ "use_cache": use_cache,
373
+ "fallback_to_cache": fallback_to_cache,
374
+ }
375
+
376
+ # If LinkedIn URL is provided, standardize it and fetch data
377
+ if organization_linkedin_url:
378
+ logger.debug(f"Organization LinkedIn URL provided: {organization_linkedin_url}")
379
+ if "linkedin.com/company" not in organization_linkedin_url:
380
+ logger.warning("Invalid LinkedIn URL provided." + organization_linkedin_url)
381
+ return {}
382
+ parsed_url = urlparse(organization_linkedin_url)
383
+ if parsed_url.netloc != 'www.linkedin.com':
384
+ standardized_netloc = 'www.linkedin.com'
385
+ standardized_path = parsed_url.path
386
+ if not standardized_path.startswith('/company/'):
387
+ standardized_path = '/company' + standardized_path
388
+ standardized_url = urlunparse(
389
+ parsed_url._replace(netloc=standardized_netloc, path=standardized_path)
390
+ )
391
+ if standardized_url and not standardized_url.endswith('/'):
392
+ standardized_url += '/'
393
+ else:
394
+ standardized_url = organization_linkedin_url
395
+ if standardized_url and not standardized_url.endswith('/'):
396
+ standardized_url += '/'
397
+
398
+ cache_key = _build_company_cache_key(standardized_url, profile_flags)
399
+ # Check cache for standardized LinkedIn URL
400
+ cached_response = retrieve_output("enrich_organization_info_from_proxycurl", cache_key)
401
+ if cached_response is not None:
402
+ logger.info(f"Cache hit for organization LinkedIn URL: {standardized_url}")
403
+ cached_response = transform_company_data(cached_response)
404
+ return cached_response
405
+
406
+ # Fetch details using standardized LinkedIn URL
407
+ url = 'https://enrichlayer.com/api/v2/company'
408
+ params = _build_company_profile_params(standardized_url, profile_flags)
409
+ logger.debug(f"Making request to Proxycurl with params: {params}")
410
+
411
+ async with aiohttp.ClientSession() as session:
412
+ try:
413
+ async with session.get(url, headers=HEADERS, params=params) as response:
414
+ logger.debug(f"Received response status: {response.status}")
415
+ if response.status == 200:
416
+ result = await response.json()
417
+ transformed_result = transform_company_data(result)
418
+ cache_output("enrich_organization_info_from_proxycurl", cache_key, transformed_result)
419
+ logger.info("Successfully retrieved and transformed organization info from Proxycurl by LinkedIn URL.")
420
+ return transformed_result
421
+ elif response.status == 429:
422
+ msg = "Rate limit exceeded"
423
+ logger.warning(msg)
424
+ await asyncio.sleep(30)
425
+ return {"error": msg}
426
+ elif response.status == 404:
427
+ error_text = await response.text()
428
+ logger.warning(
429
+ f"Proxycurl organization profile not found for LinkedIn URL {standardized_url}: {error_text}"
430
+ )
431
+ cache_output(
432
+ "enrich_organization_info_from_proxycurl", cache_key, {}
433
+ )
434
+ return {}
435
+ else:
436
+ error_text = await response.text()
437
+ logger.error(
438
+ f"Error from Proxycurl organization info fetch by URL: {error_text}"
439
+ )
440
+ return {}
441
+ except Exception as e:
442
+ logger.exception("Exception occurred while fetching organization info from Proxycurl by LinkedIn URL.")
443
+ return {"error": str(e)}
444
+
445
+ # If organization domain is provided, resolve domain to LinkedIn URL and fetch data
446
+ if organization_domain:
447
+ logger.debug(f"Organization domain provided: {organization_domain}")
448
+ domain_cache_key = _build_company_cache_key(organization_domain, profile_flags)
449
+ cached_response = retrieve_output("enrich_organization_info_from_proxycurl", domain_cache_key)
450
+ if cached_response is not None:
451
+ logger.info(f"Cache hit for organization domain: {organization_domain}")
452
+ return cached_response
453
+
454
+ resolve_url = 'https://enrichlayer.com/api/v2/company/resolve'
455
+ params = {'domain': organization_domain}
456
+ logger.debug(f"Making request to Proxycurl to resolve domain with params: {params}")
457
+
458
+ async with aiohttp.ClientSession() as session:
459
+ try:
460
+ async with session.get(resolve_url, headers=HEADERS, params=params) as response:
461
+ logger.debug(f"Received response status: {response.status}")
462
+ if response.status == 200:
463
+ company_data = await response.json()
464
+ company_url = company_data.get('url')
465
+ if company_url:
466
+ parsed_url = urlparse(company_url)
467
+ if parsed_url.netloc != 'www.linkedin.com':
468
+ standardized_netloc = 'www.linkedin.com'
469
+ standardized_path = parsed_url.path
470
+ if not standardized_path.startswith('/company/'):
471
+ standardized_path = '/company' + standardized_path
472
+ standardized_url = urlunparse(
473
+ parsed_url._replace(netloc=standardized_netloc, path=standardized_path)
474
+ )
475
+ else:
476
+ standardized_url = company_url
477
+
478
+ profile_url = 'https://enrichlayer.com/api/v2/company'
479
+ try:
480
+ profile_params = _build_company_profile_params(standardized_url, profile_flags)
481
+ async with session.get(profile_url, headers=HEADERS, params=profile_params) as profile_response:
482
+ logger.debug(f"Received profile response status: {profile_response.status}")
483
+ if profile_response.status == 200:
484
+ result = await profile_response.json()
485
+ transformed_result = transform_company_data(result)
486
+ cache_output("enrich_organization_info_from_proxycurl", domain_cache_key, transformed_result)
487
+ logger.info("Successfully retrieved and transformed organization info from Proxycurl by domain.")
488
+ return transformed_result
489
+ elif profile_response.status == 429:
490
+ msg = "Rate limit exceeded"
491
+ logger.warning(msg)
492
+ await asyncio.sleep(30)
493
+ return {"error": msg}
494
+ else:
495
+ error_text = await profile_response.text()
496
+ logger.error(f"Error from Proxycurl organization profile fetch by resolved domain: {error_text}")
497
+ return {}
498
+ except Exception as e:
499
+ logger.exception("Exception occurred while fetching organization profile data.")
500
+ return {"error": str(e)}
501
+ else:
502
+ logger.warning("Company URL not found for the provided domain.")
503
+ return {}
504
+ elif response.status == 429:
505
+ msg = "Rate limit exceeded"
506
+ logger.warning(msg)
507
+ await asyncio.sleep(30)
508
+ return {"error": msg}
509
+ elif response.status == 404:
510
+ msg = "Item not found"
511
+ logger.warning(msg)
512
+ cache_output("enrich_organization_info_from_proxycurl", domain_cache_key, {})
513
+ return {}
514
+ else:
515
+ error_text = await response.text()
516
+ logger.error(f"Error from Proxycurl domain resolve: {error_text}")
517
+ return {}
518
+ except Exception as e:
519
+ logger.exception("Exception occurred while resolving organization domain on Proxycurl.")
520
+ return {"error": str(e)}
521
+
522
+ return {}
523
+
524
+
525
+ @assistant_tool
526
+ @backoff.on_exception(
527
+ backoff.expo,
528
+ aiohttp.ClientResponseError,
529
+ max_tries=3,
530
+ giveup=lambda e: e.status != 429,
531
+ factor=10,
532
+ )
533
+ async def enrich_job_info_from_proxycurl(
534
+ job_url: Optional[str] = None,
535
+ tool_config: Optional[List[Dict]] = None
536
+ ) -> Dict:
537
+ """
538
+ Fetch a job's details from Proxycurl using the job URL.
539
+
540
+ Returns:
541
+ dict: JSON response containing job information or error.
542
+ """
543
+ logger.info("Entering enrich_job_info_from_proxycurl")
544
+
545
+ try:
546
+ API_KEY = get_proxycurl_access_token(tool_config)
547
+ except ValueError as e:
548
+ return {"error": str(e)}
549
+
550
+ HEADERS = {
551
+ 'Authorization': f'Bearer {API_KEY}',
552
+ 'Content-Type': 'application/json'
553
+ }
554
+
555
+ if not job_url:
556
+ logger.warning("No job URL provided.")
557
+ return {'error': "Job URL must be provided"}
558
+
559
+ # Check cache
560
+ cached_response = retrieve_output("enrich_job_info_from_proxycurl", job_url)
561
+ if cached_response is not None:
562
+ logger.info(f"Cache hit for job URL: {job_url}")
563
+ return cached_response
564
+
565
+ params = {'url': job_url}
566
+ api_endpoint = 'https://enrichlayer.com/api/v2/job'
567
+ logger.debug(f"Making request to Proxycurl for job info with params: {params}")
568
+
569
+ async with aiohttp.ClientSession() as session:
570
+ try:
571
+ async with session.get(api_endpoint, headers=HEADERS, params=params) as response:
572
+ logger.debug(f"Received response status: {response.status}")
573
+ if response.status == 200:
574
+ result = await response.json()
575
+ cache_output("enrich_job_info_from_proxycurl", job_url, result)
576
+ logger.info("Successfully retrieved job info from Proxycurl.")
577
+ return result
578
+ elif response.status == 429:
579
+ msg = "Rate limit exceeded"
580
+ logger.warning(msg)
581
+ await asyncio.sleep(30)
582
+ return {'error': msg}
583
+ elif response.status == 404:
584
+ msg = "Job not found"
585
+ logger.warning(msg)
586
+ cache_output("enrich_job_info_from_proxycurl", job_url, {'error': msg})
587
+ return {'error': msg}
588
+ else:
589
+ error_text = await response.text()
590
+ logger.error(f"Error from Proxycurl: {error_text}")
591
+ return {'error': error_text}
592
+ except Exception as e:
593
+ logger.exception("Exception occurred while fetching job info from Proxycurl.")
594
+ return {"error": str(e)}
595
+
596
+
597
+ @assistant_tool
598
+ @backoff.on_exception(
599
+ backoff.expo,
600
+ aiohttp.ClientResponseError,
601
+ max_tries=3,
602
+ giveup=lambda e: e.status != 429,
603
+ factor=10,
604
+ )
605
+ async def search_recent_job_changes(
606
+ job_titles: List[str],
607
+ locations: List[str],
608
+ max_items_to_return: int = 100,
609
+ tool_config: Optional[List[Dict]] = None
610
+ ) -> List[dict]:
611
+ """
612
+ Search for individuals with specified job titles and locations who have recently changed jobs.
613
+
614
+ Returns:
615
+ List[dict]: List of individuals matching the criteria, or empty list on failure/error.
616
+ """
617
+ logger.info("Entering search_recent_job_changes")
618
+
619
+ try:
620
+ API_KEY = get_proxycurl_access_token(tool_config)
621
+ except ValueError as e:
622
+ logger.error(str(e))
623
+ return []
624
+
625
+ HEADERS = {
626
+ 'Authorization': f'Bearer {API_KEY}',
627
+ 'Content-Type': 'application/json'
628
+ }
629
+
630
+ url = 'https://enrichlayer.com/api/v2/search/person'
631
+ results = []
632
+ page = 1
633
+ per_page = min(max_items_to_return, 100)
634
+
635
+ logger.debug(f"Starting search with job_titles={job_titles}, locations={locations}, max_items={max_items_to_return}")
636
+
637
+ async with aiohttp.ClientSession() as session:
638
+ while len(results) < max_items_to_return:
639
+ params = {
640
+ 'job_title': ','.join(job_titles),
641
+ 'location': ','.join(locations),
642
+ 'page': page,
643
+ 'num_records': per_page
644
+ }
645
+ logger.debug(f"Request params: {params}")
646
+
647
+ try:
648
+ async with session.get(url, headers=HEADERS, params=params) as response:
649
+ logger.debug(f"Received response status: {response.status}")
650
+ if response.status == 200:
651
+ data = await response.json()
652
+ people = data.get('persons', [])
653
+ if not people:
654
+ logger.info("No more people found, ending search.")
655
+ break
656
+ results.extend(people)
657
+ logger.info(f"Fetched {len(people)} results on page {page}. Total so far: {len(results)}")
658
+ page += 1
659
+ if len(results) >= max_items_to_return:
660
+ logger.info("Reached max items limit.")
661
+ break
662
+ elif response.status == 429:
663
+ msg = "Rate limit exceeded"
664
+ logger.warning(msg)
665
+ await asyncio.sleep(30)
666
+ # Without raising, won't trigger another backoff retry
667
+ # so just continue or break as desired:
668
+ continue
669
+ else:
670
+ error_text = await response.text()
671
+ logger.error(f"Error while searching recent job changes: {error_text}")
672
+ break
673
+ except Exception:
674
+ logger.exception("Exception occurred while searching recent job changes.")
675
+ break
676
+
677
+ return results[:max_items_to_return]
678
+
679
+
680
+ @assistant_tool
681
+ async def find_matching_job_posting_proxy_curl(
682
+ company_name: str,
683
+ keywords_check: List[str],
684
+ optional_keywords: List[str],
685
+ organization_linkedin_url: Optional[str] = None,
686
+ tool_config: Optional[List[Dict]] = None
687
+ ) -> List[str]:
688
+ """
689
+ Find job postings on LinkedIn for a given company using Google Custom Search,
690
+ then optionally validate those links with Proxycurl.
691
+
692
+ Returns:
693
+ List[str]: A list of matching job posting links.
694
+ """
695
+ logger.info("Entering find_matching_job_posting_proxy_curl")
696
+
697
+ if not company_name:
698
+ logger.warning("No company name provided.")
699
+ return []
700
+
701
+ if not keywords_check:
702
+ logger.warning("No keywords_check provided, defaulting to an empty list.")
703
+ keywords_check = []
704
+
705
+ if not optional_keywords:
706
+ logger.warning("No optional_keywords provided, defaulting to an empty list.")
707
+ optional_keywords = []
708
+
709
+ keywords_list = [kw.strip().lower() for kw in keywords_check]
710
+ job_posting_links = []
711
+
712
+ # Build the search query
713
+ keywords_str = ' '.join(f'"{kw}"' for kw in keywords_check)
714
+ optional_keywords_str = ' '.join(f'{kw}' for kw in optional_keywords)
715
+ query = f'site:*linkedin.com/jobs/view/ "{company_name}" {keywords_str} {optional_keywords_str}'
716
+ logger.debug(f"Google search query: {query}")
717
+
718
+ # First Google search attempt
719
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
720
+ if not isinstance(results, list) or len(results) == 0:
721
+ logger.info("No results found. Attempting fallback query without optional keywords.")
722
+ query = f'site:*linkedin.com/jobs/view/ "{company_name}" {keywords_str}'
723
+ results = await search_google_with_tools(query.strip(), 1, tool_config=tool_config)
724
+ if not isinstance(results, list) or len(results) == 0:
725
+ logger.info("No job postings found in fallback search either.")
726
+ return job_posting_links
727
+
728
+ # Process each search result
729
+ for result_item in results:
730
+ try:
731
+ result_json = json.loads(result_item)
732
+ except json.JSONDecodeError:
733
+ logger.debug("Skipping invalid JSON result.")
734
+ continue
735
+
736
+ link = result_json.get('link', '')
737
+ if not link:
738
+ logger.debug("No link in result; skipping.")
739
+ continue
740
+
741
+ if "linkedin.com/jobs/view/" not in link:
742
+ logger.debug("Link is not a LinkedIn job posting; skipping.")
743
+ continue
744
+
745
+ # Normalize the LinkedIn domain to www.linkedin.com
746
+ parsed = urlparse(link)
747
+ new_link = parsed._replace(netloc="www.linkedin.com").geturl()
748
+ link = new_link
749
+
750
+ # Use Proxycurl to enrich job info
751
+ logger.debug(f"Fetching job info from Proxycurl for link: {link}")
752
+ json_result = await enrich_job_info_from_proxycurl(link, tool_config=tool_config)
753
+ if not json_result or 'error' in json_result:
754
+ logger.debug("No valid job info returned; skipping.")
755
+ continue
756
+
757
+ text = json.dumps(json_result).lower()
758
+
759
+ # If the user gave an organization_linkedin_url, check if it matches
760
+ company_match = False
761
+ if organization_linkedin_url and json_result.get('company', {}):
762
+ result_url = json_result.get('company', {}).get('url', '').lower()
763
+ result_path = urlparse(result_url).path
764
+ company_path = urlparse(organization_linkedin_url.lower()).path
765
+ company_match = (result_path == company_path)
766
+ else:
767
+ company_match = False
768
+
769
+ keywords_found = any(kw in text for kw in keywords_list)
770
+
771
+ # If company matches and keywords are found, add to results
772
+ if company_match and keywords_found:
773
+ job_posting_links.append(link)
774
+
775
+ logger.info(f"Found {len(job_posting_links)} matching job postings.")
776
+ return job_posting_links
777
+
778
+
779
+ def fill_in_missing_properties(input_user_properties: dict, person_data: dict) -> dict:
780
+ """
781
+ If input_user_properties has a non-empty value for a field, keep it.
782
+ Otherwise, use that field from person_data.
783
+ """
784
+
785
+ def is_empty(value):
786
+ # Checks for None, empty string, or string with only whitespace
787
+ return value is None or (isinstance(value, str) and not value.strip())
788
+
789
+ # Email - use first personal email if input is empty
790
+ if is_empty(input_user_properties.get("email")):
791
+ personal_emails = person_data.get("personal_emails")
792
+ if isinstance(personal_emails, list) and personal_emails:
793
+ input_user_properties["email"] = personal_emails[0]
794
+
795
+ # Phone
796
+ if is_empty(input_user_properties.get("phone")):
797
+ input_user_properties["phone"] = person_data.get("contact", {}).get("sanitized_phone", "")
798
+
799
+ # Full name
800
+ if person_data.get("full_name"):
801
+ input_user_properties["full_name"] = person_data["full_name"]
802
+
803
+ # First name
804
+ if person_data.get("first_name"):
805
+ input_user_properties["first_name"] = person_data["first_name"]
806
+
807
+ # Last name
808
+ if person_data.get("last_name"):
809
+ input_user_properties["last_name"] = person_data["last_name"]
810
+
811
+ # Occupation -> job_title
812
+ if person_data.get("occupation"):
813
+ input_user_properties["job_title"] = person_data["occupation"]
814
+
815
+ # Headline
816
+ if person_data.get("headline"):
817
+ input_user_properties["headline"] = person_data["headline"]
818
+
819
+ # Summary
820
+ if is_empty(input_user_properties.get("summary_about_lead")) and person_data.get("summary"):
821
+ input_user_properties["summary_about_lead"] = person_data["summary"]
822
+
823
+ # Experiences
824
+ experiences = person_data.get("experiences", [])
825
+ if experiences:
826
+ # Current role data
827
+
828
+ input_user_properties["organization_name"] = experiences[0].get("company", "")
829
+
830
+ org_url = experiences[0].get("company_linkedin_profile_url", "")
831
+ if org_url and is_empty(input_user_properties.get("organization_linkedin_url")):
832
+ input_user_properties["organization_linkedin_url"] = org_url
833
+
834
+ # If there's a second experience, track it as previous
835
+ if len(experiences) > 1:
836
+ previous_org = experiences[1]
837
+ prev_org_url = previous_org.get("company_linkedin_profile_url", "")
838
+
839
+ if prev_org_url and is_empty(input_user_properties.get("previous_organization_linkedin_url")):
840
+ input_user_properties["previous_organization_linkedin_url"] = prev_org_url
841
+
842
+ if is_empty(input_user_properties.get("previous_organization_name")):
843
+ input_user_properties["previous_organization_name"] = previous_org.get("company", "")
844
+
845
+ # Combine city/state if available (and if lead_location is empty); avoid literal "None"
846
+ if is_empty(input_user_properties.get("lead_location")):
847
+ city = person_data.get("city")
848
+ state = person_data.get("state")
849
+ parts = []
850
+ for value in (city, state):
851
+ if value is None:
852
+ continue
853
+ s = str(value).strip()
854
+ if not s or s.lower() == "none":
855
+ continue
856
+ parts.append(s)
857
+ if parts:
858
+ input_user_properties["lead_location"] = ", ".join(parts)
859
+
860
+ # LinkedIn Followers Count
861
+ if is_empty(input_user_properties.get("linkedin_follower_count")):
862
+ input_user_properties["linkedin_follower_count"] = person_data.get("follower_count", 0)
863
+
864
+ return input_user_properties
865
+
866
+
867
+
868
+ async def enrich_user_info_with_proxy_curl(input_user_properties: dict, tool_config: Optional[List[Dict]] = None) -> dict:
869
+ """
870
+ Enriches the user info (input_user_properties) with data from Proxycurl.
871
+ If the user_linkedin_url is determined to be a proxy (acw* and length > 10),
872
+ we skip calling enrich_person_info_from_proxycurl, keep the input as-is,
873
+ and only perform the organization enrichment logic.
874
+
875
+ Returns:
876
+ dict: Updated input_user_properties with enriched data or
877
+ with an error field if something goes wrong.
878
+ """
879
+ logger.info("Entering enrich_user_info_with_proxy_curl")
880
+
881
+ if not input_user_properties:
882
+ logger.warning("No input_user_properties provided; returning empty dict.")
883
+ return {}
884
+
885
+ linkedin_url = input_user_properties.get("user_linkedin_url", "")
886
+ email = input_user_properties.get("email", "")
887
+ user_data_from_proxycurl = None
888
+
889
+ logger.debug(f"Attempting to enrich data for LinkedIn URL='{linkedin_url}', Email='{email}'")
890
+
891
+ # ---------------------------------------------------------------
892
+ # 1) Detect if the LinkedIn URL is a "proxy" URL (acw + length > 10)
893
+ # ---------------------------------------------------------------
894
+ def is_proxy_linkedin_url(url: str) -> bool:
895
+ """
896
+ Checks if the LinkedIn URL has an /in/<profile_id> path
897
+ that starts with 'acw' and has length > 10, indicating a proxy.
898
+ """
899
+ match = re.search(r"linkedin\.com/in/([^/]+)", url, re.IGNORECASE)
900
+ if match:
901
+ profile_id = match.group(1)
902
+ if profile_id.startswith("acw") and len(profile_id) > 10:
903
+ return True
904
+ return False
905
+
906
+ if is_proxy_linkedin_url(linkedin_url):
907
+ logger.info("The LinkedIn URL appears to be a proxy URL. Skipping user data enrichment from Proxycurl.")
908
+ # We do NOT call enrich_person_info_from_proxycurl for user data.
909
+ # We just set linkedin_url_match = False and enrich organization info if possible:
910
+ input_user_properties["linkedin_url_match"] = False
911
+
912
+ # Attempt organization enrichment if we have an organization_linkedin_url:
913
+ company_data = {}
914
+ if input_user_properties.get("organization_linkedin_url"):
915
+ company_data = await enrich_organization_info_from_proxycurl(
916
+ organization_linkedin_url=input_user_properties["organization_linkedin_url"],
917
+ tool_config=tool_config
918
+ )
919
+ if company_data and not company_data.get("error"):
920
+ if company_data.get("organization_linkedin_url"):
921
+ input_user_properties["organization_linkedin_url"] = company_data.get("organization_linkedin_url", "")
922
+ if company_data.get("organization_name"):
923
+ input_user_properties["organization_name"] = company_data.get("organization_name", "")
924
+ input_user_properties["organization_size"] = str(
925
+ company_data.get("company_size_on_linkedin", "")
926
+ )
927
+ input_user_properties["company_size"] = str(
928
+ company_data.get("company_size_on_linkedin", "")
929
+ )
930
+ input_user_properties["organization_industry"] = company_data.get("organization_industry", "")
931
+ input_user_properties["industry"] = company_data.get("organization_industry", "")
932
+ input_user_properties["organization_revenue"] = ""
933
+
934
+ # Always clean & store any returned org info:
935
+ additional_props = input_user_properties.get("additional_properties") or {}
936
+ company_data = cleanup_properties(company_data)
937
+ additional_props["pc_company_data"] = json.dumps(company_data)
938
+ input_user_properties["additional_properties"] = additional_props
939
+
940
+ logger.info("Returning after skipping user enrichment for proxy URL.")
941
+ return input_user_properties
942
+
943
+ # ----------------------------------------------------------------
944
+ # 2) If not proxy, proceed with normal user enrichment logic
945
+ # ----------------------------------------------------------------
946
+ if linkedin_url or email:
947
+ user_data = await enrich_person_info_from_proxycurl(
948
+ linkedin_url=linkedin_url,
949
+ email=email,
950
+ tool_config=tool_config
951
+ )
952
+ if not user_data or 'error' in user_data:
953
+ logger.warning("No valid person data found by LinkedIn or email.")
954
+ else:
955
+ user_data_from_proxycurl = user_data
956
+ if linkedin_url:
957
+ logger.info(f"User data found for LinkedIn URL: {linkedin_url}")
958
+ input_user_properties["user_linkedin_url"] = linkedin_url
959
+ else:
960
+ # Otherwise, fallback to name-based lookup
961
+ first_name = input_user_properties.get("first_name", "")
962
+ last_name = input_user_properties.get("last_name", "")
963
+ full_name = input_user_properties.get("full_name", "")
964
+
965
+ if not first_name or not last_name:
966
+ if full_name:
967
+ name_parts = full_name.split(" ", 1)
968
+ first_name = first_name or name_parts[0]
969
+ if len(name_parts) > 1:
970
+ last_name = last_name or name_parts[1]
971
+
972
+ if not full_name:
973
+ full_name = f"{first_name} {last_name}".strip()
974
+
975
+ company = input_user_properties.get("organization_name", "")
976
+ logger.debug(f"Looking up person by name: {first_name} {last_name}, company: {company}")
977
+
978
+ if first_name and last_name:
979
+ lookup_result = await lookup_person_in_proxy_curl_by_name(
980
+ first_name=first_name,
981
+ last_name=last_name,
982
+ company_name=company,
983
+ tool_config=tool_config
984
+ )
985
+ # Expecting a dict (search_result)
986
+ if lookup_result and not lookup_result.get('error'):
987
+ results = lookup_result.get("results", [])
988
+ person_company = ""
989
+ for person in results:
990
+ linkedin_profile_url = person.get("linkedin_profile_url", "")
991
+ if linkedin_profile_url:
992
+ data_from_proxycurl = await enrich_person_info_from_proxycurl(
993
+ linkedin_url=linkedin_profile_url,
994
+ tool_config=tool_config
995
+ )
996
+ if data_from_proxycurl and not data_from_proxycurl.get('error'):
997
+ person_name = data_from_proxycurl.get("name", "").lower()
998
+ person_first_name = data_from_proxycurl.get("first_name", "").lower()
999
+ person_last_name = data_from_proxycurl.get("last_name", "").lower()
1000
+ experiences = data_from_proxycurl.get('experiences', [])
1001
+ for exp in experiences:
1002
+ exp_company = exp.get("company", "").lower()
1003
+ if exp_company == company.lower():
1004
+ person_company = exp_company
1005
+ break
1006
+
1007
+ if (
1008
+ (person_name == full_name.lower() or
1009
+ (person_first_name == first_name.lower() and person_last_name == last_name.lower()))
1010
+ and (not company or person_company == company.lower())
1011
+ ):
1012
+ logger.info(f"User data found for name: {full_name}")
1013
+ input_user_properties["user_linkedin_url"] = linkedin_profile_url
1014
+ user_data_from_proxycurl = data_from_proxycurl
1015
+ break
1016
+
1017
+ if not user_data_from_proxycurl:
1018
+ logger.debug("No user data returned from Proxycurl.")
1019
+ input_user_properties["linkedin_url_match"] = False
1020
+ return input_user_properties
1021
+
1022
+ # ------------------------------------------------------------------
1023
+ # 3) If user data was found, sanitize & fill user properties
1024
+ # ------------------------------------------------------------------
1025
+ url_pattern = re.compile(r'(https?://[^\s]+)', re.IGNORECASE)
1026
+
1027
+ def sanitize_urls_in_data(data):
1028
+ """
1029
+ Recursively walk through 'data' and remove any URL that is not under linkedin.com domain.
1030
+ """
1031
+ if isinstance(data, dict):
1032
+ sanitized = {}
1033
+ for k, v in data.items():
1034
+ sanitized[k] = sanitize_urls_in_data(v)
1035
+ return sanitized
1036
+ elif isinstance(data, list):
1037
+ return [sanitize_urls_in_data(item) for item in data]
1038
+ elif isinstance(data, str):
1039
+ def replace_non_linkedin(match):
1040
+ link = match.group(1)
1041
+ if "linkedin.com" not in (urlparse(link).netloc or ""):
1042
+ return ""
1043
+ return link
1044
+ return re.sub(url_pattern, replace_non_linkedin, data)
1045
+ return data
1046
+
1047
+ person_data = sanitize_urls_in_data(user_data_from_proxycurl)
1048
+ additional_props = input_user_properties.get("additional_properties") or {}
1049
+
1050
+ # Check if there's a match on first/last name
1051
+ first_matched = (
1052
+ input_user_properties.get("first_name")
1053
+ and person_data.get("first_name") == input_user_properties["first_name"]
1054
+ )
1055
+ last_matched = (
1056
+ input_user_properties.get("last_name")
1057
+ and person_data.get("last_name") == input_user_properties["last_name"]
1058
+ )
1059
+
1060
+ if first_matched and last_matched:
1061
+ input_user_properties["linkedin_url_match"] = True
1062
+ input_user_properties["linkedin_validation_status"] = "valid"
1063
+
1064
+ input_user_properties = fill_in_missing_properties(input_user_properties, person_data)
1065
+
1066
+ # ------------------------------------------------------------------
1067
+ # 4) Attempt organization enrichment if we have an org LinkedIn URL
1068
+ # ------------------------------------------------------------------
1069
+ company_data = {}
1070
+ if input_user_properties.get("organization_linkedin_url"):
1071
+ company_data = await enrich_organization_info_from_proxycurl(
1072
+ organization_linkedin_url=input_user_properties["organization_linkedin_url"],
1073
+ tool_config=tool_config
1074
+ )
1075
+ if company_data and not company_data.get("error"):
1076
+ if company_data.get("organization_linkedin_url"):
1077
+ input_user_properties["organization_linkedin_url"] = company_data.get("organization_linkedin_url", "")
1078
+ if company_data.get("organization_name"):
1079
+ input_user_properties["organization_name"] = company_data.get("organization_name", "")
1080
+ input_user_properties["organization_size"] = str(
1081
+ company_data.get("company_size_on_linkedin", "")
1082
+ )
1083
+ input_user_properties["company_size"] = str(
1084
+ company_data.get("company_size_on_linkedin", "")
1085
+ )
1086
+ input_user_properties["company_size_list"] = company_data.get("company_size", "")
1087
+ input_user_properties["organization_industry"] = company_data.get("organization_industry", "")
1088
+ input_user_properties["industry"] = company_data.get("organization_industry", "")
1089
+ input_user_properties["organization_revenue"] = ""
1090
+
1091
+ person_data = cleanup_properties(person_data)
1092
+ additional_props["pc_person_data"] = json.dumps(person_data)
1093
+
1094
+ company_data = cleanup_properties(company_data)
1095
+ additional_props["pc_company_data"] = json.dumps(company_data)
1096
+ input_user_properties["additional_properties"] = additional_props
1097
+
1098
+ logger.info("Enrichment of user info with Proxycurl complete.")
1099
+ return input_user_properties
1100
+
1101
+
1102
+
1103
+
1104
+
1105
+ @assistant_tool
1106
+ async def find_leads_by_job_openings_proxy_curl(
1107
+ query_params: Dict[str, Any],
1108
+ hiring_manager_roles: List[str],
1109
+ tool_config: Optional[List[Dict]] = None,
1110
+ ) -> List[Dict]:
1111
+ """Search LinkedIn job postings using Proxycurl and find hiring manager leads.
1112
+
1113
+ Args:
1114
+ query_params: Dictionary of parameters to Proxycurl job search API. The
1115
+ key ``job_title`` is required. Other keys like ``location`` may also
1116
+ be supplied.
1117
+ hiring_manager_roles: List of job titles to lookup at the company for
1118
+ potential hiring managers.
1119
+ tool_config: Optional configuration containing Proxycurl credentials.
1120
+
1121
+ Returns:
1122
+ A list of lead dictionaries with normalized keys such as
1123
+ ``first_name``, ``last_name``, ``user_linkedin_url``,
1124
+ ``organization_name``, and ``organization_linkedin_url``.
1125
+ """
1126
+ logger.info("Entering find_leads_by_job_openings_proxy_curl")
1127
+
1128
+ if not isinstance(query_params, dict) or not query_params.get("job_title"):
1129
+ logger.warning("query_params must include 'job_title'")
1130
+ return []
1131
+
1132
+ try:
1133
+ API_KEY = get_proxycurl_access_token(tool_config)
1134
+ except ValueError as e:
1135
+ logger.error(str(e))
1136
+ return []
1137
+
1138
+ headers = {
1139
+ "Authorization": f"Bearer {API_KEY}",
1140
+ "Content-Type": "application/json",
1141
+ }
1142
+
1143
+ job_search_url = "https://enrichlayer.com/api/v2/company/job"
1144
+ leads: List[Dict] = []
1145
+
1146
+ # ------------------------------------------------------------------
1147
+ # 1) Look up job openings
1148
+ # ------------------------------------------------------------------
1149
+ try:
1150
+ async with aiohttp.ClientSession() as session:
1151
+ async with session.get(job_search_url, headers=headers, params=query_params) as resp:
1152
+ if resp.status == 200:
1153
+ job_result = await resp.json()
1154
+ jobs = job_result.get("results") or job_result.get("jobs") or []
1155
+ elif resp.status == 429:
1156
+ logger.warning("Rate limit exceeded on job search")
1157
+ await asyncio.sleep(30)
1158
+ return []
1159
+ else:
1160
+ error_text = await resp.text()
1161
+ logger.error("Job search error %s: %s", resp.status, error_text)
1162
+ return []
1163
+ except Exception:
1164
+ logger.exception("Exception while searching jobs on Proxycurl")
1165
+ return []
1166
+
1167
+ # ------------------------------------------------------------------
1168
+ # 2) For each job, find leads for specified hiring manager roles
1169
+ # ------------------------------------------------------------------
1170
+ for job in jobs:
1171
+ company = job.get("company", {}) if isinstance(job, dict) else {}
1172
+ company_name = company.get("name", "")
1173
+ company_url = company.get("url", "")
1174
+ if not company_name:
1175
+ continue
1176
+
1177
+ for role in hiring_manager_roles:
1178
+ employee_params = {
1179
+ "url": company_url,
1180
+ "role_search": role,
1181
+ "employment_status": "current",
1182
+ "page_size": 1,
1183
+ }
1184
+ employees = []
1185
+ try:
1186
+ async with aiohttp.ClientSession() as session:
1187
+ async with session.get(
1188
+ "https://enrichlayer.com/api/v2/company/employees",
1189
+ headers=headers,
1190
+ params=employee_params,
1191
+ ) as e_resp:
1192
+ if e_resp.status == 200:
1193
+ data = await e_resp.json()
1194
+ employees = data.get("employees") or data.get("profiles") or []
1195
+ elif e_resp.status == 429:
1196
+ logger.warning("Rate limit exceeded while fetching employees")
1197
+ await asyncio.sleep(30)
1198
+ continue
1199
+ except Exception:
1200
+ logger.exception("Exception while fetching employees from Proxycurl")
1201
+ continue
1202
+
1203
+ for emp in employees:
1204
+ profile_url = emp.get("linkedin_profile_url") or emp.get("profile_url")
1205
+ if not profile_url:
1206
+ continue
1207
+ person = await enrich_person_info_from_proxycurl(
1208
+ linkedin_url=profile_url, tool_config=tool_config
1209
+ )
1210
+ if not person or person.get("error"):
1211
+ continue
1212
+ lead = {
1213
+ "first_name": person.get("first_name", ""),
1214
+ "last_name": person.get("last_name", ""),
1215
+ "full_name": person.get("full_name", ""),
1216
+ "user_linkedin_url": profile_url,
1217
+ "job_title": person.get("occupation", role),
1218
+ "organization_name": company_name,
1219
+ "organization_linkedin_url": company_url,
1220
+ }
1221
+ cleaned = cleanup_properties(lead)
1222
+ if cleaned:
1223
+ leads.append(cleaned)
1224
+
1225
+ logger.info("Returning %d leads from Proxycurl job search", len(leads))
1226
+ return leads