dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,101 @@
1
+ from typing import Any, Dict
2
+ import base64
3
+ import re
4
+
5
+ def decode_base64(data: str) -> str:
6
+ """
7
+ Decode a base64- or web-safe-base64-encoded string.
8
+ """
9
+ if not data:
10
+ return ""
11
+ missing_padding = len(data) % 4
12
+ if missing_padding:
13
+ data += '=' * (4 - missing_padding)
14
+ return base64.urlsafe_b64decode(data).decode('utf-8', errors='ignore')
15
+
16
+ def html_to_text(html: str) -> str:
17
+ """
18
+ (Optional) Convert HTML to plain text using a simple regex.
19
+ This is not bulletproof for all HTML, but often fine for short email bodies.
20
+ """
21
+ text = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE) # <br> to newline
22
+ text = re.sub(r'<.*?>', '', text) # remove remaining tags
23
+ return text.strip()
24
+
25
+ def get_text_content(payload: Dict[str, Any]) -> str:
26
+ """
27
+ Recursively extract 'text/plain' content from the Gmail message payload.
28
+ If no text/plain is found, fallback to 'text/html' (converted to plain text).
29
+ """
30
+ # If there's a direct 'parts' list, we may need to walk each part.
31
+ if 'parts' in payload:
32
+ extracted = []
33
+ for part in payload['parts']:
34
+ extracted.append(get_text_content(part))
35
+ return "\n".join(filter(None, extracted))
36
+
37
+ # If this part has a mimeType and a body, try to decode.
38
+ mime_type = payload.get('mimeType', '')
39
+ body_data = payload.get('body', {}).get('data', '')
40
+
41
+ # If it's text/plain, decode base64 and return
42
+ if mime_type == 'text/plain' and body_data:
43
+ return decode_base64(body_data)
44
+
45
+ # If it's text/html (and we haven't returned text/plain yet), fallback
46
+ if mime_type == 'text/html' and body_data:
47
+ html_content = decode_base64(body_data)
48
+ return html_to_text(html_content)
49
+
50
+ return ""
51
+
52
+ def trim_repeated_quoted_lines(text: str) -> str:
53
+ """
54
+ (Optional) Try to remove repeated or quoted content from replies.
55
+ This is a naive approach—real-world heuristics can get quite complicated.
56
+ """
57
+ # Common patterns: lines starting with ">"
58
+ # or lines starting with "On <date>, <someone> wrote:"
59
+ lines = text.splitlines()
60
+ filtered_lines = []
61
+ for line in lines:
62
+ if line.startswith(">"):
63
+ continue
64
+ # You can add more heuristics for removing signature blocks or repeated disclaimers
65
+ filtered_lines.append(line)
66
+ return "\n".join(filtered_lines).strip()
67
+
68
+ def extract_email_content_for_llm(email_details: Dict[str, Any]) -> str:
69
+ """
70
+ Cleans up, extracts, and formats the relevant text content from a single Gmail message.
71
+ If you want the entire thread, call the Gmail API for all messages in the thread and
72
+ combine them. This function handles one message in detail, recursively extracting
73
+ text from multiple MIME parts.
74
+ """
75
+ if not email_details or 'payload' not in email_details:
76
+ return "No valid email details found."
77
+
78
+ # Extract basic headers
79
+ headers_map = {h['name']: h['value'] for h in email_details['payload'].get('headers', [])}
80
+
81
+ sender = headers_map.get('From', 'Unknown Sender')
82
+ receiver = headers_map.get('To', 'Unknown Receiver')
83
+ date = headers_map.get('Date', 'Unknown Date')
84
+ subject = headers_map.get('Subject', 'No Subject')
85
+
86
+ # Recursively get text from payload
87
+ body = get_text_content(email_details['payload'])
88
+
89
+ # Optionally remove some repeated lines
90
+ body = trim_repeated_quoted_lines(body)
91
+
92
+ # Format final string
93
+ formatted_content = (
94
+ f"From: {sender}\n"
95
+ f"To: {receiver}\n"
96
+ f"Date: {date}\n"
97
+ f"Subject: {subject}\n\n"
98
+ f"{body}"
99
+ )
100
+
101
+ return formatted_content
@@ -0,0 +1,129 @@
1
+ """
2
+ Unified OpenAI / Azure OpenAI helper (no env-fallback for secrets)
3
+ =================================================================
4
+
5
+ Resolution order
6
+ ----------------
7
+ 1. If `tool_config` has a **"openai"** block → public OpenAI
8
+ 2. Else if it has an **"azure_openai"** block → Azure OpenAI
9
+ 3. Otherwise → raise ValueError
10
+
11
+ `api_key` **and** `endpoint` (for Azure) must therefore be supplied in
12
+ `tool_config`. They will never be read from the host environment.
13
+
14
+ Optional:
15
+ • `AZURE_OPENAI_API_VERSION` – defaults to 2025-03-01-preview
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import os
21
+ from typing import Dict, List, Optional, Tuple, Union
22
+
23
+ from openai import AsyncOpenAI, OpenAI, AzureOpenAI, AsyncAzureOpenAI
24
+
25
+
26
+ # ─────────────────────────────────────────────────────────────────────────────
27
+ # 1. Helpers: config parsing
28
+ # ─────────────────────────────────────────────────────────────────────────────
29
+
30
+ def _extract_config(
31
+ tool_config: Optional[List[Dict]], provider_name: str
32
+ ) -> Dict[str, str]:
33
+ """Return the config map for the requested provider name, else {}."""
34
+ if not tool_config:
35
+ return {}
36
+ block = next((b for b in tool_config if b.get("name") == provider_name), {})
37
+ return {entry["name"]: entry["value"] for entry in block.get("configuration", []) if entry}
38
+
39
+
40
+ def _discover_credentials(
41
+ tool_config: Optional[List[Dict]] = None,
42
+ ) -> Tuple[str, str, Optional[str]]:
43
+ """
44
+ Return (provider, api_key, endpoint_or_None).
45
+
46
+ provider ∈ {"public", "azure"}
47
+ """
48
+ # 1️⃣ Public OpenAI
49
+ openai_cfg = _extract_config(tool_config, "openai")
50
+ if openai_cfg:
51
+ key = openai_cfg.get("apiKey")
52
+ if not key:
53
+ raise ValueError(
54
+ "OpenAI integration is not configured. Please configure the connection to OpenAI in Integrations."
55
+ )
56
+ return "public", key, None
57
+
58
+ # 2️⃣ Azure OpenAI
59
+ azure_cfg = _extract_config(tool_config, "azure_openai")
60
+ if azure_cfg:
61
+ key = azure_cfg.get("apiKey")
62
+ endpoint = azure_cfg.get("endpoint")
63
+ if not key or not endpoint:
64
+ raise ValueError(
65
+ "Azure OpenAI integration is not configured. Please configure the connection to Azure OpenAI in Integrations."
66
+ )
67
+ return "azure", key, endpoint
68
+
69
+ # 3️⃣ Neither block present → error
70
+ raise ValueError(
71
+ "OpenAI integration is not configured. Please configure the connection to OpenAI in Integrations."
72
+ )
73
+
74
+
75
+ # ─────────────────────────────────────────────────────────────────────────────
76
+ # 2. Client factories
77
+ # ─────────────────────────────────────────────────────────────────────────────
78
+
79
+ def _api_version() -> str:
80
+ """Return the Azure API version (env-controlled, no secret)."""
81
+ return os.getenv("AZURE_OPENAI_API_VERSION", "2025-03-01-preview")
82
+
83
+
84
+ def create_openai_client(
85
+ tool_config: Optional[List[Dict]] = None,
86
+ ) -> Union[OpenAI, AzureOpenAI]:
87
+ """
88
+ Return a *synchronous* client:
89
+ • openai.OpenAI – public service
90
+ • openai.AzureOpenAI – Azure
91
+ """
92
+ provider, key, endpoint = _discover_credentials(tool_config)
93
+
94
+ if provider == "public":
95
+ return OpenAI(api_key=key)
96
+
97
+ # Azure
98
+ return AzureOpenAI(api_key=key, azure_endpoint=endpoint, api_version=_api_version())
99
+
100
+
101
+ def create_async_openai_client(
102
+ tool_config: Optional[List[Dict]] = None,
103
+ ) -> AsyncOpenAI:
104
+ """
105
+ Return an *async* client (AsyncOpenAI).
106
+
107
+ For Azure we pass both `azure_endpoint` and `api_version`.
108
+ """
109
+ provider, key, endpoint = _discover_credentials(tool_config)
110
+
111
+ if provider == "public":
112
+ return AsyncOpenAI(api_key=key)
113
+
114
+ return AsyncAzureOpenAI(
115
+ api_key=key,
116
+ azure_endpoint=endpoint,
117
+ api_version=_api_version(),
118
+ )
119
+
120
+
121
+
122
+ # ─────────────────────────────────────────────────────────────────────────────
123
+ # 3. Convenience helper (legacy)
124
+ # ─────────────────────────────────────────────────────────────────────────────
125
+
126
+ def get_openai_access_token(tool_config: Optional[List[Dict]] = None) -> str:
127
+ """Return just the API key (legacy helper)."""
128
+ _, key, _ = _discover_credentials(tool_config)
129
+ return key
@@ -0,0 +1,426 @@
1
+ import logging
2
+ from urllib.parse import urlparse
3
+ import urllib.parse
4
+ import re
5
+
6
+ from email_validator import validate_email, EmailNotValidError
7
+ from fqdn import FQDN
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # -------------------------------------------------------------------------------------
12
+ # Utility sets and patterns
13
+ # -------------------------------------------------------------------------------------
14
+ PLACEHOLDER_EMAILS = {
15
+ "test@example.com",
16
+ "test@test.com",
17
+ "test@domain.com",
18
+ "user@domain.com",
19
+ "user@example.com",
20
+ "user@yourdomain.com",
21
+ "no-reply@example.com",
22
+ "no-reply@domain.com",
23
+ "no-reply@yourdomain.com",
24
+ "admin@domain.com",
25
+ "contact@domain.com",
26
+ "info@domain.com",
27
+ "none@none.com",
28
+ "none@domain.com",
29
+ "noemail@noemail.com",
30
+ "test@fake.com",
31
+ "test@demo.com",
32
+ "test@testing.com",
33
+ "test@local.com",
34
+ "fake@fake.com",
35
+ "email@email.com",
36
+ "asdf@asdf.com",
37
+ "qwerty@qwerty.com",
38
+ "xxx@xxx.com",
39
+ "aaaa@aaaa.com",
40
+ "nomail@nomail.com",
41
+ "dontreply@dontreply.com",
42
+ "asdasd@asdasd.com",
43
+ "abcdefg@abcdefg.com",
44
+ "123@123.com",
45
+ "test123@test.com",
46
+ }
47
+
48
+ DISPOSABLE_DOMAINS = {
49
+ "mailinator.com",
50
+ "10minutemail.com",
51
+ "yopmail.com",
52
+ "guerrillamail.com",
53
+ "tempmail.com",
54
+ "fakemailgenerator.com",
55
+ "mytrashmail.com",
56
+ "getnada.com",
57
+ "throwawaymail.com",
58
+ "sharklasers.com",
59
+ "maildrop.cc",
60
+ "discard.email",
61
+ "temporaryemail.com",
62
+ "trashmail.com",
63
+ "mohmal.com",
64
+ "mail.tm",
65
+ "mailsac.com",
66
+ "mailcatch.com",
67
+ "temp-mail.org",
68
+ "emailondeck.com",
69
+ "mailinabox.email",
70
+ "spambog.com",
71
+ "mintemail.com",
72
+ "spam4.me",
73
+ "spambox.us",
74
+ "edumail.rocks",
75
+ "getairmail.com",
76
+ "mailnesia.com",
77
+ "spoofmail.de",
78
+ "dropmail.me",
79
+ "tempmailaddress.com",
80
+ "33mail.com",
81
+ "incognitomail.com",
82
+ "tempemail.co",
83
+ "trbvm.com",
84
+ "online.ms",
85
+ "20mail.in",
86
+ "wavee.net",
87
+ "ephemeral.email",
88
+ "bccto.me",
89
+ "cuvox.de",
90
+ "dispostable.com",
91
+ "easytrashmail.com",
92
+ "email-fake.org",
93
+ "emailtemporario.com.br",
94
+ "fleckens.hu",
95
+ "lroid.com",
96
+ "mail-temporaire.fr",
97
+ "mailate.com",
98
+ "mailfever.com",
99
+ "mailforspam.com",
100
+ "mailfreeonline.com",
101
+ "mailhazard.com",
102
+ "mailimate.com",
103
+ "mailin8r.com",
104
+ "mailincubator.com",
105
+ "mailmoat.com",
106
+ "mailzilla.org",
107
+ "notsharingmy.info",
108
+ "objectmail.com",
109
+ "proxymail.eu",
110
+ "spamdecoy.net",
111
+ "spamfree24.org",
112
+ "spamgourmet.com",
113
+ "spamify.com",
114
+ "spamomatic.com",
115
+ "spamspot.com",
116
+ "superrito.com",
117
+ "teleworm.us",
118
+ "trash-amil.com",
119
+ "trashmail.me",
120
+ "trashmail.net",
121
+ "wegwerfemail.de",
122
+ "wh4f.org",
123
+ "zmail.ru",
124
+ # Add more as needed
125
+ }
126
+
127
+ SPAMMY_PATTERN = re.compile(r"(.)\1{3,}") # e.g., 4+ repeated characters
128
+
129
+ BLOCKED_DOMAINS = {
130
+ # URL shorteners and link forwarders
131
+ "bit.ly",
132
+ "tinyurl.com",
133
+ "t.co",
134
+ "ow.ly",
135
+ "is.gd",
136
+ "cutt.ly",
137
+ "bit.do",
138
+ "buff.ly",
139
+ "rebrand.ly",
140
+ "rebrandly.com",
141
+ "snip.ly",
142
+ "shorte.st",
143
+ "soo.gd",
144
+ "shorturl.at",
145
+ "adf.ly",
146
+ # Bio / profile link aggregators
147
+ "linktr.ee",
148
+ "linktree.com",
149
+ "linkin.bio",
150
+ "campsite.bio",
151
+ "bio.link",
152
+ "bio.site",
153
+ "bio.fm",
154
+ "milkshake.app",
155
+ "lnk.bio",
156
+ "withkoji.com",
157
+ "about.me",
158
+ "carrd.co",
159
+ # Large social platforms (block if used as “organization domain”)
160
+ "facebook.com",
161
+ "instagram.com",
162
+ "linkedin.com",
163
+ "youtube.com",
164
+ "yelp.com",
165
+ "twitter.com",
166
+ "tiktok.com",
167
+ "pinterest.com",
168
+ "reddit.com",
169
+ "snapchat.com",
170
+ "tumblr.com",
171
+ "vimeo.com",
172
+ "flickr.com",
173
+ "wechat.com",
174
+ "qq.com",
175
+ }
176
+
177
+
178
+ # -------------------------------------------------------------------------------------
179
+ # Helper: Check FQDN validity
180
+ # -------------------------------------------------------------------------------------
181
+ def is_valid_fqdn(domain: str) -> bool:
182
+ """
183
+ Returns True if `domain` is a syntactically valid Fully Qualified Domain Name.
184
+ """
185
+ try:
186
+ if not domain or not isinstance(domain, str):
187
+ return False
188
+ fqdn_obj = FQDN(domain)
189
+ return fqdn_obj.is_valid
190
+ except Exception:
191
+ return False
192
+
193
+
194
+ # -------------------------------------------------------------------------------------
195
+ # Domain validation for organizations
196
+ # -------------------------------------------------------------------------------------
197
+ def validation_organization_domain(domain: str) -> str:
198
+ """
199
+ 1. Lowercases/strips the input domain string.
200
+ 2. Checks if the domain is in (or a subdomain of) a blocked set.
201
+ 3. Checks if the domain is a valid FQDN.
202
+ 4. Returns '' if blocked or invalid, otherwise returns the normalized domain.
203
+ """
204
+ if not domain or not isinstance(domain, str):
205
+ return ""
206
+
207
+ domain = domain.strip().lower()
208
+ # If domain exactly matches OR is a subdomain of any blocked domain
209
+ if any(domain == blocked or domain.endswith(f".{blocked}") for blocked in BLOCKED_DOMAINS):
210
+ return ""
211
+
212
+ # Otherwise, confirm valid FQDN
213
+ return domain if is_valid_fqdn(domain) else ""
214
+
215
+
216
+ # -------------------------------------------------------------------------------------
217
+ # Email validation & cleaning
218
+ # -------------------------------------------------------------------------------------
219
+ def validate_and_clean_email(email: str) -> str:
220
+ """
221
+ Return a validated, normalized email string or '' if invalid/unwanted.
222
+ Checks:
223
+ 1. Syntax / deliverability via email_validator
224
+ 2. Against placeholder/fake emails
225
+ 3. Against disposable email domains
226
+ 4. Spammy repeated character patterns
227
+ """
228
+ if not email or not isinstance(email, str):
229
+ return ""
230
+
231
+ try:
232
+ v = validate_email(email, check_deliverability=True)
233
+ normalized_email = v["email"] # canonical form
234
+ local_part, domain_part = normalized_email.rsplit("@", 1)
235
+
236
+ # 1. Check entire address in placeholder set
237
+ if normalized_email.lower() in PLACEHOLDER_EMAILS:
238
+ return ""
239
+
240
+ # 2. Check domain in disposable set
241
+ if domain_part.lower() in DISPOSABLE_DOMAINS:
242
+ return ""
243
+
244
+ # 3. Check repeated/spammy pattern
245
+ if SPAMMY_PATTERN.search(normalized_email):
246
+ return ""
247
+
248
+ return normalized_email
249
+ except EmailNotValidError:
250
+ return ""
251
+
252
+
253
+ # -------------------------------------------------------------------------------------
254
+ # Website URL validation
255
+ # -------------------------------------------------------------------------------------
256
+ def validate_website_url(raw_url: str) -> str:
257
+ """
258
+ Validate a website URL (must be http/https or a raw domain).
259
+ If no scheme is provided but the input is a valid FQDN-like string
260
+ (e.g. www.google.com), automatically prefix https://.
261
+ Return the normalized URL without query/fragment, or '' if invalid.
262
+ """
263
+ if not raw_url or not isinstance(raw_url, str):
264
+ return ""
265
+
266
+ # Clean input
267
+ raw_url = raw_url.strip().lower()
268
+
269
+ try:
270
+ parsed = urllib.parse.urlparse(raw_url)
271
+
272
+ # If there's no scheme, try prefixing https://
273
+ if not parsed.scheme:
274
+ # Example: "www.google.com" => "https://www.google.com"
275
+ potential_url = f"https://{raw_url}"
276
+ test_parsed = urllib.parse.urlparse(potential_url)
277
+
278
+ # If that yields a valid scheme and netloc, use it
279
+ if test_parsed.scheme in ["http", "https"] and test_parsed.netloc:
280
+ parsed = test_parsed
281
+ else:
282
+ return ""
283
+
284
+ # Check we now have a valid scheme and netloc
285
+ if parsed.scheme not in ["http", "https"] or not parsed.netloc:
286
+ return ""
287
+
288
+ # Normalize by removing query and fragment parts
289
+ normalized = urllib.parse.urlunparse(
290
+ (parsed.scheme, parsed.netloc, parsed.path, "", "", "")
291
+ )
292
+ return normalized
293
+
294
+ except Exception:
295
+ return ""
296
+
297
+
298
+ # -------------------------------------------------------------------------------------
299
+ # LinkedIn URL Normalizers
300
+ # -------------------------------------------------------------------------------------
301
+ def normalize_linkedin_url(raw_url: str) -> str:
302
+ """
303
+ Normalize a personal LinkedIn URL to the form: https://www.linkedin.com/in/<something>
304
+ Must contain '/in/'. Otherwise, return ''.
305
+ """
306
+ if not raw_url or not isinstance(raw_url, str):
307
+ return ""
308
+
309
+ try:
310
+ raw_url = raw_url.strip()
311
+ parsed = urlparse(raw_url)
312
+
313
+ if not parsed.scheme or not parsed.netloc:
314
+ return ""
315
+
316
+ if "linkedin.com" not in parsed.netloc.lower():
317
+ return ""
318
+
319
+ url = raw_url.rstrip("/")
320
+ parsed = urlparse(url)
321
+
322
+ if "/in/" not in parsed.path.lower():
323
+ return ""
324
+
325
+ path = parsed.path.lstrip("/")
326
+ return f"https://www.linkedin.com/{path}".rstrip("/")
327
+ except Exception:
328
+ return ""
329
+
330
+
331
+ def normalize_linkedin_company_url(raw_url: str) -> str:
332
+ """
333
+ Normalize a company LinkedIn URL to the form: https://www.linkedin.com/company/<something>
334
+ Must contain '/company/'. Otherwise, return ''.
335
+ """
336
+ if not raw_url or not isinstance(raw_url, str):
337
+ return ""
338
+
339
+ try:
340
+ raw_url = raw_url.strip()
341
+ parsed = urlparse(raw_url)
342
+
343
+ if not parsed.scheme or not parsed.netloc:
344
+ return ""
345
+
346
+ if "linkedin.com" not in parsed.netloc.lower():
347
+ return ""
348
+
349
+ url = raw_url.rstrip("/")
350
+ parsed = urlparse(url)
351
+
352
+ if "/company/" not in parsed.path.lower():
353
+ return ""
354
+
355
+ path = parsed.path.lstrip("/")
356
+ return f"https://www.linkedin.com/{path}".rstrip("/")
357
+ except Exception:
358
+ return ""
359
+
360
+
361
+ def normalize_linkedin_company_salesnav_url(raw_url: str) -> str:
362
+ """
363
+ Normalize a company Sales Navigator URL to: https://www.linkedin.com/sales/company/<something>
364
+ Must contain '/sales/company/'. Otherwise, return ''.
365
+ """
366
+ if not raw_url or not isinstance(raw_url, str):
367
+ return ""
368
+
369
+ try:
370
+ raw_url = raw_url.strip()
371
+ parsed = urlparse(raw_url)
372
+
373
+ if not parsed.scheme or not parsed.netloc:
374
+ return ""
375
+
376
+ if "linkedin.com" not in parsed.netloc.lower():
377
+ return ""
378
+
379
+ url = raw_url.rstrip("/")
380
+ parsed = urlparse(url)
381
+
382
+ if "/sales/company/" not in parsed.path.lower():
383
+ return ""
384
+
385
+ path = parsed.path.lstrip("/")
386
+ return f"https://www.linkedin.com/{path}".rstrip("/")
387
+ except Exception:
388
+ return ""
389
+
390
+
391
+ def normalize_salesnav_url(raw_url: str) -> str:
392
+ """
393
+ Normalize a Sales Navigator URL to: https://www.linkedin.com/sales/lead/<something>
394
+ Must contain '/sales/lead/'. Otherwise, return ''.
395
+ Strips anything after a comma in the URL.
396
+ """
397
+ if not raw_url or not isinstance(raw_url, str):
398
+ return ""
399
+
400
+ try:
401
+ raw_url = raw_url.strip()
402
+ parsed_initial = urlparse(raw_url)
403
+
404
+ if not parsed_initial.scheme or not parsed_initial.netloc:
405
+ return ""
406
+
407
+ if "linkedin.com" not in parsed_initial.netloc.lower():
408
+ return ""
409
+
410
+ # Remove trailing slash
411
+ url = raw_url.rstrip("/")
412
+
413
+ # Strip anything after the first comma
414
+ comma_idx = url.find(",")
415
+ if comma_idx != -1:
416
+ url = url[:comma_idx]
417
+
418
+ parsed = urlparse(url)
419
+
420
+ if "/sales/lead/" not in parsed.path.lower():
421
+ return ""
422
+
423
+ path = parsed.path.lstrip("/")
424
+ return f"https://www.linkedin.com/{path}".rstrip("/")
425
+ except Exception:
426
+ return ""