dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,107 @@
1
+ import datetime
2
+ import logging
3
+ from typing import Any, Dict, List, Optional, cast
4
+
5
+ from pydantic import BaseModel
6
+ from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
7
+ from dhisana.utils.compose_search_query import (
8
+ get_search_results_for_insights
9
+ )
10
+
11
+ logger = logging.getLogger(__name__)
12
+ logging.basicConfig(level=logging.INFO)
13
+
14
+
15
+ class IntentSignalScoring(BaseModel):
16
+ score_based_on_intent_signal: int
17
+ reasoning_for_score_being_high: str
18
+ summary_of_lead_and_company: str
19
+
20
+
21
+ async def check_for_intent_signal(
22
+ lead: Dict[str, Any],
23
+ signal_to_look_for_in_plan_english: str,
24
+ intent_signal_type: str,
25
+ add_search_results: Optional[bool] = False,
26
+ tool_config: Optional[List[Dict[str, Any]]] = None
27
+ ) -> int:
28
+ """
29
+ Evaluate a 'lead' for a specific intent signal and return an integer score from 0–5.
30
+ """
31
+
32
+ logger.info("check_for_intent_signal called with lead=%s, intent_signal_type=%s", lead.get("full_name"), intent_signal_type)
33
+
34
+ search_results_text = ""
35
+ if add_search_results:
36
+ logger.info("Fetching search results for lead='%s' with signal='%s'", lead.get("full_name"), intent_signal_type)
37
+ search_results = await get_search_results_for_insights(
38
+ lead=lead,
39
+ english_description=signal_to_look_for_in_plan_english,
40
+ intent_signal_type=intent_signal_type,
41
+ tool_config=tool_config
42
+ )
43
+ logger.info("Received search results count: %d", len(search_results))
44
+
45
+ for item in search_results:
46
+ query_str = item.get("query", "")
47
+ results_str = item.get("results", "")
48
+ logger.info("Search query: %s", query_str)
49
+ logger.info("Search results snippet: %s", results_str[:100]) # Show partial snippet
50
+ search_results_text += f"Query: {query_str}\nResults: {results_str}\n\n"
51
+ datetime.datetime.now().isoformat()
52
+ user_prompt = f"""
53
+ Hi AI Assistant,
54
+ You are an expert in scoring leads based on intent signals.
55
+ You have the following lead and user requirements to provide a qulifying lead score score between 0 and 5
56
+ based on the intent signal the user is looking for.
57
+ Do the following step by step:
58
+ 1. This about the summary of the lead and the company lead is working for.
59
+ 2. Create a summary of the search results obtained.
60
+ 3. Think about the signal user is looking for to qualify and score the lead.
61
+ 4. Use the lead information, summary of search results and signal user is looking for to score the lead.
62
+ 5. Go back and check if the score makes sense. Score between 0-5 based on the confidence of the signal.
63
+
64
+ Lead Data:
65
+ {lead}
66
+
67
+ Description of the signal user is looking for:
68
+ {signal_to_look_for_in_plan_english}
69
+
70
+ Following is some search results I found online. Use them if they are relevant for scoring:
71
+ {search_results_text}
72
+
73
+
74
+ Return your answer in valid JSON with the key 'score_based_on_intent_signal'.
75
+ Make sure it is an integer between 0 and 5.
76
+ Add small reasoning_for_score_bing_high describing why you gave the score score_based_on_intent_signal as high if you are giving high score.
77
+ in summary_of_lead_and_company field provide a summary of the lead (like role, experience, tenure, locaion) and details about the company lead is working for currently.
78
+ """
79
+ logger.info("Constructed user prompt for LLM.")
80
+
81
+ response_any, status = await get_structured_output_internal(
82
+ user_prompt,
83
+ IntentSignalScoring,
84
+ effort="low",
85
+ tool_config=tool_config
86
+ )
87
+ logger.info("Intent signal scoring call completed with status=%s", status)
88
+
89
+ if status != "SUCCESS" or response_any is None:
90
+ logger.error("Failed to generate an intent signal score from the LLM.")
91
+ raise Exception("Failed to generate an intent signal score from the LLM.")
92
+
93
+ response = cast(IntentSignalScoring, response_any)
94
+ score = response.score_based_on_intent_signal
95
+ reasoning = response.reasoning_for_score_being_high[:100] # Show partial if very long
96
+ lead["qualification_score"] = score
97
+ lead["qualification_reason"] = response.reasoning_for_score_being_high
98
+ lead["summary_about_lead"] = response.summary_of_lead_and_company
99
+
100
+ logger.info(
101
+ "Lead '%s' scored %d for intent signal '%s'. Reason partial: %s",
102
+ lead.get("full_name", "Unknown"),
103
+ score,
104
+ intent_signal_type,
105
+ reasoning
106
+ )
107
+ return score
@@ -0,0 +1,209 @@
1
+ import re
2
+ from typing import Dict, List, Optional, Any
3
+ from pydantic import BaseModel
4
+ from dhisana.utils.apollo_tools import enrich_person_info_from_apollo
5
+ from dhisana.utils.assistant_tool_tag import assistant_tool
6
+ from dhisana.utils.proxy_curl_tools import enrich_person_info_from_proxycurl
7
+
8
+ # --------------------------------------------------------------------------------
9
+ # 1. Data Model
10
+ # --------------------------------------------------------------------------------
11
+
12
+ class LeadLinkedInMatch(BaseModel):
13
+ first_name_matched: bool = False
14
+ last_name_matched: bool = False
15
+ linkedin_url_valid: bool = False
16
+ title_matched: bool = False
17
+ location_matched: bool = False
18
+
19
+ # --------------------------------------------------------------------------------
20
+ # 2. Helper: Compare Single Field
21
+ # --------------------------------------------------------------------------------
22
+
23
+ def compare_field(
24
+ lead_properties: Dict[str, Any],
25
+ person_data: Dict[str, Any],
26
+ lead_key: str,
27
+ person_key: str
28
+ ) -> bool:
29
+ if not lead_properties.get(lead_key):
30
+ # If the lead doesn't have the field at all, let's consider it "matched" by default
31
+ return True
32
+
33
+ lead_value = lead_properties.get(lead_key, "")
34
+ person_value = person_data.get(person_key, "")
35
+
36
+ if isinstance(lead_value, str) and isinstance(person_value, str):
37
+ return lead_value.strip().lower() == person_value.strip().lower()
38
+
39
+ return person_value == lead_value
40
+
41
+ # --------------------------------------------------------------------------------
42
+ # 3. Apollo Validation Function
43
+ # --------------------------------------------------------------------------------
44
+
45
+ @assistant_tool
46
+ async def validate_linkedin_url_with_apollo(
47
+ lead_properties: Dict[str, Any],
48
+ tool_config: Optional[List[Dict]] = None
49
+ ) -> Dict[str, bool]:
50
+ """
51
+ Validates the LinkedIn URL and user information using the Apollo API.
52
+
53
+ Args:
54
+ lead_properties (dict): Contains keys like:
55
+ 'first_name', 'last_name', 'job_title', 'lead_location', 'user_linkedin_url'.
56
+ tool_config (Optional[List[Dict]]): Contains configuration for the Apollo tool.
57
+
58
+ Returns:
59
+ Dict[str, bool]: A dictionary with matching status:
60
+ {
61
+ "first_name_matched": bool,
62
+ "last_name_matched": bool,
63
+ "linkedin_url_valid": bool,
64
+ "title_matched": bool,
65
+ "location_matched": bool
66
+ }
67
+ """
68
+ linkedin_url = lead_properties.get("user_linkedin_url", "")
69
+ match_result = LeadLinkedInMatch()
70
+
71
+ linkedin_data = await enrich_person_info_from_apollo(
72
+ linkedin_url=linkedin_url,
73
+ tool_config=tool_config
74
+ )
75
+ # If no data is returned from Apollo, return defaults
76
+ if not linkedin_data:
77
+ return match_result.model_dump()
78
+
79
+ person_data = linkedin_data.get("person", {})
80
+
81
+ # Compare each field systematically
82
+ match_result.first_name_matched = compare_field(lead_properties, person_data, "first_name", "first_name")
83
+ match_result.last_name_matched = compare_field(lead_properties, person_data, "last_name", "last_name")
84
+ match_result.title_matched = compare_field(lead_properties, person_data, "job_title", "title")
85
+ match_result.location_matched = compare_field(lead_properties, person_data, "lead_location", "location")
86
+
87
+ # If we got data, we consider the LinkedIn URL valid
88
+ match_result.linkedin_url_valid = True
89
+
90
+ return match_result.model_dump()
91
+
92
+ @assistant_tool
93
+ async def validate_linkedin_url_with_proxy_curl(
94
+ lead_properties: Dict[str, Any],
95
+ tool_config: Optional[List[Dict]] = None
96
+ ) -> Dict[str, bool]:
97
+ """
98
+ Validates the LinkedIn URL and user information using the Proxy Curl API.
99
+
100
+ Args:
101
+ lead_properties (dict): Contains keys like:
102
+ 'first_name', 'last_name', 'job_title', 'lead_location', 'user_linkedin_url'.
103
+ tool_config (Optional[List[Dict]]): Contains configuration for the Apollo tool.
104
+
105
+ Returns:
106
+ Dict[str, bool]: A dictionary with matching status:
107
+ {
108
+ "first_name_matched": bool,
109
+ "last_name_matched": bool,
110
+ "linkedin_url_valid": bool,
111
+ "title_matched": bool,
112
+ "location_matched": bool
113
+ }
114
+ """
115
+ linkedin_url = lead_properties.get("user_linkedin_url", "")
116
+ match_result = LeadLinkedInMatch()
117
+
118
+ linkedin_data = await enrich_person_info_from_proxycurl(
119
+ linkedin_url=linkedin_url,
120
+ tool_config=tool_config
121
+ )
122
+ # If no data is returned from Proxycurl, return defaults
123
+ if not linkedin_data:
124
+ return match_result.model_dump()
125
+
126
+ person_data = linkedin_data
127
+
128
+ # Compare each field systematically
129
+ match_result.first_name_matched = compare_field(lead_properties, person_data, "first_name", "first_name")
130
+ match_result.last_name_matched = compare_field(lead_properties, person_data, "last_name", "last_name")
131
+ match_result.title_matched = compare_field(lead_properties, person_data, "job_title", "occupation")
132
+ # match_result.location_matched = compare_field(lead_properties, person_data, "lead_location", "location")
133
+
134
+ # If we got data, we consider the LinkedIn URL valid
135
+ match_result.linkedin_url_valid = True
136
+
137
+ return match_result.model_dump()
138
+
139
+ # --------------------------------------------------------------------------------
140
+ # 4. High-Level Validation Router
141
+ # --------------------------------------------------------------------------------
142
+
143
+ ALLOWED_CHECK_LINKEDIN_TOOLS = ["apollo", "proxycurl", "zoominfo"]
144
+ LINKEDIN_VALIDATE_TOOL_NAME_TO_FUNCTION_MAP = {
145
+ "apollo": validate_linkedin_url_with_apollo,
146
+ "proxycurl": validate_linkedin_url_with_proxy_curl
147
+ }
148
+
149
+ def is_proxy_linkedin_url(url: str) -> bool:
150
+ """
151
+ Determines if a LinkedIn URL is "proxy-like":
152
+ specifically, if /in/<profile_id> starts with 'acw' and is > 10 chars total.
153
+ """
154
+ match = re.search(r"linkedin\.com/in/([^/]+)", url, re.IGNORECASE)
155
+ if match:
156
+ profile_id = match.group(1).strip()
157
+ if profile_id.startswith("acw") and len(profile_id) > 10:
158
+ return True
159
+ return False
160
+
161
+ @assistant_tool
162
+ async def check_linkedin_url_validity(
163
+ lead_properties: Dict[str, Any],
164
+ tool_config: Optional[List[Dict]] = None
165
+ ) -> Dict[str, bool]:
166
+ """
167
+ Validates LinkedIn URL (and related fields) by choosing the appropriate tool
168
+ from the tool_config. If the LinkedIn URL is detected as a "proxy" URL,
169
+ we skip calling any external tool and directly return 'linkedin_url_valid' = True.
170
+
171
+ Args:
172
+ lead_properties (dict): Lead info (e.g. first_name, last_name, job_title,
173
+ lead_location, user_linkedin_url).
174
+ tool_config (Optional[List[Dict]]): Configuration to identify which tool is available.
175
+
176
+ Returns:
177
+ Dict[str, bool]: Standardized response from the chosen validation function.
178
+
179
+ Raises:
180
+ ValueError: If no tool configuration or no suitable validation tool is found.
181
+ """
182
+ if not tool_config:
183
+ raise ValueError("No tool configuration found.")
184
+
185
+ # ---------------------------------------------------------
186
+ # 1) If it’s a "proxy" LinkedIn URL, just return valid = True
187
+ # ---------------------------------------------------------
188
+ linkedin_url = lead_properties.get("user_linkedin_url", "")
189
+ if is_proxy_linkedin_url(linkedin_url):
190
+ match_result = LeadLinkedInMatch()
191
+ match_result.linkedin_url_valid = True
192
+ # The other fields remain their default (False) unless
193
+ # you want to set them otherwise. For now, we just do:
194
+ return match_result.model_dump()
195
+
196
+ # ---------------------------------------------------------
197
+ # 2) Otherwise, pick the correct tool and validate normally
198
+ # ---------------------------------------------------------
199
+ chosen_tool_func = None
200
+ for item in tool_config:
201
+ tool_name = item.get("name")
202
+ if tool_name in LINKEDIN_VALIDATE_TOOL_NAME_TO_FUNCTION_MAP and tool_name in ALLOWED_CHECK_LINKEDIN_TOOLS:
203
+ chosen_tool_func = LINKEDIN_VALIDATE_TOOL_NAME_TO_FUNCTION_MAP[tool_name]
204
+ break
205
+
206
+ if not chosen_tool_func:
207
+ raise ValueError("No suitable LinkedIn validation tool found in tool_config.")
208
+
209
+ return await chosen_tool_func(lead_properties, tool_config)
@@ -0,0 +1,43 @@
1
+ import aiohttp
2
+ import logging
3
+ from typing import Optional
4
+ from dhisana.utils.assistant_tool_tag import assistant_tool
5
+
6
+ @assistant_tool
7
+ async def push_to_clay_table(
8
+ data: dict,
9
+ webhook: Optional[str] = None,
10
+ api_key: Optional[str] = None,
11
+ ):
12
+ """
13
+ Push data to the Clay webhook.
14
+
15
+ Parameters:
16
+ - **data** (*dict*): Data to send to the webhook.
17
+ - **webhook** (*str*, optional): The webhook URL.
18
+ - **api_key** (*str*, optional): The authentication token.
19
+
20
+ Returns:
21
+ - **dict**: Response message or error.
22
+ """
23
+ if not api_key:
24
+ return {
25
+ 'error': "Clay integration is not configured. Please configure the connection to Clay in Integrations."
26
+ }
27
+
28
+ if not webhook:
29
+ return {'error': "Webhook URL not provided"}
30
+
31
+ headers = {
32
+ "Content-Type": "application/json",
33
+ "x-clay-webhook-auth": api_key
34
+ }
35
+
36
+ async with aiohttp.ClientSession() as session:
37
+ async with session.post(webhook, headers=headers, json=data) as response:
38
+ result = await response.text()
39
+ if response.status == 200:
40
+ return {'message': result}
41
+ else:
42
+ logging.warning(f"push_to_clay_table failed: {result}")
43
+ return {'error': result}
@@ -0,0 +1,135 @@
1
+ from typing import Any, Dict, List
2
+ import copy
3
+ from typing import Any, Dict, List, Optional
4
+
5
+
6
+
7
+ def remove_empty(data: Any) -> Any:
8
+ """
9
+ Recursively remove empty or null-like values from JSON/dict data.
10
+
11
+ - Removes None or 'null' (case-insensitive) strings.
12
+ - Removes empty strings.
13
+ - Removes empty lists and dicts.
14
+ - Returns `None` if the entire structure becomes empty.
15
+ """
16
+ if isinstance(data, dict):
17
+ cleaned_dict: Dict[str, Any] = {}
18
+ for key, value in data.items():
19
+ cleaned_value = remove_empty(value)
20
+ if cleaned_value is not None:
21
+ cleaned_dict[key] = cleaned_value
22
+
23
+ # Return None if dictionary is empty after cleaning
24
+ return cleaned_dict if cleaned_dict else None
25
+
26
+ elif isinstance(data, list):
27
+ cleaned_list: List[Any] = []
28
+ for item in data:
29
+ cleaned_item = remove_empty(item)
30
+ if cleaned_item is not None:
31
+ cleaned_list.append(cleaned_item)
32
+
33
+ # Return None if list is empty after cleaning
34
+ return cleaned_list if cleaned_list else None
35
+
36
+ else:
37
+ # Base/primitive case
38
+ # Remove None or empty strings or strings "null" (case-insensitive)
39
+ if data is None:
40
+ return None
41
+ if isinstance(data, str):
42
+ if not data.strip() or data.lower() == "null":
43
+ return None
44
+ return data
45
+
46
+
47
+ def cleanup_properties(properties: Dict[str, Any]) -> Dict[str, Any]:
48
+ """
49
+ In-place style: returns a cleaned copy (so the original isn't mutated).
50
+ """
51
+ cleaned = remove_empty(properties)
52
+ return cleaned if cleaned is not None else {}
53
+
54
+
55
+
56
+
57
+ def cleanup_email_context(user_properties: Dict[str, Any]) -> Dict[str, Any]:
58
+ """
59
+ Return a cleaned copy of user_properties:
60
+ - Removes null/empty values recursively.
61
+ - Removes fields with keys that look like an 'id' or 'guid'.
62
+ - Explicitly sets external_openai_vector_store_id to None if present.
63
+ """
64
+ clone_context = copy.deepcopy(user_properties)
65
+
66
+ if isinstance(clone_context.get('external_known_data'), dict) \
67
+ and 'external_openai_vector_store_id' in clone_context['external_known_data']:
68
+ clone_context['external_known_data']['external_openai_vector_store_id'] = None
69
+
70
+ cleaned = _remove_empty_and_ids(clone_context)
71
+ return cleaned if cleaned is not None else {}
72
+
73
+ def _remove_empty_and_ids(data: Any) -> Optional[Any]:
74
+ """
75
+ Recursively remove:
76
+ - None values
77
+ - Empty strings or "null" strings (case-insensitive)
78
+ - Empty lists/dicts
79
+ - Keys whose names look like IDs (e.g., containing "id" or "guid")
80
+ Returns None if the resulting object is empty.
81
+ """
82
+ if isinstance(data, dict):
83
+ result: Dict[str, Any] = {}
84
+ for key, value in data.items():
85
+ if _is_id_key(key):
86
+ continue
87
+ cleaned_value = _remove_empty_and_ids(value)
88
+ if not _is_empty_value(cleaned_value):
89
+ result[key] = cleaned_value
90
+ return result if result else None
91
+
92
+ elif isinstance(data, list):
93
+ result: List[Any] = []
94
+ for item in data:
95
+ cleaned_item = _remove_empty_and_ids(item)
96
+ if not _is_empty_value(cleaned_item):
97
+ result.append(cleaned_item)
98
+ return result if result else None
99
+
100
+ else:
101
+ if _is_empty_value(data):
102
+ return None
103
+ return data
104
+
105
+ def _is_id_key(key: str) -> bool:
106
+ """
107
+ Identify if a key is ID-like by checking if 'id' or 'guid' appears in its name (case-insensitive),
108
+ or if it ends with _id, _ids, or _by.
109
+ """
110
+ key_lower = key.lower()
111
+ return (
112
+ 'id' in key_lower
113
+ or 'guid' in key_lower
114
+ or key_lower.endswith('_id')
115
+ or key_lower.endswith('_ids')
116
+ or key_lower.endswith('_by')
117
+ )
118
+
119
+ def _is_empty_value(value: Any) -> bool:
120
+ """
121
+ Determine if a value is considered "empty" for removal.
122
+ This includes:
123
+ - None
124
+ - Empty string
125
+ - String "null" (case-insensitive)
126
+ - Empty list or dict
127
+ """
128
+ if value is None:
129
+ return True
130
+ if isinstance(value, str):
131
+ if not value.strip() or value.lower() == "null":
132
+ return True
133
+ if isinstance(value, (list, dict)) and len(value) == 0:
134
+ return True
135
+ return False
@@ -0,0 +1,60 @@
1
+ import re
2
+ def normalize_company_name(name: str) -> str:
3
+ """
4
+ Normalize a company name while preserving the original letter case of
5
+ alphanumeric characters. Returns '' if the name is invalid, a common placeholder,
6
+ or contains disallowed keywords.
7
+
8
+ Steps:
9
+ 1) Reject if name is None, not a string, or in typical placeholder words (e.g. 'none', 'na', etc.).
10
+ 2) If 'freelance', 'consulting', 'startup', etc. appear anywhere (case-insensitive), return ''.
11
+ 3) Remove parentheses and their contents.
12
+ 4) Remove anything after the first '|'.
13
+ 5) Strip out non-alphanumeric characters (but preserve the case of letters that remain).
14
+ 6) Trim whitespace. Return the result.
15
+ """
16
+
17
+ # 1. Quick checks for invalid inputs
18
+ if not name or not isinstance(name, str):
19
+ return ""
20
+
21
+ # Convert to lowercase for checks while keeping original in `name`
22
+ lower_str = name.strip().lower()
23
+ invalid_placeholders = {
24
+ "null", "none", "na", "n.a", "notfound", "error",
25
+ "notavilable", "notavailable", ""
26
+ }
27
+ if lower_str in invalid_placeholders:
28
+ return ""
29
+
30
+ # 2. Disallowed keywords => entire name is made empty
31
+ # (Case-insensitive substring check)
32
+ disallowed_keywords = [
33
+ "freelance",
34
+ "freelancer",
35
+ "consulting",
36
+ "not working",
37
+ "taking break",
38
+ "startup",
39
+ "stealth startup",
40
+ "sealth startup",
41
+ ]
42
+ for keyword in disallowed_keywords:
43
+ if keyword in lower_str:
44
+ return ""
45
+
46
+ # 3. Remove parentheses and their contents
47
+ no_paren = re.sub(r"\(.*?\)", "", name)
48
+
49
+ # 4. Remove content after '|'
50
+ # Splits on the first '|'; keeps only the left side
51
+ no_pipe = no_paren.split("|", 1)[0]
52
+
53
+ # 5. Strip out non-alphanumeric chars (preserve letters' original case)
54
+ # Keep letters (a-z, A-Z), digits, and whitespace
55
+ cleaned = re.sub(r"[^a-zA-Z0-9\s]", "", no_pipe)
56
+
57
+ # 6. Final trim
58
+ final_str = cleaned.strip()
59
+
60
+ return final_str