dhisana 0.0.1.dev243__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. dhisana/__init__.py +1 -0
  2. dhisana/cli/__init__.py +1 -0
  3. dhisana/cli/cli.py +20 -0
  4. dhisana/cli/datasets.py +27 -0
  5. dhisana/cli/models.py +26 -0
  6. dhisana/cli/predictions.py +20 -0
  7. dhisana/schemas/__init__.py +1 -0
  8. dhisana/schemas/common.py +399 -0
  9. dhisana/schemas/sales.py +965 -0
  10. dhisana/ui/__init__.py +1 -0
  11. dhisana/ui/components.py +472 -0
  12. dhisana/utils/__init__.py +1 -0
  13. dhisana/utils/add_mapping.py +352 -0
  14. dhisana/utils/agent_tools.py +51 -0
  15. dhisana/utils/apollo_tools.py +1597 -0
  16. dhisana/utils/assistant_tool_tag.py +4 -0
  17. dhisana/utils/built_with_api_tools.py +282 -0
  18. dhisana/utils/cache_output_tools.py +98 -0
  19. dhisana/utils/cache_output_tools_local.py +78 -0
  20. dhisana/utils/check_email_validity_tools.py +717 -0
  21. dhisana/utils/check_for_intent_signal.py +107 -0
  22. dhisana/utils/check_linkedin_url_validity.py +209 -0
  23. dhisana/utils/clay_tools.py +43 -0
  24. dhisana/utils/clean_properties.py +135 -0
  25. dhisana/utils/company_utils.py +60 -0
  26. dhisana/utils/compose_salesnav_query.py +259 -0
  27. dhisana/utils/compose_search_query.py +759 -0
  28. dhisana/utils/compose_three_step_workflow.py +234 -0
  29. dhisana/utils/composite_tools.py +137 -0
  30. dhisana/utils/dataframe_tools.py +237 -0
  31. dhisana/utils/domain_parser.py +45 -0
  32. dhisana/utils/email_body_utils.py +72 -0
  33. dhisana/utils/email_parse_helpers.py +132 -0
  34. dhisana/utils/email_provider.py +375 -0
  35. dhisana/utils/enrich_lead_information.py +933 -0
  36. dhisana/utils/extract_email_content_for_llm.py +101 -0
  37. dhisana/utils/fetch_openai_config.py +129 -0
  38. dhisana/utils/field_validators.py +426 -0
  39. dhisana/utils/g2_tools.py +104 -0
  40. dhisana/utils/generate_content.py +41 -0
  41. dhisana/utils/generate_custom_message.py +271 -0
  42. dhisana/utils/generate_email.py +278 -0
  43. dhisana/utils/generate_email_response.py +465 -0
  44. dhisana/utils/generate_flow.py +102 -0
  45. dhisana/utils/generate_leads_salesnav.py +303 -0
  46. dhisana/utils/generate_linkedin_connect_message.py +224 -0
  47. dhisana/utils/generate_linkedin_response_message.py +317 -0
  48. dhisana/utils/generate_structured_output_internal.py +462 -0
  49. dhisana/utils/google_custom_search.py +267 -0
  50. dhisana/utils/google_oauth_tools.py +727 -0
  51. dhisana/utils/google_workspace_tools.py +1294 -0
  52. dhisana/utils/hubspot_clearbit.py +96 -0
  53. dhisana/utils/hubspot_crm_tools.py +2440 -0
  54. dhisana/utils/instantly_tools.py +149 -0
  55. dhisana/utils/linkedin_crawler.py +168 -0
  56. dhisana/utils/lusha_tools.py +333 -0
  57. dhisana/utils/mailgun_tools.py +156 -0
  58. dhisana/utils/mailreach_tools.py +123 -0
  59. dhisana/utils/microsoft365_tools.py +455 -0
  60. dhisana/utils/openai_assistant_and_file_utils.py +267 -0
  61. dhisana/utils/openai_helpers.py +977 -0
  62. dhisana/utils/openapi_spec_to_tools.py +45 -0
  63. dhisana/utils/openapi_tool/__init__.py +1 -0
  64. dhisana/utils/openapi_tool/api_models.py +633 -0
  65. dhisana/utils/openapi_tool/convert_openai_spec_to_tool.py +271 -0
  66. dhisana/utils/openapi_tool/openapi_tool.py +319 -0
  67. dhisana/utils/parse_linkedin_messages_txt.py +100 -0
  68. dhisana/utils/profile.py +37 -0
  69. dhisana/utils/proxy_curl_tools.py +1226 -0
  70. dhisana/utils/proxycurl_search_leads.py +426 -0
  71. dhisana/utils/python_function_to_tools.py +83 -0
  72. dhisana/utils/research_lead.py +176 -0
  73. dhisana/utils/sales_navigator_crawler.py +1103 -0
  74. dhisana/utils/salesforce_crm_tools.py +477 -0
  75. dhisana/utils/search_router.py +131 -0
  76. dhisana/utils/search_router_jobs.py +51 -0
  77. dhisana/utils/sendgrid_tools.py +162 -0
  78. dhisana/utils/serarch_router_local_business.py +75 -0
  79. dhisana/utils/serpapi_additional_tools.py +290 -0
  80. dhisana/utils/serpapi_google_jobs.py +117 -0
  81. dhisana/utils/serpapi_google_search.py +188 -0
  82. dhisana/utils/serpapi_local_business_search.py +129 -0
  83. dhisana/utils/serpapi_search_tools.py +852 -0
  84. dhisana/utils/serperdev_google_jobs.py +125 -0
  85. dhisana/utils/serperdev_local_business.py +154 -0
  86. dhisana/utils/serperdev_search.py +233 -0
  87. dhisana/utils/smtp_email_tools.py +582 -0
  88. dhisana/utils/test_connect.py +2087 -0
  89. dhisana/utils/trasform_json.py +173 -0
  90. dhisana/utils/web_download_parse_tools.py +189 -0
  91. dhisana/utils/workflow_code_model.py +5 -0
  92. dhisana/utils/zoominfo_tools.py +357 -0
  93. dhisana/workflow/__init__.py +1 -0
  94. dhisana/workflow/agent.py +18 -0
  95. dhisana/workflow/flow.py +44 -0
  96. dhisana/workflow/task.py +43 -0
  97. dhisana/workflow/test.py +90 -0
  98. dhisana-0.0.1.dev243.dist-info/METADATA +43 -0
  99. dhisana-0.0.1.dev243.dist-info/RECORD +102 -0
  100. dhisana-0.0.1.dev243.dist-info/WHEEL +5 -0
  101. dhisana-0.0.1.dev243.dist-info/entry_points.txt +2 -0
  102. dhisana-0.0.1.dev243.dist-info/top_level.txt +1 -0
@@ -0,0 +1,759 @@
1
+ import logging
2
+ import os
3
+ import json
4
+ import re
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ import aiohttp
8
+ import asyncio
9
+ from bs4 import BeautifulSoup
10
+ from pydantic import BaseModel
11
+
12
+ # If these are your local imports, leave them as is. Otherwise adjust paths as needed.
13
+ from dhisana.utils.company_utils import normalize_company_name
14
+ from dhisana.utils.generate_structured_output_internal import get_structured_output_internal
15
+ from dhisana.utils.cache_output_tools import cache_output, retrieve_output
16
+
17
+ logger = logging.getLogger(__name__)
18
+ logging.basicConfig(level=logging.INFO)
19
+
20
+
21
+ class GoogleSearchQuery(BaseModel):
22
+ """
23
+ Pydantic model representing the three Google search queries generated.
24
+ google_search_queries has a list of 3 search query strings.
25
+ """
26
+ google_search_queries: List[str]
27
+
28
+
29
+ async def generate_google_search_queries(
30
+ lead: Dict[str, Any],
31
+ english_description: str,
32
+ intent_signal_type: str,
33
+ example_query: str = "",
34
+ tool_config: Optional[List[Dict[str, Any]]] = None
35
+ ) -> Dict[str, Any]:
36
+ """
37
+ Generate three Google search queries based on a plain-English description,
38
+ incorporating the following logic:
39
+ 1. First consider searching LinkedIn and the organization's own website for relevant info.
40
+ 2. Then consider searching Instagram, Twitter, Github, Yelp, Crunchbase, Bloomberg,
41
+ or reputable news/financial sites for relevant qualification info.
42
+ 3. If lead["primary_domain_of_organization"] is not empty, ALWAYS include one query
43
+ that searches the domain with something like:
44
+ site:<primary_domain_of_organization> "about this company"
45
+ 4. Make sure lead["organization_name"] is part of every query.
46
+
47
+ Args:
48
+ lead: Dictionary containing information about the lead, including 'organization_name'.
49
+ english_description: The user's plain-English description.
50
+ intent_signal_type: A string indicating the intent signal type.
51
+ example_query: Optional user-provided example.
52
+ tool_config: Optional list of dictionaries containing tool configuration.
53
+
54
+ Returns:
55
+ A dictionary with a single key: "google_search_queries", mapping to a list of
56
+ exactly three search query strings.
57
+ """
58
+ # Pull out relevant values
59
+ org_name = lead.get("organization_name", "").strip()
60
+ org_name = normalize_company_name(org_name)
61
+ primary_domain = lead.get("primary_domain_of_organization", "").strip()
62
+
63
+ system_message = (
64
+ "You are a helpful AI Assistant that converts an English description of search requirements "
65
+ "into valid Google search queries.\n\n"
66
+ "Important instructions:\n"
67
+ "1. Always include the organization name in every query.\n"
68
+ "2. First consider ways to use LinkedIn or the company's own website to gather info.\n"
69
+ "3. Then consider how Google can leverage Instagram, Twitter, Github, Yelp, Crunchbase, Bloomberg, "
70
+ " or reputable news/financial sites to figure out relevant info for qualification.\n"
71
+ "4. You MUST generate exactly three Google search queries. No extra commentary.\n"
72
+ "5. If you're unsure about a filter, make your best guess or omit it.\n"
73
+ f"6. Primary domain of organization is: {primary_domain}\n\n"
74
+ f"7. Organization name is: {org_name}\n"
75
+ "8. In any site:linkedin.com search, make sure intitle:<organization_name> is present.\n\n"
76
+ "Output must be valid JSON with the structure:\n"
77
+ "{\n"
78
+ ' "google_search_queries": ["search query1", "search query2", "search query3"]\n'
79
+ "}"
80
+ )
81
+
82
+ few_shot_example_queries_lines = [
83
+ 'Examples (like Neo4j used in company):',
84
+ f'- site:linkedin.com/in "{org_name}" "Neo4j" intitle:"{org_name}" -intitle:Neo4j -intitle:"profiles" ',
85
+ 'Other examples to ssearch by title, news etc',
86
+ f'- site:linkedin.com/in "{org_name}" "Data Engineer" intitle:"{org_name}" -intitle:"profiles" ',
87
+ f'- site:linkedin.com/jobs/view/ "{org_name}" "hiring" "angular developer" intitle:"{org_name}"',
88
+ f'- site:news.google.com "{org_name}" "funding" OR "acquisition" OR "partnership"',
89
+ f'- site:crunchbase.com "{org_name}" "funding"',
90
+ f'- site:bloomberg.com "{org_name}" "financial news"'
91
+ ]
92
+ if primary_domain:
93
+ few_shot_example_queries_lines.append(f'- site:{primary_domain} Job Openings')
94
+ few_shot_example_queries_lines.append(f'- site:{primary_domain} Case Studies')
95
+ few_shot_example_queries_lines.append(f'- "{org_name}" "competitors" OR "versus" OR "vs" "market share" "compare"')
96
+
97
+ few_shot_example_queries = "\n".join(few_shot_example_queries_lines)
98
+
99
+ user_prompt = f"""
100
+ {system_message}
101
+
102
+ The user wants to build Google search queries for:
103
+ "{english_description}"
104
+
105
+ Some example queries:
106
+ {few_shot_example_queries}
107
+
108
+ Lead info:
109
+ {json.dumps(lead, indent=2)}
110
+
111
+ Example query (if provided):
112
+ {example_query}
113
+
114
+ Intent signal type:
115
+ {intent_signal_type}
116
+
117
+ Please generate exactly three queries in JSON format as:
118
+ {{
119
+ "google_search_queries": ["query1", "query2", "query3"]
120
+ }}
121
+ Remember to include "{org_name}" in each query.
122
+ """
123
+
124
+ logger.info("Generating Google search queries from description: %s", english_description)
125
+
126
+ # Call your structured-output helper
127
+ response, status = await get_structured_output_internal(
128
+ user_prompt,
129
+ GoogleSearchQuery,
130
+ tool_config=tool_config
131
+ )
132
+
133
+ if status != "SUCCESS" or not response:
134
+ raise Exception("Error generating the Google search queries.")
135
+
136
+ queries_dict = response.model_dump()
137
+
138
+ # Ensure that each query includes org_name
139
+ fixed_queries = []
140
+ for q in queries_dict["google_search_queries"]:
141
+ if org_name and org_name.lower() not in q.lower() and not q.lower().startswith(f'site:{primary_domain}'):
142
+ q = f'{q} "{org_name}"'
143
+ fixed_queries.append(q.strip())
144
+
145
+ queries_dict["google_search_queries"] = fixed_queries
146
+
147
+ # Ensure the domain-based query is included if primary_domain is present.
148
+ if primary_domain:
149
+ domain_query = f'site:{primary_domain}'
150
+ if all(domain_query.lower() not in x.lower() for x in queries_dict["google_search_queries"]):
151
+ queries_dict["google_search_queries"].append(domain_query)
152
+
153
+ logger.info("Search queries to be returned: %s", queries_dict["google_search_queries"])
154
+ return queries_dict
155
+
156
+
157
+ async def get_search_results_for_insights(
158
+ lead: Dict[str, Any],
159
+ english_description: str,
160
+ intent_signal_type: str,
161
+ example_query: str = "",
162
+ tool_config: Optional[List[Dict[str, Any]]] = None
163
+ ) -> List[Dict[str, Any]]:
164
+ """
165
+ Uses generate_google_search_queries() to get up to four Google queries,
166
+ then calls search_google() for each query in parallel to fetch results.
167
+
168
+ For special intent signals, specialized queries are composed directly
169
+ (e.g., searching LinkedIn for job postings with specific roles/technologies).
170
+
171
+ Args:
172
+ lead: Dictionary containing information about the lead.
173
+ english_description: The user's plain-English description.
174
+ intent_signal_type: A string indicating the intent signal type.
175
+ example_query: Optional user-provided example.
176
+ tool_config: Optional list of dictionaries containing tool configuration.
177
+
178
+ Returns:
179
+ A list of dictionaries, where each dictionary contains:
180
+ {
181
+ "query": <the google query used>,
182
+ "results": <a JSON string of search results array>
183
+ }
184
+ """
185
+ results_of_queries: List[Dict[str, Any]] = []
186
+
187
+ # ---------------------------------------------------------
188
+ # Specialized approach for recognized intent signal types
189
+ # ---------------------------------------------------------
190
+ if intent_signal_type == "intent_find_tech_usage_in_leads_current_company":
191
+ company_name = lead.get("organization_name", "")
192
+ company_name = normalize_company_name(company_name)
193
+ organization_linkedin_url = lead.get("organization_linkedin_url", "")
194
+ if company_name:
195
+ google_queries = await get_google_queries_for_technology_used(
196
+ english_description,
197
+ company_name=company_name,
198
+ tool_config=tool_config
199
+ )
200
+ if google_queries:
201
+ job_posting_links = await find_tech_reference_by_google_search(
202
+ company_name,
203
+ google_queries,
204
+ organization_linkedin_url,
205
+ tool_config
206
+ )
207
+ results_of_queries.append({
208
+ "query": f"Find tech usage references by {company_name} using {google_queries} in Google search",
209
+ "results": json.dumps(job_posting_links)
210
+ })
211
+
212
+ elif intent_signal_type == "intent_find_tech_usage_in_leads_previous_company":
213
+ previous_company_name = lead.get("previous_organization_name", "")
214
+ previous_company_name = normalize_company_name(previous_company_name)
215
+ previous_organization_linkedin_url = lead.get("previous_organization_linkedin_url", "")
216
+ if previous_company_name:
217
+ google_queries = await get_google_queries_for_technology_used(
218
+ english_description,
219
+ company_name=previous_company_name,
220
+ tool_config=tool_config
221
+ )
222
+ if google_queries:
223
+ job_posting_links = await find_tech_reference_by_google_search(
224
+ previous_company_name,
225
+ google_queries,
226
+ previous_organization_linkedin_url,
227
+ tool_config
228
+ )
229
+ results_of_queries.append({
230
+ "query": f"Find tech usage references by previous {previous_company_name} using {google_queries} in Google search",
231
+ "results": json.dumps(job_posting_links)
232
+ })
233
+
234
+ elif intent_signal_type == "intent_find_champion_changed_job":
235
+ # For current
236
+ company_name = normalize_company_name(lead.get("organization_name", ""))
237
+ organization_linkedin_url = lead.get("organization_linkedin_url", "")
238
+ if company_name:
239
+ google_queries = await get_google_queries_for_technology_used(
240
+ english_description,
241
+ company_name=company_name,
242
+ tool_config=tool_config
243
+ )
244
+ if google_queries:
245
+ current_company_job_posting_links = await find_tech_reference_by_google_search(
246
+ company_name,
247
+ google_queries,
248
+ organization_linkedin_url,
249
+ tool_config
250
+ )
251
+ results_of_queries.append({
252
+ "query": f"Find tech usage references by current company {company_name} using {google_queries} in Google search",
253
+ "results": json.dumps(current_company_job_posting_links)
254
+ })
255
+
256
+ # For previous
257
+ previous_company_name = normalize_company_name(lead.get("previous_organization_name", ""))
258
+ previous_organization_linkedin_url = lead.get("previous_organization_linkedin_url", "")
259
+ if previous_company_name:
260
+ google_queries = await get_google_queries_for_technology_used(
261
+ english_description,
262
+ company_name=previous_company_name,
263
+ tool_config=tool_config
264
+ )
265
+ if google_queries:
266
+ prev_company_job_posting_links = await find_tech_reference_by_google_search(
267
+ previous_company_name,
268
+ google_queries,
269
+ previous_organization_linkedin_url,
270
+ tool_config
271
+ )
272
+ results_of_queries.append({
273
+ "query": f"Find tech usage references by previous company {previous_company_name} using {google_queries} in Google search",
274
+ "results": json.dumps(prev_company_job_posting_links)
275
+ })
276
+
277
+ elif intent_signal_type == "intent_find_job_opening_with_role_in_company":
278
+ company_name = normalize_company_name(lead.get("organization_name", ""))
279
+ organization_linkedin_url = lead.get("organization_linkedin_url", "")
280
+ if company_name and organization_linkedin_url:
281
+ google_query = await get_google_query_for_specific_role(
282
+ english_description,
283
+ company_name=company_name,
284
+ tool_config=tool_config
285
+ )
286
+ if google_query.strip():
287
+ job_posting_links = await find_job_postings_google_search(
288
+ company_name,
289
+ google_query,
290
+ organization_linkedin_url,
291
+ tool_config
292
+ )
293
+ results_of_queries.append({
294
+ "query": f"Find job by role in {company_name} using {google_query} in Google search",
295
+ "results": json.dumps(job_posting_links)
296
+ })
297
+
298
+ elif intent_signal_type == "intent_find_person_with_title_in_company":
299
+ company_name = normalize_company_name(lead.get("organization_name", ""))
300
+ organization_linkedin_url = lead.get("organization_linkedin_url", "")
301
+ if company_name and organization_linkedin_url:
302
+ google_query = await get_google_query_for_specific_title(
303
+ english_description,
304
+ company_name=company_name,
305
+ tool_config=tool_config
306
+ )
307
+ if google_query.strip():
308
+ job_posting_links = await find_job_postings_google_search(
309
+ company_name,
310
+ google_query,
311
+ organization_linkedin_url,
312
+ tool_config
313
+ )
314
+ results_of_queries.append({
315
+ "query": f"Find people with specific title in {company_name} using {google_query} in Google search",
316
+ "results": json.dumps(job_posting_links)
317
+ })
318
+
319
+ else:
320
+ # ---------------------------------------------------------
321
+ # Generic approach for unknown or general intent signals
322
+ # ---------------------------------------------------------
323
+ response_dict = await generate_google_search_queries(
324
+ lead=lead,
325
+ english_description=english_description,
326
+ intent_signal_type=intent_signal_type,
327
+ example_query=example_query,
328
+ tool_config=tool_config
329
+ )
330
+
331
+ # Extract and limit the queries to a maximum of four
332
+ queries = response_dict.get("google_search_queries", [])
333
+ queries = queries[:4]
334
+
335
+ # Execute searches in parallel
336
+ coroutines = [
337
+ search_google(query, number_of_results=3, tool_config=tool_config)
338
+ for query in queries
339
+ ]
340
+ results = await asyncio.gather(*coroutines)
341
+
342
+ for query, query_results in zip(queries, results):
343
+ results_of_queries.append({
344
+ "query": query,
345
+ "results": json.dumps(query_results)
346
+ })
347
+
348
+ # Return the compiled list of search results
349
+ return results_of_queries
350
+
351
+
352
+ def get_serp_api_access_token(tool_config: Optional[List[Dict]] = None) -> str:
353
+ """
354
+ Retrieves the SERPAPI_KEY access token from the provided tool configuration
355
+ or from the environment variable SERPAPI_KEY.
356
+
357
+ Raises:
358
+ ValueError: If the SerpAPI integration has not been configured.
359
+ """
360
+ serpapi_key = None
361
+ if tool_config:
362
+ serpapi_config = next(
363
+ (item for item in tool_config if item.get("name") == "serpapi"),
364
+ None
365
+ )
366
+ if serpapi_config:
367
+ config_map = {
368
+ item["name"]: item["value"]
369
+ for item in serpapi_config.get("configuration", [])
370
+ if item
371
+ }
372
+ serpapi_key = config_map.get("apiKey")
373
+
374
+ # Fallback to environment variable if not found in tool_config
375
+ serpapi_key = serpapi_key or os.getenv("SERPAPI_KEY")
376
+ if not serpapi_key:
377
+ raise ValueError(
378
+ "SerpAPI integration is not configured. Please configure the connection to SerpAPI in Integrations."
379
+ )
380
+ return serpapi_key
381
+
382
+
383
+ async def search_google(
384
+ query: str,
385
+ number_of_results: int = 3,
386
+ tool_config: Optional[List[Dict]] = None
387
+ ) -> List[str]:
388
+ """
389
+ Search Google using SERP API and return the results as a list of JSON strings.
390
+
391
+ Args:
392
+ query: The search query.
393
+ number_of_results: Number of organic results to return.
394
+ tool_config: Optional list of dictionaries containing tool configuration.
395
+
396
+ Returns:
397
+ A list of JSON strings, each representing one search result.
398
+ If any error occurs, returns a list with a single JSON-encoded error dict.
399
+ """
400
+ serpapi_key = get_serp_api_access_token(tool_config)
401
+
402
+ # Check cache first
403
+ cached_response = retrieve_output("search_google_serp", query)
404
+ if cached_response is not None:
405
+ return cached_response
406
+
407
+ params = {
408
+ "q": query,
409
+ "num": number_of_results,
410
+ "api_key": serpapi_key
411
+ }
412
+
413
+ url = "https://serpapi.com/search"
414
+ try:
415
+ async with aiohttp.ClientSession() as session:
416
+ async with session.get(url, params=params) as response:
417
+ if response.status != 200:
418
+ error_data = await response.text()
419
+ return [json.dumps({"error": error_data})]
420
+
421
+ result = await response.json()
422
+ # Serialize each result to a JSON string
423
+ serialized_results = [
424
+ json.dumps(item) for item in result.get('organic_results', [])
425
+ ]
426
+ # Cache results
427
+ cache_output("search_google_serp", query, serialized_results)
428
+ return serialized_results
429
+ except Exception as exc:
430
+ return [json.dumps({"error": str(exc)})]
431
+
432
+
433
+ class TechnologyUsedCheck(BaseModel):
434
+ """
435
+ Pydantic model representing the technology keywords to look for.
436
+ technologies_used: list of strings describing the sought-after technologies.
437
+ """
438
+ technologies_used: List[str]
439
+ location_to_filter_by: str
440
+
441
+
442
+ async def get_google_queries_for_technology_used(
443
+ english_description: str,
444
+ company_name: str,
445
+ tool_config: Optional[List[Dict[str, Any]]] = None
446
+ ) -> List[str]:
447
+ """
448
+ Extract technology keywords from the English description to build a LinkedIn job-focused query.
449
+
450
+ Args:
451
+ english_description: The user's plain-English description.
452
+ company_name: Name of the company to search around.
453
+ tool_config: Optional tool configuration for structured-output or SERP.
454
+
455
+ Returns:
456
+ A list of Google queries that includes the company name and discovered technology keywords.
457
+ """
458
+ prompt = f"""
459
+ Given the English description, list any technologies that the user is trying to verify for {company_name}.
460
+ Find if there is a location to filter search by and fill location_to_filter_by. If none specified, default is United States.
461
+
462
+ User input:
463
+ {english_description}
464
+
465
+ Output must be valid JSON, e.g.:
466
+ {{
467
+ "technologies_used": ["someTech", "anotherTech"],
468
+ "location_to_filter_by": "United States"
469
+ }}
470
+ """
471
+ response, status = await get_structured_output_internal(
472
+ prompt=prompt,
473
+ response_format=TechnologyUsedCheck,
474
+ effort="high",
475
+ model="gpt-5.1-chat",
476
+ tool_config=tool_config
477
+ )
478
+
479
+ # Build up to two queries if we have technologies
480
+ if status == "SUCCESS" and response and response.technologies_used:
481
+ queries = []
482
+ tech_used_quoted = " OR ".join([f'"{tech}"' for tech in response.technologies_used])
483
+ queries.append(
484
+ f'site:linkedin.com (({tech_used_quoted}) AND ("{company_name}") AND ("{response.location_to_filter_by}"))'
485
+ )
486
+ queries.append(
487
+ f'site:x.com (({tech_used_quoted}) AND ("{company_name}"))'
488
+ )
489
+ return queries
490
+ else:
491
+ return []
492
+
493
+
494
+ class TechnologyAndRoleCheck(BaseModel):
495
+ """
496
+ Pydantic model representing the technology keywords and role(s) to look for.
497
+ """
498
+ technologies_used: List[str]
499
+ roles_looking_for: List[str]
500
+ location_to_filter_by: str
501
+
502
+
503
+ async def get_google_query_for_specific_role(
504
+ english_description: str,
505
+ company_name: str,
506
+ tool_config: Optional[List[Dict[str, Any]]] = None
507
+ ) -> str:
508
+ """
509
+ Extract role and technology keywords from the English description to build a LinkedIn jobs query.
510
+
511
+ Args:
512
+ english_description: The user's plain-English description.
513
+ company_name: Name of the company to search around.
514
+ tool_config: Optional tool configuration.
515
+
516
+ Returns:
517
+ A single Google query string with role(s) and technology keywords.
518
+ """
519
+ prompt = f"""
520
+ Given the English description, identify any specific roles and technologies for {company_name}.
521
+ Find if there is a location to filter search by and fill location_to_filter_by. If none specified, default is United States.
522
+
523
+ User input:
524
+ {english_description}
525
+
526
+ Output must be valid JSON, e.g.:
527
+ {{
528
+ "technologies_used": ["Angular", "Python"],
529
+ "roles_looking_for": ["Developer", "Team Lead"],
530
+ "location_to_filter_by": "United States"
531
+ }}
532
+ """
533
+ response, status = await get_structured_output_internal(
534
+ prompt=prompt,
535
+ response_format=TechnologyAndRoleCheck,
536
+ effort="high",
537
+ model="gpt-5.1-chat",
538
+ tool_config=tool_config
539
+ )
540
+
541
+ if status == "SUCCESS" and response:
542
+ tech_used_part = " OR ".join([f'"{tech}"' for tech in response.technologies_used]) if response.technologies_used else ""
543
+ roles_part = " OR ".join([f'"{role}"' for role in response.roles_looking_for]) if response.roles_looking_for else ""
544
+ return (
545
+ f'site:linkedin.com/in ({tech_used_part}) AND ({roles_part}) '
546
+ f'AND ("{company_name}") AND ("{response.location_to_filter_by}") -intitle:"profiles" '
547
+ ).strip()
548
+ else:
549
+ return ""
550
+
551
+
552
+ class CheckForPeopleWithTitle(BaseModel):
553
+ """
554
+ Pydantic model for extracting job titles from an English description.
555
+ """
556
+ job_titles: List[str]
557
+ location_to_filter_by: str = "United States"
558
+
559
+
560
+ async def get_google_query_for_specific_title(
561
+ english_description: str,
562
+ company_name: str,
563
+ tool_config: Optional[List[Dict[str, Any]]] = None
564
+ ) -> str:
565
+ """
566
+ Extract job titles (and location) from the English description to build a LinkedIn /in query.
567
+
568
+ Args:
569
+ english_description: The user's plain-English description.
570
+ company_name: Name of the company to search around.
571
+ tool_config: Optional tool configuration.
572
+
573
+ Returns:
574
+ A single Google query string with job titles and the company name.
575
+ """
576
+ prompt = f"""
577
+ Given the English description, identify any specific job titles that the user wants to find at {company_name}.
578
+ Find if there is location to filter search by and fill location_to_filter_by. If none specified default is United States.
579
+
580
+ User input:
581
+ {english_description}
582
+
583
+ Output must be valid JSON, e.g.:
584
+ {{
585
+ "job_titles": ["CTO", "Head of Engineering"],
586
+ "location_to_filter_by": "United States"
587
+ }}
588
+ """
589
+ response, status = await get_structured_output_internal(
590
+ prompt=prompt,
591
+ response_format=CheckForPeopleWithTitle,
592
+ effort="high",
593
+ tool_config=tool_config
594
+ )
595
+
596
+ if status == "SUCCESS" and response:
597
+ titles_part = " OR ".join([f'"{title}"' for title in response.job_titles]) if response.job_titles else ""
598
+ return (
599
+ f'site:linkedin.com/in ({titles_part}) AND ("{company_name}") '
600
+ f'AND ("{response.location_to_filter_by}") -intitle:"profiles" '
601
+ ).strip()
602
+ else:
603
+ return ""
604
+
605
+
606
+ # TODO: fix with playwright implementation.
607
+ headers = {
608
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0",
609
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
610
+ "Accept-Language": "en-US,en;q=0.5",
611
+ "Referer": "https://www.google.com/"
612
+ }
613
+
614
+
615
+ async def _get_html_content_from_url(url: str) -> str:
616
+ async with aiohttp.ClientSession(headers=headers) as session:
617
+ async with session.get(url) as response:
618
+ return await response.text()
619
+
620
+
621
+ async def _clean_html_content(html_content: str) -> BeautifulSoup:
622
+ if not html_content:
623
+ return BeautifulSoup("", 'html.parser')
624
+ soup = BeautifulSoup(html_content, 'html.parser')
625
+ for element in soup(['script', 'style', 'meta', 'code', 'svg']):
626
+ element.decompose()
627
+ return soup
628
+
629
+
630
+ async def find_job_postings_google_search(
631
+ company_name: str,
632
+ google_query: str,
633
+ organization_linkedin_url: Optional[str] = None,
634
+ tool_config: Optional[List[Dict[str, Any]]] = None
635
+ ) -> List[str]:
636
+ """
637
+ Find job postings on LinkedIn for a given company using a Google Search query.
638
+
639
+ Args:
640
+ company_name (str): The name of the company.
641
+ google_query (str): The Google query to run.
642
+ organization_linkedin_url (Optional[str]): The LinkedIn URL of the company.
643
+ tool_config: Optional list of dictionaries containing tool configuration.
644
+
645
+ Returns:
646
+ A list of discovered job posting links.
647
+ """
648
+ logger.info("Entering find_job_postings_google_search with query: %s", google_query)
649
+ if not google_query.strip():
650
+ return []
651
+
652
+ job_posting_links = []
653
+
654
+ try:
655
+ results = await search_google(google_query.strip(), number_of_results=10, tool_config=tool_config)
656
+ except Exception:
657
+ logger.exception("Error searching for job postings via Google.")
658
+ return []
659
+
660
+ if not isinstance(results, list) or len(results) == 0:
661
+ logger.debug("No results returned for this query.")
662
+ return []
663
+
664
+ for result_item in results:
665
+ try:
666
+ result_json = json.loads(result_item)
667
+ except json.JSONDecodeError:
668
+ logger.debug("Failed to parse JSON from the search result.")
669
+ continue
670
+
671
+ link = result_json.get('link', '')
672
+ if not link:
673
+ logger.debug("No link found in result JSON.")
674
+ continue
675
+
676
+ try:
677
+ page_content = await _get_html_content_from_url(link)
678
+ soup = await _clean_html_content(page_content)
679
+ except Exception:
680
+ logger.exception("Error fetching or parsing the job posting page.")
681
+ continue
682
+
683
+ page_links = [a.get('href') for a in soup.find_all('a', href=True)]
684
+
685
+ company_match = False
686
+ if organization_linkedin_url:
687
+ partial_url = re.sub(r'^https?:\/\/(www\.)?', '', organization_linkedin_url).rstrip('/')
688
+ for page_link in page_links:
689
+ if (
690
+ page_link
691
+ and partial_url in page_link
692
+ and 'public_jobs_topcard-org-name' in page_link
693
+ ):
694
+ company_match = True
695
+ break
696
+
697
+ if company_match:
698
+ job_posting_links.append(link)
699
+
700
+ logger.info("Found %d job posting links for query '%s'.", len(job_posting_links), google_query)
701
+ return job_posting_links
702
+
703
+
704
+ async def find_tech_reference_by_google_search(
705
+ company_name: str,
706
+ google_queries: List[str],
707
+ organization_linkedin_url: Optional[str] = None,
708
+ tool_config: Optional[List[Dict[str, Any]]] = None
709
+ ) -> List[str]:
710
+ """
711
+ Find pages referencing certain technologies or job postings on LinkedIn for a given company
712
+ using a list of Google queries.
713
+
714
+ Args:
715
+ company_name (str): The name of the company.
716
+ google_queries (List[str]): The Google queries to run.
717
+ organization_linkedin_url (Optional[str]): The LinkedIn URL of the company.
718
+ tool_config (Optional[List[Dict[str, Any]]]): Optional list of dictionaries containing tool configuration.
719
+
720
+ Returns:
721
+ List[str]: A list of discovered links referencing the technologies/job postings.
722
+ """
723
+ linkedin_reference_links = []
724
+ for google_query in google_queries:
725
+ logger.info("Entering find_tech_reference_by_google_search with query: %s", google_query)
726
+ if not google_query.strip():
727
+ continue
728
+
729
+ try:
730
+ results = await search_google(google_query.strip(), number_of_results=10, tool_config=tool_config)
731
+ except Exception:
732
+ logger.exception("Error searching for job postings via Google.")
733
+ continue
734
+
735
+ if not isinstance(results, list) or len(results) == 0:
736
+ logger.debug("No results returned for this query.")
737
+ continue
738
+
739
+ for result_item in results:
740
+ try:
741
+ result_json = json.loads(result_item)
742
+ except json.JSONDecodeError:
743
+ logger.debug("Failed to parse JSON from the search result.")
744
+ continue
745
+
746
+ link = result_json.get('link', '')
747
+ if not link:
748
+ logger.debug("No link found in result JSON.")
749
+ continue
750
+
751
+ linkedin_reference_links.append(link)
752
+
753
+ logger.info(
754
+ "Accumulated %d links so far for query '%s'.",
755
+ len(linkedin_reference_links),
756
+ google_query
757
+ )
758
+
759
+ return linkedin_reference_links