leads-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. company_discovery/__init__.py +4 -0
  2. company_discovery/adapters/__init__.py +5 -0
  3. company_discovery/adapters/apollo.py +189 -0
  4. company_discovery/adapters/exa.py +112 -0
  5. company_discovery/adapters/llm.py +118 -0
  6. company_discovery/adapters/protocols.py +58 -0
  7. company_discovery/adapters/website.py +154 -0
  8. company_discovery/bundled_skills/__init__.py +1 -0
  9. company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
  10. company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
  11. company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
  12. company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
  13. company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
  14. company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
  15. company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
  16. company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
  17. company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
  18. company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
  19. company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
  20. company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
  21. company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
  22. company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
  23. company_discovery/cli.py +1789 -0
  24. company_discovery/db/__init__.py +5 -0
  25. company_discovery/db/contact_enrichment_repository.py +268 -0
  26. company_discovery/db/contact_repository.py +366 -0
  27. company_discovery/db/enrichment_repository.py +207 -0
  28. company_discovery/db/models.py +324 -0
  29. company_discovery/db/repository.py +363 -0
  30. company_discovery/db/session.py +48 -0
  31. company_discovery/domain/__init__.py +24 -0
  32. company_discovery/domain/contact_models.py +178 -0
  33. company_discovery/domain/contact_spec.py +86 -0
  34. company_discovery/domain/models.py +287 -0
  35. company_discovery/domain/spec.py +263 -0
  36. company_discovery/migrations.py +190 -0
  37. company_discovery/prompts/__init__.py +8 -0
  38. company_discovery/prompts/candidate_evaluation/system.md +13 -0
  39. company_discovery/prompts/company_enrichment/system.md +42 -0
  40. company_discovery/prompts/contact_evaluation/system.md +18 -0
  41. company_discovery/prompts/query_generation/system.md +10 -0
  42. company_discovery/release_manifest.json +7 -0
  43. company_discovery/reports/__init__.py +4 -0
  44. company_discovery/reports/contact_enrichment_exporter.py +108 -0
  45. company_discovery/reports/contact_exporter.py +132 -0
  46. company_discovery/reports/enrichment_exporter.py +125 -0
  47. company_discovery/reports/exporter.py +135 -0
  48. company_discovery/runtime.py +336 -0
  49. company_discovery/services/__init__.py +4 -0
  50. company_discovery/services/contact_enrichment_pipeline.py +344 -0
  51. company_discovery/services/contact_enrichment_progress.py +37 -0
  52. company_discovery/services/contact_evaluator.py +110 -0
  53. company_discovery/services/contact_pipeline.py +295 -0
  54. company_discovery/services/contact_progress.py +38 -0
  55. company_discovery/services/enrichment_extractor.py +61 -0
  56. company_discovery/services/enrichment_pipeline.py +526 -0
  57. company_discovery/services/enrichment_progress.py +20 -0
  58. company_discovery/services/enrichment_resolver.py +148 -0
  59. company_discovery/services/evaluator.py +40 -0
  60. company_discovery/services/hygiene.py +51 -0
  61. company_discovery/services/memory.py +150 -0
  62. company_discovery/services/normalization.py +98 -0
  63. company_discovery/services/pipeline.py +628 -0
  64. company_discovery/services/progress.py +48 -0
  65. company_discovery/services/query_planner.py +47 -0
  66. company_discovery/settings.py +152 -0
  67. company_discovery/skill_installer.py +197 -0
  68. company_discovery/update_plan.py +79 -0
  69. leads_cli-0.1.0.dist-info/METADATA +277 -0
  70. leads_cli-0.1.0.dist-info/RECORD +72 -0
  71. leads_cli-0.1.0.dist-info/WHEEL +4 -0
  72. leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,4 @@
1
+ """Memory-first company discovery."""
2
+
3
+ __distribution_name__ = "leads-cli"
4
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ from company_discovery.adapters.exa import ExaClient
2
+ from company_discovery.adapters.llm import OpenAICompatibleLLM
3
+
4
+ __all__ = ["ExaClient", "OpenAICompatibleLLM"]
5
+
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import Any
5
+
6
+ import httpx
7
+
8
+ from company_discovery.domain.contact_models import (
9
+ ApolloBatchResult,
10
+ ApolloPersonMatch,
11
+ ApolloPersonRequest,
12
+ )
13
+ from company_discovery.services.normalization import canonical_domain
14
+ from company_discovery.settings import Settings
15
+
16
+
17
+ class ApolloClient:
18
+ """Apollo people-enrichment adapter with bounded retry and async-result support."""
19
+
20
+ MAX_BATCH_SIZE = 10
21
+
22
+ def __init__(self, settings: Settings, client: httpx.Client | None = None) -> None:
23
+ if not settings.apollo_api_key:
24
+ raise ValueError("APOLLO_API_KEY is required for contact enrichment")
25
+ self._settings = settings
26
+ self._owns_client = client is None
27
+ self._client = client or httpx.Client(
28
+ base_url=settings.apollo_base_url.rstrip("/"),
29
+ headers={
30
+ "X-Api-Key": settings.apollo_api_key,
31
+ "Content-Type": "application/json",
32
+ "Cache-Control": "no-cache",
33
+ },
34
+ timeout=settings.apollo_timeout_seconds,
35
+ )
36
+ self._pending_people: dict[str, list[ApolloPersonRequest]] = {}
37
+
38
+ def enrich_people(
39
+ self,
40
+ people: list[ApolloPersonRequest],
41
+ *,
42
+ reveal_email: bool,
43
+ reveal_phone: bool,
44
+ ) -> ApolloBatchResult:
45
+ if not people:
46
+ return ApolloBatchResult()
47
+ if len(people) > self.MAX_BATCH_SIZE:
48
+ raise ValueError(f"Apollo bulk enrichment accepts at most {self.MAX_BATCH_SIZE} people")
49
+ if reveal_phone and not self._settings.apollo_webhook_url:
50
+ raise ValueError(
51
+ "APOLLO_WEBHOOK_URL is required when phone enrichment is enabled; "
52
+ "use --no-phone for synchronous email-only enrichment"
53
+ )
54
+
55
+ payload: dict[str, Any] = {
56
+ "details": [self._person_payload(person) for person in people],
57
+ "reveal_personal_emails": False,
58
+ "reveal_phone_number": reveal_phone,
59
+ # Standard work email is synchronous. Waterfall email becomes useful only when
60
+ # Apollo has a webhook destination for its asynchronous completion payload.
61
+ "run_waterfall_email": reveal_email and bool(self._settings.apollo_webhook_url),
62
+ "run_waterfall_phone": reveal_phone,
63
+ }
64
+ if self._settings.apollo_webhook_url:
65
+ payload["webhook_url"] = self._settings.apollo_webhook_url
66
+ response = self._request("POST", "/api/v1/people/bulk_match", json=payload)
67
+ data = response.json()
68
+ result = self._parse(data, people)
69
+ if result.request_id:
70
+ self._pending_people[result.request_id] = people
71
+ return result
72
+
73
+ def poll(self, request_id: str) -> ApolloBatchResult:
74
+ response = self._request("GET", f"/api/v1/webhook_result/{request_id}")
75
+ data = response.json()
76
+ result = self._parse(data, self._pending_people.get(request_id, []))
77
+ if not result.request_id:
78
+ result = result.model_copy(update={"request_id": request_id})
79
+ if not result.pending:
80
+ self._pending_people.pop(request_id, None)
81
+ return result
82
+
83
+ def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
84
+ for attempt in range(3):
85
+ response = self._client.request(method, path, **kwargs)
86
+ if response.status_code != 429 and response.status_code < 500:
87
+ response.raise_for_status()
88
+ return response
89
+ if attempt == 2:
90
+ response.raise_for_status()
91
+ retry_after = response.headers.get("retry-after")
92
+ delay = float(retry_after) if retry_after else float(2**attempt)
93
+ time.sleep(delay)
94
+ raise RuntimeError("Apollo request retry loop exited unexpectedly")
95
+
96
+ @staticmethod
97
+ def _person_payload(person: ApolloPersonRequest) -> dict[str, str]:
98
+ payload = {
99
+ "first_name": person.first_name,
100
+ "last_name": person.last_name,
101
+ "name": person.full_name,
102
+ "organization_name": person.company_name,
103
+ "domain": person.company_domain,
104
+ }
105
+ if person.linkedin_url:
106
+ payload["linkedin_url"] = person.linkedin_url
107
+ return payload
108
+
109
+ @classmethod
110
+ def _parse(
111
+ cls, payload: dict[str, Any], requested: list[ApolloPersonRequest]
112
+ ) -> ApolloBatchResult:
113
+ request_id = cls._request_id(payload)
114
+ status = str(payload.get("status") or payload.get("state") or "").lower()
115
+ pending = status in {"pending", "processing", "queued", "running"}
116
+ source = payload.get("data") if isinstance(payload.get("data"), dict) else payload
117
+ records = source.get("matches") or source.get("people") or source.get("results") or []
118
+ if isinstance(records, dict):
119
+ records = records.get("matches") or records.get("people") or records.get("results") or []
120
+ if not isinstance(records, list):
121
+ records = []
122
+ if request_id and not records and status not in {"complete", "completed", "success", "succeeded"}:
123
+ pending = True
124
+
125
+ matches: list[ApolloPersonMatch] = []
126
+ for index, requested_person in enumerate(requested):
127
+ raw = records[index] if index < len(records) and isinstance(records[index], dict) else {}
128
+ person = raw.get("person") if isinstance(raw.get("person"), dict) else raw
129
+ found = bool(person) and not bool(raw.get("error"))
130
+ organization = person.get("organization") if isinstance(person.get("organization"), dict) else {}
131
+ phones = cls._phones(person)
132
+ raw_domain = (
133
+ organization.get("primary_domain")
134
+ or organization.get("website_url")
135
+ or person.get("organization_domain")
136
+ )
137
+ matches.append(
138
+ ApolloPersonMatch(
139
+ candidate_id=requested_person.candidate_id,
140
+ person_found=found,
141
+ full_name=person.get("name") or cls._joined_name(person),
142
+ linkedin_url=person.get("linkedin_url"),
143
+ title=person.get("title"),
144
+ organization_name=organization.get("name") or person.get("organization_name"),
145
+ organization_domain=canonical_domain(str(raw_domain)) if raw_domain else None,
146
+ email=person.get("email"),
147
+ email_status=person.get("email_status"),
148
+ phones=phones,
149
+ apollo_person_id=person.get("id"),
150
+ raw=raw,
151
+ )
152
+ )
153
+ return ApolloBatchResult(matches=matches, request_id=request_id, pending=pending)
154
+
155
+ @staticmethod
156
+ def _request_id(payload: dict[str, Any]) -> str | None:
157
+ direct = payload.get("request_id") or payload.get("requestId")
158
+ if direct:
159
+ return str(direct)
160
+ data = payload.get("data")
161
+ if isinstance(data, dict):
162
+ nested = data.get("request_id") or data.get("requestId")
163
+ return str(nested) if nested else None
164
+ return None
165
+
166
+ @staticmethod
167
+ def _joined_name(person: dict[str, Any]) -> str | None:
168
+ name = " ".join(
169
+ part for part in (person.get("first_name"), person.get("last_name")) if part
170
+ )
171
+ return name or None
172
+
173
+ @staticmethod
174
+ def _phones(person: dict[str, Any]) -> list[str]:
175
+ values: list[str] = []
176
+ for phone in person.get("phone_numbers") or []:
177
+ if not isinstance(phone, dict):
178
+ continue
179
+ value = phone.get("sanitized_number") or phone.get("raw_number") or phone.get("number")
180
+ if value and value not in values:
181
+ values.append(str(value))
182
+ direct = person.get("phone") or person.get("mobile_phone")
183
+ if direct and str(direct) not in values:
184
+ values.append(str(direct))
185
+ return values
186
+
187
+ def close(self) -> None:
188
+ if self._owns_client:
189
+ self._client.close()
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import Any
5
+
6
+ import httpx
7
+
8
+ from company_discovery.domain.models import ExaSearchResult
9
+ from company_discovery.settings import Settings
10
+
11
+
12
+ class ExaClient:
13
+ """Minimal Exa company-search adapter that preserves each provider payload."""
14
+
15
+ def __init__(self, settings: Settings, client: httpx.Client | None = None) -> None:
16
+ if not settings.exa_api_key:
17
+ raise ValueError("EXA_API_KEY is required for external discovery")
18
+ self._settings = settings
19
+ self._owns_client = client is None
20
+ self._client = client or httpx.Client(
21
+ base_url=settings.exa_base_url.rstrip("/"),
22
+ headers={"x-api-key": settings.exa_api_key, "content-type": "application/json"},
23
+ timeout=settings.exa_timeout_seconds,
24
+ )
25
+ self._last_cost_dollars = 0.0
26
+ self._last_request_at: float | None = None
27
+
28
+ @property
29
+ def last_cost_dollars(self) -> float:
30
+ return self._last_cost_dollars
31
+
32
+ def search(self, query: str, *, country: str, num_results: int) -> list[ExaSearchResult]:
33
+ return self._search(query, country=country, num_results=num_results, category="company")
34
+
35
+ def search_people(
36
+ self, query: str, *, country: str, num_results: int
37
+ ) -> list[ExaSearchResult]:
38
+ return self._search(query, country=country, num_results=num_results, category="people")
39
+
40
+ def search_contact_evidence(
41
+ self, query: str, *, country: str, num_results: int
42
+ ) -> list[ExaSearchResult]:
43
+ return self._search(query, country=country, num_results=num_results, category=None)
44
+
45
+ def _search(
46
+ self, query: str, *, country: str, num_results: int, category: str | None
47
+ ) -> list[ExaSearchResult]:
48
+ payload = {
49
+ "query": query,
50
+ "numResults": max(1, min(num_results, 100)),
51
+ "type": "auto",
52
+ "userLocation": country.upper(),
53
+ "contents": {"text": {"maxCharacters": 3000}},
54
+ }
55
+ if category is not None:
56
+ payload["category"] = category
57
+ if category == "company":
58
+ payload["systemPrompt"] = (
59
+ "Return official operating-company websites. Avoid directories, associations, "
60
+ "marketplaces, news pages, and duplicate companies."
61
+ )
62
+ response = self._post_with_retry(payload)
63
+ data = response.json()
64
+ self._last_cost_dollars = self._read_cost(data)
65
+ return [
66
+ ExaSearchResult(
67
+ query=query,
68
+ position=index,
69
+ title=item.get("title") or "",
70
+ url=item.get("url") or "",
71
+ text=item.get("text"),
72
+ published_date=item.get("publishedDate"),
73
+ exa_id=item.get("id"),
74
+ raw=item,
75
+ )
76
+ for index, item in enumerate(data.get("results", []), start=1)
77
+ ]
78
+
79
+ def _post_with_retry(self, payload: dict[str, Any]) -> httpx.Response:
80
+ for attempt in range(3):
81
+ self._pace_request()
82
+ response = self._client.post("/search", json=payload)
83
+ self._last_request_at = time.monotonic()
84
+ if response.status_code != 429 and response.status_code < 500:
85
+ response.raise_for_status()
86
+ return response
87
+ if attempt == 2:
88
+ response.raise_for_status()
89
+ retry_after = response.headers.get("retry-after")
90
+ delay = float(retry_after) if retry_after and retry_after.isdigit() else 2**attempt
91
+ time.sleep(delay)
92
+ raise RuntimeError("Exa request retry loop exited unexpectedly")
93
+
94
+ def _pace_request(self) -> None:
95
+ if self._last_request_at is None:
96
+ return
97
+ remaining = 0.21 - (time.monotonic() - self._last_request_at)
98
+ if remaining > 0:
99
+ time.sleep(remaining)
100
+
101
+ @staticmethod
102
+ def _read_cost(payload: dict[str, Any]) -> float:
103
+ cost = payload.get("costDollars")
104
+ if isinstance(cost, dict) and isinstance(cost.get("total"), (int, float)):
105
+ return float(cost["total"])
106
+ if isinstance(cost, (int, float)):
107
+ return float(cost)
108
+ return 0.0
109
+
110
+ def close(self) -> None:
111
+ if self._owns_client:
112
+ self._client.close()
@@ -0,0 +1,118 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import time
5
+
6
+ import httpx
7
+ from pydantic import BaseModel, ValidationError
8
+
9
+ from company_discovery.settings import Settings
10
+
11
+
12
+ class OpenAICompatibleLLM:
13
+ """Structured-output client for OpenAI-compatible chat completion APIs."""
14
+
15
+ def __init__(self, settings: Settings, client: httpx.Client | None = None) -> None:
16
+ if not settings.llm_api_key:
17
+ raise ValueError("LLM_API_KEY is required for query generation and evaluation")
18
+ self._settings = settings
19
+ self._owns_client = client is None
20
+ self._client = client or httpx.Client(
21
+ base_url=settings.llm_base_url.rstrip("/"),
22
+ headers={
23
+ "Authorization": f"Bearer {settings.llm_api_key}",
24
+ "Content-Type": "application/json",
25
+ },
26
+ timeout=settings.llm_timeout_seconds,
27
+ )
28
+
29
+ def generate(
30
+ self,
31
+ *,
32
+ system_prompt: str,
33
+ user_prompt: str,
34
+ response_model: type[BaseModel],
35
+ ) -> BaseModel:
36
+ messages = [
37
+ {"role": "system", "content": system_prompt},
38
+ {"role": "user", "content": user_prompt},
39
+ ]
40
+ schema = response_model.model_json_schema()
41
+ response_format = self._response_format(response_model, schema)
42
+ if self._settings.resolved_llm_response_format == "json_object":
43
+ messages[0]["content"] = (
44
+ f"{system_prompt}\n\n"
45
+ "Return JSON only. The JSON must match this exact JSON Schema:\n"
46
+ f"{json.dumps(schema, ensure_ascii=True)}"
47
+ )
48
+ for attempt in range(2):
49
+ payload = {
50
+ "model": self._settings.llm_model,
51
+ "messages": messages,
52
+ "max_tokens": self._settings.llm_max_tokens,
53
+ "response_format": response_format,
54
+ }
55
+ response = self._post_with_retry(payload)
56
+ message = response.json()["choices"][0]["message"]
57
+ if message.get("refusal"):
58
+ raise ValueError(f"LLM refused structured generation: {message['refusal']}")
59
+ content = message.get("content") or ""
60
+ if not isinstance(content, str):
61
+ content = ""
62
+ try:
63
+ return response_model.model_validate_json(content)
64
+ except (ValidationError, json.JSONDecodeError) as exc:
65
+ if attempt == 1:
66
+ raise ValueError(f"LLM returned invalid {response_model.__name__}: {exc}") from exc
67
+ messages.extend(
68
+ [
69
+ {"role": "assistant", "content": content},
70
+ {
71
+ "role": "user",
72
+ "content": (
73
+ "Correct the response to satisfy the supplied JSON Schema. "
74
+ f"Validation error: {exc}. Return JSON only."
75
+ ),
76
+ },
77
+ ]
78
+ )
79
+ raise RuntimeError("structured generation exhausted retries")
80
+
81
+ def _response_format(self, response_model: type[BaseModel], schema: dict) -> dict:
82
+ if self._settings.resolved_llm_response_format == "json_object":
83
+ return {"type": "json_object"}
84
+ return {
85
+ "type": "json_schema",
86
+ "json_schema": {
87
+ "name": response_model.__name__.lower(),
88
+ "strict": True,
89
+ "schema": schema,
90
+ },
91
+ }
92
+
93
+ def _post_with_retry(self, payload: dict) -> httpx.Response:
94
+ for attempt in range(3):
95
+ try:
96
+ response = self._client.post("/chat/completions", json=payload)
97
+ except httpx.TransportError:
98
+ if attempt == 2:
99
+ raise
100
+ time.sleep(2**attempt)
101
+ continue
102
+ if response.status_code != 429 and response.status_code < 500:
103
+ if response.is_error:
104
+ detail = response.text.strip()[:1000]
105
+ raise ValueError(
106
+ f"LLM API returned HTTP {response.status_code}: {detail or 'no error body'}"
107
+ )
108
+ return response
109
+ if attempt == 2:
110
+ response.raise_for_status()
111
+ retry_after = response.headers.get("retry-after")
112
+ delay = float(retry_after) if retry_after and retry_after.isdigit() else 2**attempt
113
+ time.sleep(delay)
114
+ raise RuntimeError("LLM request retry loop exited unexpectedly")
115
+
116
+ def close(self) -> None:
117
+ if self._owns_client:
118
+ self._client.close()
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from company_discovery.domain.contact_models import ApolloBatchResult, ApolloPersonRequest
8
+ from company_discovery.domain.models import ExaSearchResult
9
+
10
+
11
+ class StructuredLLM(Protocol):
12
+ def generate(
13
+ self,
14
+ *,
15
+ system_prompt: str,
16
+ user_prompt: str,
17
+ response_model: type[BaseModel],
18
+ ) -> BaseModel: ...
19
+
20
+ def close(self) -> None: ...
21
+
22
+
23
+ class CompanySearchProvider(Protocol):
24
+ @property
25
+ def last_cost_dollars(self) -> float: ...
26
+
27
+ def search(self, query: str, *, country: str, num_results: int) -> list[ExaSearchResult]: ...
28
+
29
+ def close(self) -> None: ...
30
+
31
+
32
+ class ContactSearchProvider(Protocol):
33
+ @property
34
+ def last_cost_dollars(self) -> float: ...
35
+
36
+ def search_people(
37
+ self, query: str, *, country: str, num_results: int
38
+ ) -> list[ExaSearchResult]: ...
39
+
40
+ def search_contact_evidence(
41
+ self, query: str, *, country: str, num_results: int
42
+ ) -> list[ExaSearchResult]: ...
43
+
44
+ def close(self) -> None: ...
45
+
46
+
47
+ class ContactEnrichmentProvider(Protocol):
48
+ def enrich_people(
49
+ self,
50
+ people: list[ApolloPersonRequest],
51
+ *,
52
+ reveal_email: bool,
53
+ reveal_phone: bool,
54
+ ) -> ApolloBatchResult: ...
55
+
56
+ def poll(self, request_id: str) -> ApolloBatchResult: ...
57
+
58
+ def close(self) -> None: ...
@@ -0,0 +1,154 @@
1
+ from __future__ import annotations
2
+
3
+ from html.parser import HTMLParser
4
+ from urllib.parse import urljoin, urlparse
5
+
6
+ import httpx
7
+
8
+ from company_discovery.domain.models import WebsitePage
9
+ from company_discovery.services.normalization import canonical_domain
10
+ from company_discovery.services.enrichment_resolver import normalize_linkedin_company_url
11
+
12
+
13
+ class _PageParser(HTMLParser):
14
+ def __init__(self) -> None:
15
+ super().__init__()
16
+ self.text: list[str] = []
17
+ self.links: list[tuple[str, str]] = []
18
+ self.title = ""
19
+ self._hidden = 0
20
+ self._in_title = False
21
+ self._anchor_href: str | None = None
22
+ self._anchor_text: list[str] = []
23
+
24
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
25
+ attributes = dict(attrs)
26
+ if tag in {"script", "style", "noscript", "svg"}:
27
+ self._hidden += 1
28
+ if tag == "title":
29
+ self._in_title = True
30
+ if tag == "a":
31
+ self._anchor_href = attributes.get("href")
32
+ self._anchor_text = []
33
+
34
+ def handle_endtag(self, tag: str) -> None:
35
+ if tag in {"script", "style", "noscript", "svg"} and self._hidden:
36
+ self._hidden -= 1
37
+ if tag == "title":
38
+ self._in_title = False
39
+ if tag == "a" and self._anchor_href:
40
+ self.links.append((self._anchor_href, " ".join(self._anchor_text)))
41
+ self._anchor_href = None
42
+
43
+ def handle_data(self, data: str) -> None:
44
+ value = " ".join(data.split())
45
+ if not value:
46
+ return
47
+ if self._in_title:
48
+ self.title = f"{self.title} {value}".strip()
49
+ if self._anchor_href is not None:
50
+ self._anchor_text.append(value)
51
+ if not self._hidden:
52
+ self.text.append(value)
53
+
54
+
55
+ class WebsiteClient:
56
+ """Fetch a small official-site page pack anchored to a known root domain."""
57
+
58
+ PAGE_TERMS = {
59
+ "contact": ("contact", "locations", "location", "offices"),
60
+ "about": ("about", "company", "who-we-are", "our-story", "ownership"),
61
+ }
62
+
63
+ def __init__(
64
+ self,
65
+ *,
66
+ timeout_seconds: float = 20.0,
67
+ max_pages: int = 4,
68
+ max_characters: int = 16000,
69
+ client: httpx.Client | None = None,
70
+ ) -> None:
71
+ self._owns_client = client is None
72
+ self._client = client or httpx.Client(
73
+ follow_redirects=True,
74
+ timeout=timeout_seconds,
75
+ headers={"User-Agent": "CompanyEnrichmentBot/1.0 (+business-information-research)"},
76
+ )
77
+ self._max_pages = max_pages
78
+ self._max_characters = max_characters
79
+
80
+ def fetch(self, domain: str) -> list[WebsitePage]:
81
+ homepage = self._fetch_homepage(domain)
82
+ if homepage is None:
83
+ return []
84
+ page, links = homepage
85
+ pages = [page]
86
+ for url, page_type in self._rank_links(page.url, domain, links):
87
+ if len(pages) >= self._max_pages:
88
+ break
89
+ fetched = self._fetch_page(url, page_type)
90
+ if fetched is not None and fetched.url not in {item.url for item in pages}:
91
+ pages.append(fetched)
92
+ return pages
93
+
94
+ def _fetch_homepage(self, domain: str) -> tuple[WebsitePage, list[tuple[str, str]]] | None:
95
+ for scheme in ("https", "http"):
96
+ result = self._request(f"{scheme}://{domain}", "homepage")
97
+ if result is not None:
98
+ return result
99
+ return None
100
+
101
+ def _fetch_page(self, url: str, page_type: str) -> WebsitePage | None:
102
+ result = self._request(url, page_type)
103
+ return result[0] if result else None
104
+
105
+ def _request(self, url: str, page_type: str) -> tuple[WebsitePage, list[tuple[str, str]]] | None:
106
+ try:
107
+ response = self._client.get(url)
108
+ response.raise_for_status()
109
+ except (httpx.HTTPError, ValueError):
110
+ return None
111
+ content_type = response.headers.get("content-type", "")
112
+ if "html" not in content_type.lower():
113
+ return None
114
+ parser = _PageParser()
115
+ parser.feed(response.text)
116
+ text = "\n".join(parser.text)[: self._max_characters]
117
+ linkedin_urls = list(
118
+ dict.fromkeys(
119
+ normalized
120
+ for href, _ in parser.links
121
+ if (normalized := normalize_linkedin_company_url(urljoin(str(response.url), href)))
122
+ )
123
+ )
124
+ return (
125
+ WebsitePage(
126
+ url=str(response.url),
127
+ title=parser.title,
128
+ text=text,
129
+ page_type=page_type,
130
+ linkedin_urls=linkedin_urls,
131
+ ),
132
+ parser.links,
133
+ )
134
+
135
+ def _rank_links(
136
+ self, base_url: str, domain: str, links: list[tuple[str, str]]
137
+ ) -> list[tuple[str, str]]:
138
+ ranked: list[tuple[int, str, str]] = []
139
+ seen: set[str] = set()
140
+ for href, label in links:
141
+ url = urljoin(base_url, href).split("#", 1)[0]
142
+ if url in seen or canonical_domain(urlparse(url).hostname or "") != domain:
143
+ continue
144
+ haystack = f"{urlparse(url).path} {label}".lower()
145
+ for priority, (page_type, terms) in enumerate(self.PAGE_TERMS.items()):
146
+ if any(term in haystack for term in terms):
147
+ seen.add(url)
148
+ ranked.append((priority, url, page_type))
149
+ break
150
+ return [(url, page_type) for _, url, page_type in sorted(ranked)]
151
+
152
+ def close(self) -> None:
153
+ if self._owns_client:
154
+ self._client.close()
@@ -0,0 +1 @@
1
+ """Bundled agent skills installed by the leads CLI."""