leads-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- company_discovery/__init__.py +4 -0
- company_discovery/adapters/__init__.py +5 -0
- company_discovery/adapters/apollo.py +189 -0
- company_discovery/adapters/exa.py +112 -0
- company_discovery/adapters/llm.py +118 -0
- company_discovery/adapters/protocols.py +58 -0
- company_discovery/adapters/website.py +154 -0
- company_discovery/bundled_skills/__init__.py +1 -0
- company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
- company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
- company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
- company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
- company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
- company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
- company_discovery/cli.py +1789 -0
- company_discovery/db/__init__.py +5 -0
- company_discovery/db/contact_enrichment_repository.py +268 -0
- company_discovery/db/contact_repository.py +366 -0
- company_discovery/db/enrichment_repository.py +207 -0
- company_discovery/db/models.py +324 -0
- company_discovery/db/repository.py +363 -0
- company_discovery/db/session.py +48 -0
- company_discovery/domain/__init__.py +24 -0
- company_discovery/domain/contact_models.py +178 -0
- company_discovery/domain/contact_spec.py +86 -0
- company_discovery/domain/models.py +287 -0
- company_discovery/domain/spec.py +263 -0
- company_discovery/migrations.py +190 -0
- company_discovery/prompts/__init__.py +8 -0
- company_discovery/prompts/candidate_evaluation/system.md +13 -0
- company_discovery/prompts/company_enrichment/system.md +42 -0
- company_discovery/prompts/contact_evaluation/system.md +18 -0
- company_discovery/prompts/query_generation/system.md +10 -0
- company_discovery/release_manifest.json +7 -0
- company_discovery/reports/__init__.py +4 -0
- company_discovery/reports/contact_enrichment_exporter.py +108 -0
- company_discovery/reports/contact_exporter.py +132 -0
- company_discovery/reports/enrichment_exporter.py +125 -0
- company_discovery/reports/exporter.py +135 -0
- company_discovery/runtime.py +336 -0
- company_discovery/services/__init__.py +4 -0
- company_discovery/services/contact_enrichment_pipeline.py +344 -0
- company_discovery/services/contact_enrichment_progress.py +37 -0
- company_discovery/services/contact_evaluator.py +110 -0
- company_discovery/services/contact_pipeline.py +295 -0
- company_discovery/services/contact_progress.py +38 -0
- company_discovery/services/enrichment_extractor.py +61 -0
- company_discovery/services/enrichment_pipeline.py +526 -0
- company_discovery/services/enrichment_progress.py +20 -0
- company_discovery/services/enrichment_resolver.py +148 -0
- company_discovery/services/evaluator.py +40 -0
- company_discovery/services/hygiene.py +51 -0
- company_discovery/services/memory.py +150 -0
- company_discovery/services/normalization.py +98 -0
- company_discovery/services/pipeline.py +628 -0
- company_discovery/services/progress.py +48 -0
- company_discovery/services/query_planner.py +47 -0
- company_discovery/settings.py +152 -0
- company_discovery/skill_installer.py +197 -0
- company_discovery/update_plan.py +79 -0
- leads_cli-0.1.0.dist-info/METADATA +277 -0
- leads_cli-0.1.0.dist-info/RECORD +72 -0
- leads_cli-0.1.0.dist-info/WHEEL +4 -0
- leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from company_discovery.domain.contact_models import (
|
|
9
|
+
ApolloBatchResult,
|
|
10
|
+
ApolloPersonMatch,
|
|
11
|
+
ApolloPersonRequest,
|
|
12
|
+
)
|
|
13
|
+
from company_discovery.services.normalization import canonical_domain
|
|
14
|
+
from company_discovery.settings import Settings
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ApolloClient:
|
|
18
|
+
"""Apollo people-enrichment adapter with bounded retry and async-result support."""
|
|
19
|
+
|
|
20
|
+
MAX_BATCH_SIZE = 10
|
|
21
|
+
|
|
22
|
+
def __init__(self, settings: Settings, client: httpx.Client | None = None) -> None:
|
|
23
|
+
if not settings.apollo_api_key:
|
|
24
|
+
raise ValueError("APOLLO_API_KEY is required for contact enrichment")
|
|
25
|
+
self._settings = settings
|
|
26
|
+
self._owns_client = client is None
|
|
27
|
+
self._client = client or httpx.Client(
|
|
28
|
+
base_url=settings.apollo_base_url.rstrip("/"),
|
|
29
|
+
headers={
|
|
30
|
+
"X-Api-Key": settings.apollo_api_key,
|
|
31
|
+
"Content-Type": "application/json",
|
|
32
|
+
"Cache-Control": "no-cache",
|
|
33
|
+
},
|
|
34
|
+
timeout=settings.apollo_timeout_seconds,
|
|
35
|
+
)
|
|
36
|
+
self._pending_people: dict[str, list[ApolloPersonRequest]] = {}
|
|
37
|
+
|
|
38
|
+
def enrich_people(
|
|
39
|
+
self,
|
|
40
|
+
people: list[ApolloPersonRequest],
|
|
41
|
+
*,
|
|
42
|
+
reveal_email: bool,
|
|
43
|
+
reveal_phone: bool,
|
|
44
|
+
) -> ApolloBatchResult:
|
|
45
|
+
if not people:
|
|
46
|
+
return ApolloBatchResult()
|
|
47
|
+
if len(people) > self.MAX_BATCH_SIZE:
|
|
48
|
+
raise ValueError(f"Apollo bulk enrichment accepts at most {self.MAX_BATCH_SIZE} people")
|
|
49
|
+
if reveal_phone and not self._settings.apollo_webhook_url:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"APOLLO_WEBHOOK_URL is required when phone enrichment is enabled; "
|
|
52
|
+
"use --no-phone for synchronous email-only enrichment"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
payload: dict[str, Any] = {
|
|
56
|
+
"details": [self._person_payload(person) for person in people],
|
|
57
|
+
"reveal_personal_emails": False,
|
|
58
|
+
"reveal_phone_number": reveal_phone,
|
|
59
|
+
# Standard work email is synchronous. Waterfall email becomes useful only when
|
|
60
|
+
# Apollo has a webhook destination for its asynchronous completion payload.
|
|
61
|
+
"run_waterfall_email": reveal_email and bool(self._settings.apollo_webhook_url),
|
|
62
|
+
"run_waterfall_phone": reveal_phone,
|
|
63
|
+
}
|
|
64
|
+
if self._settings.apollo_webhook_url:
|
|
65
|
+
payload["webhook_url"] = self._settings.apollo_webhook_url
|
|
66
|
+
response = self._request("POST", "/api/v1/people/bulk_match", json=payload)
|
|
67
|
+
data = response.json()
|
|
68
|
+
result = self._parse(data, people)
|
|
69
|
+
if result.request_id:
|
|
70
|
+
self._pending_people[result.request_id] = people
|
|
71
|
+
return result
|
|
72
|
+
|
|
73
|
+
def poll(self, request_id: str) -> ApolloBatchResult:
|
|
74
|
+
response = self._request("GET", f"/api/v1/webhook_result/{request_id}")
|
|
75
|
+
data = response.json()
|
|
76
|
+
result = self._parse(data, self._pending_people.get(request_id, []))
|
|
77
|
+
if not result.request_id:
|
|
78
|
+
result = result.model_copy(update={"request_id": request_id})
|
|
79
|
+
if not result.pending:
|
|
80
|
+
self._pending_people.pop(request_id, None)
|
|
81
|
+
return result
|
|
82
|
+
|
|
83
|
+
def _request(self, method: str, path: str, **kwargs: Any) -> httpx.Response:
|
|
84
|
+
for attempt in range(3):
|
|
85
|
+
response = self._client.request(method, path, **kwargs)
|
|
86
|
+
if response.status_code != 429 and response.status_code < 500:
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
return response
|
|
89
|
+
if attempt == 2:
|
|
90
|
+
response.raise_for_status()
|
|
91
|
+
retry_after = response.headers.get("retry-after")
|
|
92
|
+
delay = float(retry_after) if retry_after else float(2**attempt)
|
|
93
|
+
time.sleep(delay)
|
|
94
|
+
raise RuntimeError("Apollo request retry loop exited unexpectedly")
|
|
95
|
+
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _person_payload(person: ApolloPersonRequest) -> dict[str, str]:
|
|
98
|
+
payload = {
|
|
99
|
+
"first_name": person.first_name,
|
|
100
|
+
"last_name": person.last_name,
|
|
101
|
+
"name": person.full_name,
|
|
102
|
+
"organization_name": person.company_name,
|
|
103
|
+
"domain": person.company_domain,
|
|
104
|
+
}
|
|
105
|
+
if person.linkedin_url:
|
|
106
|
+
payload["linkedin_url"] = person.linkedin_url
|
|
107
|
+
return payload
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def _parse(
|
|
111
|
+
cls, payload: dict[str, Any], requested: list[ApolloPersonRequest]
|
|
112
|
+
) -> ApolloBatchResult:
|
|
113
|
+
request_id = cls._request_id(payload)
|
|
114
|
+
status = str(payload.get("status") or payload.get("state") or "").lower()
|
|
115
|
+
pending = status in {"pending", "processing", "queued", "running"}
|
|
116
|
+
source = payload.get("data") if isinstance(payload.get("data"), dict) else payload
|
|
117
|
+
records = source.get("matches") or source.get("people") or source.get("results") or []
|
|
118
|
+
if isinstance(records, dict):
|
|
119
|
+
records = records.get("matches") or records.get("people") or records.get("results") or []
|
|
120
|
+
if not isinstance(records, list):
|
|
121
|
+
records = []
|
|
122
|
+
if request_id and not records and status not in {"complete", "completed", "success", "succeeded"}:
|
|
123
|
+
pending = True
|
|
124
|
+
|
|
125
|
+
matches: list[ApolloPersonMatch] = []
|
|
126
|
+
for index, requested_person in enumerate(requested):
|
|
127
|
+
raw = records[index] if index < len(records) and isinstance(records[index], dict) else {}
|
|
128
|
+
person = raw.get("person") if isinstance(raw.get("person"), dict) else raw
|
|
129
|
+
found = bool(person) and not bool(raw.get("error"))
|
|
130
|
+
organization = person.get("organization") if isinstance(person.get("organization"), dict) else {}
|
|
131
|
+
phones = cls._phones(person)
|
|
132
|
+
raw_domain = (
|
|
133
|
+
organization.get("primary_domain")
|
|
134
|
+
or organization.get("website_url")
|
|
135
|
+
or person.get("organization_domain")
|
|
136
|
+
)
|
|
137
|
+
matches.append(
|
|
138
|
+
ApolloPersonMatch(
|
|
139
|
+
candidate_id=requested_person.candidate_id,
|
|
140
|
+
person_found=found,
|
|
141
|
+
full_name=person.get("name") or cls._joined_name(person),
|
|
142
|
+
linkedin_url=person.get("linkedin_url"),
|
|
143
|
+
title=person.get("title"),
|
|
144
|
+
organization_name=organization.get("name") or person.get("organization_name"),
|
|
145
|
+
organization_domain=canonical_domain(str(raw_domain)) if raw_domain else None,
|
|
146
|
+
email=person.get("email"),
|
|
147
|
+
email_status=person.get("email_status"),
|
|
148
|
+
phones=phones,
|
|
149
|
+
apollo_person_id=person.get("id"),
|
|
150
|
+
raw=raw,
|
|
151
|
+
)
|
|
152
|
+
)
|
|
153
|
+
return ApolloBatchResult(matches=matches, request_id=request_id, pending=pending)
|
|
154
|
+
|
|
155
|
+
@staticmethod
|
|
156
|
+
def _request_id(payload: dict[str, Any]) -> str | None:
|
|
157
|
+
direct = payload.get("request_id") or payload.get("requestId")
|
|
158
|
+
if direct:
|
|
159
|
+
return str(direct)
|
|
160
|
+
data = payload.get("data")
|
|
161
|
+
if isinstance(data, dict):
|
|
162
|
+
nested = data.get("request_id") or data.get("requestId")
|
|
163
|
+
return str(nested) if nested else None
|
|
164
|
+
return None
|
|
165
|
+
|
|
166
|
+
@staticmethod
|
|
167
|
+
def _joined_name(person: dict[str, Any]) -> str | None:
|
|
168
|
+
name = " ".join(
|
|
169
|
+
part for part in (person.get("first_name"), person.get("last_name")) if part
|
|
170
|
+
)
|
|
171
|
+
return name or None
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
def _phones(person: dict[str, Any]) -> list[str]:
|
|
175
|
+
values: list[str] = []
|
|
176
|
+
for phone in person.get("phone_numbers") or []:
|
|
177
|
+
if not isinstance(phone, dict):
|
|
178
|
+
continue
|
|
179
|
+
value = phone.get("sanitized_number") or phone.get("raw_number") or phone.get("number")
|
|
180
|
+
if value and value not in values:
|
|
181
|
+
values.append(str(value))
|
|
182
|
+
direct = person.get("phone") or person.get("mobile_phone")
|
|
183
|
+
if direct and str(direct) not in values:
|
|
184
|
+
values.append(str(direct))
|
|
185
|
+
return values
|
|
186
|
+
|
|
187
|
+
def close(self) -> None:
|
|
188
|
+
if self._owns_client:
|
|
189
|
+
self._client.close()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from company_discovery.domain.models import ExaSearchResult
|
|
9
|
+
from company_discovery.settings import Settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExaClient:
|
|
13
|
+
"""Minimal Exa company-search adapter that preserves each provider payload."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, settings: Settings, client: httpx.Client | None = None) -> None:
|
|
16
|
+
if not settings.exa_api_key:
|
|
17
|
+
raise ValueError("EXA_API_KEY is required for external discovery")
|
|
18
|
+
self._settings = settings
|
|
19
|
+
self._owns_client = client is None
|
|
20
|
+
self._client = client or httpx.Client(
|
|
21
|
+
base_url=settings.exa_base_url.rstrip("/"),
|
|
22
|
+
headers={"x-api-key": settings.exa_api_key, "content-type": "application/json"},
|
|
23
|
+
timeout=settings.exa_timeout_seconds,
|
|
24
|
+
)
|
|
25
|
+
self._last_cost_dollars = 0.0
|
|
26
|
+
self._last_request_at: float | None = None
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def last_cost_dollars(self) -> float:
|
|
30
|
+
return self._last_cost_dollars
|
|
31
|
+
|
|
32
|
+
def search(self, query: str, *, country: str, num_results: int) -> list[ExaSearchResult]:
|
|
33
|
+
return self._search(query, country=country, num_results=num_results, category="company")
|
|
34
|
+
|
|
35
|
+
def search_people(
|
|
36
|
+
self, query: str, *, country: str, num_results: int
|
|
37
|
+
) -> list[ExaSearchResult]:
|
|
38
|
+
return self._search(query, country=country, num_results=num_results, category="people")
|
|
39
|
+
|
|
40
|
+
def search_contact_evidence(
|
|
41
|
+
self, query: str, *, country: str, num_results: int
|
|
42
|
+
) -> list[ExaSearchResult]:
|
|
43
|
+
return self._search(query, country=country, num_results=num_results, category=None)
|
|
44
|
+
|
|
45
|
+
def _search(
|
|
46
|
+
self, query: str, *, country: str, num_results: int, category: str | None
|
|
47
|
+
) -> list[ExaSearchResult]:
|
|
48
|
+
payload = {
|
|
49
|
+
"query": query,
|
|
50
|
+
"numResults": max(1, min(num_results, 100)),
|
|
51
|
+
"type": "auto",
|
|
52
|
+
"userLocation": country.upper(),
|
|
53
|
+
"contents": {"text": {"maxCharacters": 3000}},
|
|
54
|
+
}
|
|
55
|
+
if category is not None:
|
|
56
|
+
payload["category"] = category
|
|
57
|
+
if category == "company":
|
|
58
|
+
payload["systemPrompt"] = (
|
|
59
|
+
"Return official operating-company websites. Avoid directories, associations, "
|
|
60
|
+
"marketplaces, news pages, and duplicate companies."
|
|
61
|
+
)
|
|
62
|
+
response = self._post_with_retry(payload)
|
|
63
|
+
data = response.json()
|
|
64
|
+
self._last_cost_dollars = self._read_cost(data)
|
|
65
|
+
return [
|
|
66
|
+
ExaSearchResult(
|
|
67
|
+
query=query,
|
|
68
|
+
position=index,
|
|
69
|
+
title=item.get("title") or "",
|
|
70
|
+
url=item.get("url") or "",
|
|
71
|
+
text=item.get("text"),
|
|
72
|
+
published_date=item.get("publishedDate"),
|
|
73
|
+
exa_id=item.get("id"),
|
|
74
|
+
raw=item,
|
|
75
|
+
)
|
|
76
|
+
for index, item in enumerate(data.get("results", []), start=1)
|
|
77
|
+
]
|
|
78
|
+
|
|
79
|
+
def _post_with_retry(self, payload: dict[str, Any]) -> httpx.Response:
|
|
80
|
+
for attempt in range(3):
|
|
81
|
+
self._pace_request()
|
|
82
|
+
response = self._client.post("/search", json=payload)
|
|
83
|
+
self._last_request_at = time.monotonic()
|
|
84
|
+
if response.status_code != 429 and response.status_code < 500:
|
|
85
|
+
response.raise_for_status()
|
|
86
|
+
return response
|
|
87
|
+
if attempt == 2:
|
|
88
|
+
response.raise_for_status()
|
|
89
|
+
retry_after = response.headers.get("retry-after")
|
|
90
|
+
delay = float(retry_after) if retry_after and retry_after.isdigit() else 2**attempt
|
|
91
|
+
time.sleep(delay)
|
|
92
|
+
raise RuntimeError("Exa request retry loop exited unexpectedly")
|
|
93
|
+
|
|
94
|
+
def _pace_request(self) -> None:
|
|
95
|
+
if self._last_request_at is None:
|
|
96
|
+
return
|
|
97
|
+
remaining = 0.21 - (time.monotonic() - self._last_request_at)
|
|
98
|
+
if remaining > 0:
|
|
99
|
+
time.sleep(remaining)
|
|
100
|
+
|
|
101
|
+
@staticmethod
|
|
102
|
+
def _read_cost(payload: dict[str, Any]) -> float:
|
|
103
|
+
cost = payload.get("costDollars")
|
|
104
|
+
if isinstance(cost, dict) and isinstance(cost.get("total"), (int, float)):
|
|
105
|
+
return float(cost["total"])
|
|
106
|
+
if isinstance(cost, (int, float)):
|
|
107
|
+
return float(cost)
|
|
108
|
+
return 0.0
|
|
109
|
+
|
|
110
|
+
def close(self) -> None:
|
|
111
|
+
if self._owns_client:
|
|
112
|
+
self._client.close()
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
from pydantic import BaseModel, ValidationError
|
|
8
|
+
|
|
9
|
+
from company_discovery.settings import Settings
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class OpenAICompatibleLLM:
|
|
13
|
+
"""Structured-output client for OpenAI-compatible chat completion APIs."""
|
|
14
|
+
|
|
15
|
+
def __init__(self, settings: Settings, client: httpx.Client | None = None) -> None:
|
|
16
|
+
if not settings.llm_api_key:
|
|
17
|
+
raise ValueError("LLM_API_KEY is required for query generation and evaluation")
|
|
18
|
+
self._settings = settings
|
|
19
|
+
self._owns_client = client is None
|
|
20
|
+
self._client = client or httpx.Client(
|
|
21
|
+
base_url=settings.llm_base_url.rstrip("/"),
|
|
22
|
+
headers={
|
|
23
|
+
"Authorization": f"Bearer {settings.llm_api_key}",
|
|
24
|
+
"Content-Type": "application/json",
|
|
25
|
+
},
|
|
26
|
+
timeout=settings.llm_timeout_seconds,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
def generate(
|
|
30
|
+
self,
|
|
31
|
+
*,
|
|
32
|
+
system_prompt: str,
|
|
33
|
+
user_prompt: str,
|
|
34
|
+
response_model: type[BaseModel],
|
|
35
|
+
) -> BaseModel:
|
|
36
|
+
messages = [
|
|
37
|
+
{"role": "system", "content": system_prompt},
|
|
38
|
+
{"role": "user", "content": user_prompt},
|
|
39
|
+
]
|
|
40
|
+
schema = response_model.model_json_schema()
|
|
41
|
+
response_format = self._response_format(response_model, schema)
|
|
42
|
+
if self._settings.resolved_llm_response_format == "json_object":
|
|
43
|
+
messages[0]["content"] = (
|
|
44
|
+
f"{system_prompt}\n\n"
|
|
45
|
+
"Return JSON only. The JSON must match this exact JSON Schema:\n"
|
|
46
|
+
f"{json.dumps(schema, ensure_ascii=True)}"
|
|
47
|
+
)
|
|
48
|
+
for attempt in range(2):
|
|
49
|
+
payload = {
|
|
50
|
+
"model": self._settings.llm_model,
|
|
51
|
+
"messages": messages,
|
|
52
|
+
"max_tokens": self._settings.llm_max_tokens,
|
|
53
|
+
"response_format": response_format,
|
|
54
|
+
}
|
|
55
|
+
response = self._post_with_retry(payload)
|
|
56
|
+
message = response.json()["choices"][0]["message"]
|
|
57
|
+
if message.get("refusal"):
|
|
58
|
+
raise ValueError(f"LLM refused structured generation: {message['refusal']}")
|
|
59
|
+
content = message.get("content") or ""
|
|
60
|
+
if not isinstance(content, str):
|
|
61
|
+
content = ""
|
|
62
|
+
try:
|
|
63
|
+
return response_model.model_validate_json(content)
|
|
64
|
+
except (ValidationError, json.JSONDecodeError) as exc:
|
|
65
|
+
if attempt == 1:
|
|
66
|
+
raise ValueError(f"LLM returned invalid {response_model.__name__}: {exc}") from exc
|
|
67
|
+
messages.extend(
|
|
68
|
+
[
|
|
69
|
+
{"role": "assistant", "content": content},
|
|
70
|
+
{
|
|
71
|
+
"role": "user",
|
|
72
|
+
"content": (
|
|
73
|
+
"Correct the response to satisfy the supplied JSON Schema. "
|
|
74
|
+
f"Validation error: {exc}. Return JSON only."
|
|
75
|
+
),
|
|
76
|
+
},
|
|
77
|
+
]
|
|
78
|
+
)
|
|
79
|
+
raise RuntimeError("structured generation exhausted retries")
|
|
80
|
+
|
|
81
|
+
def _response_format(self, response_model: type[BaseModel], schema: dict) -> dict:
|
|
82
|
+
if self._settings.resolved_llm_response_format == "json_object":
|
|
83
|
+
return {"type": "json_object"}
|
|
84
|
+
return {
|
|
85
|
+
"type": "json_schema",
|
|
86
|
+
"json_schema": {
|
|
87
|
+
"name": response_model.__name__.lower(),
|
|
88
|
+
"strict": True,
|
|
89
|
+
"schema": schema,
|
|
90
|
+
},
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
def _post_with_retry(self, payload: dict) -> httpx.Response:
|
|
94
|
+
for attempt in range(3):
|
|
95
|
+
try:
|
|
96
|
+
response = self._client.post("/chat/completions", json=payload)
|
|
97
|
+
except httpx.TransportError:
|
|
98
|
+
if attempt == 2:
|
|
99
|
+
raise
|
|
100
|
+
time.sleep(2**attempt)
|
|
101
|
+
continue
|
|
102
|
+
if response.status_code != 429 and response.status_code < 500:
|
|
103
|
+
if response.is_error:
|
|
104
|
+
detail = response.text.strip()[:1000]
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"LLM API returned HTTP {response.status_code}: {detail or 'no error body'}"
|
|
107
|
+
)
|
|
108
|
+
return response
|
|
109
|
+
if attempt == 2:
|
|
110
|
+
response.raise_for_status()
|
|
111
|
+
retry_after = response.headers.get("retry-after")
|
|
112
|
+
delay = float(retry_after) if retry_after and retry_after.isdigit() else 2**attempt
|
|
113
|
+
time.sleep(delay)
|
|
114
|
+
raise RuntimeError("LLM request retry loop exited unexpectedly")
|
|
115
|
+
|
|
116
|
+
def close(self) -> None:
|
|
117
|
+
if self._owns_client:
|
|
118
|
+
self._client.close()
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from company_discovery.domain.contact_models import ApolloBatchResult, ApolloPersonRequest
|
|
8
|
+
from company_discovery.domain.models import ExaSearchResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class StructuredLLM(Protocol):
|
|
12
|
+
def generate(
|
|
13
|
+
self,
|
|
14
|
+
*,
|
|
15
|
+
system_prompt: str,
|
|
16
|
+
user_prompt: str,
|
|
17
|
+
response_model: type[BaseModel],
|
|
18
|
+
) -> BaseModel: ...
|
|
19
|
+
|
|
20
|
+
def close(self) -> None: ...
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class CompanySearchProvider(Protocol):
|
|
24
|
+
@property
|
|
25
|
+
def last_cost_dollars(self) -> float: ...
|
|
26
|
+
|
|
27
|
+
def search(self, query: str, *, country: str, num_results: int) -> list[ExaSearchResult]: ...
|
|
28
|
+
|
|
29
|
+
def close(self) -> None: ...
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ContactSearchProvider(Protocol):
|
|
33
|
+
@property
|
|
34
|
+
def last_cost_dollars(self) -> float: ...
|
|
35
|
+
|
|
36
|
+
def search_people(
|
|
37
|
+
self, query: str, *, country: str, num_results: int
|
|
38
|
+
) -> list[ExaSearchResult]: ...
|
|
39
|
+
|
|
40
|
+
def search_contact_evidence(
|
|
41
|
+
self, query: str, *, country: str, num_results: int
|
|
42
|
+
) -> list[ExaSearchResult]: ...
|
|
43
|
+
|
|
44
|
+
def close(self) -> None: ...
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ContactEnrichmentProvider(Protocol):
|
|
48
|
+
def enrich_people(
|
|
49
|
+
self,
|
|
50
|
+
people: list[ApolloPersonRequest],
|
|
51
|
+
*,
|
|
52
|
+
reveal_email: bool,
|
|
53
|
+
reveal_phone: bool,
|
|
54
|
+
) -> ApolloBatchResult: ...
|
|
55
|
+
|
|
56
|
+
def poll(self, request_id: str) -> ApolloBatchResult: ...
|
|
57
|
+
|
|
58
|
+
def close(self) -> None: ...
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from html.parser import HTMLParser
|
|
4
|
+
from urllib.parse import urljoin, urlparse
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from company_discovery.domain.models import WebsitePage
|
|
9
|
+
from company_discovery.services.normalization import canonical_domain
|
|
10
|
+
from company_discovery.services.enrichment_resolver import normalize_linkedin_company_url
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class _PageParser(HTMLParser):
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
super().__init__()
|
|
16
|
+
self.text: list[str] = []
|
|
17
|
+
self.links: list[tuple[str, str]] = []
|
|
18
|
+
self.title = ""
|
|
19
|
+
self._hidden = 0
|
|
20
|
+
self._in_title = False
|
|
21
|
+
self._anchor_href: str | None = None
|
|
22
|
+
self._anchor_text: list[str] = []
|
|
23
|
+
|
|
24
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
25
|
+
attributes = dict(attrs)
|
|
26
|
+
if tag in {"script", "style", "noscript", "svg"}:
|
|
27
|
+
self._hidden += 1
|
|
28
|
+
if tag == "title":
|
|
29
|
+
self._in_title = True
|
|
30
|
+
if tag == "a":
|
|
31
|
+
self._anchor_href = attributes.get("href")
|
|
32
|
+
self._anchor_text = []
|
|
33
|
+
|
|
34
|
+
def handle_endtag(self, tag: str) -> None:
|
|
35
|
+
if tag in {"script", "style", "noscript", "svg"} and self._hidden:
|
|
36
|
+
self._hidden -= 1
|
|
37
|
+
if tag == "title":
|
|
38
|
+
self._in_title = False
|
|
39
|
+
if tag == "a" and self._anchor_href:
|
|
40
|
+
self.links.append((self._anchor_href, " ".join(self._anchor_text)))
|
|
41
|
+
self._anchor_href = None
|
|
42
|
+
|
|
43
|
+
def handle_data(self, data: str) -> None:
|
|
44
|
+
value = " ".join(data.split())
|
|
45
|
+
if not value:
|
|
46
|
+
return
|
|
47
|
+
if self._in_title:
|
|
48
|
+
self.title = f"{self.title} {value}".strip()
|
|
49
|
+
if self._anchor_href is not None:
|
|
50
|
+
self._anchor_text.append(value)
|
|
51
|
+
if not self._hidden:
|
|
52
|
+
self.text.append(value)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class WebsiteClient:
|
|
56
|
+
"""Fetch a small official-site page pack anchored to a known root domain."""
|
|
57
|
+
|
|
58
|
+
PAGE_TERMS = {
|
|
59
|
+
"contact": ("contact", "locations", "location", "offices"),
|
|
60
|
+
"about": ("about", "company", "who-we-are", "our-story", "ownership"),
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
def __init__(
|
|
64
|
+
self,
|
|
65
|
+
*,
|
|
66
|
+
timeout_seconds: float = 20.0,
|
|
67
|
+
max_pages: int = 4,
|
|
68
|
+
max_characters: int = 16000,
|
|
69
|
+
client: httpx.Client | None = None,
|
|
70
|
+
) -> None:
|
|
71
|
+
self._owns_client = client is None
|
|
72
|
+
self._client = client or httpx.Client(
|
|
73
|
+
follow_redirects=True,
|
|
74
|
+
timeout=timeout_seconds,
|
|
75
|
+
headers={"User-Agent": "CompanyEnrichmentBot/1.0 (+business-information-research)"},
|
|
76
|
+
)
|
|
77
|
+
self._max_pages = max_pages
|
|
78
|
+
self._max_characters = max_characters
|
|
79
|
+
|
|
80
|
+
def fetch(self, domain: str) -> list[WebsitePage]:
|
|
81
|
+
homepage = self._fetch_homepage(domain)
|
|
82
|
+
if homepage is None:
|
|
83
|
+
return []
|
|
84
|
+
page, links = homepage
|
|
85
|
+
pages = [page]
|
|
86
|
+
for url, page_type in self._rank_links(page.url, domain, links):
|
|
87
|
+
if len(pages) >= self._max_pages:
|
|
88
|
+
break
|
|
89
|
+
fetched = self._fetch_page(url, page_type)
|
|
90
|
+
if fetched is not None and fetched.url not in {item.url for item in pages}:
|
|
91
|
+
pages.append(fetched)
|
|
92
|
+
return pages
|
|
93
|
+
|
|
94
|
+
def _fetch_homepage(self, domain: str) -> tuple[WebsitePage, list[tuple[str, str]]] | None:
|
|
95
|
+
for scheme in ("https", "http"):
|
|
96
|
+
result = self._request(f"{scheme}://{domain}", "homepage")
|
|
97
|
+
if result is not None:
|
|
98
|
+
return result
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
def _fetch_page(self, url: str, page_type: str) -> WebsitePage | None:
|
|
102
|
+
result = self._request(url, page_type)
|
|
103
|
+
return result[0] if result else None
|
|
104
|
+
|
|
105
|
+
def _request(self, url: str, page_type: str) -> tuple[WebsitePage, list[tuple[str, str]]] | None:
|
|
106
|
+
try:
|
|
107
|
+
response = self._client.get(url)
|
|
108
|
+
response.raise_for_status()
|
|
109
|
+
except (httpx.HTTPError, ValueError):
|
|
110
|
+
return None
|
|
111
|
+
content_type = response.headers.get("content-type", "")
|
|
112
|
+
if "html" not in content_type.lower():
|
|
113
|
+
return None
|
|
114
|
+
parser = _PageParser()
|
|
115
|
+
parser.feed(response.text)
|
|
116
|
+
text = "\n".join(parser.text)[: self._max_characters]
|
|
117
|
+
linkedin_urls = list(
|
|
118
|
+
dict.fromkeys(
|
|
119
|
+
normalized
|
|
120
|
+
for href, _ in parser.links
|
|
121
|
+
if (normalized := normalize_linkedin_company_url(urljoin(str(response.url), href)))
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
return (
|
|
125
|
+
WebsitePage(
|
|
126
|
+
url=str(response.url),
|
|
127
|
+
title=parser.title,
|
|
128
|
+
text=text,
|
|
129
|
+
page_type=page_type,
|
|
130
|
+
linkedin_urls=linkedin_urls,
|
|
131
|
+
),
|
|
132
|
+
parser.links,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
def _rank_links(
|
|
136
|
+
self, base_url: str, domain: str, links: list[tuple[str, str]]
|
|
137
|
+
) -> list[tuple[str, str]]:
|
|
138
|
+
ranked: list[tuple[int, str, str]] = []
|
|
139
|
+
seen: set[str] = set()
|
|
140
|
+
for href, label in links:
|
|
141
|
+
url = urljoin(base_url, href).split("#", 1)[0]
|
|
142
|
+
if url in seen or canonical_domain(urlparse(url).hostname or "") != domain:
|
|
143
|
+
continue
|
|
144
|
+
haystack = f"{urlparse(url).path} {label}".lower()
|
|
145
|
+
for priority, (page_type, terms) in enumerate(self.PAGE_TERMS.items()):
|
|
146
|
+
if any(term in haystack for term in terms):
|
|
147
|
+
seen.add(url)
|
|
148
|
+
ranked.append((priority, url, page_type))
|
|
149
|
+
break
|
|
150
|
+
return [(url, page_type) for _, url, page_type in sorted(ranked)]
|
|
151
|
+
|
|
152
|
+
def close(self) -> None:
|
|
153
|
+
if self._owns_client:
|
|
154
|
+
self._client.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Bundled agent skills installed by the leads CLI."""
|