leads-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- company_discovery/__init__.py +4 -0
- company_discovery/adapters/__init__.py +5 -0
- company_discovery/adapters/apollo.py +189 -0
- company_discovery/adapters/exa.py +112 -0
- company_discovery/adapters/llm.py +118 -0
- company_discovery/adapters/protocols.py +58 -0
- company_discovery/adapters/website.py +154 -0
- company_discovery/bundled_skills/__init__.py +1 -0
- company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
- company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
- company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
- company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
- company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
- company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
- company_discovery/cli.py +1789 -0
- company_discovery/db/__init__.py +5 -0
- company_discovery/db/contact_enrichment_repository.py +268 -0
- company_discovery/db/contact_repository.py +366 -0
- company_discovery/db/enrichment_repository.py +207 -0
- company_discovery/db/models.py +324 -0
- company_discovery/db/repository.py +363 -0
- company_discovery/db/session.py +48 -0
- company_discovery/domain/__init__.py +24 -0
- company_discovery/domain/contact_models.py +178 -0
- company_discovery/domain/contact_spec.py +86 -0
- company_discovery/domain/models.py +287 -0
- company_discovery/domain/spec.py +263 -0
- company_discovery/migrations.py +190 -0
- company_discovery/prompts/__init__.py +8 -0
- company_discovery/prompts/candidate_evaluation/system.md +13 -0
- company_discovery/prompts/company_enrichment/system.md +42 -0
- company_discovery/prompts/contact_evaluation/system.md +18 -0
- company_discovery/prompts/query_generation/system.md +10 -0
- company_discovery/release_manifest.json +7 -0
- company_discovery/reports/__init__.py +4 -0
- company_discovery/reports/contact_enrichment_exporter.py +108 -0
- company_discovery/reports/contact_exporter.py +132 -0
- company_discovery/reports/enrichment_exporter.py +125 -0
- company_discovery/reports/exporter.py +135 -0
- company_discovery/runtime.py +336 -0
- company_discovery/services/__init__.py +4 -0
- company_discovery/services/contact_enrichment_pipeline.py +344 -0
- company_discovery/services/contact_enrichment_progress.py +37 -0
- company_discovery/services/contact_evaluator.py +110 -0
- company_discovery/services/contact_pipeline.py +295 -0
- company_discovery/services/contact_progress.py +38 -0
- company_discovery/services/enrichment_extractor.py +61 -0
- company_discovery/services/enrichment_pipeline.py +526 -0
- company_discovery/services/enrichment_progress.py +20 -0
- company_discovery/services/enrichment_resolver.py +148 -0
- company_discovery/services/evaluator.py +40 -0
- company_discovery/services/hygiene.py +51 -0
- company_discovery/services/memory.py +150 -0
- company_discovery/services/normalization.py +98 -0
- company_discovery/services/pipeline.py +628 -0
- company_discovery/services/progress.py +48 -0
- company_discovery/services/query_planner.py +47 -0
- company_discovery/settings.py +152 -0
- company_discovery/skill_installer.py +197 -0
- company_discovery/update_plan.py +79 -0
- leads_cli-0.1.0.dist-info/METADATA +277 -0
- leads_cli-0.1.0.dist-info/RECORD +72 -0
- leads_cli-0.1.0.dist-info/WHEEL +4 -0
- leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from urllib.parse import urlsplit
|
|
7
|
+
|
|
8
|
+
from company_discovery.adapters.protocols import ContactEnrichmentProvider
|
|
9
|
+
from company_discovery.db.contact_enrichment_repository import ContactEnrichmentRepository
|
|
10
|
+
from company_discovery.domain.contact_models import (
|
|
11
|
+
ApolloBatchResult,
|
|
12
|
+
ApolloPersonMatch,
|
|
13
|
+
ApolloPersonRequest,
|
|
14
|
+
ContactChannelProfile,
|
|
15
|
+
ContactEnrichmentItem,
|
|
16
|
+
ContactEnrichmentOutcome,
|
|
17
|
+
ContactEnrichmentResult,
|
|
18
|
+
ContactEnrichmentSummary,
|
|
19
|
+
)
|
|
20
|
+
from company_discovery.reports.contact_enrichment_exporter import (
|
|
21
|
+
ContactEnrichmentArtifactExporter,
|
|
22
|
+
)
|
|
23
|
+
from company_discovery.services.contact_enrichment_progress import (
|
|
24
|
+
ContactEnrichmentProgressReporter,
|
|
25
|
+
NullContactEnrichmentProgressReporter,
|
|
26
|
+
)
|
|
27
|
+
from company_discovery.services.normalization import canonical_domain
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
PERSONAL_EMAIL_DOMAINS = {
|
|
31
|
+
"gmail.com", "googlemail.com", "yahoo.com", "hotmail.com", "outlook.com",
|
|
32
|
+
"icloud.com", "aol.com", "proton.me", "protonmail.com",
|
|
33
|
+
}
|
|
34
|
+
INVALID_EMAIL_STATUSES = {"invalid", "bounced", "unavailable", "do_not_mail"}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class ContactEnrichmentOptions:
|
|
39
|
+
reveal_email: bool = True
|
|
40
|
+
reveal_phone: bool = True
|
|
41
|
+
refresh: bool = False
|
|
42
|
+
|
|
43
|
+
def as_dict(self) -> dict[str, bool]:
|
|
44
|
+
return {
|
|
45
|
+
"reveal_email": self.reveal_email,
|
|
46
|
+
"reveal_phone": self.reveal_phone,
|
|
47
|
+
"refresh": self.refresh,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ContactEnrichmentPipeline:
|
|
52
|
+
BATCH_SIZE = 10
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
*,
|
|
57
|
+
repository: ContactEnrichmentRepository,
|
|
58
|
+
exporter: ContactEnrichmentArtifactExporter,
|
|
59
|
+
provider: ContactEnrichmentProvider,
|
|
60
|
+
freshness_days: int = 14,
|
|
61
|
+
poll_interval_seconds: float = 2,
|
|
62
|
+
poll_timeout_seconds: float = 120,
|
|
63
|
+
) -> None:
|
|
64
|
+
self._repository = repository
|
|
65
|
+
self._exporter = exporter
|
|
66
|
+
self._provider = provider
|
|
67
|
+
self._freshness_days = freshness_days
|
|
68
|
+
self._poll_interval_seconds = poll_interval_seconds
|
|
69
|
+
self._poll_timeout_seconds = poll_timeout_seconds
|
|
70
|
+
|
|
71
|
+
def enrich(
|
|
72
|
+
self,
|
|
73
|
+
contact_run_id: str,
|
|
74
|
+
*,
|
|
75
|
+
options: ContactEnrichmentOptions | None = None,
|
|
76
|
+
progress: ContactEnrichmentProgressReporter | None = None,
|
|
77
|
+
) -> ContactEnrichmentResult:
|
|
78
|
+
selected_options = options or ContactEnrichmentOptions()
|
|
79
|
+
reporter = progress or NullContactEnrichmentProgressReporter()
|
|
80
|
+
contacts = self._repository.accepted_contacts(contact_run_id)
|
|
81
|
+
run_id = self._repository.create_run(contact_run_id, selected_options.as_dict())
|
|
82
|
+
try:
|
|
83
|
+
return self._run(
|
|
84
|
+
run_id, contact_run_id, contacts, selected_options, reporter
|
|
85
|
+
)
|
|
86
|
+
except Exception as exc:
|
|
87
|
+
self._repository.fail_run(run_id, exc)
|
|
88
|
+
raise
|
|
89
|
+
|
|
90
|
+
def _run(
|
|
91
|
+
self,
|
|
92
|
+
run_id: str,
|
|
93
|
+
contact_run_id: str,
|
|
94
|
+
contacts: list[dict[str, object]],
|
|
95
|
+
options: ContactEnrichmentOptions,
|
|
96
|
+
reporter: ContactEnrichmentProgressReporter,
|
|
97
|
+
) -> ContactEnrichmentResult:
|
|
98
|
+
summary = ContactEnrichmentSummary(contacts_loaded=len(contacts))
|
|
99
|
+
items: list[ContactEnrichmentItem] = []
|
|
100
|
+
pending: list[dict[str, object]] = []
|
|
101
|
+
reporter.start(contact_run_id, len(contacts))
|
|
102
|
+
|
|
103
|
+
for contact in contacts:
|
|
104
|
+
remembered = None
|
|
105
|
+
if not options.refresh:
|
|
106
|
+
remembered = self._repository.fresh_item(
|
|
107
|
+
int(contact["candidate_id"]), self._freshness_days
|
|
108
|
+
)
|
|
109
|
+
if remembered is not None and self._memory_satisfies(remembered, options):
|
|
110
|
+
channels = remembered.channels.model_copy(
|
|
111
|
+
update={
|
|
112
|
+
"email": remembered.channels.email if options.reveal_email else None,
|
|
113
|
+
"email_status": (
|
|
114
|
+
remembered.channels.email_status if options.reveal_email else None
|
|
115
|
+
),
|
|
116
|
+
"phone": remembered.channels.phone if options.reveal_phone else None,
|
|
117
|
+
}
|
|
118
|
+
)
|
|
119
|
+
item = remembered.model_copy(
|
|
120
|
+
update={"discovery": self._discovery(contact), "channels": channels}
|
|
121
|
+
)
|
|
122
|
+
items.append(item)
|
|
123
|
+
summary.memory_reused += 1
|
|
124
|
+
else:
|
|
125
|
+
pending.append(contact)
|
|
126
|
+
reporter.memory(summary.memory_reused, len(pending))
|
|
127
|
+
|
|
128
|
+
batches = [pending[index : index + self.BATCH_SIZE] for index in range(0, len(pending), self.BATCH_SIZE)]
|
|
129
|
+
for batch_index, batch in enumerate(batches, start=1):
|
|
130
|
+
reporter.batch(batch_index, len(batches), len(batch))
|
|
131
|
+
requests = [self._request(contact) for contact in batch]
|
|
132
|
+
summary.apollo_batches += 1
|
|
133
|
+
summary.apollo_requests += len(requests)
|
|
134
|
+
result = self._provider.enrich_people(
|
|
135
|
+
requests,
|
|
136
|
+
reveal_email=options.reveal_email,
|
|
137
|
+
reveal_phone=options.reveal_phone,
|
|
138
|
+
)
|
|
139
|
+
if result.pending:
|
|
140
|
+
if not result.request_id:
|
|
141
|
+
raise RuntimeError("Apollo returned a pending result without a request_id")
|
|
142
|
+
result, polls = self._wait_for_result(result.request_id, reporter)
|
|
143
|
+
summary.async_polls += polls
|
|
144
|
+
by_id = {match.candidate_id: match for match in result.matches}
|
|
145
|
+
for contact in batch:
|
|
146
|
+
candidate_id = int(contact["candidate_id"])
|
|
147
|
+
match = by_id.get(candidate_id) or ApolloPersonMatch(
|
|
148
|
+
candidate_id=candidate_id, person_found=False
|
|
149
|
+
)
|
|
150
|
+
item = self._resolve(contact, match, options)
|
|
151
|
+
items.append(item)
|
|
152
|
+
reporter.outcome(
|
|
153
|
+
str(contact["full_name"]), item.outcome.value, item.review_flags
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
for item in items:
|
|
157
|
+
self._repository.save_item(run_id, item)
|
|
158
|
+
if item.outcome == ContactEnrichmentOutcome.READY:
|
|
159
|
+
summary.ready += 1
|
|
160
|
+
elif item.outcome == ContactEnrichmentOutcome.REVIEW:
|
|
161
|
+
summary.review += 1
|
|
162
|
+
else:
|
|
163
|
+
summary.blocked += 1
|
|
164
|
+
|
|
165
|
+
lineage = self._repository.get_run(run_id)
|
|
166
|
+
payload = {
|
|
167
|
+
"run_id": run_id,
|
|
168
|
+
"source_contact_run_id": contact_run_id,
|
|
169
|
+
"source_enrichment_run_id": lineage["source_enrichment_run_id"],
|
|
170
|
+
"source_discovery_run_id": lineage["source_discovery_run_id"],
|
|
171
|
+
"options": options.as_dict(),
|
|
172
|
+
"status": "completed",
|
|
173
|
+
"items": [item.model_dump(mode="json") for item in items],
|
|
174
|
+
}
|
|
175
|
+
paths = self._exporter.export(payload, summary)
|
|
176
|
+
self._repository.complete_run(run_id, summary, paths)
|
|
177
|
+
reporter.save(run_id)
|
|
178
|
+
return ContactEnrichmentResult(
|
|
179
|
+
run_id=run_id,
|
|
180
|
+
source_contact_run_id=contact_run_id,
|
|
181
|
+
summary=summary,
|
|
182
|
+
items=items,
|
|
183
|
+
artifact_paths=paths,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def _wait_for_result(
|
|
187
|
+
self, request_id: str, reporter: ContactEnrichmentProgressReporter
|
|
188
|
+
) -> tuple[ApolloBatchResult, int]:
|
|
189
|
+
deadline = time.monotonic() + self._poll_timeout_seconds
|
|
190
|
+
attempts = 0
|
|
191
|
+
while time.monotonic() < deadline:
|
|
192
|
+
attempts += 1
|
|
193
|
+
reporter.poll(request_id, attempts)
|
|
194
|
+
result = self._provider.poll(request_id)
|
|
195
|
+
if not result.pending:
|
|
196
|
+
return result, attempts
|
|
197
|
+
time.sleep(self._poll_interval_seconds)
|
|
198
|
+
raise TimeoutError(
|
|
199
|
+
f"Apollo enrichment {request_id} was still pending after "
|
|
200
|
+
f"{self._poll_timeout_seconds:g} seconds"
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
@staticmethod
|
|
204
|
+
def _request(contact: dict[str, object]) -> ApolloPersonRequest:
|
|
205
|
+
parts = str(contact["full_name"]).split()
|
|
206
|
+
return ApolloPersonRequest(
|
|
207
|
+
candidate_id=int(contact["candidate_id"]),
|
|
208
|
+
first_name=parts[0],
|
|
209
|
+
last_name=" ".join(parts[1:]) if len(parts) > 1 else parts[0],
|
|
210
|
+
full_name=str(contact["full_name"]),
|
|
211
|
+
company_name=str(contact["company_name"]),
|
|
212
|
+
company_domain=str(contact["company_domain"]),
|
|
213
|
+
linkedin_url=str(contact["linkedin_url"]) if contact.get("linkedin_url") else None,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
@staticmethod
|
|
217
|
+
def _discovery(contact: dict[str, object]) -> dict[str, object]:
|
|
218
|
+
return {
|
|
219
|
+
"company_name": contact["company_name"],
|
|
220
|
+
"company_domain": contact["company_domain"],
|
|
221
|
+
"contact_name": contact["full_name"],
|
|
222
|
+
"title": contact["title"],
|
|
223
|
+
"linkedin_url": contact.get("linkedin_url"),
|
|
224
|
+
"role_keys": contact.get("role_keys", []),
|
|
225
|
+
"source_urls": contact.get("source_urls", []),
|
|
226
|
+
"discovery_reason": contact.get("discovery_reason"),
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
@classmethod
|
|
230
|
+
def _resolve(
|
|
231
|
+
cls,
|
|
232
|
+
contact: dict[str, object],
|
|
233
|
+
match: ApolloPersonMatch,
|
|
234
|
+
options: ContactEnrichmentOptions,
|
|
235
|
+
) -> ContactEnrichmentItem:
|
|
236
|
+
discovery = cls._discovery(contact)
|
|
237
|
+
flags: list[str] = []
|
|
238
|
+
target_domain = str(contact["company_domain"]).lower()
|
|
239
|
+
target_linkedin = cls._normalize_linkedin(contact.get("linkedin_url"))
|
|
240
|
+
apollo_linkedin = cls._normalize_linkedin(match.linkedin_url)
|
|
241
|
+
target_name = cls._normalize_text(str(contact["full_name"]))
|
|
242
|
+
apollo_name = cls._normalize_text(match.full_name or "")
|
|
243
|
+
|
|
244
|
+
identity_match = match.person_found
|
|
245
|
+
if target_linkedin and apollo_linkedin and target_linkedin != apollo_linkedin:
|
|
246
|
+
identity_match = False
|
|
247
|
+
flags.append("linkedin_identity_mismatch")
|
|
248
|
+
elif apollo_name and apollo_name != target_name:
|
|
249
|
+
identity_match = False
|
|
250
|
+
flags.append("person_name_mismatch")
|
|
251
|
+
if not match.person_found:
|
|
252
|
+
flags.append("no_apollo_match")
|
|
253
|
+
|
|
254
|
+
email = match.email if options.reveal_email else None
|
|
255
|
+
if email and match.email_status and match.email_status.lower() in INVALID_EMAIL_STATUSES:
|
|
256
|
+
email = None
|
|
257
|
+
flags.append(f"email_{match.email_status.lower()}")
|
|
258
|
+
email_domain = cls._email_domain(email)
|
|
259
|
+
company_match = match.organization_domain == target_domain
|
|
260
|
+
company_supported = company_match or email_domain == target_domain
|
|
261
|
+
if match.organization_domain and not company_match:
|
|
262
|
+
flags.append("apollo_company_mismatch")
|
|
263
|
+
if email_domain in PERSONAL_EMAIL_DOMAINS:
|
|
264
|
+
flags.append("personal_email")
|
|
265
|
+
elif email_domain and email_domain != target_domain:
|
|
266
|
+
flags.append("email_domain_mismatch")
|
|
267
|
+
|
|
268
|
+
phone = match.phones[0] if options.reveal_phone and match.phones else None
|
|
269
|
+
has_channel = bool(email or phone)
|
|
270
|
+
if not has_channel:
|
|
271
|
+
flags.append("no_contact_channels")
|
|
272
|
+
|
|
273
|
+
if not identity_match:
|
|
274
|
+
outcome = ContactEnrichmentOutcome.BLOCKED
|
|
275
|
+
elif not has_channel:
|
|
276
|
+
outcome = ContactEnrichmentOutcome.BLOCKED
|
|
277
|
+
elif not company_supported:
|
|
278
|
+
flags.append("company_not_confirmed_by_apollo_channel")
|
|
279
|
+
outcome = ContactEnrichmentOutcome.REVIEW
|
|
280
|
+
elif "personal_email" in flags or "email_domain_mismatch" in flags:
|
|
281
|
+
outcome = ContactEnrichmentOutcome.REVIEW
|
|
282
|
+
else:
|
|
283
|
+
outcome = ContactEnrichmentOutcome.READY
|
|
284
|
+
|
|
285
|
+
channels = ContactChannelProfile(
|
|
286
|
+
email_requested=options.reveal_email,
|
|
287
|
+
phone_requested=options.reveal_phone,
|
|
288
|
+
email=email,
|
|
289
|
+
email_status=match.email_status if options.reveal_email else None,
|
|
290
|
+
phone=phone,
|
|
291
|
+
apollo_person_id=match.apollo_person_id,
|
|
292
|
+
apollo_linkedin_url=match.linkedin_url,
|
|
293
|
+
apollo_company_name=match.organization_name,
|
|
294
|
+
apollo_company_domain=match.organization_domain,
|
|
295
|
+
apollo_title=match.title,
|
|
296
|
+
)
|
|
297
|
+
trace = [
|
|
298
|
+
{
|
|
299
|
+
"stage": "apollo_match",
|
|
300
|
+
"identity_match": identity_match,
|
|
301
|
+
"company_match": company_match,
|
|
302
|
+
"email_domain": email_domain,
|
|
303
|
+
"company_supported": company_supported,
|
|
304
|
+
"provider_record": match.raw,
|
|
305
|
+
},
|
|
306
|
+
{"stage": "outcome", "value": outcome.value, "flags": flags},
|
|
307
|
+
]
|
|
308
|
+
return ContactEnrichmentItem(
|
|
309
|
+
candidate_id=int(contact["candidate_id"]),
|
|
310
|
+
discovery=discovery,
|
|
311
|
+
channels=channels,
|
|
312
|
+
outcome=outcome,
|
|
313
|
+
review_flags=list(dict.fromkeys(flags)),
|
|
314
|
+
trace=trace,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
@staticmethod
|
|
318
|
+
def _memory_satisfies(
|
|
319
|
+
item: ContactEnrichmentItem, options: ContactEnrichmentOptions
|
|
320
|
+
) -> bool:
|
|
321
|
+
if options.reveal_email and not item.channels.email_requested:
|
|
322
|
+
return False
|
|
323
|
+
if options.reveal_phone and not item.channels.phone_requested:
|
|
324
|
+
return False
|
|
325
|
+
return True
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
def _normalize_text(value: str) -> str:
|
|
329
|
+
return " ".join(
|
|
330
|
+
"".join(char.lower() if char.isalnum() else " " for char in value).split()
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def _normalize_linkedin(value: object) -> str | None:
|
|
335
|
+
if not value:
|
|
336
|
+
return None
|
|
337
|
+
parsed = urlsplit(str(value))
|
|
338
|
+
return f"{parsed.netloc.lower().removeprefix('www.')}{parsed.path.rstrip('/').lower()}"
|
|
339
|
+
|
|
340
|
+
@staticmethod
|
|
341
|
+
def _email_domain(email: str | None) -> str | None:
|
|
342
|
+
if not email or "@" not in email:
|
|
343
|
+
return None
|
|
344
|
+
return canonical_domain(email.rsplit("@", 1)[1])
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ContactEnrichmentProgressReporter(Protocol):
|
|
7
|
+
def start(self, source_run_id: str, contacts: int) -> None: ...
|
|
8
|
+
|
|
9
|
+
def memory(self, reused: int, pending: int) -> None: ...
|
|
10
|
+
|
|
11
|
+
def batch(self, current: int, total: int, size: int) -> None: ...
|
|
12
|
+
|
|
13
|
+
def poll(self, request_id: str, attempt: int) -> None: ...
|
|
14
|
+
|
|
15
|
+
def outcome(self, name: str, outcome: str, flags: list[str]) -> None: ...
|
|
16
|
+
|
|
17
|
+
def save(self, run_id: str) -> None: ...
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NullContactEnrichmentProgressReporter:
|
|
21
|
+
def start(self, source_run_id: str, contacts: int) -> None:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
def memory(self, reused: int, pending: int) -> None:
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
def batch(self, current: int, total: int, size: int) -> None:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
def poll(self, request_id: str, attempt: int) -> None:
|
|
31
|
+
pass
|
|
32
|
+
|
|
33
|
+
def outcome(self, name: str, outcome: str, flags: list[str]) -> None:
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
def save(self, run_id: str) -> None:
|
|
37
|
+
pass
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from company_discovery.adapters.protocols import StructuredLLM
|
|
7
|
+
from company_discovery.domain.contact_models import (
|
|
8
|
+
ContactAssessment,
|
|
9
|
+
ContactAssessmentBatch,
|
|
10
|
+
ContactSearchBatch,
|
|
11
|
+
ContactVerdict,
|
|
12
|
+
EvidenceVerdict,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
PROMPT_PATH = Path(__file__).parents[1] / "prompts" / "contact_evaluation" / "system.md"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ContactEvaluator:
|
|
20
|
+
def __init__(self, llm: StructuredLLM) -> None:
|
|
21
|
+
self._llm = llm
|
|
22
|
+
self._system_prompt = PROMPT_PATH.read_text(encoding="utf-8")
|
|
23
|
+
|
|
24
|
+
def evaluate(
|
|
25
|
+
self,
|
|
26
|
+
batch: ContactSearchBatch,
|
|
27
|
+
*,
|
|
28
|
+
current_only: bool,
|
|
29
|
+
require_role_match: bool,
|
|
30
|
+
) -> list[ContactAssessment]:
|
|
31
|
+
payload = {
|
|
32
|
+
"target": {
|
|
33
|
+
"company_name": batch.company_name,
|
|
34
|
+
"company_domain": batch.company_domain,
|
|
35
|
+
"role_key": batch.role_key,
|
|
36
|
+
"role_labels": batch.role_labels,
|
|
37
|
+
"current_only": current_only,
|
|
38
|
+
"require_role_match": require_role_match,
|
|
39
|
+
},
|
|
40
|
+
"results": [
|
|
41
|
+
{
|
|
42
|
+
"title": result.title,
|
|
43
|
+
"url": result.url,
|
|
44
|
+
"text": result.text,
|
|
45
|
+
"published_date": result.published_date,
|
|
46
|
+
}
|
|
47
|
+
for result in batch.results
|
|
48
|
+
],
|
|
49
|
+
}
|
|
50
|
+
generated = self._llm.generate(
|
|
51
|
+
system_prompt=self._system_prompt,
|
|
52
|
+
user_prompt=json.dumps(payload, ensure_ascii=True),
|
|
53
|
+
response_model=ContactAssessmentBatch,
|
|
54
|
+
)
|
|
55
|
+
assert isinstance(generated, ContactAssessmentBatch)
|
|
56
|
+
allowed_urls = {result.url for result in batch.results}
|
|
57
|
+
candidates: list[ContactAssessment] = []
|
|
58
|
+
for candidate in generated.candidates:
|
|
59
|
+
valid_sources = [url for url in candidate.source_urls if url in allowed_urls]
|
|
60
|
+
if not valid_sources:
|
|
61
|
+
continue
|
|
62
|
+
linkedin_url = candidate.linkedin_url
|
|
63
|
+
if linkedin_url not in allowed_urls or "linkedin.com/in/" not in linkedin_url.lower():
|
|
64
|
+
linkedin_url = next(
|
|
65
|
+
(
|
|
66
|
+
url
|
|
67
|
+
for url in valid_sources
|
|
68
|
+
if "linkedin.com/in/" in url.lower()
|
|
69
|
+
),
|
|
70
|
+
None,
|
|
71
|
+
)
|
|
72
|
+
verdict = self._guard_verdict(
|
|
73
|
+
candidate,
|
|
74
|
+
current_only=current_only,
|
|
75
|
+
require_role_match=require_role_match,
|
|
76
|
+
)
|
|
77
|
+
candidates.append(
|
|
78
|
+
candidate.model_copy(
|
|
79
|
+
update={
|
|
80
|
+
"source_urls": valid_sources,
|
|
81
|
+
"linkedin_url": linkedin_url,
|
|
82
|
+
"verdict": verdict,
|
|
83
|
+
}
|
|
84
|
+
)
|
|
85
|
+
)
|
|
86
|
+
return candidates
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def _guard_verdict(
|
|
90
|
+
candidate: ContactAssessment,
|
|
91
|
+
*,
|
|
92
|
+
current_only: bool,
|
|
93
|
+
require_role_match: bool,
|
|
94
|
+
) -> ContactVerdict:
|
|
95
|
+
if (
|
|
96
|
+
candidate.current_company_match == EvidenceVerdict.NO
|
|
97
|
+
or candidate.role_match == EvidenceVerdict.NO
|
|
98
|
+
or not candidate.identity_clear
|
|
99
|
+
):
|
|
100
|
+
return ContactVerdict.REJECTED
|
|
101
|
+
company_ok = candidate.current_company_match == EvidenceVerdict.YES or (
|
|
102
|
+
not current_only and candidate.current_company_match == EvidenceVerdict.LIKELY
|
|
103
|
+
)
|
|
104
|
+
role_ok = candidate.role_match == EvidenceVerdict.YES or (
|
|
105
|
+
not require_role_match and candidate.role_match == EvidenceVerdict.LIKELY
|
|
106
|
+
)
|
|
107
|
+
if company_ok and role_ok:
|
|
108
|
+
return ContactVerdict.ACCEPTED
|
|
109
|
+
return ContactVerdict.REVIEW
|
|
110
|
+
|