leads-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. company_discovery/__init__.py +4 -0
  2. company_discovery/adapters/__init__.py +5 -0
  3. company_discovery/adapters/apollo.py +189 -0
  4. company_discovery/adapters/exa.py +112 -0
  5. company_discovery/adapters/llm.py +118 -0
  6. company_discovery/adapters/protocols.py +58 -0
  7. company_discovery/adapters/website.py +154 -0
  8. company_discovery/bundled_skills/__init__.py +1 -0
  9. company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
  10. company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
  11. company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
  12. company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
  13. company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
  14. company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
  15. company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
  16. company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
  17. company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
  18. company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
  19. company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
  20. company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
  21. company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
  22. company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
  23. company_discovery/cli.py +1789 -0
  24. company_discovery/db/__init__.py +5 -0
  25. company_discovery/db/contact_enrichment_repository.py +268 -0
  26. company_discovery/db/contact_repository.py +366 -0
  27. company_discovery/db/enrichment_repository.py +207 -0
  28. company_discovery/db/models.py +324 -0
  29. company_discovery/db/repository.py +363 -0
  30. company_discovery/db/session.py +48 -0
  31. company_discovery/domain/__init__.py +24 -0
  32. company_discovery/domain/contact_models.py +178 -0
  33. company_discovery/domain/contact_spec.py +86 -0
  34. company_discovery/domain/models.py +287 -0
  35. company_discovery/domain/spec.py +263 -0
  36. company_discovery/migrations.py +190 -0
  37. company_discovery/prompts/__init__.py +8 -0
  38. company_discovery/prompts/candidate_evaluation/system.md +13 -0
  39. company_discovery/prompts/company_enrichment/system.md +42 -0
  40. company_discovery/prompts/contact_evaluation/system.md +18 -0
  41. company_discovery/prompts/query_generation/system.md +10 -0
  42. company_discovery/release_manifest.json +7 -0
  43. company_discovery/reports/__init__.py +4 -0
  44. company_discovery/reports/contact_enrichment_exporter.py +108 -0
  45. company_discovery/reports/contact_exporter.py +132 -0
  46. company_discovery/reports/enrichment_exporter.py +125 -0
  47. company_discovery/reports/exporter.py +135 -0
  48. company_discovery/runtime.py +336 -0
  49. company_discovery/services/__init__.py +4 -0
  50. company_discovery/services/contact_enrichment_pipeline.py +344 -0
  51. company_discovery/services/contact_enrichment_progress.py +37 -0
  52. company_discovery/services/contact_evaluator.py +110 -0
  53. company_discovery/services/contact_pipeline.py +295 -0
  54. company_discovery/services/contact_progress.py +38 -0
  55. company_discovery/services/enrichment_extractor.py +61 -0
  56. company_discovery/services/enrichment_pipeline.py +526 -0
  57. company_discovery/services/enrichment_progress.py +20 -0
  58. company_discovery/services/enrichment_resolver.py +148 -0
  59. company_discovery/services/evaluator.py +40 -0
  60. company_discovery/services/hygiene.py +51 -0
  61. company_discovery/services/memory.py +150 -0
  62. company_discovery/services/normalization.py +98 -0
  63. company_discovery/services/pipeline.py +628 -0
  64. company_discovery/services/progress.py +48 -0
  65. company_discovery/services/query_planner.py +47 -0
  66. company_discovery/settings.py +152 -0
  67. company_discovery/skill_installer.py +197 -0
  68. company_discovery/update_plan.py +79 -0
  69. leads_cli-0.1.0.dist-info/METADATA +277 -0
  70. leads_cli-0.1.0.dist-info/RECORD +72 -0
  71. leads_cli-0.1.0.dist-info/WHEEL +4 -0
  72. leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,344 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from urllib.parse import urlsplit
7
+
8
+ from company_discovery.adapters.protocols import ContactEnrichmentProvider
9
+ from company_discovery.db.contact_enrichment_repository import ContactEnrichmentRepository
10
+ from company_discovery.domain.contact_models import (
11
+ ApolloBatchResult,
12
+ ApolloPersonMatch,
13
+ ApolloPersonRequest,
14
+ ContactChannelProfile,
15
+ ContactEnrichmentItem,
16
+ ContactEnrichmentOutcome,
17
+ ContactEnrichmentResult,
18
+ ContactEnrichmentSummary,
19
+ )
20
+ from company_discovery.reports.contact_enrichment_exporter import (
21
+ ContactEnrichmentArtifactExporter,
22
+ )
23
+ from company_discovery.services.contact_enrichment_progress import (
24
+ ContactEnrichmentProgressReporter,
25
+ NullContactEnrichmentProgressReporter,
26
+ )
27
+ from company_discovery.services.normalization import canonical_domain
28
+
29
+
30
+ PERSONAL_EMAIL_DOMAINS = {
31
+ "gmail.com", "googlemail.com", "yahoo.com", "hotmail.com", "outlook.com",
32
+ "icloud.com", "aol.com", "proton.me", "protonmail.com",
33
+ }
34
+ INVALID_EMAIL_STATUSES = {"invalid", "bounced", "unavailable", "do_not_mail"}
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class ContactEnrichmentOptions:
39
+ reveal_email: bool = True
40
+ reveal_phone: bool = True
41
+ refresh: bool = False
42
+
43
+ def as_dict(self) -> dict[str, bool]:
44
+ return {
45
+ "reveal_email": self.reveal_email,
46
+ "reveal_phone": self.reveal_phone,
47
+ "refresh": self.refresh,
48
+ }
49
+
50
+
51
+ class ContactEnrichmentPipeline:
52
+ BATCH_SIZE = 10
53
+
54
+ def __init__(
55
+ self,
56
+ *,
57
+ repository: ContactEnrichmentRepository,
58
+ exporter: ContactEnrichmentArtifactExporter,
59
+ provider: ContactEnrichmentProvider,
60
+ freshness_days: int = 14,
61
+ poll_interval_seconds: float = 2,
62
+ poll_timeout_seconds: float = 120,
63
+ ) -> None:
64
+ self._repository = repository
65
+ self._exporter = exporter
66
+ self._provider = provider
67
+ self._freshness_days = freshness_days
68
+ self._poll_interval_seconds = poll_interval_seconds
69
+ self._poll_timeout_seconds = poll_timeout_seconds
70
+
71
+ def enrich(
72
+ self,
73
+ contact_run_id: str,
74
+ *,
75
+ options: ContactEnrichmentOptions | None = None,
76
+ progress: ContactEnrichmentProgressReporter | None = None,
77
+ ) -> ContactEnrichmentResult:
78
+ selected_options = options or ContactEnrichmentOptions()
79
+ reporter = progress or NullContactEnrichmentProgressReporter()
80
+ contacts = self._repository.accepted_contacts(contact_run_id)
81
+ run_id = self._repository.create_run(contact_run_id, selected_options.as_dict())
82
+ try:
83
+ return self._run(
84
+ run_id, contact_run_id, contacts, selected_options, reporter
85
+ )
86
+ except Exception as exc:
87
+ self._repository.fail_run(run_id, exc)
88
+ raise
89
+
90
+ def _run(
91
+ self,
92
+ run_id: str,
93
+ contact_run_id: str,
94
+ contacts: list[dict[str, object]],
95
+ options: ContactEnrichmentOptions,
96
+ reporter: ContactEnrichmentProgressReporter,
97
+ ) -> ContactEnrichmentResult:
98
+ summary = ContactEnrichmentSummary(contacts_loaded=len(contacts))
99
+ items: list[ContactEnrichmentItem] = []
100
+ pending: list[dict[str, object]] = []
101
+ reporter.start(contact_run_id, len(contacts))
102
+
103
+ for contact in contacts:
104
+ remembered = None
105
+ if not options.refresh:
106
+ remembered = self._repository.fresh_item(
107
+ int(contact["candidate_id"]), self._freshness_days
108
+ )
109
+ if remembered is not None and self._memory_satisfies(remembered, options):
110
+ channels = remembered.channels.model_copy(
111
+ update={
112
+ "email": remembered.channels.email if options.reveal_email else None,
113
+ "email_status": (
114
+ remembered.channels.email_status if options.reveal_email else None
115
+ ),
116
+ "phone": remembered.channels.phone if options.reveal_phone else None,
117
+ }
118
+ )
119
+ item = remembered.model_copy(
120
+ update={"discovery": self._discovery(contact), "channels": channels}
121
+ )
122
+ items.append(item)
123
+ summary.memory_reused += 1
124
+ else:
125
+ pending.append(contact)
126
+ reporter.memory(summary.memory_reused, len(pending))
127
+
128
+ batches = [pending[index : index + self.BATCH_SIZE] for index in range(0, len(pending), self.BATCH_SIZE)]
129
+ for batch_index, batch in enumerate(batches, start=1):
130
+ reporter.batch(batch_index, len(batches), len(batch))
131
+ requests = [self._request(contact) for contact in batch]
132
+ summary.apollo_batches += 1
133
+ summary.apollo_requests += len(requests)
134
+ result = self._provider.enrich_people(
135
+ requests,
136
+ reveal_email=options.reveal_email,
137
+ reveal_phone=options.reveal_phone,
138
+ )
139
+ if result.pending:
140
+ if not result.request_id:
141
+ raise RuntimeError("Apollo returned a pending result without a request_id")
142
+ result, polls = self._wait_for_result(result.request_id, reporter)
143
+ summary.async_polls += polls
144
+ by_id = {match.candidate_id: match for match in result.matches}
145
+ for contact in batch:
146
+ candidate_id = int(contact["candidate_id"])
147
+ match = by_id.get(candidate_id) or ApolloPersonMatch(
148
+ candidate_id=candidate_id, person_found=False
149
+ )
150
+ item = self._resolve(contact, match, options)
151
+ items.append(item)
152
+ reporter.outcome(
153
+ str(contact["full_name"]), item.outcome.value, item.review_flags
154
+ )
155
+
156
+ for item in items:
157
+ self._repository.save_item(run_id, item)
158
+ if item.outcome == ContactEnrichmentOutcome.READY:
159
+ summary.ready += 1
160
+ elif item.outcome == ContactEnrichmentOutcome.REVIEW:
161
+ summary.review += 1
162
+ else:
163
+ summary.blocked += 1
164
+
165
+ lineage = self._repository.get_run(run_id)
166
+ payload = {
167
+ "run_id": run_id,
168
+ "source_contact_run_id": contact_run_id,
169
+ "source_enrichment_run_id": lineage["source_enrichment_run_id"],
170
+ "source_discovery_run_id": lineage["source_discovery_run_id"],
171
+ "options": options.as_dict(),
172
+ "status": "completed",
173
+ "items": [item.model_dump(mode="json") for item in items],
174
+ }
175
+ paths = self._exporter.export(payload, summary)
176
+ self._repository.complete_run(run_id, summary, paths)
177
+ reporter.save(run_id)
178
+ return ContactEnrichmentResult(
179
+ run_id=run_id,
180
+ source_contact_run_id=contact_run_id,
181
+ summary=summary,
182
+ items=items,
183
+ artifact_paths=paths,
184
+ )
185
+
186
+ def _wait_for_result(
187
+ self, request_id: str, reporter: ContactEnrichmentProgressReporter
188
+ ) -> tuple[ApolloBatchResult, int]:
189
+ deadline = time.monotonic() + self._poll_timeout_seconds
190
+ attempts = 0
191
+ while time.monotonic() < deadline:
192
+ attempts += 1
193
+ reporter.poll(request_id, attempts)
194
+ result = self._provider.poll(request_id)
195
+ if not result.pending:
196
+ return result, attempts
197
+ time.sleep(self._poll_interval_seconds)
198
+ raise TimeoutError(
199
+ f"Apollo enrichment {request_id} was still pending after "
200
+ f"{self._poll_timeout_seconds:g} seconds"
201
+ )
202
+
203
+ @staticmethod
204
+ def _request(contact: dict[str, object]) -> ApolloPersonRequest:
205
+ parts = str(contact["full_name"]).split()
206
+ return ApolloPersonRequest(
207
+ candidate_id=int(contact["candidate_id"]),
208
+ first_name=parts[0],
209
+ last_name=" ".join(parts[1:]) if len(parts) > 1 else parts[0],
210
+ full_name=str(contact["full_name"]),
211
+ company_name=str(contact["company_name"]),
212
+ company_domain=str(contact["company_domain"]),
213
+ linkedin_url=str(contact["linkedin_url"]) if contact.get("linkedin_url") else None,
214
+ )
215
+
216
+ @staticmethod
217
+ def _discovery(contact: dict[str, object]) -> dict[str, object]:
218
+ return {
219
+ "company_name": contact["company_name"],
220
+ "company_domain": contact["company_domain"],
221
+ "contact_name": contact["full_name"],
222
+ "title": contact["title"],
223
+ "linkedin_url": contact.get("linkedin_url"),
224
+ "role_keys": contact.get("role_keys", []),
225
+ "source_urls": contact.get("source_urls", []),
226
+ "discovery_reason": contact.get("discovery_reason"),
227
+ }
228
+
229
+ @classmethod
230
+ def _resolve(
231
+ cls,
232
+ contact: dict[str, object],
233
+ match: ApolloPersonMatch,
234
+ options: ContactEnrichmentOptions,
235
+ ) -> ContactEnrichmentItem:
236
+ discovery = cls._discovery(contact)
237
+ flags: list[str] = []
238
+ target_domain = str(contact["company_domain"]).lower()
239
+ target_linkedin = cls._normalize_linkedin(contact.get("linkedin_url"))
240
+ apollo_linkedin = cls._normalize_linkedin(match.linkedin_url)
241
+ target_name = cls._normalize_text(str(contact["full_name"]))
242
+ apollo_name = cls._normalize_text(match.full_name or "")
243
+
244
+ identity_match = match.person_found
245
+ if target_linkedin and apollo_linkedin and target_linkedin != apollo_linkedin:
246
+ identity_match = False
247
+ flags.append("linkedin_identity_mismatch")
248
+ elif apollo_name and apollo_name != target_name:
249
+ identity_match = False
250
+ flags.append("person_name_mismatch")
251
+ if not match.person_found:
252
+ flags.append("no_apollo_match")
253
+
254
+ email = match.email if options.reveal_email else None
255
+ if email and match.email_status and match.email_status.lower() in INVALID_EMAIL_STATUSES:
256
+ email = None
257
+ flags.append(f"email_{match.email_status.lower()}")
258
+ email_domain = cls._email_domain(email)
259
+ company_match = match.organization_domain == target_domain
260
+ company_supported = company_match or email_domain == target_domain
261
+ if match.organization_domain and not company_match:
262
+ flags.append("apollo_company_mismatch")
263
+ if email_domain in PERSONAL_EMAIL_DOMAINS:
264
+ flags.append("personal_email")
265
+ elif email_domain and email_domain != target_domain:
266
+ flags.append("email_domain_mismatch")
267
+
268
+ phone = match.phones[0] if options.reveal_phone and match.phones else None
269
+ has_channel = bool(email or phone)
270
+ if not has_channel:
271
+ flags.append("no_contact_channels")
272
+
273
+ if not identity_match:
274
+ outcome = ContactEnrichmentOutcome.BLOCKED
275
+ elif not has_channel:
276
+ outcome = ContactEnrichmentOutcome.BLOCKED
277
+ elif not company_supported:
278
+ flags.append("company_not_confirmed_by_apollo_channel")
279
+ outcome = ContactEnrichmentOutcome.REVIEW
280
+ elif "personal_email" in flags or "email_domain_mismatch" in flags:
281
+ outcome = ContactEnrichmentOutcome.REVIEW
282
+ else:
283
+ outcome = ContactEnrichmentOutcome.READY
284
+
285
+ channels = ContactChannelProfile(
286
+ email_requested=options.reveal_email,
287
+ phone_requested=options.reveal_phone,
288
+ email=email,
289
+ email_status=match.email_status if options.reveal_email else None,
290
+ phone=phone,
291
+ apollo_person_id=match.apollo_person_id,
292
+ apollo_linkedin_url=match.linkedin_url,
293
+ apollo_company_name=match.organization_name,
294
+ apollo_company_domain=match.organization_domain,
295
+ apollo_title=match.title,
296
+ )
297
+ trace = [
298
+ {
299
+ "stage": "apollo_match",
300
+ "identity_match": identity_match,
301
+ "company_match": company_match,
302
+ "email_domain": email_domain,
303
+ "company_supported": company_supported,
304
+ "provider_record": match.raw,
305
+ },
306
+ {"stage": "outcome", "value": outcome.value, "flags": flags},
307
+ ]
308
+ return ContactEnrichmentItem(
309
+ candidate_id=int(contact["candidate_id"]),
310
+ discovery=discovery,
311
+ channels=channels,
312
+ outcome=outcome,
313
+ review_flags=list(dict.fromkeys(flags)),
314
+ trace=trace,
315
+ )
316
+
317
+ @staticmethod
318
+ def _memory_satisfies(
319
+ item: ContactEnrichmentItem, options: ContactEnrichmentOptions
320
+ ) -> bool:
321
+ if options.reveal_email and not item.channels.email_requested:
322
+ return False
323
+ if options.reveal_phone and not item.channels.phone_requested:
324
+ return False
325
+ return True
326
+
327
+ @staticmethod
328
+ def _normalize_text(value: str) -> str:
329
+ return " ".join(
330
+ "".join(char.lower() if char.isalnum() else " " for char in value).split()
331
+ )
332
+
333
+ @staticmethod
334
+ def _normalize_linkedin(value: object) -> str | None:
335
+ if not value:
336
+ return None
337
+ parsed = urlsplit(str(value))
338
+ return f"{parsed.netloc.lower().removeprefix('www.')}{parsed.path.rstrip('/').lower()}"
339
+
340
+ @staticmethod
341
+ def _email_domain(email: str | None) -> str | None:
342
+ if not email or "@" not in email:
343
+ return None
344
+ return canonical_domain(email.rsplit("@", 1)[1])
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol
4
+
5
+
6
+ class ContactEnrichmentProgressReporter(Protocol):
7
+ def start(self, source_run_id: str, contacts: int) -> None: ...
8
+
9
+ def memory(self, reused: int, pending: int) -> None: ...
10
+
11
+ def batch(self, current: int, total: int, size: int) -> None: ...
12
+
13
+ def poll(self, request_id: str, attempt: int) -> None: ...
14
+
15
+ def outcome(self, name: str, outcome: str, flags: list[str]) -> None: ...
16
+
17
+ def save(self, run_id: str) -> None: ...
18
+
19
+
20
+ class NullContactEnrichmentProgressReporter:
21
+ def start(self, source_run_id: str, contacts: int) -> None:
22
+ pass
23
+
24
+ def memory(self, reused: int, pending: int) -> None:
25
+ pass
26
+
27
+ def batch(self, current: int, total: int, size: int) -> None:
28
+ pass
29
+
30
+ def poll(self, request_id: str, attempt: int) -> None:
31
+ pass
32
+
33
+ def outcome(self, name: str, outcome: str, flags: list[str]) -> None:
34
+ pass
35
+
36
+ def save(self, run_id: str) -> None:
37
+ pass
@@ -0,0 +1,110 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from company_discovery.adapters.protocols import StructuredLLM
7
+ from company_discovery.domain.contact_models import (
8
+ ContactAssessment,
9
+ ContactAssessmentBatch,
10
+ ContactSearchBatch,
11
+ ContactVerdict,
12
+ EvidenceVerdict,
13
+ )
14
+
15
+
16
+ PROMPT_PATH = Path(__file__).parents[1] / "prompts" / "contact_evaluation" / "system.md"
17
+
18
+
19
+ class ContactEvaluator:
20
+ def __init__(self, llm: StructuredLLM) -> None:
21
+ self._llm = llm
22
+ self._system_prompt = PROMPT_PATH.read_text(encoding="utf-8")
23
+
24
+ def evaluate(
25
+ self,
26
+ batch: ContactSearchBatch,
27
+ *,
28
+ current_only: bool,
29
+ require_role_match: bool,
30
+ ) -> list[ContactAssessment]:
31
+ payload = {
32
+ "target": {
33
+ "company_name": batch.company_name,
34
+ "company_domain": batch.company_domain,
35
+ "role_key": batch.role_key,
36
+ "role_labels": batch.role_labels,
37
+ "current_only": current_only,
38
+ "require_role_match": require_role_match,
39
+ },
40
+ "results": [
41
+ {
42
+ "title": result.title,
43
+ "url": result.url,
44
+ "text": result.text,
45
+ "published_date": result.published_date,
46
+ }
47
+ for result in batch.results
48
+ ],
49
+ }
50
+ generated = self._llm.generate(
51
+ system_prompt=self._system_prompt,
52
+ user_prompt=json.dumps(payload, ensure_ascii=True),
53
+ response_model=ContactAssessmentBatch,
54
+ )
55
+ assert isinstance(generated, ContactAssessmentBatch)
56
+ allowed_urls = {result.url for result in batch.results}
57
+ candidates: list[ContactAssessment] = []
58
+ for candidate in generated.candidates:
59
+ valid_sources = [url for url in candidate.source_urls if url in allowed_urls]
60
+ if not valid_sources:
61
+ continue
62
+ linkedin_url = candidate.linkedin_url
63
+ if linkedin_url not in allowed_urls or "linkedin.com/in/" not in linkedin_url.lower():
64
+ linkedin_url = next(
65
+ (
66
+ url
67
+ for url in valid_sources
68
+ if "linkedin.com/in/" in url.lower()
69
+ ),
70
+ None,
71
+ )
72
+ verdict = self._guard_verdict(
73
+ candidate,
74
+ current_only=current_only,
75
+ require_role_match=require_role_match,
76
+ )
77
+ candidates.append(
78
+ candidate.model_copy(
79
+ update={
80
+ "source_urls": valid_sources,
81
+ "linkedin_url": linkedin_url,
82
+ "verdict": verdict,
83
+ }
84
+ )
85
+ )
86
+ return candidates
87
+
88
+ @staticmethod
89
+ def _guard_verdict(
90
+ candidate: ContactAssessment,
91
+ *,
92
+ current_only: bool,
93
+ require_role_match: bool,
94
+ ) -> ContactVerdict:
95
+ if (
96
+ candidate.current_company_match == EvidenceVerdict.NO
97
+ or candidate.role_match == EvidenceVerdict.NO
98
+ or not candidate.identity_clear
99
+ ):
100
+ return ContactVerdict.REJECTED
101
+ company_ok = candidate.current_company_match == EvidenceVerdict.YES or (
102
+ not current_only and candidate.current_company_match == EvidenceVerdict.LIKELY
103
+ )
104
+ role_ok = candidate.role_match == EvidenceVerdict.YES or (
105
+ not require_role_match and candidate.role_match == EvidenceVerdict.LIKELY
106
+ )
107
+ if company_ok and role_ok:
108
+ return ContactVerdict.ACCEPTED
109
+ return ContactVerdict.REVIEW
110
+