leads-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- company_discovery/__init__.py +4 -0
- company_discovery/adapters/__init__.py +5 -0
- company_discovery/adapters/apollo.py +189 -0
- company_discovery/adapters/exa.py +112 -0
- company_discovery/adapters/llm.py +118 -0
- company_discovery/adapters/protocols.py +58 -0
- company_discovery/adapters/website.py +154 -0
- company_discovery/bundled_skills/__init__.py +1 -0
- company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
- company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
- company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
- company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
- company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
- company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
- company_discovery/cli.py +1789 -0
- company_discovery/db/__init__.py +5 -0
- company_discovery/db/contact_enrichment_repository.py +268 -0
- company_discovery/db/contact_repository.py +366 -0
- company_discovery/db/enrichment_repository.py +207 -0
- company_discovery/db/models.py +324 -0
- company_discovery/db/repository.py +363 -0
- company_discovery/db/session.py +48 -0
- company_discovery/domain/__init__.py +24 -0
- company_discovery/domain/contact_models.py +178 -0
- company_discovery/domain/contact_spec.py +86 -0
- company_discovery/domain/models.py +287 -0
- company_discovery/domain/spec.py +263 -0
- company_discovery/migrations.py +190 -0
- company_discovery/prompts/__init__.py +8 -0
- company_discovery/prompts/candidate_evaluation/system.md +13 -0
- company_discovery/prompts/company_enrichment/system.md +42 -0
- company_discovery/prompts/contact_evaluation/system.md +18 -0
- company_discovery/prompts/query_generation/system.md +10 -0
- company_discovery/release_manifest.json +7 -0
- company_discovery/reports/__init__.py +4 -0
- company_discovery/reports/contact_enrichment_exporter.py +108 -0
- company_discovery/reports/contact_exporter.py +132 -0
- company_discovery/reports/enrichment_exporter.py +125 -0
- company_discovery/reports/exporter.py +135 -0
- company_discovery/runtime.py +336 -0
- company_discovery/services/__init__.py +4 -0
- company_discovery/services/contact_enrichment_pipeline.py +344 -0
- company_discovery/services/contact_enrichment_progress.py +37 -0
- company_discovery/services/contact_evaluator.py +110 -0
- company_discovery/services/contact_pipeline.py +295 -0
- company_discovery/services/contact_progress.py +38 -0
- company_discovery/services/enrichment_extractor.py +61 -0
- company_discovery/services/enrichment_pipeline.py +526 -0
- company_discovery/services/enrichment_progress.py +20 -0
- company_discovery/services/enrichment_resolver.py +148 -0
- company_discovery/services/evaluator.py +40 -0
- company_discovery/services/hygiene.py +51 -0
- company_discovery/services/memory.py +150 -0
- company_discovery/services/normalization.py +98 -0
- company_discovery/services/pipeline.py +628 -0
- company_discovery/services/progress.py +48 -0
- company_discovery/services/query_planner.py +47 -0
- company_discovery/settings.py +152 -0
- company_discovery/skill_installer.py +197 -0
- company_discovery/update_plan.py +79 -0
- leads_cli-0.1.0.dist-info/METADATA +277 -0
- leads_cli-0.1.0.dist-info/RECORD +72 -0
- leads_cli-0.1.0.dist-info/WHEEL +4 -0
- leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from datetime import UTC, datetime
|
|
5
|
+
from typing import Any
|
|
6
|
+
from uuid import uuid4
|
|
7
|
+
|
|
8
|
+
from sqlalchemy import Select, func, select
|
|
9
|
+
from sqlalchemy.exc import IntegrityError
|
|
10
|
+
from sqlalchemy.orm import joinedload
|
|
11
|
+
|
|
12
|
+
from company_discovery.db.models import (
|
|
13
|
+
CandidateEvaluationRow,
|
|
14
|
+
CompanyCandidateRow,
|
|
15
|
+
DiscoveryQueryRow,
|
|
16
|
+
DiscoveryRunRow,
|
|
17
|
+
RawResultRow,
|
|
18
|
+
)
|
|
19
|
+
from company_discovery.db.session import Database
|
|
20
|
+
from company_discovery.domain.models import (
|
|
21
|
+
CandidateBucket,
|
|
22
|
+
CandidateEvaluation,
|
|
23
|
+
ExaSearchResult,
|
|
24
|
+
NormalizedCandidate,
|
|
25
|
+
RunSummary,
|
|
26
|
+
)
|
|
27
|
+
from company_discovery.domain.spec import CompanySearchSpec
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class MemoryRecord:
|
|
32
|
+
candidate_id: int
|
|
33
|
+
candidate: NormalizedCandidate
|
|
34
|
+
latest_fit: str | None
|
|
35
|
+
latest_bucket: str | None
|
|
36
|
+
latest_reason: str | None
|
|
37
|
+
latest_reason_codes: tuple[str, ...]
|
|
38
|
+
latest_evaluation: CandidateEvaluation | None
|
|
39
|
+
ever_selected: bool
|
|
40
|
+
latest_spec: CompanySearchSpec | None = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class RunNotFoundError(LookupError):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class CandidateNotFoundError(LookupError):
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DiscoveryRepository:
|
|
52
|
+
RUN_ID_PREFIX = "company-discover-"
|
|
53
|
+
CREATE_RUN_ATTEMPTS = 5
|
|
54
|
+
|
|
55
|
+
def __init__(self, database: Database) -> None:
|
|
56
|
+
self.database = database
|
|
57
|
+
|
|
58
|
+
def create_run(self, spec: CompanySearchSpec, source_spec_path: str | None = None) -> str:
|
|
59
|
+
for _ in range(self.CREATE_RUN_ATTEMPTS):
|
|
60
|
+
try:
|
|
61
|
+
run_id = self._new_run_id()
|
|
62
|
+
with self.database.session() as session:
|
|
63
|
+
session.add(
|
|
64
|
+
DiscoveryRunRow(
|
|
65
|
+
id=run_id,
|
|
66
|
+
spec_payload=spec.model_dump(mode="json"),
|
|
67
|
+
source_spec_path=source_spec_path,
|
|
68
|
+
status="running",
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
return run_id
|
|
72
|
+
except IntegrityError:
|
|
73
|
+
continue
|
|
74
|
+
raise RuntimeError("unable to allocate a unique company discovery run id")
|
|
75
|
+
|
|
76
|
+
@classmethod
|
|
77
|
+
def _new_run_id(cls) -> str:
|
|
78
|
+
return f"{cls.RUN_ID_PREFIX}{uuid4().hex[:12]}"
|
|
79
|
+
|
|
80
|
+
def complete_run(
|
|
81
|
+
self,
|
|
82
|
+
run_id: str,
|
|
83
|
+
summary: RunSummary,
|
|
84
|
+
artifact_paths: dict[str, str],
|
|
85
|
+
) -> None:
|
|
86
|
+
with self.database.session() as session:
|
|
87
|
+
row = self._require_run(session, run_id)
|
|
88
|
+
row.status = "completed"
|
|
89
|
+
row.summary_payload = summary.model_dump(mode="json")
|
|
90
|
+
row.artifact_paths = artifact_paths
|
|
91
|
+
row.completed_at = datetime.now(UTC)
|
|
92
|
+
|
|
93
|
+
def fail_run(self, run_id: str, error: Exception) -> None:
|
|
94
|
+
with self.database.session() as session:
|
|
95
|
+
row = self._require_run(session, run_id)
|
|
96
|
+
row.status = "failed"
|
|
97
|
+
row.error_message = str(error)
|
|
98
|
+
row.completed_at = datetime.now(UTC)
|
|
99
|
+
|
|
100
|
+
def set_artifacts(self, run_id: str, artifact_paths: dict[str, str]) -> None:
|
|
101
|
+
with self.database.session() as session:
|
|
102
|
+
row = self._require_run(session, run_id)
|
|
103
|
+
row.artifact_paths = artifact_paths
|
|
104
|
+
|
|
105
|
+
def add_query(self, run_id: str, order: int, text: str, rationale: str = "") -> int:
|
|
106
|
+
with self.database.session() as session:
|
|
107
|
+
row = DiscoveryQueryRow(
|
|
108
|
+
run_id=run_id,
|
|
109
|
+
query_order=order,
|
|
110
|
+
query_text=text,
|
|
111
|
+
rationale=rationale,
|
|
112
|
+
)
|
|
113
|
+
session.add(row)
|
|
114
|
+
session.flush()
|
|
115
|
+
return row.id
|
|
116
|
+
|
|
117
|
+
def save_query_results(
|
|
118
|
+
self,
|
|
119
|
+
run_id: str,
|
|
120
|
+
query_id: int,
|
|
121
|
+
results: list[ExaSearchResult],
|
|
122
|
+
cost_dollars: float,
|
|
123
|
+
) -> None:
|
|
124
|
+
with self.database.session() as session:
|
|
125
|
+
query = session.get(DiscoveryQueryRow, query_id)
|
|
126
|
+
if query is None or query.run_id != run_id:
|
|
127
|
+
raise LookupError(f"query {query_id} does not belong to run {run_id}")
|
|
128
|
+
query.result_count = len(results)
|
|
129
|
+
query.cost_dollars = cost_dollars
|
|
130
|
+
session.add_all(
|
|
131
|
+
RawResultRow(
|
|
132
|
+
run_id=run_id,
|
|
133
|
+
query_id=query_id,
|
|
134
|
+
result_position=result.position,
|
|
135
|
+
observed_url=result.url,
|
|
136
|
+
observed_title=result.title,
|
|
137
|
+
raw_payload=result.model_dump(mode="json"),
|
|
138
|
+
)
|
|
139
|
+
for result in results
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def upsert_candidate(self, candidate: NormalizedCandidate) -> int:
|
|
143
|
+
with self.database.session() as session:
|
|
144
|
+
row = session.scalar(
|
|
145
|
+
select(CompanyCandidateRow).where(CompanyCandidateRow.domain == candidate.domain)
|
|
146
|
+
)
|
|
147
|
+
payload = candidate.model_dump(mode="json")
|
|
148
|
+
if row is None:
|
|
149
|
+
row = CompanyCandidateRow(
|
|
150
|
+
canonical_name=candidate.company_name,
|
|
151
|
+
domain=candidate.domain,
|
|
152
|
+
dedupe_key=candidate.dedupe_key,
|
|
153
|
+
normalized_payload=payload,
|
|
154
|
+
vertical=candidate.vertical,
|
|
155
|
+
country=candidate.country,
|
|
156
|
+
state=candidate.state,
|
|
157
|
+
employee_min=candidate.employee_min,
|
|
158
|
+
employee_max=candidate.employee_max,
|
|
159
|
+
ownership_type=candidate.ownership_type,
|
|
160
|
+
excluded=candidate.excluded,
|
|
161
|
+
first_seen_at=candidate.first_seen_at,
|
|
162
|
+
last_seen_at=candidate.last_seen_at,
|
|
163
|
+
)
|
|
164
|
+
session.add(row)
|
|
165
|
+
session.flush()
|
|
166
|
+
else:
|
|
167
|
+
merged = self._merge_candidate_payload(row.normalized_payload, payload)
|
|
168
|
+
row.canonical_name = candidate.company_name or row.canonical_name
|
|
169
|
+
row.normalized_payload = merged
|
|
170
|
+
row.last_seen_at = candidate.last_seen_at
|
|
171
|
+
row.excluded = row.excluded or candidate.excluded
|
|
172
|
+
return row.id
|
|
173
|
+
|
|
174
|
+
def record_evaluation(
|
|
175
|
+
self,
|
|
176
|
+
run_id: str,
|
|
177
|
+
candidate_id: int,
|
|
178
|
+
evaluation: CandidateEvaluation,
|
|
179
|
+
bucket: CandidateBucket,
|
|
180
|
+
source: str,
|
|
181
|
+
) -> None:
|
|
182
|
+
now = datetime.now(UTC)
|
|
183
|
+
with self.database.session() as session:
|
|
184
|
+
candidate = session.get(CompanyCandidateRow, candidate_id)
|
|
185
|
+
if candidate is None:
|
|
186
|
+
raise CandidateNotFoundError(f"candidate not found: {candidate_id}")
|
|
187
|
+
existing = session.scalar(
|
|
188
|
+
select(CandidateEvaluationRow).where(
|
|
189
|
+
CandidateEvaluationRow.run_id == run_id,
|
|
190
|
+
CandidateEvaluationRow.candidate_id == candidate_id,
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
if existing is not None:
|
|
194
|
+
raise ValueError(f"candidate {candidate.domain} already evaluated in run {run_id}")
|
|
195
|
+
session.add(
|
|
196
|
+
CandidateEvaluationRow(
|
|
197
|
+
run_id=run_id,
|
|
198
|
+
candidate_id=candidate_id,
|
|
199
|
+
evaluation_payload=evaluation.model_dump(mode="json"),
|
|
200
|
+
fit_outcome=evaluation.fit.value,
|
|
201
|
+
bucket=bucket.value,
|
|
202
|
+
reason=evaluation.reason,
|
|
203
|
+
reason_codes=evaluation.reason_codes,
|
|
204
|
+
source=source,
|
|
205
|
+
created_at=now,
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
self._apply_inferences(candidate, evaluation)
|
|
209
|
+
candidate.prior_bucket = bucket.value
|
|
210
|
+
candidate.prior_reason = evaluation.reason
|
|
211
|
+
candidate.last_evaluated_at = now
|
|
212
|
+
|
|
213
|
+
def memory_records(self) -> list[MemoryRecord]:
|
|
214
|
+
latest = (
|
|
215
|
+
select(
|
|
216
|
+
CandidateEvaluationRow.candidate_id,
|
|
217
|
+
func.max(CandidateEvaluationRow.id).label("latest_id"),
|
|
218
|
+
)
|
|
219
|
+
.group_by(CandidateEvaluationRow.candidate_id)
|
|
220
|
+
.subquery()
|
|
221
|
+
)
|
|
222
|
+
latest_evaluations: Select[tuple[CompanyCandidateRow, CandidateEvaluationRow | None]] = (
|
|
223
|
+
select(CompanyCandidateRow, CandidateEvaluationRow)
|
|
224
|
+
.outerjoin(latest, latest.c.candidate_id == CompanyCandidateRow.id)
|
|
225
|
+
.outerjoin(
|
|
226
|
+
CandidateEvaluationRow,
|
|
227
|
+
CandidateEvaluationRow.id == latest.c.latest_id,
|
|
228
|
+
)
|
|
229
|
+
.order_by(CompanyCandidateRow.last_seen_at.desc())
|
|
230
|
+
)
|
|
231
|
+
with self.database.session() as session:
|
|
232
|
+
rows = session.execute(latest_evaluations).all()
|
|
233
|
+
selected_ids = set(
|
|
234
|
+
session.scalars(
|
|
235
|
+
select(CandidateEvaluationRow.candidate_id).where(
|
|
236
|
+
CandidateEvaluationRow.bucket == CandidateBucket.SELECTED.value
|
|
237
|
+
)
|
|
238
|
+
).all()
|
|
239
|
+
)
|
|
240
|
+
run_specs = {
|
|
241
|
+
row.id: CompanySearchSpec.model_validate(row.spec_payload)
|
|
242
|
+
for row in session.scalars(select(DiscoveryRunRow)).all()
|
|
243
|
+
}
|
|
244
|
+
return [
|
|
245
|
+
MemoryRecord(
|
|
246
|
+
candidate_id=candidate_row.id,
|
|
247
|
+
candidate=NormalizedCandidate.model_validate(candidate_row.normalized_payload),
|
|
248
|
+
latest_fit=evaluation.fit_outcome if evaluation else None,
|
|
249
|
+
latest_bucket=evaluation.bucket if evaluation else None,
|
|
250
|
+
latest_reason=evaluation.reason if evaluation else None,
|
|
251
|
+
latest_reason_codes=tuple(evaluation.reason_codes) if evaluation else (),
|
|
252
|
+
latest_evaluation=(
|
|
253
|
+
CandidateEvaluation.model_validate(evaluation.evaluation_payload)
|
|
254
|
+
if evaluation
|
|
255
|
+
else None
|
|
256
|
+
),
|
|
257
|
+
ever_selected=candidate_row.id in selected_ids,
|
|
258
|
+
latest_spec=run_specs.get(evaluation.run_id) if evaluation else None,
|
|
259
|
+
)
|
|
260
|
+
for candidate_row, evaluation in rows
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
def get_run(self, run_id: str) -> dict[str, Any]:
|
|
264
|
+
with self.database.session() as session:
|
|
265
|
+
row = session.scalar(
|
|
266
|
+
select(DiscoveryRunRow)
|
|
267
|
+
.options(joinedload(DiscoveryRunRow.queries))
|
|
268
|
+
.where(DiscoveryRunRow.id == run_id)
|
|
269
|
+
)
|
|
270
|
+
if row is None:
|
|
271
|
+
raise RunNotFoundError(f"run not found: {run_id}")
|
|
272
|
+
evaluations = session.execute(
|
|
273
|
+
select(CandidateEvaluationRow, CompanyCandidateRow)
|
|
274
|
+
.join(CompanyCandidateRow)
|
|
275
|
+
.where(CandidateEvaluationRow.run_id == run_id)
|
|
276
|
+
.order_by(CandidateEvaluationRow.id)
|
|
277
|
+
).all()
|
|
278
|
+
return {
|
|
279
|
+
"run_id": row.id,
|
|
280
|
+
"status": row.status,
|
|
281
|
+
"spec": row.spec_payload,
|
|
282
|
+
"summary": row.summary_payload,
|
|
283
|
+
"artifacts": row.artifact_paths,
|
|
284
|
+
"error": row.error_message,
|
|
285
|
+
"created_at": row.created_at.isoformat(),
|
|
286
|
+
"completed_at": row.completed_at.isoformat() if row.completed_at else None,
|
|
287
|
+
"queries": [query.query_text for query in row.queries],
|
|
288
|
+
"candidates": [
|
|
289
|
+
{
|
|
290
|
+
"candidate_id": candidate.id,
|
|
291
|
+
"company": candidate.normalized_payload,
|
|
292
|
+
"evaluation": evaluation.evaluation_payload,
|
|
293
|
+
"bucket": evaluation.bucket,
|
|
294
|
+
"source": evaluation.source,
|
|
295
|
+
}
|
|
296
|
+
for evaluation, candidate in evaluations
|
|
297
|
+
],
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
def inspect_candidate(self, run_id: str, domain: str) -> dict[str, Any]:
|
|
301
|
+
with self.database.session() as session:
|
|
302
|
+
result = session.execute(
|
|
303
|
+
select(CandidateEvaluationRow, CompanyCandidateRow)
|
|
304
|
+
.join(CompanyCandidateRow)
|
|
305
|
+
.where(
|
|
306
|
+
CandidateEvaluationRow.run_id == run_id,
|
|
307
|
+
CompanyCandidateRow.domain == domain,
|
|
308
|
+
)
|
|
309
|
+
).first()
|
|
310
|
+
if result is None:
|
|
311
|
+
raise CandidateNotFoundError(f"domain {domain} was not evaluated in run {run_id}")
|
|
312
|
+
evaluation, candidate = result
|
|
313
|
+
raw_hits = session.scalars(
|
|
314
|
+
select(RawResultRow).where(
|
|
315
|
+
RawResultRow.run_id == run_id,
|
|
316
|
+
RawResultRow.observed_url.contains(domain),
|
|
317
|
+
)
|
|
318
|
+
).all()
|
|
319
|
+
return {
|
|
320
|
+
"company": candidate.normalized_payload,
|
|
321
|
+
"evaluation": evaluation.evaluation_payload,
|
|
322
|
+
"bucket": evaluation.bucket,
|
|
323
|
+
"source": evaluation.source,
|
|
324
|
+
"raw_hits": [hit.raw_payload for hit in raw_hits],
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
def _require_run(session: Any, run_id: str) -> DiscoveryRunRow:
|
|
329
|
+
row = session.get(DiscoveryRunRow, run_id)
|
|
330
|
+
if row is None:
|
|
331
|
+
raise RunNotFoundError(f"run not found: {run_id}")
|
|
332
|
+
return row
|
|
333
|
+
|
|
334
|
+
@staticmethod
|
|
335
|
+
def _merge_candidate_payload(current: dict[str, Any], incoming: dict[str, Any]) -> dict[str, Any]:
|
|
336
|
+
merged = dict(current)
|
|
337
|
+
for key, value in incoming.items():
|
|
338
|
+
if key == "sightings":
|
|
339
|
+
existing = {item["url"]: item for item in merged.get("sightings", [])}
|
|
340
|
+
existing.update({item["url"]: item for item in value})
|
|
341
|
+
merged[key] = list(existing.values())
|
|
342
|
+
elif value is not None and value != []:
|
|
343
|
+
merged[key] = value
|
|
344
|
+
return merged
|
|
345
|
+
|
|
346
|
+
@staticmethod
|
|
347
|
+
def _apply_inferences(candidate: CompanyCandidateRow, evaluation: CandidateEvaluation) -> None:
|
|
348
|
+
updates = {
|
|
349
|
+
"vertical": evaluation.inferred_vertical,
|
|
350
|
+
"country": evaluation.inferred_country,
|
|
351
|
+
"state": evaluation.inferred_state,
|
|
352
|
+
"employee_min": evaluation.inferred_employee_min,
|
|
353
|
+
"employee_max": evaluation.inferred_employee_max,
|
|
354
|
+
"ownership_type": evaluation.inferred_ownership_type,
|
|
355
|
+
}
|
|
356
|
+
payload = dict(candidate.normalized_payload)
|
|
357
|
+
for field, value in updates.items():
|
|
358
|
+
if value is not None:
|
|
359
|
+
setattr(candidate, field, value)
|
|
360
|
+
payload[field] = value
|
|
361
|
+
candidate.excluded = evaluation.excluded.value == "yes"
|
|
362
|
+
payload["excluded"] = candidate.excluded
|
|
363
|
+
candidate.normalized_payload = payload
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from contextlib import contextmanager
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Iterator
|
|
6
|
+
|
|
7
|
+
from sqlalchemy import create_engine, event
|
|
8
|
+
from sqlalchemy.orm import Session, sessionmaker
|
|
9
|
+
|
|
10
|
+
from company_discovery.db.models import Base
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Database:
|
|
14
|
+
def __init__(self, url: str) -> None:
|
|
15
|
+
if url.startswith("sqlite:///"):
|
|
16
|
+
Path(url.removeprefix("sqlite:///")).parent.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
self.engine = create_engine(url, future=True)
|
|
18
|
+
if url.startswith("sqlite"):
|
|
19
|
+
event.listen(self.engine, "connect", self._enable_sqlite_foreign_keys)
|
|
20
|
+
self._session_factory = sessionmaker(
|
|
21
|
+
bind=self.engine,
|
|
22
|
+
class_=Session,
|
|
23
|
+
expire_on_commit=False,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def _enable_sqlite_foreign_keys(dbapi_connection: object, _: object) -> None:
|
|
28
|
+
cursor = dbapi_connection.cursor() # type: ignore[attr-defined]
|
|
29
|
+
cursor.execute("PRAGMA foreign_keys=ON")
|
|
30
|
+
cursor.close()
|
|
31
|
+
|
|
32
|
+
def create_schema(self) -> None:
|
|
33
|
+
Base.metadata.create_all(self.engine)
|
|
34
|
+
|
|
35
|
+
@contextmanager
|
|
36
|
+
def session(self) -> Iterator[Session]:
|
|
37
|
+
session = self._session_factory()
|
|
38
|
+
try:
|
|
39
|
+
yield session
|
|
40
|
+
session.commit()
|
|
41
|
+
except Exception:
|
|
42
|
+
session.rollback()
|
|
43
|
+
raise
|
|
44
|
+
finally:
|
|
45
|
+
session.close()
|
|
46
|
+
|
|
47
|
+
def dispose(self) -> None:
|
|
48
|
+
self.engine.dispose()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from company_discovery.domain.models import (
|
|
2
|
+
CandidateBucket,
|
|
3
|
+
CandidateEvaluation,
|
|
4
|
+
ExaSearchResult,
|
|
5
|
+
FitVerdict,
|
|
6
|
+
MatchVerdict,
|
|
7
|
+
NormalizedCandidate,
|
|
8
|
+
QueryPlan,
|
|
9
|
+
RunResult,
|
|
10
|
+
)
|
|
11
|
+
from company_discovery.domain.spec import CompanySearchSpec
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"CandidateBucket",
|
|
15
|
+
"CandidateEvaluation",
|
|
16
|
+
"CompanySearchSpec",
|
|
17
|
+
"ExaSearchResult",
|
|
18
|
+
"FitVerdict",
|
|
19
|
+
"MatchVerdict",
|
|
20
|
+
"NormalizedCandidate",
|
|
21
|
+
"QueryPlan",
|
|
22
|
+
"RunResult",
|
|
23
|
+
]
|
|
24
|
+
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import UTC, datetime
|
|
4
|
+
from enum import StrEnum
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import Field, field_validator
|
|
8
|
+
|
|
9
|
+
from company_discovery.domain.models import DomainModel, ExaSearchResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ContactVerdict(StrEnum):
|
|
13
|
+
ACCEPTED = "accepted"
|
|
14
|
+
REVIEW = "review"
|
|
15
|
+
REJECTED = "rejected"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EvidenceVerdict(StrEnum):
|
|
19
|
+
YES = "yes"
|
|
20
|
+
LIKELY = "likely"
|
|
21
|
+
UNKNOWN = "unknown"
|
|
22
|
+
NO = "no"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ContactAssessment(DomainModel):
|
|
26
|
+
full_name: str = Field(min_length=3)
|
|
27
|
+
title: str = Field(min_length=2)
|
|
28
|
+
linkedin_url: str | None = None
|
|
29
|
+
source_urls: list[str] = Field(min_length=1)
|
|
30
|
+
evidence: list[str] = Field(min_length=1)
|
|
31
|
+
current_company_match: EvidenceVerdict
|
|
32
|
+
role_match: EvidenceVerdict
|
|
33
|
+
identity_clear: bool
|
|
34
|
+
verdict: ContactVerdict
|
|
35
|
+
reason: str = Field(min_length=3)
|
|
36
|
+
|
|
37
|
+
@field_validator("full_name", "title")
|
|
38
|
+
@classmethod
|
|
39
|
+
def normalize_text(cls, value: str) -> str:
|
|
40
|
+
return " ".join(value.split())
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ContactAssessmentBatch(DomainModel):
|
|
44
|
+
candidates: list[ContactAssessment] = Field(default_factory=list, max_length=30)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class ContactCandidate(DomainModel):
|
|
48
|
+
company_id: int
|
|
49
|
+
company_name: str
|
|
50
|
+
company_domain: str
|
|
51
|
+
full_name: str
|
|
52
|
+
normalized_name: str
|
|
53
|
+
identity_key: str
|
|
54
|
+
title: str
|
|
55
|
+
linkedin_url: str | None = None
|
|
56
|
+
source_urls: list[str] = Field(default_factory=list)
|
|
57
|
+
evidence: list[str] = Field(default_factory=list)
|
|
58
|
+
first_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
59
|
+
last_seen_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ContactDiscoveryItem(DomainModel):
|
|
63
|
+
candidate_id: int
|
|
64
|
+
candidate: ContactCandidate
|
|
65
|
+
role_key: str
|
|
66
|
+
verdict: ContactVerdict
|
|
67
|
+
reason: str
|
|
68
|
+
current_company_match: EvidenceVerdict
|
|
69
|
+
role_match: EvidenceVerdict
|
|
70
|
+
identity_clear: bool
|
|
71
|
+
source: str
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ContactDiscoverySummary(DomainModel):
|
|
75
|
+
companies_loaded: int = 0
|
|
76
|
+
memory_reused: int = 0
|
|
77
|
+
role_gaps: int = 0
|
|
78
|
+
queries_run: int = 0
|
|
79
|
+
raw_results: int = 0
|
|
80
|
+
unique_people: int = 0
|
|
81
|
+
accepted: int = 0
|
|
82
|
+
review: int = 0
|
|
83
|
+
rejected: int = 0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ContactDiscoveryResult(DomainModel):
|
|
87
|
+
run_id: str
|
|
88
|
+
source_enrichment_run_id: str
|
|
89
|
+
summary: ContactDiscoverySummary
|
|
90
|
+
items: list[ContactDiscoveryItem]
|
|
91
|
+
artifact_paths: dict[str, str] = Field(default_factory=dict)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class ContactSearchBatch(DomainModel):
|
|
95
|
+
company_name: str
|
|
96
|
+
company_domain: str
|
|
97
|
+
role_key: str
|
|
98
|
+
role_labels: list[str]
|
|
99
|
+
results: list[ExaSearchResult]
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class ContactEnrichmentOutcome(StrEnum):
|
|
103
|
+
READY = "ready"
|
|
104
|
+
REVIEW = "review"
|
|
105
|
+
BLOCKED = "blocked"
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class ApolloPersonRequest(DomainModel):
|
|
109
|
+
candidate_id: int
|
|
110
|
+
first_name: str
|
|
111
|
+
last_name: str
|
|
112
|
+
full_name: str
|
|
113
|
+
company_name: str
|
|
114
|
+
company_domain: str
|
|
115
|
+
linkedin_url: str | None = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class ApolloPersonMatch(DomainModel):
|
|
119
|
+
candidate_id: int
|
|
120
|
+
person_found: bool
|
|
121
|
+
full_name: str | None = None
|
|
122
|
+
linkedin_url: str | None = None
|
|
123
|
+
title: str | None = None
|
|
124
|
+
organization_name: str | None = None
|
|
125
|
+
organization_domain: str | None = None
|
|
126
|
+
email: str | None = None
|
|
127
|
+
email_status: str | None = None
|
|
128
|
+
phones: list[str] = Field(default_factory=list)
|
|
129
|
+
apollo_person_id: str | None = None
|
|
130
|
+
raw: dict[str, Any] = Field(default_factory=dict)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class ApolloBatchResult(DomainModel):
|
|
134
|
+
matches: list[ApolloPersonMatch] = Field(default_factory=list)
|
|
135
|
+
request_id: str | None = None
|
|
136
|
+
pending: bool = False
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class ContactChannelProfile(DomainModel):
|
|
140
|
+
email_requested: bool = False
|
|
141
|
+
phone_requested: bool = False
|
|
142
|
+
email: str | None = None
|
|
143
|
+
email_status: str | None = None
|
|
144
|
+
phone: str | None = None
|
|
145
|
+
apollo_person_id: str | None = None
|
|
146
|
+
apollo_linkedin_url: str | None = None
|
|
147
|
+
apollo_company_name: str | None = None
|
|
148
|
+
apollo_company_domain: str | None = None
|
|
149
|
+
apollo_title: str | None = None
|
|
150
|
+
observed_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class ContactEnrichmentItem(DomainModel):
|
|
154
|
+
candidate_id: int
|
|
155
|
+
discovery: dict[str, Any]
|
|
156
|
+
channels: ContactChannelProfile
|
|
157
|
+
outcome: ContactEnrichmentOutcome
|
|
158
|
+
review_flags: list[str] = Field(default_factory=list)
|
|
159
|
+
trace: list[dict[str, Any]] = Field(default_factory=list)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class ContactEnrichmentSummary(DomainModel):
|
|
163
|
+
contacts_loaded: int = 0
|
|
164
|
+
memory_reused: int = 0
|
|
165
|
+
apollo_requests: int = 0
|
|
166
|
+
apollo_batches: int = 0
|
|
167
|
+
async_polls: int = 0
|
|
168
|
+
ready: int = 0
|
|
169
|
+
review: int = 0
|
|
170
|
+
blocked: int = 0
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
class ContactEnrichmentResult(DomainModel):
|
|
174
|
+
run_id: str
|
|
175
|
+
source_contact_run_id: str
|
|
176
|
+
summary: ContactEnrichmentSummary
|
|
177
|
+
items: list[ContactEnrichmentItem]
|
|
178
|
+
artifact_paths: dict[str, str] = Field(default_factory=dict)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import Field, field_validator, model_validator
|
|
9
|
+
|
|
10
|
+
from company_discovery.domain.models import DomainModel
|
|
11
|
+
from company_discovery.services.normalization import canonical_domain
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
ROLE_KEY_PATTERN = re.compile(r"^[a-z][a-z0-9_]{1,63}$")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ContactCompanySource(DomainModel):
|
|
18
|
+
enrichment_run_id: str = Field(min_length=1)
|
|
19
|
+
bucket: Literal["ready", "review", "all"] = "ready"
|
|
20
|
+
domains: list[str] = Field(default_factory=list)
|
|
21
|
+
|
|
22
|
+
@field_validator("domains")
|
|
23
|
+
@classmethod
|
|
24
|
+
def normalize_domains(cls, values: list[str]) -> list[str]:
|
|
25
|
+
normalized: list[str] = []
|
|
26
|
+
for value in values:
|
|
27
|
+
domain = canonical_domain(value)
|
|
28
|
+
if domain is None:
|
|
29
|
+
raise ValueError(f"invalid company domain: {value}")
|
|
30
|
+
if domain not in normalized:
|
|
31
|
+
normalized.append(domain)
|
|
32
|
+
return normalized
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ContactRoleTarget(DomainModel):
|
|
36
|
+
key: str = Field(min_length=2, max_length=64)
|
|
37
|
+
labels: list[str] = Field(min_length=1, max_length=12)
|
|
38
|
+
max_per_company: int = Field(default=1, ge=1, le=10)
|
|
39
|
+
|
|
40
|
+
@field_validator("key")
|
|
41
|
+
@classmethod
|
|
42
|
+
def validate_key(cls, value: str) -> str:
|
|
43
|
+
normalized = value.strip().lower().replace("-", "_").replace(" ", "_")
|
|
44
|
+
if not ROLE_KEY_PATTERN.fullmatch(normalized):
|
|
45
|
+
raise ValueError("role key must use lowercase letters, numbers, and underscores")
|
|
46
|
+
return normalized
|
|
47
|
+
|
|
48
|
+
@field_validator("labels")
|
|
49
|
+
@classmethod
|
|
50
|
+
def normalize_labels(cls, values: list[str]) -> list[str]:
|
|
51
|
+
labels: list[str] = []
|
|
52
|
+
for value in values:
|
|
53
|
+
label = " ".join(value.lower().split())
|
|
54
|
+
if len(label) < 2:
|
|
55
|
+
raise ValueError("role labels cannot be empty")
|
|
56
|
+
if label not in labels:
|
|
57
|
+
labels.append(label)
|
|
58
|
+
return labels
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ContactSearchSpec(DomainModel):
|
|
62
|
+
version: Literal[1] = 1
|
|
63
|
+
company_source: ContactCompanySource
|
|
64
|
+
roles: list[ContactRoleTarget] = Field(min_length=1, max_length=20)
|
|
65
|
+
company_limit: int | None = Field(default=None, ge=1, le=1000)
|
|
66
|
+
contact_limit: int | None = Field(default=None, ge=1, le=10000)
|
|
67
|
+
current_only: bool = True
|
|
68
|
+
require_role_match: bool = True
|
|
69
|
+
memory_freshness_days: int = Field(default=30, ge=1, le=365)
|
|
70
|
+
|
|
71
|
+
@model_validator(mode="after")
|
|
72
|
+
def validate_unique_roles(self) -> "ContactSearchSpec":
|
|
73
|
+
keys = [role.key for role in self.roles]
|
|
74
|
+
if len(keys) != len(set(keys)):
|
|
75
|
+
raise ValueError("role keys must be unique")
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
@classmethod
|
|
79
|
+
def from_file(cls, path: Path) -> "ContactSearchSpec":
|
|
80
|
+
try:
|
|
81
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
82
|
+
except FileNotFoundError as exc:
|
|
83
|
+
raise ValueError(f"spec file does not exist: {path}") from exc
|
|
84
|
+
except json.JSONDecodeError as exc:
|
|
85
|
+
raise ValueError(f"invalid JSON: {exc}") from exc
|
|
86
|
+
return cls.model_validate(payload)
|