leads-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- company_discovery/__init__.py +4 -0
- company_discovery/adapters/__init__.py +5 -0
- company_discovery/adapters/apollo.py +189 -0
- company_discovery/adapters/exa.py +112 -0
- company_discovery/adapters/llm.py +118 -0
- company_discovery/adapters/protocols.py +58 -0
- company_discovery/adapters/website.py +154 -0
- company_discovery/bundled_skills/__init__.py +1 -0
- company_discovery/bundled_skills/company-discovery-operator/SKILL.md +72 -0
- company_discovery/bundled_skills/company-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-enrichment-operator/SKILL.md +94 -0
- company_discovery/bundled_skills/company-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/company-search-spec-writer/SKILL.md +109 -0
- company_discovery/bundled_skills/company-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-discovery-operator/SKILL.md +80 -0
- company_discovery/bundled_skills/contact-discovery-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-enrichment-operator/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-enrichment-operator/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/contact-search-spec-writer/SKILL.md +86 -0
- company_discovery/bundled_skills/contact-search-spec-writer/agents/openai.yaml +4 -0
- company_discovery/bundled_skills/leads-update-operator/SKILL.md +60 -0
- company_discovery/bundled_skills/leads-update-operator/agents/openai.yaml +4 -0
- company_discovery/cli.py +1789 -0
- company_discovery/db/__init__.py +5 -0
- company_discovery/db/contact_enrichment_repository.py +268 -0
- company_discovery/db/contact_repository.py +366 -0
- company_discovery/db/enrichment_repository.py +207 -0
- company_discovery/db/models.py +324 -0
- company_discovery/db/repository.py +363 -0
- company_discovery/db/session.py +48 -0
- company_discovery/domain/__init__.py +24 -0
- company_discovery/domain/contact_models.py +178 -0
- company_discovery/domain/contact_spec.py +86 -0
- company_discovery/domain/models.py +287 -0
- company_discovery/domain/spec.py +263 -0
- company_discovery/migrations.py +190 -0
- company_discovery/prompts/__init__.py +8 -0
- company_discovery/prompts/candidate_evaluation/system.md +13 -0
- company_discovery/prompts/company_enrichment/system.md +42 -0
- company_discovery/prompts/contact_evaluation/system.md +18 -0
- company_discovery/prompts/query_generation/system.md +10 -0
- company_discovery/release_manifest.json +7 -0
- company_discovery/reports/__init__.py +4 -0
- company_discovery/reports/contact_enrichment_exporter.py +108 -0
- company_discovery/reports/contact_exporter.py +132 -0
- company_discovery/reports/enrichment_exporter.py +125 -0
- company_discovery/reports/exporter.py +135 -0
- company_discovery/runtime.py +336 -0
- company_discovery/services/__init__.py +4 -0
- company_discovery/services/contact_enrichment_pipeline.py +344 -0
- company_discovery/services/contact_enrichment_progress.py +37 -0
- company_discovery/services/contact_evaluator.py +110 -0
- company_discovery/services/contact_pipeline.py +295 -0
- company_discovery/services/contact_progress.py +38 -0
- company_discovery/services/enrichment_extractor.py +61 -0
- company_discovery/services/enrichment_pipeline.py +526 -0
- company_discovery/services/enrichment_progress.py +20 -0
- company_discovery/services/enrichment_resolver.py +148 -0
- company_discovery/services/evaluator.py +40 -0
- company_discovery/services/hygiene.py +51 -0
- company_discovery/services/memory.py +150 -0
- company_discovery/services/normalization.py +98 -0
- company_discovery/services/pipeline.py +628 -0
- company_discovery/services/progress.py +48 -0
- company_discovery/services/query_planner.py +47 -0
- company_discovery/settings.py +152 -0
- company_discovery/skill_installer.py +197 -0
- company_discovery/update_plan.py +79 -0
- leads_cli-0.1.0.dist-info/METADATA +277 -0
- leads_cli-0.1.0.dist-info/RECORD +72 -0
- leads_cli-0.1.0.dist-info/WHEEL +4 -0
- leads_cli-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from company_discovery.adapters.protocols import StructuredLLM
|
|
6
|
+
from company_discovery.domain.models import CandidateEvaluation, NormalizedCandidate
|
|
7
|
+
from company_discovery.domain.spec import CompanySearchSpec
|
|
8
|
+
from company_discovery.prompts import load_prompt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class CandidateEvaluator:
|
|
12
|
+
def __init__(self, llm: StructuredLLM) -> None:
|
|
13
|
+
self._llm = llm
|
|
14
|
+
self._system_prompt = load_prompt("candidate_evaluation")
|
|
15
|
+
|
|
16
|
+
def evaluate(
|
|
17
|
+
self,
|
|
18
|
+
spec: CompanySearchSpec,
|
|
19
|
+
candidate: NormalizedCandidate,
|
|
20
|
+
) -> CandidateEvaluation:
|
|
21
|
+
prompt = json.dumps(
|
|
22
|
+
{
|
|
23
|
+
"search_spec": spec.model_dump(mode="json"),
|
|
24
|
+
"candidate": candidate.model_dump(mode="json"),
|
|
25
|
+
},
|
|
26
|
+
indent=2,
|
|
27
|
+
)
|
|
28
|
+
result = self._llm.generate(
|
|
29
|
+
system_prompt=self._system_prompt,
|
|
30
|
+
user_prompt=prompt,
|
|
31
|
+
response_model=CandidateEvaluation,
|
|
32
|
+
)
|
|
33
|
+
if not isinstance(result, CandidateEvaluation):
|
|
34
|
+
raise TypeError(
|
|
35
|
+
f"LLM returned {type(result).__name__}, expected CandidateEvaluation"
|
|
36
|
+
)
|
|
37
|
+
evaluation = result
|
|
38
|
+
return evaluation.model_copy(
|
|
39
|
+
update={"company_name": candidate.company_name, "domain": candidate.domain}
|
|
40
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from company_discovery.domain.models import NormalizedCandidate
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
KNOWN_NON_COMPANY_DOMAINS = frozenset(
|
|
9
|
+
{
|
|
10
|
+
"bbb.org",
|
|
11
|
+
"bloomberg.com",
|
|
12
|
+
"chamberofcommerce.com",
|
|
13
|
+
"crunchbase.com",
|
|
14
|
+
"facebook.com",
|
|
15
|
+
"glassdoor.com",
|
|
16
|
+
"instagram.com",
|
|
17
|
+
"linkedin.com",
|
|
18
|
+
"mapquest.com",
|
|
19
|
+
"manta.com",
|
|
20
|
+
"opencorporates.com",
|
|
21
|
+
"pitchbook.com",
|
|
22
|
+
"wikipedia.org",
|
|
23
|
+
"yelp.com",
|
|
24
|
+
"yellowpages.com",
|
|
25
|
+
"youtube.com",
|
|
26
|
+
}
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass(frozen=True)
|
|
31
|
+
class HygieneResult:
|
|
32
|
+
accepted: list[NormalizedCandidate]
|
|
33
|
+
rejected: list[tuple[NormalizedCandidate, str]]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def filter_hygiene(candidates: list[NormalizedCandidate]) -> HygieneResult:
|
|
37
|
+
accepted: list[NormalizedCandidate] = []
|
|
38
|
+
rejected: list[tuple[NormalizedCandidate, str]] = []
|
|
39
|
+
seen: set[str] = set()
|
|
40
|
+
for candidate in candidates:
|
|
41
|
+
if candidate.domain in seen:
|
|
42
|
+
rejected.append((candidate, "duplicate_domain"))
|
|
43
|
+
elif candidate.domain in KNOWN_NON_COMPANY_DOMAINS:
|
|
44
|
+
rejected.append((candidate, "known_non_company_domain"))
|
|
45
|
+
elif not candidate.company_name.strip():
|
|
46
|
+
rejected.append((candidate, "missing_company_name"))
|
|
47
|
+
else:
|
|
48
|
+
seen.add(candidate.domain)
|
|
49
|
+
accepted.append(candidate)
|
|
50
|
+
return HygieneResult(accepted=accepted, rejected=rejected)
|
|
51
|
+
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from company_discovery.db.repository import MemoryRecord
|
|
6
|
+
from company_discovery.domain.models import FitVerdict
|
|
7
|
+
from company_discovery.domain.spec import CompanySearchSpec, NoveltyMode
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class SkippedMemoryRecord:
|
|
12
|
+
record: MemoryRecord
|
|
13
|
+
reason: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class MemoryScanResult:
|
|
18
|
+
matched: int
|
|
19
|
+
reusable: list[MemoryRecord]
|
|
20
|
+
recheck: list[MemoryRecord]
|
|
21
|
+
skipped: list[SkippedMemoryRecord]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MemoryMatcher:
|
|
25
|
+
def scan(self, spec: CompanySearchSpec, records: list[MemoryRecord]) -> MemoryScanResult:
|
|
26
|
+
if spec.novelty_mode == NoveltyMode.ONLY_NEW:
|
|
27
|
+
return MemoryScanResult(
|
|
28
|
+
matched=0,
|
|
29
|
+
reusable=[],
|
|
30
|
+
recheck=[],
|
|
31
|
+
skipped=[
|
|
32
|
+
SkippedMemoryRecord(record, "memory_disabled_only_new") for record in records
|
|
33
|
+
],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
reusable: list[MemoryRecord] = []
|
|
37
|
+
recheck: list[MemoryRecord] = []
|
|
38
|
+
skipped: list[SkippedMemoryRecord] = []
|
|
39
|
+
for record in records:
|
|
40
|
+
mismatch = self._hard_mismatch(spec, record)
|
|
41
|
+
if mismatch:
|
|
42
|
+
skipped.append(SkippedMemoryRecord(record, mismatch))
|
|
43
|
+
continue
|
|
44
|
+
if (
|
|
45
|
+
record.latest_fit == FitVerdict.GOOD.value
|
|
46
|
+
and record.latest_evaluation is not None
|
|
47
|
+
and not self._requires_recheck(spec, record)
|
|
48
|
+
):
|
|
49
|
+
reusable.append(record)
|
|
50
|
+
else:
|
|
51
|
+
recheck.append(record)
|
|
52
|
+
|
|
53
|
+
return MemoryScanResult(
|
|
54
|
+
matched=len(reusable) + len(recheck),
|
|
55
|
+
reusable=reusable,
|
|
56
|
+
recheck=recheck,
|
|
57
|
+
skipped=skipped,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _hard_mismatch(spec: CompanySearchSpec, record: MemoryRecord) -> str | None:
|
|
62
|
+
candidate = record.candidate
|
|
63
|
+
previous = record.latest_spec
|
|
64
|
+
reason_codes = set(record.latest_reason_codes)
|
|
65
|
+
if previous is not None:
|
|
66
|
+
previous_target = (
|
|
67
|
+
record.latest_evaluation.target_vertical
|
|
68
|
+
if record.latest_evaluation is not None
|
|
69
|
+
else None
|
|
70
|
+
)
|
|
71
|
+
same_vertical = previous_target == spec.vertical.key or (
|
|
72
|
+
len(previous.verticals) == 1 and previous.verticals[0] == spec.vertical
|
|
73
|
+
)
|
|
74
|
+
if "vertical_mismatch" in reason_codes and same_vertical:
|
|
75
|
+
return "prior_vertical_mismatch_same_spec"
|
|
76
|
+
if (
|
|
77
|
+
"geography_mismatch" in reason_codes
|
|
78
|
+
and previous.geography == spec.geography
|
|
79
|
+
):
|
|
80
|
+
return "prior_geography_mismatch_same_spec"
|
|
81
|
+
if "size_mismatch" in reason_codes and previous.company_size == spec.company_size:
|
|
82
|
+
return "prior_size_mismatch_same_spec"
|
|
83
|
+
exclusion_codes = {
|
|
84
|
+
"excluded_ownership",
|
|
85
|
+
"excluded_keyword",
|
|
86
|
+
"excluded_company_pattern",
|
|
87
|
+
}
|
|
88
|
+
if reason_codes & exclusion_codes and previous.exclude == spec.exclude:
|
|
89
|
+
return "prior_exclusion_same_spec"
|
|
90
|
+
if spec.novelty_mode == NoveltyMode.UNUSED_MEMORY and record.ever_selected:
|
|
91
|
+
return "previously_selected"
|
|
92
|
+
if candidate.vertical and candidate.vertical != spec.vertical.key:
|
|
93
|
+
return "vertical_mismatch"
|
|
94
|
+
if candidate.country and candidate.country.upper() != spec.geography.country:
|
|
95
|
+
return "country_mismatch"
|
|
96
|
+
if spec.geography.states and candidate.state and candidate.state.upper() not in spec.geography.states:
|
|
97
|
+
return "state_mismatch"
|
|
98
|
+
size = spec.company_size
|
|
99
|
+
if (
|
|
100
|
+
size.employee_min is not None
|
|
101
|
+
and candidate.employee_max is not None
|
|
102
|
+
and candidate.employee_max < size.employee_min
|
|
103
|
+
):
|
|
104
|
+
return "size_below_minimum"
|
|
105
|
+
if (
|
|
106
|
+
size.employee_max is not None
|
|
107
|
+
and candidate.employee_min is not None
|
|
108
|
+
and candidate.employee_min > size.employee_max
|
|
109
|
+
):
|
|
110
|
+
return "size_above_maximum"
|
|
111
|
+
if (
|
|
112
|
+
candidate.ownership_type
|
|
113
|
+
and candidate.ownership_type.lower() in spec.exclude.ownership_types
|
|
114
|
+
):
|
|
115
|
+
return "excluded_ownership"
|
|
116
|
+
searchable = " ".join(
|
|
117
|
+
[candidate.company_name]
|
|
118
|
+
+ [sighting.title for sighting in candidate.sightings]
|
|
119
|
+
+ [sighting.text or "" for sighting in candidate.sightings]
|
|
120
|
+
).lower()
|
|
121
|
+
if any(keyword in searchable for keyword in spec.exclude.keywords):
|
|
122
|
+
return "excluded_keyword"
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def _requires_recheck(spec: CompanySearchSpec, record: MemoryRecord) -> bool:
|
|
127
|
+
candidate = record.candidate
|
|
128
|
+
if candidate.vertical is None or candidate.country is None:
|
|
129
|
+
return True
|
|
130
|
+
if spec.geography.states and candidate.state is None:
|
|
131
|
+
return True
|
|
132
|
+
if not spec.company_size.is_unbounded and (
|
|
133
|
+
candidate.employee_min is None or candidate.employee_max is None
|
|
134
|
+
):
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
previous = record.latest_spec
|
|
138
|
+
if spec.include.keywords or spec.include.subtypes:
|
|
139
|
+
if previous is None or previous.include != spec.include:
|
|
140
|
+
return True
|
|
141
|
+
has_custom_exclusions = any(
|
|
142
|
+
(
|
|
143
|
+
spec.exclude.keywords,
|
|
144
|
+
spec.exclude.ownership_types,
|
|
145
|
+
spec.exclude.company_patterns,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
if has_custom_exclusions and (previous is None or previous.exclude != spec.exclude):
|
|
149
|
+
return True
|
|
150
|
+
return False
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from collections import OrderedDict
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
from urllib.parse import urlparse
|
|
7
|
+
|
|
8
|
+
import tldextract
|
|
9
|
+
|
|
10
|
+
from company_discovery.domain.models import ExaSearchResult, NormalizedCandidate, SourceSighting
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
_extract_domain = tldextract.TLDExtract(suffix_list_urls=(), cache_dir=None)
|
|
14
|
+
_TITLE_SUFFIX = re.compile(r"\s+[|\-\u2013\u2014]\s+.*$")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def canonical_domain(url: str) -> str | None:
|
|
18
|
+
value = url.strip()
|
|
19
|
+
if not value:
|
|
20
|
+
return None
|
|
21
|
+
if ":" in value and "://" not in value:
|
|
22
|
+
host, _, port = value.partition(":")
|
|
23
|
+
if not ("." in host and port.isdigit()):
|
|
24
|
+
return None
|
|
25
|
+
parsed = urlparse(value if "://" in value else f"https://{value}")
|
|
26
|
+
if parsed.scheme not in {"http", "https"} or not parsed.hostname:
|
|
27
|
+
return None
|
|
28
|
+
extracted = _extract_domain(parsed.hostname.lower())
|
|
29
|
+
if not extracted.domain or not extracted.suffix:
|
|
30
|
+
return None
|
|
31
|
+
return f"{extracted.domain}.{extracted.suffix}"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def candidate_name(title: str, domain: str) -> str:
|
|
35
|
+
cleaned = _TITLE_SUFFIX.sub("", title).strip()
|
|
36
|
+
if cleaned:
|
|
37
|
+
return cleaned
|
|
38
|
+
return domain.split(".", maxsplit=1)[0].replace("-", " ").title()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def normalize_results(results: list[ExaSearchResult]) -> list[NormalizedCandidate]:
|
|
42
|
+
by_domain: OrderedDict[str, NormalizedCandidate] = OrderedDict()
|
|
43
|
+
now = datetime.now(UTC)
|
|
44
|
+
for result in results:
|
|
45
|
+
domain = canonical_domain(result.url)
|
|
46
|
+
if domain is None:
|
|
47
|
+
continue
|
|
48
|
+
sighting = SourceSighting(
|
|
49
|
+
query=result.query,
|
|
50
|
+
url=result.url,
|
|
51
|
+
title=result.title,
|
|
52
|
+
text=result.text,
|
|
53
|
+
exa_id=result.exa_id,
|
|
54
|
+
raw=result.raw,
|
|
55
|
+
)
|
|
56
|
+
existing = by_domain.get(domain)
|
|
57
|
+
if existing is None:
|
|
58
|
+
entity = _company_entity(result.raw)
|
|
59
|
+
properties = entity.get("properties", {}) if entity else {}
|
|
60
|
+
workforce = properties.get("workforce") or {}
|
|
61
|
+
headquarters = properties.get("headquarters") or {}
|
|
62
|
+
employee_total = workforce.get("total")
|
|
63
|
+
if not isinstance(employee_total, int) or employee_total < 1:
|
|
64
|
+
employee_total = None
|
|
65
|
+
country = _country_code(headquarters.get("country"))
|
|
66
|
+
by_domain[domain] = NormalizedCandidate(
|
|
67
|
+
company_name=properties.get("name") or candidate_name(result.title, domain),
|
|
68
|
+
domain=domain,
|
|
69
|
+
dedupe_key=domain,
|
|
70
|
+
country=country,
|
|
71
|
+
employee_min=employee_total,
|
|
72
|
+
employee_max=employee_total,
|
|
73
|
+
sightings=[sighting],
|
|
74
|
+
first_seen_at=now,
|
|
75
|
+
last_seen_at=now,
|
|
76
|
+
)
|
|
77
|
+
elif all(item.url != sighting.url for item in existing.sightings):
|
|
78
|
+
existing.sightings.append(sighting)
|
|
79
|
+
return list(by_domain.values())
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _company_entity(raw: dict) -> dict:
|
|
83
|
+
entities = raw.get("entities")
|
|
84
|
+
if not isinstance(entities, list):
|
|
85
|
+
return {}
|
|
86
|
+
for entity in entities:
|
|
87
|
+
if isinstance(entity, dict) and entity.get("type") in {None, "company"}:
|
|
88
|
+
return entity
|
|
89
|
+
return {}
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _country_code(value: object) -> str | None:
|
|
93
|
+
if not isinstance(value, str):
|
|
94
|
+
return None
|
|
95
|
+
normalized = value.strip().upper()
|
|
96
|
+
if normalized in {"UNITED STATES", "UNITED STATES OF AMERICA", "USA"}:
|
|
97
|
+
return "US"
|
|
98
|
+
return normalized if len(normalized) == 2 else None
|