enterprise-domain-mapper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- enterprise_domain_mapper-0.1.0/LICENSE +21 -0
- enterprise_domain_mapper-0.1.0/PKG-INFO +207 -0
- enterprise_domain_mapper-0.1.0/README.md +174 -0
- enterprise_domain_mapper-0.1.0/pyproject.toml +65 -0
- enterprise_domain_mapper-0.1.0/setup.cfg +4 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/__init__.py +3 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/cli.py +192 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/dns_verifier.py +87 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/mapper.py +117 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/models.py +73 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/output.py +84 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/sources/__init__.py +7 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/sources/sec_edgar.py +243 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/sources/tld_generator.py +177 -0
- enterprise_domain_mapper-0.1.0/src/domain_mapper/sources/wikipedia.py +208 -0
- enterprise_domain_mapper-0.1.0/src/enterprise_domain_mapper.egg-info/PKG-INFO +207 -0
- enterprise_domain_mapper-0.1.0/src/enterprise_domain_mapper.egg-info/SOURCES.txt +22 -0
- enterprise_domain_mapper-0.1.0/src/enterprise_domain_mapper.egg-info/dependency_links.txt +1 -0
- enterprise_domain_mapper-0.1.0/src/enterprise_domain_mapper.egg-info/entry_points.txt +2 -0
- enterprise_domain_mapper-0.1.0/src/enterprise_domain_mapper.egg-info/requires.txt +11 -0
- enterprise_domain_mapper-0.1.0/src/enterprise_domain_mapper.egg-info/top_level.txt +1 -0
- enterprise_domain_mapper-0.1.0/tests/test_models.py +73 -0
- enterprise_domain_mapper-0.1.0/tests/test_output.py +69 -0
- enterprise_domain_mapper-0.1.0/tests/test_tld_generator.py +76 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 GTM Layer
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: enterprise-domain-mapper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Map enterprise corporate structures to enrichable domains
|
|
5
|
+
Author-email: GTM Layer <hello@gtmlayer.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/gtmlayer/enterprise-domain-mapper
|
|
8
|
+
Project-URL: Repository, https://github.com/gtmlayer/enterprise-domain-mapper
|
|
9
|
+
Project-URL: Issues, https://github.com/gtmlayer/enterprise-domain-mapper/issues
|
|
10
|
+
Keywords: enterprise,enrichment,abm,domains,subsidiaries,sales
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Office/Business
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: requests>=2.31.0
|
|
23
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
24
|
+
Requires-Dist: click>=8.1.0
|
|
25
|
+
Requires-Dist: rich>=13.0.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
28
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
31
|
+
Requires-Dist: responses>=0.23.0; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# Enterprise Domain Mapper
|
|
35
|
+
|
|
36
|
+
[](https://github.com/gtmlayer/enterprise-domain-mapper/actions/workflows/ci.yml)
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
|
|
40
|
+
**Map enterprise corporate structures to enrichable domains.** Feed it company names, get back subsidiaries, acquisitions, and regional domains that your enrichment tools are missing.
|
|
41
|
+
|
|
42
|
+
Every sales team doing enterprise ABM hits the same wall: large companies have dozens of subsidiaries, acquisitions, and regional entities, each with their own email domain. Without a complete domain map, tools like Clay and Apollo only find contacts at the parent domain. Entire business units get missed.
|
|
43
|
+
|
|
44
|
+
This tool fixes that.
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
$ domain-mapper "NTT Data"
|
|
48
|
+
|
|
49
|
+
NTT Data
|
|
50
|
+
├── NTT DATA Services nttdataservices.com [SEC EDGAR]
|
|
51
|
+
├── NTT DATA Business Solutions nttdata-solutions.com [Wikipedia]
|
|
52
|
+
├── Dimension Data dimensiondata.com [Wikipedia]
|
|
53
|
+
├── NTT DATA Italia nttdata.it [TLD guess ✓ DNS verified]
|
|
54
|
+
├── NTT DATA UK nttdata.co.uk [TLD guess ✓ DNS verified]
|
|
55
|
+
└── NTT DATA Japan nttdata.co.jp [TLD guess ✓ DNS verified]
|
|
56
|
+
|
|
57
|
+
Found 12 subsidiaries, 18 domains (6 confirmed, 12 guessed, 9 DNS verified)
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## The problem
|
|
61
|
+
|
|
62
|
+
Enterprise accounts don't operate under a single domain. A company like NTT Data has subsidiaries in 50+ countries, each with localised domains. Deloitte has member firms. Boeing has defence subsidiaries that use completely different brands.
|
|
63
|
+
|
|
64
|
+
If you're running enrichment against just `nttdata.com`, you're finding maybe 30% of the contacts you could be reaching. The rest are hiding behind `dimensiondata.com`, `nttdata.co.uk`, `nttdata.it`, and domains you didn't know existed.
|
|
65
|
+
|
|
66
|
+
Building these domain maps manually takes hours per account. We built this tool because we got tired of doing it by hand.
|
|
67
|
+
|
|
68
|
+
## Quick start
|
|
69
|
+
|
|
70
|
+
### Installation
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
pip install enterprise-domain-mapper
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Or clone and install locally:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
git clone https://github.com/gtmlayer/enterprise-domain-mapper.git
|
|
80
|
+
cd enterprise-domain-mapper
|
|
81
|
+
pip install -e .
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### Single company lookup
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
domain-mapper "Boeing"
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Batch mode (CSV input)
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
domain-mapper accounts.csv --output results.csv
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Your input CSV just needs a column with company names. The tool auto-detects columns named `company_name`, `company`, `name`, or `account`. If you have a domain column (`domain`, `website`, `url`), it'll use that as the parent domain for TLD guessing.
|
|
97
|
+
|
|
98
|
+
### With DNS verification
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
domain-mapper accounts.csv --output results.csv --verify-dns
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
This checks whether guessed domains actually have mail infrastructure (MX records) or at minimum resolve (A records). Adds a few seconds per company but filters out the noise.
|
|
105
|
+
|
|
106
|
+
## What it does
|
|
107
|
+
|
|
108
|
+
The tool combines three data sources and a verification layer to build comprehensive domain maps:
|
|
109
|
+
|
|
110
|
+
### 1. SEC EDGAR Exhibit 21 scraper
|
|
111
|
+
|
|
112
|
+
For US-listed companies, SEC filings include Exhibit 21: a legally required list of all subsidiaries. The tool looks up the company's CIK, finds the latest 10-K filing, and parses the subsidiary list with jurisdictions.
|
|
113
|
+
|
|
114
|
+
This is the highest-quality source - it's legally mandated disclosure, so it's comprehensive and current.
|
|
115
|
+
|
|
116
|
+
### 2. Wikipedia corporate structure parser
|
|
117
|
+
|
|
118
|
+
For non-US companies (or supplementary data), the tool searches Wikipedia for the company page and extracts subsidiary and acquisition data from infoboxes and structured sections.
|
|
119
|
+
|
|
120
|
+
Covers companies globally, though data depth varies by how well-maintained the Wikipedia page is.
|
|
121
|
+
|
|
122
|
+
### 3. TLD pattern generator
|
|
123
|
+
|
|
124
|
+
Once subsidiaries are identified with their jurisdictions, the tool generates likely domain patterns. A subsidiary in Italy with parent domain `nttdata.com` produces guesses like `nttdata.it`. Covers 70+ countries with their standard corporate TLD patterns (e.g. UK produces `co.uk` and `.uk`, Japan produces `co.jp` and `.jp`).
|
|
125
|
+
|
|
126
|
+
### 4. DNS verification (optional)
|
|
127
|
+
|
|
128
|
+
MX record lookup with A record fallback to confirm guessed domains actually resolve. MX records are the strongest signal - if a domain has mail infrastructure, it's real. A records confirm the domain exists even without mail setup.
|
|
129
|
+
|
|
130
|
+
## Output format
|
|
131
|
+
|
|
132
|
+
### Detailed output (default)
|
|
133
|
+
|
|
134
|
+
Nine columns, one row per subsidiary-domain pair:
|
|
135
|
+
|
|
136
|
+
| Column | Description |
|
|
137
|
+
|--------|-------------|
|
|
138
|
+
| `parent_company` | The company you looked up |
|
|
139
|
+
| `parent_domain` | Known parent domain |
|
|
140
|
+
| `subsidiary_name` | Name of the subsidiary or entity |
|
|
141
|
+
| `subsidiary_type` | Subsidiary, acquisition, division, etc. |
|
|
142
|
+
| `jurisdiction` | Country or region |
|
|
143
|
+
| `domain` | Confirmed or guessed domain |
|
|
144
|
+
| `domain_source` | Where it came from (SEC EDGAR, Wikipedia, TLD guess) |
|
|
145
|
+
| `dns_verified` | Whether DNS verification passed |
|
|
146
|
+
| `confidence` | High (confirmed), Medium (guessed + verified), Low (guessed only) |
|
|
147
|
+
|
|
148
|
+
### Clay import format
|
|
149
|
+
|
|
150
|
+
One row per company with domains consolidated into a single field, ready for direct import into Clay as a data source.
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
domain-mapper accounts.csv --output results.csv --format clay
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
## Importing into Clay
|
|
157
|
+
|
|
158
|
+
1. Run the tool with `--format clay` to get the Clay-optimised output
|
|
159
|
+
2. In Clay, create a new table or add to an existing one
|
|
160
|
+
3. Import the CSV - the columns map directly to Clay's expected format
|
|
161
|
+
4. Use the domain list column with Clay's enrichment tools to find contacts across all mapped domains
|
|
162
|
+
|
|
163
|
+
This is the workflow that sparked the whole tool. We were manually building domain maps for a client's enterprise accounts and realised the process was repeatable enough to automate.
|
|
164
|
+
|
|
165
|
+
## Example
|
|
166
|
+
|
|
167
|
+
The `examples/` directory contains `input_sample.csv` with five test companies to get you started:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
domain-mapper examples/input_sample.csv --output examples/results.csv --verify-dns
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Contributing
|
|
174
|
+
|
|
175
|
+
Want to add a new data source? The architecture makes it straightforward:
|
|
176
|
+
|
|
177
|
+
1. Create a new module in `src/domain_mapper/sources/`
|
|
178
|
+
2. Implement a class with a `get_subsidiaries(company_name)` method that returns a list of subsidiary objects
|
|
179
|
+
3. Add it to the orchestrator in `mapper.py`
|
|
180
|
+
4. Write tests in `tests/`
|
|
181
|
+
|
|
182
|
+
Some data sources we'd love to see contributed:
|
|
183
|
+
|
|
184
|
+
- Companies House (UK company registry)
|
|
185
|
+
- OpenCorporates API
|
|
186
|
+
- Crunchbase (acquisitions data)
|
|
187
|
+
- D&B corporate hierarchies
|
|
188
|
+
|
|
189
|
+
Pull requests welcome. Run `ruff check` and `black` before submitting, and make sure `pytest` passes.
|
|
190
|
+
|
|
191
|
+
## Tech stack
|
|
192
|
+
|
|
193
|
+
- Python 3.10+
|
|
194
|
+
- `requests` and `beautifulsoup4` for web scraping
|
|
195
|
+
- `click` for the CLI
|
|
196
|
+
- `rich` for terminal output
|
|
197
|
+
- No paid APIs, no API keys required
|
|
198
|
+
|
|
199
|
+
## Built by GTM Layer
|
|
200
|
+
|
|
201
|
+
[GTM Layer](https://gtmlayer.com) builds revenue systems for B2B sales teams. We work with companies on CRM architecture, enrichment pipelines, signal-driven outbound, and everything in between.
|
|
202
|
+
|
|
203
|
+
This tool came out of real client work - we built it to solve a problem we kept hitting on enterprise ABM engagements. If you're running into similar challenges, [get in touch](https://gtmlayer.com).
|
|
204
|
+
|
|
205
|
+
## Licence
|
|
206
|
+
|
|
207
|
+
MIT - use it however you want.
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# Enterprise Domain Mapper
|
|
2
|
+
|
|
3
|
+
[](https://github.com/gtmlayer/enterprise-domain-mapper/actions/workflows/ci.yml)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
**Map enterprise corporate structures to enrichable domains.** Feed it company names, get back subsidiaries, acquisitions, and regional domains that your enrichment tools are missing.
|
|
8
|
+
|
|
9
|
+
Every sales team doing enterprise ABM hits the same wall: large companies have dozens of subsidiaries, acquisitions, and regional entities, each with their own email domain. Without a complete domain map, tools like Clay and Apollo only find contacts at the parent domain. Entire business units get missed.
|
|
10
|
+
|
|
11
|
+
This tool fixes that.
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
$ domain-mapper "NTT Data"
|
|
15
|
+
|
|
16
|
+
NTT Data
|
|
17
|
+
├── NTT DATA Services nttdataservices.com [SEC EDGAR]
|
|
18
|
+
├── NTT DATA Business Solutions nttdata-solutions.com [Wikipedia]
|
|
19
|
+
├── Dimension Data dimensiondata.com [Wikipedia]
|
|
20
|
+
├── NTT DATA Italia nttdata.it [TLD guess ✓ DNS verified]
|
|
21
|
+
├── NTT DATA UK nttdata.co.uk [TLD guess ✓ DNS verified]
|
|
22
|
+
└── NTT DATA Japan nttdata.co.jp [TLD guess ✓ DNS verified]
|
|
23
|
+
|
|
24
|
+
Found 12 subsidiaries, 18 domains (6 confirmed, 12 guessed, 9 DNS verified)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## The problem
|
|
28
|
+
|
|
29
|
+
Enterprise accounts don't operate under a single domain. A company like NTT Data has subsidiaries in 50+ countries, each with localised domains. Deloitte has member firms. Boeing has defence subsidiaries that use completely different brands.
|
|
30
|
+
|
|
31
|
+
If you're running enrichment against just `nttdata.com`, you're finding maybe 30% of the contacts you could be reaching. The rest are hiding behind `dimensiondata.com`, `nttdata.co.uk`, `nttdata.it`, and domains you didn't know existed.
|
|
32
|
+
|
|
33
|
+
Building these domain maps manually takes hours per account. We built this tool because we got tired of doing it by hand.
|
|
34
|
+
|
|
35
|
+
## Quick start
|
|
36
|
+
|
|
37
|
+
### Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
pip install enterprise-domain-mapper
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Or clone and install locally:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/gtmlayer/enterprise-domain-mapper.git
|
|
47
|
+
cd enterprise-domain-mapper
|
|
48
|
+
pip install -e .
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Single company lookup
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
domain-mapper "Boeing"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### Batch mode (CSV input)
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
domain-mapper accounts.csv --output results.csv
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Your input CSV just needs a column with company names. The tool auto-detects columns named `company_name`, `company`, `name`, or `account`. If you have a domain column (`domain`, `website`, `url`), it'll use that as the parent domain for TLD guessing.
|
|
64
|
+
|
|
65
|
+
### With DNS verification
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
domain-mapper accounts.csv --output results.csv --verify-dns
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
This checks whether guessed domains actually have mail infrastructure (MX records) or at minimum resolve (A records). Adds a few seconds per company but filters out the noise.
|
|
72
|
+
|
|
73
|
+
## What it does
|
|
74
|
+
|
|
75
|
+
The tool combines three data sources and a verification layer to build comprehensive domain maps:
|
|
76
|
+
|
|
77
|
+
### 1. SEC EDGAR Exhibit 21 scraper
|
|
78
|
+
|
|
79
|
+
For US-listed companies, SEC filings include Exhibit 21: a legally required list of all subsidiaries. The tool looks up the company's CIK, finds the latest 10-K filing, and parses the subsidiary list with jurisdictions.
|
|
80
|
+
|
|
81
|
+
This is the highest-quality source - it's legally mandated disclosure, so it's comprehensive and current.
|
|
82
|
+
|
|
83
|
+
### 2. Wikipedia corporate structure parser
|
|
84
|
+
|
|
85
|
+
For non-US companies (or supplementary data), the tool searches Wikipedia for the company page and extracts subsidiary and acquisition data from infoboxes and structured sections.
|
|
86
|
+
|
|
87
|
+
Covers companies globally, though data depth varies by how well-maintained the Wikipedia page is.
|
|
88
|
+
|
|
89
|
+
### 3. TLD pattern generator
|
|
90
|
+
|
|
91
|
+
Once subsidiaries are identified with their jurisdictions, the tool generates likely domain patterns. A subsidiary in Italy with parent domain `nttdata.com` produces guesses like `nttdata.it`. Covers 70+ countries with their standard corporate TLD patterns (e.g. UK produces `co.uk` and `.uk`, Japan produces `co.jp` and `.jp`).
|
|
92
|
+
|
|
93
|
+
### 4. DNS verification (optional)
|
|
94
|
+
|
|
95
|
+
MX record lookup with A record fallback to confirm guessed domains actually resolve. MX records are the strongest signal - if a domain has mail infrastructure, it's real. A records confirm the domain exists even without mail setup.
|
|
96
|
+
|
|
97
|
+
## Output format
|
|
98
|
+
|
|
99
|
+
### Detailed output (default)
|
|
100
|
+
|
|
101
|
+
Nine columns, one row per subsidiary-domain pair:
|
|
102
|
+
|
|
103
|
+
| Column | Description |
|
|
104
|
+
|--------|-------------|
|
|
105
|
+
| `parent_company` | The company you looked up |
|
|
106
|
+
| `parent_domain` | Known parent domain |
|
|
107
|
+
| `subsidiary_name` | Name of the subsidiary or entity |
|
|
108
|
+
| `subsidiary_type` | Subsidiary, acquisition, division, etc. |
|
|
109
|
+
| `jurisdiction` | Country or region |
|
|
110
|
+
| `domain` | Confirmed or guessed domain |
|
|
111
|
+
| `domain_source` | Where it came from (SEC EDGAR, Wikipedia, TLD guess) |
|
|
112
|
+
| `dns_verified` | Whether DNS verification passed |
|
|
113
|
+
| `confidence` | High (confirmed), Medium (guessed + verified), Low (guessed only) |
|
|
114
|
+
|
|
115
|
+
### Clay import format
|
|
116
|
+
|
|
117
|
+
One row per company with domains consolidated into a single field, ready for direct import into Clay as a data source.
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
domain-mapper accounts.csv --output results.csv --format clay
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Importing into Clay
|
|
124
|
+
|
|
125
|
+
1. Run the tool with `--format clay` to get the Clay-optimised output
|
|
126
|
+
2. In Clay, create a new table or add to an existing one
|
|
127
|
+
3. Import the CSV - the columns map directly to Clay's expected format
|
|
128
|
+
4. Use the domain list column with Clay's enrichment tools to find contacts across all mapped domains
|
|
129
|
+
|
|
130
|
+
This is the workflow that sparked the whole tool. We were manually building domain maps for a client's enterprise accounts and realised the process was repeatable enough to automate.
|
|
131
|
+
|
|
132
|
+
## Example
|
|
133
|
+
|
|
134
|
+
The `examples/` directory contains `input_sample.csv` with five test companies to get you started:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
domain-mapper examples/input_sample.csv --output examples/results.csv --verify-dns
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## Contributing
|
|
141
|
+
|
|
142
|
+
Want to add a new data source? The architecture makes it straightforward:
|
|
143
|
+
|
|
144
|
+
1. Create a new module in `src/domain_mapper/sources/`
|
|
145
|
+
2. Implement a class with a `get_subsidiaries(company_name)` method that returns a list of subsidiary objects
|
|
146
|
+
3. Add it to the orchestrator in `mapper.py`
|
|
147
|
+
4. Write tests in `tests/`
|
|
148
|
+
|
|
149
|
+
Some data sources we'd love to see contributed:
|
|
150
|
+
|
|
151
|
+
- Companies House (UK company registry)
|
|
152
|
+
- OpenCorporates API
|
|
153
|
+
- Crunchbase (acquisitions data)
|
|
154
|
+
- D&B corporate hierarchies
|
|
155
|
+
|
|
156
|
+
Pull requests welcome. Run `ruff check` and `black` before submitting, and make sure `pytest` passes.
|
|
157
|
+
|
|
158
|
+
## Tech stack
|
|
159
|
+
|
|
160
|
+
- Python 3.10+
|
|
161
|
+
- `requests` and `beautifulsoup4` for web scraping
|
|
162
|
+
- `click` for the CLI
|
|
163
|
+
- `rich` for terminal output
|
|
164
|
+
- No paid APIs, no API keys required
|
|
165
|
+
|
|
166
|
+
## Built by GTM Layer
|
|
167
|
+
|
|
168
|
+
[GTM Layer](https://gtmlayer.com) builds revenue systems for B2B sales teams. We work with companies on CRM architecture, enrichment pipelines, signal-driven outbound, and everything in between.
|
|
169
|
+
|
|
170
|
+
This tool came out of real client work - we built it to solve a problem we kept hitting on enterprise ABM engagements. If you're running into similar challenges, [get in touch](https://gtmlayer.com).
|
|
171
|
+
|
|
172
|
+
## Licence
|
|
173
|
+
|
|
174
|
+
MIT - use it however you want.
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "enterprise-domain-mapper"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Map enterprise corporate structures to enrichable domains"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "GTM Layer", email = "hello@gtmlayer.com"},
|
|
14
|
+
]
|
|
15
|
+
keywords = ["enterprise", "enrichment", "abm", "domains", "subsidiaries", "sales"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Office/Business",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"requests>=2.31.0",
|
|
28
|
+
"beautifulsoup4>=4.12.0",
|
|
29
|
+
"click>=8.1.0",
|
|
30
|
+
"rich>=13.0.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=7.0",
|
|
36
|
+
"pytest-cov>=4.0",
|
|
37
|
+
"ruff>=0.1.0",
|
|
38
|
+
"black>=23.0",
|
|
39
|
+
"responses>=0.23.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
domain-mapper = "domain_mapper.cli:main"
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/gtmlayer/enterprise-domain-mapper"
|
|
47
|
+
Repository = "https://github.com/gtmlayer/enterprise-domain-mapper"
|
|
48
|
+
Issues = "https://github.com/gtmlayer/enterprise-domain-mapper/issues"
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.packages.find]
|
|
51
|
+
where = ["src"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
line-length = 100
|
|
55
|
+
target-version = "py310"
|
|
56
|
+
|
|
57
|
+
[tool.ruff.lint]
|
|
58
|
+
select = ["E", "F", "I", "N", "W"]
|
|
59
|
+
|
|
60
|
+
[tool.black]
|
|
61
|
+
line-length = 100
|
|
62
|
+
target-version = ["py310"]
|
|
63
|
+
|
|
64
|
+
[tool.pytest.ini_options]
|
|
65
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
"""CLI interface for the Enterprise Domain Mapper."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
11
|
+
from rich.tree import Tree
|
|
12
|
+
|
|
13
|
+
from domain_mapper.mapper import DomainMapper
|
|
14
|
+
from domain_mapper.models import CompanyResult
|
|
15
|
+
from domain_mapper.output import write_clay_csv, write_detailed_csv
|
|
16
|
+
|
|
17
|
+
console = Console()
|
|
18
|
+
|
|
19
|
+
# Column name detection for CSV inputs
|
|
20
|
+
COMPANY_COLUMNS = ["company_name", "company", "name", "account", "account_name", "organization"]
|
|
21
|
+
DOMAIN_COLUMNS = ["domain", "website", "url", "parent_domain", "company_domain"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _detect_columns(headers: list[str]) -> tuple[str | None, str | None]:
|
|
25
|
+
"""Auto-detect company name and domain columns from CSV headers."""
|
|
26
|
+
headers_lower = [h.lower().strip() for h in headers]
|
|
27
|
+
|
|
28
|
+
company_col = None
|
|
29
|
+
for candidate in COMPANY_COLUMNS:
|
|
30
|
+
if candidate in headers_lower:
|
|
31
|
+
company_col = headers[headers_lower.index(candidate)]
|
|
32
|
+
break
|
|
33
|
+
|
|
34
|
+
domain_col = None
|
|
35
|
+
for candidate in DOMAIN_COLUMNS:
|
|
36
|
+
if candidate in headers_lower:
|
|
37
|
+
domain_col = headers[headers_lower.index(candidate)]
|
|
38
|
+
break
|
|
39
|
+
|
|
40
|
+
return company_col, domain_col
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _print_tree(result: CompanyResult) -> None:
|
|
44
|
+
"""Print a rich tree representation of the mapping results."""
|
|
45
|
+
tree = Tree(f"[bold]{result.company_name}[/bold]")
|
|
46
|
+
|
|
47
|
+
for domain in result.domains:
|
|
48
|
+
icon = "[green]✓[/green]" if domain.dns_verified else "[dim]·[/dim]"
|
|
49
|
+
source_tag = f"[dim][{domain.domain_source}][/dim]"
|
|
50
|
+
|
|
51
|
+
if domain.dns_verified:
|
|
52
|
+
source_tag += " [green]✓ DNS verified[/green]"
|
|
53
|
+
|
|
54
|
+
tree.add(
|
|
55
|
+
f"{icon} [cyan]{domain.subsidiary_name:<30}[/cyan] "
|
|
56
|
+
f"{domain.domain:<25} {source_tag}"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
console.print(tree)
|
|
60
|
+
|
|
61
|
+
# Summary line
|
|
62
|
+
total = len(result.domains)
|
|
63
|
+
confirmed = len(result.confirmed_domains)
|
|
64
|
+
guessed = len(result.guessed_domains)
|
|
65
|
+
verified = len(result.verified_domains)
|
|
66
|
+
console.print(
|
|
67
|
+
f"\n[dim]Found {len(result.subsidiaries)} subsidiaries, "
|
|
68
|
+
f"{total} domains ({confirmed} confirmed, {guessed} guessed, "
|
|
69
|
+
f"{verified} DNS verified)[/dim]"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@click.command()
|
|
74
|
+
@click.argument("input", type=str)
|
|
75
|
+
@click.option("--output", "-o", type=click.Path(), help="Output CSV file path")
|
|
76
|
+
@click.option(
|
|
77
|
+
"--format",
|
|
78
|
+
"-f",
|
|
79
|
+
"output_format",
|
|
80
|
+
type=click.Choice(["detailed", "clay"]),
|
|
81
|
+
default="detailed",
|
|
82
|
+
help="Output format: detailed (one row per domain) or clay (one row per company)",
|
|
83
|
+
)
|
|
84
|
+
@click.option("--verify-dns", is_flag=True, help="Verify guessed domains via DNS lookups")
|
|
85
|
+
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose logging")
|
|
86
|
+
def main(input: str, output: str | None, output_format: str, verify_dns: bool, verbose: bool):
|
|
87
|
+
"""Map enterprise companies to their subsidiary domains.
|
|
88
|
+
|
|
89
|
+
INPUT can be a company name (e.g. "Boeing") or a CSV file path.
|
|
90
|
+
"""
|
|
91
|
+
if verbose:
|
|
92
|
+
logging.basicConfig(level=logging.INFO, format="%(name)s: %(message)s")
|
|
93
|
+
else:
|
|
94
|
+
logging.basicConfig(level=logging.WARNING)
|
|
95
|
+
|
|
96
|
+
mapper = DomainMapper(verify_dns=verify_dns)
|
|
97
|
+
results: list[CompanyResult] = []
|
|
98
|
+
|
|
99
|
+
input_path = Path(input)
|
|
100
|
+
if input_path.exists() and input_path.suffix.lower() == ".csv":
|
|
101
|
+
# Batch mode: CSV input
|
|
102
|
+
results = _process_csv(mapper, input_path, verify_dns)
|
|
103
|
+
else:
|
|
104
|
+
# Single company mode
|
|
105
|
+
console.print(f"\n[bold]Mapping domains for:[/bold] {input}\n")
|
|
106
|
+
with Progress(
|
|
107
|
+
SpinnerColumn(),
|
|
108
|
+
TextColumn("[progress.description]{task.description}"),
|
|
109
|
+
console=console,
|
|
110
|
+
) as progress:
|
|
111
|
+
task = progress.add_task("Searching SEC EDGAR, Wikipedia, generating TLDs...", total=None)
|
|
112
|
+
result = mapper.map_company(input)
|
|
113
|
+
progress.update(task, completed=True)
|
|
114
|
+
|
|
115
|
+
results = [result]
|
|
116
|
+
_print_tree(result)
|
|
117
|
+
|
|
118
|
+
# Write output file
|
|
119
|
+
if output:
|
|
120
|
+
output_path = Path(output)
|
|
121
|
+
with open(output_path, "w", newline="", encoding="utf-8") as f:
|
|
122
|
+
if output_format == "clay":
|
|
123
|
+
write_clay_csv(results, f)
|
|
124
|
+
else:
|
|
125
|
+
write_detailed_csv(results, f)
|
|
126
|
+
console.print(f"\n[green]✓[/green] Results written to {output_path}")
|
|
127
|
+
total_domains = sum(len(r.domains) for r in results)
|
|
128
|
+
console.print(f"[dim] {len(results)} companies, {total_domains} domains[/dim]")
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _process_csv(mapper: DomainMapper, csv_path: Path, verify_dns: bool) -> list[CompanyResult]:
|
|
132
|
+
"""Process a CSV file of companies."""
|
|
133
|
+
results = []
|
|
134
|
+
|
|
135
|
+
with open(csv_path, newline="", encoding="utf-8") as f:
|
|
136
|
+
reader = csv.DictReader(f)
|
|
137
|
+
if not reader.fieldnames:
|
|
138
|
+
console.print("[red]Error: CSV file has no headers[/red]")
|
|
139
|
+
sys.exit(1)
|
|
140
|
+
|
|
141
|
+
company_col, domain_col = _detect_columns(list(reader.fieldnames))
|
|
142
|
+
if not company_col:
|
|
143
|
+
console.print(
|
|
144
|
+
f"[red]Error: Could not detect company name column. "
|
|
145
|
+
f"Expected one of: {', '.join(COMPANY_COLUMNS)}[/red]"
|
|
146
|
+
)
|
|
147
|
+
sys.exit(1)
|
|
148
|
+
|
|
149
|
+
rows = list(reader)
|
|
150
|
+
|
|
151
|
+
console.print(f"\n[bold]Processing {len(rows)} companies from {csv_path.name}[/bold]")
|
|
152
|
+
if domain_col:
|
|
153
|
+
console.print(f"[dim]Using '{company_col}' for names, '{domain_col}' for parent domains[/dim]")
|
|
154
|
+
else:
|
|
155
|
+
console.print(f"[dim]Using '{company_col}' for names (no domain column detected)[/dim]")
|
|
156
|
+
|
|
157
|
+
with Progress(
|
|
158
|
+
SpinnerColumn(),
|
|
159
|
+
TextColumn("[progress.description]{task.description}"),
|
|
160
|
+
console=console,
|
|
161
|
+
) as progress:
|
|
162
|
+
for i, row in enumerate(rows):
|
|
163
|
+
company_name = row.get(company_col, "").strip()
|
|
164
|
+
parent_domain = row.get(domain_col, "").strip() if domain_col else ""
|
|
165
|
+
|
|
166
|
+
if not company_name:
|
|
167
|
+
continue
|
|
168
|
+
|
|
169
|
+
task = progress.add_task(
|
|
170
|
+
f"[{i + 1}/{len(rows)}] {company_name}...", total=None
|
|
171
|
+
)
|
|
172
|
+
result = mapper.map_company(company_name, parent_domain)
|
|
173
|
+
results.append(result)
|
|
174
|
+
progress.update(task, completed=True, description=f"[{i + 1}/{len(rows)}] {company_name}: {len(result.domains)} domains")
|
|
175
|
+
|
|
176
|
+
# Print summary
|
|
177
|
+
total_subs = sum(len(r.subsidiaries) for r in results)
|
|
178
|
+
total_domains = sum(len(r.domains) for r in results)
|
|
179
|
+
total_verified = sum(len(r.verified_domains) for r in results)
|
|
180
|
+
|
|
181
|
+
console.print(f"\n[bold]Summary[/bold]")
|
|
182
|
+
console.print(f" Companies processed: {len(results)}")
|
|
183
|
+
console.print(f" Total subsidiaries found: {total_subs}")
|
|
184
|
+
console.print(f" Total domains mapped: {total_domains}")
|
|
185
|
+
if verify_dns:
|
|
186
|
+
console.print(f" DNS verified: {total_verified}")
|
|
187
|
+
|
|
188
|
+
return results
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
main()
|