autonitia-intel 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- autonitia_intel-0.2.0/LICENSE +21 -0
- autonitia_intel-0.2.0/PKG-INFO +119 -0
- autonitia_intel-0.2.0/README.md +88 -0
- autonitia_intel-0.2.0/autonitia_intel/__init__.py +27 -0
- autonitia_intel-0.2.0/autonitia_intel/__main__.py +3 -0
- autonitia_intel-0.2.0/autonitia_intel/cli.py +56 -0
- autonitia_intel-0.2.0/autonitia_intel/config.py +36 -0
- autonitia_intel-0.2.0/autonitia_intel/detection/__init__.py +4 -0
- autonitia_intel-0.2.0/autonitia_intel/detection/capabilities.py +136 -0
- autonitia_intel-0.2.0/autonitia_intel/detection/fingerprints.py +63 -0
- autonitia_intel-0.2.0/autonitia_intel/fetchers/__init__.py +4 -0
- autonitia_intel-0.2.0/autonitia_intel/fetchers/fetcher.py +146 -0
- autonitia_intel-0.2.0/autonitia_intel/fetchers/robots.py +50 -0
- autonitia_intel-0.2.0/autonitia_intel/graph/__init__.py +5 -0
- autonitia_intel-0.2.0/autonitia_intel/graph/base_graph.py +64 -0
- autonitia_intel-0.2.0/autonitia_intel/graph/base_node.py +13 -0
- autonitia_intel-0.2.0/autonitia_intel/graph/profile_graph.py +148 -0
- autonitia_intel-0.2.0/autonitia_intel/lenses/__init__.py +3 -0
- autonitia_intel-0.2.0/autonitia_intel/lenses/catalog.py +110 -0
- autonitia_intel-0.2.0/autonitia_intel/models.py +136 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/__init__.py +15 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/basic_assemble_node.py +76 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/fact_extraction_node.py +41 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/fetch_node.py +70 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/markdownify_node.py +35 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/positive_detection_node.py +24 -0
- autonitia_intel-0.2.0/autonitia_intel/nodes/repair_extraction_node.py +51 -0
- autonitia_intel-0.2.0/autonitia_intel/signal_packs/industries/real_estate.yaml +23 -0
- autonitia_intel-0.2.0/autonitia_intel/signal_packs/lenses/automation.yaml +57 -0
- autonitia_intel-0.2.0/autonitia_intel/signal_packs/lenses/marketing.yaml +37 -0
- autonitia_intel-0.2.0/autonitia_intel/signal_packs/lenses/sales.yaml +19 -0
- autonitia_intel-0.2.0/autonitia_intel/telemetry/__init__.py +3 -0
- autonitia_intel-0.2.0/autonitia_intel/telemetry/telemetry.py +84 -0
- autonitia_intel-0.2.0/autonitia_intel/usage.py +32 -0
- autonitia_intel-0.2.0/autonitia_intel.egg-info/PKG-INFO +119 -0
- autonitia_intel-0.2.0/autonitia_intel.egg-info/SOURCES.txt +45 -0
- autonitia_intel-0.2.0/autonitia_intel.egg-info/dependency_links.txt +1 -0
- autonitia_intel-0.2.0/autonitia_intel.egg-info/entry_points.txt +2 -0
- autonitia_intel-0.2.0/autonitia_intel.egg-info/requires.txt +12 -0
- autonitia_intel-0.2.0/autonitia_intel.egg-info/top_level.txt +1 -0
- autonitia_intel-0.2.0/pyproject.toml +48 -0
- autonitia_intel-0.2.0/setup.cfg +4 -0
- autonitia_intel-0.2.0/tests/test_catalog.py +76 -0
- autonitia_intel-0.2.0/tests/test_detection.py +87 -0
- autonitia_intel-0.2.0/tests/test_fetch.py +65 -0
- autonitia_intel-0.2.0/tests/test_graph.py +77 -0
- autonitia_intel-0.2.0/tests/test_integration.py +52 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Autonitia
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: autonitia-intel
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Turn any business website into a clean, structured company profile — a graph-based extraction engine.
|
|
5
|
+
Author: Syed Mukarramuddin
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Autonitia/autonitia-intel
|
|
8
|
+
Project-URL: Repository, https://github.com/Autonitia/autonitia-intel
|
|
9
|
+
Keywords: web-scraping,llm,company-data,lead-generation,openai,extraction,graph
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Requires-Python: >=3.10
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: requests>=2.31
|
|
20
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
21
|
+
Requires-Dist: lxml>=5.0
|
|
22
|
+
Requires-Dist: openai>=1.40
|
|
23
|
+
Requires-Dist: pydantic>=2.0
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0
|
|
25
|
+
Requires-Dist: pyyaml>=6.0
|
|
26
|
+
Requires-Dist: cloudscraper>=1.2.71
|
|
27
|
+
Requires-Dist: playwright>=1.40
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
# autonitia-intel
|
|
33
|
+
|
|
34
|
+
Turn any business website into a clean, structured company profile — and a quick read on where the opportunities are.
|
|
35
|
+
|
|
36
|
+
Point it at a URL and get back the company's details (description, services, contact info, social presence) plus the tools and capabilities its site exposes. It also tells you how many opportunities a given **lens** (automation, marketing, sales…) would surface.
|
|
37
|
+
|
|
38
|
+
## Install
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install autonitia-intel
|
|
42
|
+
playwright install chromium # only needed for JavaScript-heavy sites
|
|
43
|
+
export OPENAI_API_KEY=sk-... # or pass api_key in the config
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Use it
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from autonitia_intel import ProfileGraph
|
|
50
|
+
|
|
51
|
+
config = {
|
|
52
|
+
"llm": {"model": "gpt-4o-mini"}, # add "api_key": "sk-..." or use the env var
|
|
53
|
+
"lens": "automation", # automation | marketing | sales | …
|
|
54
|
+
"verbose": True,
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
graph = ProfileGraph(source="https://example.com", config=config)
|
|
58
|
+
result = graph.run()
|
|
59
|
+
|
|
60
|
+
print(result.model_dump_json(indent=2))
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Prefer the command line?
|
|
64
|
+
|
|
65
|
+
```bash
|
|
66
|
+
python run.py https://example.com --lens marketing --json
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## What you get
|
|
70
|
+
|
|
71
|
+
```json
|
|
72
|
+
{
|
|
73
|
+
"target_company": {
|
|
74
|
+
"name": "Example Co",
|
|
75
|
+
"industry": "Real Estate",
|
|
76
|
+
"description": "...",
|
|
77
|
+
"location": "Dubai, UAE",
|
|
78
|
+
"contact": { "phones": ["..."], "emails": ["..."], "addresses": ["..."] }
|
|
79
|
+
},
|
|
80
|
+
"digital_presence": { "social_media": { "linkedin": "...", "instagram": "..." } },
|
|
81
|
+
"capabilities_present": ["phone", "whatsapp", "online_booking"],
|
|
82
|
+
"pro_features": { "lens": "automation", "opportunities_found": 2 }
|
|
83
|
+
}
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## How it works
|
|
87
|
+
|
|
88
|
+
It fetches the site politely (respecting `robots.txt`, with retries and a real-browser fallback for JS-heavy pages), uses one LLM call to read out the company profile, and runs fast local checks to spot the tools and capabilities present. The opportunity count for a lens is computed locally — no guessing.
|
|
89
|
+
|
|
90
|
+
## Lenses
|
|
91
|
+
|
|
92
|
+
A **lens** is the perspective you analyse a site through — `automation`, `marketing`, `sales`, and more. Lenses and the signals they look for are defined as simple **YAML packs** in [`autonitia_intel/signal_packs/`](autonitia_intel/signal_packs), so you can add a new lens or industry pack without touching the Python.
|
|
93
|
+
|
|
94
|
+
## Contributing
|
|
95
|
+
|
|
96
|
+
Contributions welcome — the easiest place to start is a signal pack: drop a YAML file under `signal_packs/lenses/` or `signal_packs/industries/` and open a PR. Run the tests with `pytest -m "not integration"`.
|
|
97
|
+
|
|
98
|
+
## Hosted version
|
|
99
|
+
|
|
100
|
+
This open-source engine gives you the profile and the opportunity count. The hosted **Autonitia Intel** turns those opportunities into verified, ranked, outreach-ready intelligence over a REST API.
|
|
101
|
+
|
|
102
|
+
**→ Docs & access: [autonitia.ai/intel](https://autonitia.ai/intel)**
|
|
103
|
+
|
|
104
|
+
| | Free — `autonitia-intel` | Hosted — Autonitia Intel |
|
|
105
|
+
|---|:---:|:---:|
|
|
106
|
+
| Company profile + contact + socials | ✅ | ✅ |
|
|
107
|
+
| Tool & capability detection | ✅ | ✅ |
|
|
108
|
+
| Opportunity count | ✅ | — |
|
|
109
|
+
| Verified capability analysis | — | ✅ |
|
|
110
|
+
| Pain signals with evidence | — | ✅ |
|
|
111
|
+
| Scoring (fit / opportunity / confidence) | — | ✅ |
|
|
112
|
+
| Offer matching + ranked opportunities | — | ✅ |
|
|
113
|
+
| Outreach messages | — | ✅ |
|
|
114
|
+
| External enrichment (founders, HQ, funding) | — | ✅ |
|
|
115
|
+
| REST API, async jobs, webhooks, CRM export | — | ✅ |
|
|
116
|
+
|
|
117
|
+
## License
|
|
118
|
+
|
|
119
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# autonitia-intel
|
|
2
|
+
|
|
3
|
+
Turn any business website into a clean, structured company profile — and a quick read on where the opportunities are.
|
|
4
|
+
|
|
5
|
+
Point it at a URL and get back the company's details (description, services, contact info, social presence) plus the tools and capabilities its site exposes. It also tells you how many opportunities a given **lens** (automation, marketing, sales…) would surface.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install autonitia-intel
|
|
11
|
+
playwright install chromium # only needed for JavaScript-heavy sites
|
|
12
|
+
export OPENAI_API_KEY=sk-... # or pass api_key in the config
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Use it
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from autonitia_intel import ProfileGraph
|
|
19
|
+
|
|
20
|
+
config = {
|
|
21
|
+
"llm": {"model": "gpt-4o-mini"}, # add "api_key": "sk-..." or use the env var
|
|
22
|
+
"lens": "automation", # automation | marketing | sales | …
|
|
23
|
+
"verbose": True,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
graph = ProfileGraph(source="https://example.com", config=config)
|
|
27
|
+
result = graph.run()
|
|
28
|
+
|
|
29
|
+
print(result.model_dump_json(indent=2))
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Prefer the command line?
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
python run.py https://example.com --lens marketing --json
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## What you get
|
|
39
|
+
|
|
40
|
+
```json
|
|
41
|
+
{
|
|
42
|
+
"target_company": {
|
|
43
|
+
"name": "Example Co",
|
|
44
|
+
"industry": "Real Estate",
|
|
45
|
+
"description": "...",
|
|
46
|
+
"location": "Dubai, UAE",
|
|
47
|
+
"contact": { "phones": ["..."], "emails": ["..."], "addresses": ["..."] }
|
|
48
|
+
},
|
|
49
|
+
"digital_presence": { "social_media": { "linkedin": "...", "instagram": "..." } },
|
|
50
|
+
"capabilities_present": ["phone", "whatsapp", "online_booking"],
|
|
51
|
+
"pro_features": { "lens": "automation", "opportunities_found": 2 }
|
|
52
|
+
}
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## How it works
|
|
56
|
+
|
|
57
|
+
It fetches the site politely (respecting `robots.txt`, with retries and a real-browser fallback for JS-heavy pages), uses one LLM call to read out the company profile, and runs fast local checks to spot the tools and capabilities present. The opportunity count for a lens is computed locally — no guessing.
|
|
58
|
+
|
|
59
|
+
## Lenses
|
|
60
|
+
|
|
61
|
+
A **lens** is the perspective you analyse a site through — `automation`, `marketing`, `sales`, and more. Lenses and the signals they look for are defined as simple **YAML packs** in [`autonitia_intel/signal_packs/`](autonitia_intel/signal_packs), so you can add a new lens or industry pack without touching the Python.
|
|
62
|
+
|
|
63
|
+
## Contributing
|
|
64
|
+
|
|
65
|
+
Contributions welcome — the easiest place to start is a signal pack: drop a YAML file under `signal_packs/lenses/` or `signal_packs/industries/` and open a PR. Run the tests with `pytest -m "not integration"`.
|
|
66
|
+
|
|
67
|
+
## Hosted version
|
|
68
|
+
|
|
69
|
+
This open-source engine gives you the profile and the opportunity count. The hosted **Autonitia Intel** turns those opportunities into verified, ranked, outreach-ready intelligence over a REST API.
|
|
70
|
+
|
|
71
|
+
**→ Docs & access: [autonitia.ai/intel](https://autonitia.ai/intel)**
|
|
72
|
+
|
|
73
|
+
| | Free — `autonitia-intel` | Hosted — Autonitia Intel |
|
|
74
|
+
|---|:---:|:---:|
|
|
75
|
+
| Company profile + contact + socials | ✅ | ✅ |
|
|
76
|
+
| Tool & capability detection | ✅ | ✅ |
|
|
77
|
+
| Opportunity count | ✅ | — |
|
|
78
|
+
| Verified capability analysis | — | ✅ |
|
|
79
|
+
| Pain signals with evidence | — | ✅ |
|
|
80
|
+
| Scoring (fit / opportunity / confidence) | — | ✅ |
|
|
81
|
+
| Offer matching + ranked opportunities | — | ✅ |
|
|
82
|
+
| Outreach messages | — | ✅ |
|
|
83
|
+
| External enrichment (founders, HQ, funding) | — | ✅ |
|
|
84
|
+
| REST API, async jobs, webhooks, CRM export | — | ✅ |
|
|
85
|
+
|
|
86
|
+
## License
|
|
87
|
+
|
|
88
|
+
MIT — see [LICENSE](LICENSE).
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
autonitia-intel — open-source business-website profile extractor.
|
|
3
|
+
|
|
4
|
+
The FREE engine turns any business website into a clean structured profile
|
|
5
|
+
(company facts, contact details, social presence, detected tools/capabilities)
|
|
6
|
+
plus a *pro_features* count of opportunities. The intelligence layer — verified
|
|
7
|
+
signals, scoring, offer matching, outreach — is Autonitia Intel Pro, which
|
|
8
|
+
imports these same building blocks.
|
|
9
|
+
|
|
10
|
+
Quick start:
|
|
11
|
+
|
|
12
|
+
from autonitia_intel import ProfileGraph
|
|
13
|
+
|
|
14
|
+
graph = ProfileGraph(lens="automation") # bring your own key via env or args
|
|
15
|
+
profile = graph.run("https://example.com")
|
|
16
|
+
print(profile.model_dump_json(indent=2))
|
|
17
|
+
|
|
18
|
+
Bring your own model key:
|
|
19
|
+
|
|
20
|
+
ProfileGraph(api_key="sk-...", model="gpt-4o-mini")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .graph import ProfileGraph
|
|
24
|
+
from .models import CompanyProfile, ProfileResult
|
|
25
|
+
|
|
26
|
+
__version__ = "0.2.0"
|
|
27
|
+
__all__ = ["ProfileGraph", "CompanyProfile", "ProfileResult"]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLI: autonitia-intel analyse — extract a company profile (+ opportunity pro_features).
|
|
3
|
+
|
|
4
|
+
Example:
|
|
5
|
+
python -m autonitia_intel analyse --target-url https://example.com --lens automation
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import sys
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .config import OUTPUT_DIR
|
|
13
|
+
from .graph import ProfileGraph
|
|
14
|
+
from .lenses import LENSES
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main(argv=None):
|
|
18
|
+
parser = argparse.ArgumentParser(prog="autonitia-intel", description="Business-website profile extractor (free tier).")
|
|
19
|
+
sub = parser.add_subparsers(dest="command")
|
|
20
|
+
|
|
21
|
+
a = sub.add_parser("analyse", help="Extract a profile from a business website")
|
|
22
|
+
a.add_argument("--target-url", required=True)
|
|
23
|
+
a.add_argument("--lens", default="automation", choices=LENSES, help="Lens used only for the opportunity pro_features count")
|
|
24
|
+
a.add_argument("--api-key", default=None, help="Bring your own model key (overrides env)")
|
|
25
|
+
a.add_argument("--model", default=None, help="Model id (overrides env)")
|
|
26
|
+
a.add_argument("--no-cache", action="store_true")
|
|
27
|
+
a.add_argument("--no-telemetry", action="store_true")
|
|
28
|
+
a.add_argument("--quiet", action="store_true")
|
|
29
|
+
|
|
30
|
+
args = parser.parse_args(argv)
|
|
31
|
+
if args.command != "analyse":
|
|
32
|
+
parser.print_help()
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
|
|
35
|
+
graph = ProfileGraph(
|
|
36
|
+
lens=args.lens,
|
|
37
|
+
telemetry=not args.no_telemetry,
|
|
38
|
+
verbose=not args.quiet,
|
|
39
|
+
api_key=args.api_key,
|
|
40
|
+
model=args.model,
|
|
41
|
+
)
|
|
42
|
+
if not args.quiet:
|
|
43
|
+
print(f"Analysing {args.target_url} (lens={args.lens}) ...")
|
|
44
|
+
result = graph.run(args.target_url, use_cache=not args.no_cache)
|
|
45
|
+
|
|
46
|
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
name = (result.target_company.name or "result").replace("/", "_").replace(" ", "_")
|
|
48
|
+
path = OUTPUT_DIR / f"{name}_profile.json"
|
|
49
|
+
path.write_text(result.model_dump_json(indent=2))
|
|
50
|
+
|
|
51
|
+
print(f"\nSaved: {path}\n")
|
|
52
|
+
print(result.model_dump_json(indent=2))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
if __name__ == "__main__":
|
|
56
|
+
main()
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
from dotenv import load_dotenv
|
|
5
|
+
|
|
6
|
+
load_dotenv()
|
|
7
|
+
|
|
8
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", "")
|
|
9
|
+
MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
|
|
10
|
+
|
|
11
|
+
BROWSER_HEADERS = {
|
|
12
|
+
"User-Agent": (
|
|
13
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
14
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
15
|
+
"Chrome/125.0.0.0 Safari/537.36"
|
|
16
|
+
),
|
|
17
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
18
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
CACHE_DIR = Path(__file__).parent.parent / "output" / ".cache"
|
|
22
|
+
OUTPUT_DIR = Path(__file__).parent.parent / "output"
|
|
23
|
+
REQUEST_TIMEOUT = 15
|
|
24
|
+
MAX_CONTENT_CHARS = 24_000 # overall budget sent to the LLM
|
|
25
|
+
PER_PAGE_CHARS = 6_000 # per-page cap so no single page (e.g. a bloated homepage) starves the rest
|
|
26
|
+
MAX_SUBPAGES = 3
|
|
27
|
+
|
|
28
|
+
# Crawling politeness/resilience
|
|
29
|
+
RESPECT_ROBOTS = os.getenv("AUTONITIA_RESPECT_ROBOTS", "true").lower() != "false"
|
|
30
|
+
FETCH_RETRIES = int(os.getenv("AUTONITIA_FETCH_RETRIES", "2")) # extra attempts on transient errors
|
|
31
|
+
ROBOTS_UA = "autonitia-intel"
|
|
32
|
+
|
|
33
|
+
# Telemetry — see telemetry/telemetry.py. Nothing is sent over the network in v0.1.
|
|
34
|
+
# Level 1 (execution metrics) is opt-OUT. Level 2 (dataset capture) is opt-IN.
|
|
35
|
+
TELEMETRY_ENABLED = os.getenv("AUTONITIA_TELEMETRY", "true").lower() != "false"
|
|
36
|
+
DATASET_CONTRIBUTION = os.getenv("AUTONITIA_DATASET", "false").lower() == "true"
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic capability + digital-presence detection.
|
|
3
|
+
|
|
4
|
+
Inspects raw HTML across all fetched pages to determine observable facts:
|
|
5
|
+
lead-capture methods, social links, SEO basics, tracking. No LLM.
|
|
6
|
+
|
|
7
|
+
These are FACTS (present/absent), which become the evidence base for signals.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import re
|
|
11
|
+
|
|
12
|
+
from ..detection.fingerprints import detect_tools
|
|
13
|
+
from ..models import Capabilities, SEO, SocialMedia, Tracking
|
|
14
|
+
|
|
15
|
+
SOCIAL_PATTERNS = {
|
|
16
|
+
"facebook": r"https?://(?:www\.)?facebook\.com/[A-Za-z0-9_.\-/]+",
|
|
17
|
+
"instagram": r"https?://(?:www\.)?instagram\.com/[A-Za-z0-9_.\-/]+",
|
|
18
|
+
"linkedin": r"https?://(?:[a-z]{2}\.)?linkedin\.com/(?:company|in)/[A-Za-z0-9_.\-/]+",
|
|
19
|
+
"tiktok": r"https?://(?:www\.)?tiktok\.com/@[A-Za-z0-9_.\-/]+",
|
|
20
|
+
"youtube": r"https?://(?:www\.)?youtube\.com/[A-Za-z0-9_.\-/@]+",
|
|
21
|
+
"x": r"https?://(?:www\.)?(?:twitter|x)\.com/[A-Za-z0-9_]+",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# STRONG patterns = a real third-party tool / explicit URL → trustworthy, the
|
|
25
|
+
# LLM verifier may NOT downgrade these. WEAK patterns = generic text heuristics
|
|
26
|
+
# ("book now") that are easily wrong → the LLM verifier MAY override them.
|
|
27
|
+
BOOKING_STRONG = [r"calendly\.com", r"fresha\.com", r"acuityscheduling\.com", r"booksy\.com",
|
|
28
|
+
r"simplybook\.(me|it)", r"setmore\.com", r"squareup\.com/appointments"]
|
|
29
|
+
BOOKING_WEAK = [r"book\s*now", r"book\s*online", r"schedule\s*(an?\s*)?appointment", r"book\s*a\s*viewing"]
|
|
30
|
+
|
|
31
|
+
LIVE_CHAT_STRONG = [r"intercom", r"driftt?\.com", r"tidio", r"tawk\.to", r"crisp\.chat", r"hs-scripts"]
|
|
32
|
+
LIVE_CHAT_WEAK = [r"livechat", r"chat\s*with\s*us", r"live\s*chat"]
|
|
33
|
+
|
|
34
|
+
WHATSAPP_STRONG = [r"wa\.me/", r"api\.whatsapp\.com", r"whatsapp://", r"web\.whatsapp\.com",
|
|
35
|
+
r"chat\.whatsapp\.com", r"wa\.link/"]
|
|
36
|
+
WHATSAPP_WEAK = [r"click\s*to\s*whatsapp", r'aria-label=["\'][^"\']*whatsapp', r"whatsapp\s*us"]
|
|
37
|
+
|
|
38
|
+
NEWSLETTER_STRONG = [r"chimpstatic", r"klaviyo", r"list-manage\.com"]
|
|
39
|
+
NEWSLETTER_WEAK = [r"newsletter", r"subscribe"]
|
|
40
|
+
|
|
41
|
+
# These have no reliable "strong" structural signal — treat as weak (downgradable).
|
|
42
|
+
PRICING_WEAK = [r"/pricing", r">\s*pricing\s*<", r">\s*plans\s*<", r"per\s*month", r"/mo\b"]
|
|
43
|
+
CASE_STUDY_WEAK = [r"case\s*stud", r"success\s*stor", r"/portfolio", r"testimonial"]
|
|
44
|
+
FORM_WEAK = [r"<form[\s>]"] # a <form> could be search/login, not a contact form → downgradable
|
|
45
|
+
|
|
46
|
+
PHONE_PATTERN = r"tel:\+?[\d\s\-()]{7,}"
|
|
47
|
+
EMAIL_PATTERN = r"mailto:[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+"
|
|
48
|
+
|
|
49
|
+
# Capabilities the LLM verifier is allowed to downgrade when only a WEAK signal fired.
|
|
50
|
+
DOWNGRADABLE = {"has_online_booking", "has_whatsapp", "has_live_chat",
|
|
51
|
+
"has_pricing", "has_case_studies", "has_contact_form", "has_newsletter"}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _any(patterns: list[str], html: str) -> bool:
|
|
55
|
+
return any(re.search(p, html, re.IGNORECASE) for p in patterns)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
_EMAIL_RE = re.compile(r"[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}")
|
|
59
|
+
_TEL_RE = re.compile(r'tel:(\+?[\d\s\-()]{7,})', re.IGNORECASE)
|
|
60
|
+
_WA_RE = re.compile(r'(https?://(?:wa\.me|wa\.link|api\.whatsapp\.com|web\.whatsapp\.com|chat\.whatsapp\.com)/[^\s"\'<>]+)', re.IGNORECASE)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def extract_contacts(html: str) -> dict:
|
|
64
|
+
"""Deterministic contact extraction — a backstop/merge for the LLM output."""
|
|
65
|
+
emails = sorted({m.group(0) for m in _EMAIL_RE.finditer(html)
|
|
66
|
+
if not m.group(0).lower().endswith((".png", ".jpg", ".gif", ".webp", ".svg"))})
|
|
67
|
+
phones = sorted({re.sub(r"\s+", " ", m.group(1)).strip() for m in _TEL_RE.finditer(html)})
|
|
68
|
+
wa = ""
|
|
69
|
+
m = _WA_RE.search(html)
|
|
70
|
+
if m:
|
|
71
|
+
wa = m.group(1)
|
|
72
|
+
return {"emails": emails, "phones": phones, "whatsapp": wa}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def detect_capabilities(combined_html: str):
|
|
76
|
+
"""
|
|
77
|
+
Returns (capabilities, social, seo, tracking, tools, strongly_detected).
|
|
78
|
+
|
|
79
|
+
`strongly_detected` is the set of capability names backed by a STRONG
|
|
80
|
+
structural signal (a real tool/URL). The LLM verifier may only downgrade
|
|
81
|
+
capabilities NOT in this set.
|
|
82
|
+
"""
|
|
83
|
+
booking_strong = _any(BOOKING_STRONG, combined_html)
|
|
84
|
+
chat_strong = _any(LIVE_CHAT_STRONG, combined_html)
|
|
85
|
+
wa_strong = _any(WHATSAPP_STRONG, combined_html)
|
|
86
|
+
news_strong = _any(NEWSLETTER_STRONG, combined_html)
|
|
87
|
+
|
|
88
|
+
caps = Capabilities(
|
|
89
|
+
has_phone=bool(re.search(PHONE_PATTERN, combined_html, re.IGNORECASE)),
|
|
90
|
+
has_email=bool(re.search(EMAIL_PATTERN, combined_html, re.IGNORECASE)),
|
|
91
|
+
has_contact_form=_any(FORM_WEAK, combined_html),
|
|
92
|
+
has_whatsapp=wa_strong or _any(WHATSAPP_WEAK, combined_html),
|
|
93
|
+
has_online_booking=booking_strong or _any(BOOKING_WEAK, combined_html),
|
|
94
|
+
has_live_chat=chat_strong or _any(LIVE_CHAT_WEAK, combined_html),
|
|
95
|
+
has_pricing=_any(PRICING_WEAK, combined_html),
|
|
96
|
+
has_case_studies=_any(CASE_STUDY_WEAK, combined_html),
|
|
97
|
+
has_newsletter=news_strong or _any(NEWSLETTER_WEAK, combined_html),
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
strongly_detected = set()
|
|
101
|
+
if booking_strong:
|
|
102
|
+
strongly_detected.add("has_online_booking")
|
|
103
|
+
if chat_strong:
|
|
104
|
+
strongly_detected.add("has_live_chat")
|
|
105
|
+
if wa_strong:
|
|
106
|
+
strongly_detected.add("has_whatsapp")
|
|
107
|
+
if news_strong:
|
|
108
|
+
strongly_detected.add("has_newsletter")
|
|
109
|
+
|
|
110
|
+
social = SocialMedia()
|
|
111
|
+
for field, pattern in SOCIAL_PATTERNS.items():
|
|
112
|
+
m = re.search(pattern, combined_html, re.IGNORECASE)
|
|
113
|
+
if m:
|
|
114
|
+
# Skip share/intent links — keep only profile-looking URLs
|
|
115
|
+
url = m.group(0)
|
|
116
|
+
if "sharer" not in url and "intent" not in url and "/share" not in url:
|
|
117
|
+
setattr(social, field, url)
|
|
118
|
+
caps.has_social_links = any(getattr(social, f) for f in SOCIAL_PATTERNS)
|
|
119
|
+
|
|
120
|
+
seo = SEO(
|
|
121
|
+
title_tag_present=bool(re.search(r"<title[\s>]", combined_html, re.IGNORECASE)),
|
|
122
|
+
meta_description_present=bool(re.search(r'<meta[^>]+name=["\']description["\']', combined_html, re.IGNORECASE)),
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
tools = detect_tools(combined_html)
|
|
126
|
+
tool_names = {t["name"] for t in tools}
|
|
127
|
+
tracking = Tracking(
|
|
128
|
+
google_analytics="Google Analytics" in tool_names,
|
|
129
|
+
google_tag_manager="Google Tag Manager" in tool_names,
|
|
130
|
+
meta_pixel="Meta Pixel" in tool_names,
|
|
131
|
+
tiktok_pixel="TikTok Pixel" in tool_names,
|
|
132
|
+
linkedin_pixel="LinkedIn Insight" in tool_names,
|
|
133
|
+
hotjar="Hotjar" in tool_names,
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return caps, social, seo, tracking, tools, strongly_detected
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deterministic technology detection.
|
|
3
|
+
|
|
4
|
+
A pragmatic subset of Wappalyzer-style fingerprints: each entry matches a
|
|
5
|
+
substring/regex in the raw HTML. NO LLM involved — this is fact, not inference,
|
|
6
|
+
which is why it's the most defensible signal in the product.
|
|
7
|
+
|
|
8
|
+
For production, swap this dict for the full Wappalyzer fingerprint database
|
|
9
|
+
(https://github.com/enthec/webappanalyzer) — same matching approach, ~3000 apps.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
# name -> (category, [patterns], confidence)
|
|
15
|
+
FINGERPRINTS: dict[str, tuple[str, list[str], float]] = {
|
|
16
|
+
# CMS / site builders
|
|
17
|
+
"WordPress": ("cms", [r"wp-content", r"wp-includes"], 0.92),
|
|
18
|
+
"Shopify": ("ecommerce", [r"cdn\.shopify\.com", r"Shopify\.theme"], 0.95),
|
|
19
|
+
"Wix": ("cms", [r"static\.wixstatic\.com", r"_wixCssImports"], 0.9),
|
|
20
|
+
"Webflow": ("cms", [r"assets\.website-files\.com", r"webflow\.js", r"wf-"], 0.88),
|
|
21
|
+
"Squarespace": ("cms", [r"squarespace", r"static1\.squarespace\.com"], 0.9),
|
|
22
|
+
"WooCommerce": ("ecommerce", [r"woocommerce", r"wc-ajax"], 0.85),
|
|
23
|
+
# Analytics / tracking
|
|
24
|
+
"Google Analytics": ("analytics", [r"google-analytics\.com", r"gtag\(", r"ga\('create'"], 0.9),
|
|
25
|
+
"Google Tag Manager": ("analytics", [r"googletagmanager\.com"], 0.92),
|
|
26
|
+
"Meta Pixel": ("marketing_tracking", [r"fbq\(", r"connect\.facebook\.net/[a-z_]+/fbevents\.js"], 0.93),
|
|
27
|
+
"TikTok Pixel": ("marketing_tracking", [r"analytics\.tiktok\.com"], 0.9),
|
|
28
|
+
"LinkedIn Insight": ("marketing_tracking", [r"snap\.licdn\.com"], 0.9),
|
|
29
|
+
"Hotjar": ("analytics", [r"static\.hotjar\.com", r"hotjar"], 0.85),
|
|
30
|
+
# CRM / marketing / chat
|
|
31
|
+
"HubSpot": ("crm", [r"js\.hs-scripts\.com", r"hs-scripts"], 0.9),
|
|
32
|
+
"Intercom": ("live_chat", [r"widget\.intercom\.io", r"intercomSettings"], 0.9),
|
|
33
|
+
"Drift": ("live_chat", [r"js\.driftt\.com", r"drift\.com"], 0.88),
|
|
34
|
+
"Tidio": ("live_chat", [r"code\.tidio\.co"], 0.9),
|
|
35
|
+
"Tawk.to": ("live_chat", [r"embed\.tawk\.to"], 0.9),
|
|
36
|
+
"Crisp": ("live_chat", [r"client\.crisp\.chat"], 0.9),
|
|
37
|
+
"Mailchimp": ("email_marketing", [r"chimpstatic\.com", r"list-manage\.com"], 0.85),
|
|
38
|
+
"Klaviyo": ("email_marketing", [r"klaviyo"], 0.85),
|
|
39
|
+
# Booking / forms
|
|
40
|
+
"Calendly": ("booking", [r"calendly\.com"], 0.92),
|
|
41
|
+
"Fresha": ("booking", [r"fresha\.com"], 0.9),
|
|
42
|
+
"Acuity Scheduling": ("booking", [r"acuityscheduling\.com"], 0.9),
|
|
43
|
+
"Booksy": ("booking", [r"booksy\.com"], 0.9),
|
|
44
|
+
"SimplyBook": ("booking", [r"simplybook\.(me|it)"], 0.88),
|
|
45
|
+
"Typeform": ("forms", [r"typeform\.com"], 0.88),
|
|
46
|
+
"Jotform": ("forms", [r"jotform\.com"], 0.88),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def detect_tools(html: str) -> list[dict]:
|
|
51
|
+
"""Return a list of detected tools: {name, category, confidence, evidence}."""
|
|
52
|
+
found = []
|
|
53
|
+
for name, (category, patterns, confidence) in FINGERPRINTS.items():
|
|
54
|
+
for pat in patterns:
|
|
55
|
+
if re.search(pat, html, re.IGNORECASE):
|
|
56
|
+
found.append({
|
|
57
|
+
"name": name,
|
|
58
|
+
"category": category,
|
|
59
|
+
"confidence": confidence,
|
|
60
|
+
"evidence": f"matched /{pat}/",
|
|
61
|
+
})
|
|
62
|
+
break
|
|
63
|
+
return found
|