seobuddy 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. seobuddy-0.2.0/.env.example +4 -0
  2. seobuddy-0.2.0/.github/workflows/publish-pypi.yml +29 -0
  3. seobuddy-0.2.0/.gitignore +22 -0
  4. seobuddy-0.2.0/LICENSE +21 -0
  5. seobuddy-0.2.0/PKG-INFO +449 -0
  6. seobuddy-0.2.0/README.md +417 -0
  7. seobuddy-0.2.0/docs/PUBLISHING.md +59 -0
  8. seobuddy-0.2.0/docs/REPORT_EXAMPLE.md +233 -0
  9. seobuddy-0.2.0/docs/TECHNICAL_MANUAL.md +411 -0
  10. seobuddy-0.2.0/docs/USER_MANUAL.md +348 -0
  11. seobuddy-0.2.0/docs/assets/terminal-preview.svg +80 -0
  12. seobuddy-0.2.0/plans/00-SEObuddy-initial-plan.md +209 -0
  13. seobuddy-0.2.0/pyproject.toml +57 -0
  14. seobuddy-0.2.0/scripts/publish-to-pypi.sh +61 -0
  15. seobuddy-0.2.0/src/seobuddy/__init__.py +10 -0
  16. seobuddy-0.2.0/src/seobuddy/__main__.py +3 -0
  17. seobuddy-0.2.0/src/seobuddy/auditor.py +85 -0
  18. seobuddy-0.2.0/src/seobuddy/checks/__init__.py +1 -0
  19. seobuddy-0.2.0/src/seobuddy/checks/base.py +167 -0
  20. seobuddy-0.2.0/src/seobuddy/checks/canonical.py +64 -0
  21. seobuddy-0.2.0/src/seobuddy/checks/content.py +61 -0
  22. seobuddy-0.2.0/src/seobuddy/checks/headings.py +57 -0
  23. seobuddy-0.2.0/src/seobuddy/checks/hreflang.py +155 -0
  24. seobuddy-0.2.0/src/seobuddy/checks/images.py +49 -0
  25. seobuddy-0.2.0/src/seobuddy/checks/jsonld.py +100 -0
  26. seobuddy-0.2.0/src/seobuddy/checks/links.py +119 -0
  27. seobuddy-0.2.0/src/seobuddy/checks/meta.py +59 -0
  28. seobuddy-0.2.0/src/seobuddy/checks/opengraph.py +37 -0
  29. seobuddy-0.2.0/src/seobuddy/checks/robots_check.py +51 -0
  30. seobuddy-0.2.0/src/seobuddy/checks/sitemap_check.py +100 -0
  31. seobuddy-0.2.0/src/seobuddy/checks/technical.py +53 -0
  32. seobuddy-0.2.0/src/seobuddy/checks/title.py +59 -0
  33. seobuddy-0.2.0/src/seobuddy/cli.py +225 -0
  34. seobuddy-0.2.0/src/seobuddy/crawler.py +180 -0
  35. seobuddy-0.2.0/src/seobuddy/display.py +257 -0
  36. seobuddy-0.2.0/src/seobuddy/html_utils.py +47 -0
  37. seobuddy-0.2.0/src/seobuddy/models.py +99 -0
  38. seobuddy-0.2.0/src/seobuddy/report.py +148 -0
  39. seobuddy-0.2.0/src/seobuddy/site_resources.py +251 -0
  40. seobuddy-0.2.0/src/seobuddy/url_utils.py +155 -0
  41. seobuddy-0.2.0/tests/conftest.py +24 -0
  42. seobuddy-0.2.0/tests/helpers.py +28 -0
  43. seobuddy-0.2.0/tests/test_auditor.py +42 -0
  44. seobuddy-0.2.0/tests/test_checks_base.py +31 -0
  45. seobuddy-0.2.0/tests/test_checks_headings.py +14 -0
  46. seobuddy-0.2.0/tests/test_checks_hreflang.py +57 -0
  47. seobuddy-0.2.0/tests/test_checks_jsonld.py +26 -0
  48. seobuddy-0.2.0/tests/test_checks_meta.py +18 -0
  49. seobuddy-0.2.0/tests/test_checks_opengraph.py +20 -0
  50. seobuddy-0.2.0/tests/test_checks_robots.py +32 -0
  51. seobuddy-0.2.0/tests/test_checks_sitemap.py +23 -0
  52. seobuddy-0.2.0/tests/test_checks_technical.py +17 -0
  53. seobuddy-0.2.0/tests/test_checks_title.py +27 -0
  54. seobuddy-0.2.0/tests/test_cli.py +62 -0
  55. seobuddy-0.2.0/tests/test_crawler.py +49 -0
  56. seobuddy-0.2.0/tests/test_crawler_limits.py +32 -0
  57. seobuddy-0.2.0/tests/test_crawler_robots.py +46 -0
  58. seobuddy-0.2.0/tests/test_models.py +9 -0
  59. seobuddy-0.2.0/tests/test_report.py +54 -0
  60. seobuddy-0.2.0/tests/test_robots_parser.py +27 -0
  61. seobuddy-0.2.0/tests/test_sitemap_parser.py +27 -0
  62. seobuddy-0.2.0/tests/test_url_utils.py +32 -0
@@ -0,0 +1,4 @@
1
+ # Copy to .env (gitignored) — never commit real values.
2
+ # Create token: https://pypi.org/manage/account/token/
3
+ TWINE_USERNAME=__token__
4
+ TWINE_PASSWORD=pypi-PASTE_YOUR_TOKEN_HERE
@@ -0,0 +1,29 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ id-token: write
11
+
12
+ jobs:
13
+ publish:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - name: Install build tools
23
+ run: python -m pip install --upgrade build
24
+
25
+ - name: Build package
26
+ run: python -m build
27
+
28
+ - name: Publish to PyPI
29
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,22 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .pytest_cache/
8
+
9
+ # Local audit reports (may include crawled URLs, emails, etc.)
10
+ *-report.md
11
+
12
+ # Secrets and credentials — never commit
13
+ .env
14
+ .env.*
15
+ !.env.example
16
+ *.pem
17
+ *.key
18
+ *.p12
19
+ *.pfx
20
+ credentials.json
21
+ secrets.json
22
+ .secrets/
seobuddy-0.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 NikitaY.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,449 @@
1
+ Metadata-Version: 2.4
2
+ Name: seobuddy
3
+ Version: 0.2.0
4
+ Summary: Technical SEO audit CLI — crawl a site and get terminal scores plus a Markdown report (not affiliated with seobuddy.com)
5
+ Project-URL: Homepage, https://nikitay.com/
6
+ Project-URL: Documentation, https://github.com/nikitaycs50/SEObuddy
7
+ Project-URL: Repository, https://github.com/nikitaycs50/SEObuddy
8
+ Project-URL: Issues, https://github.com/nikitaycs50/SEObuddy/issues
9
+ Author: NikitaY.com
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Keywords: audit,cli,crawler,seo,technical-seo
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Environment :: Console
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Topic :: Internet :: WWW/HTTP :: Site Management
22
+ Requires-Python: >=3.11
23
+ Requires-Dist: beautifulsoup4>=4.12
24
+ Requires-Dist: httpx[http2]>=0.27
25
+ Requires-Dist: lxml>=5.0
26
+ Requires-Dist: rich>=13.7
27
+ Requires-Dist: typer>=0.12
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
30
+ Requires-Dist: pytest>=8.0; extra == 'dev'
31
+ Description-Content-Type: text/markdown
32
+
33
+ # SEObuddy
34
+
35
+ <p align="center">
36
+ <strong>Technical SEO audits from your terminal — live scores, colour-coded results, Markdown reports.</strong>
37
+ </p>
38
+
39
+ <p align="center">
40
+ <a href="https://pypi.org/project/seobuddy/"><img src="https://img.shields.io/pypi/v/seobuddy.svg?style=for-the-badge&amp;labelColor=161b22&amp;color=3fb950" alt="PyPI version"></a>
41
+ <a href="https://pypi.org/project/seobuddy/"><img src="https://img.shields.io/pypi/pyversions/seobuddy.svg?style=for-the-badge&amp;labelColor=161b22&amp;color=58a6ff" alt="Python versions"></a>
42
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-MIT-58a6ff?style=for-the-badge&amp;labelColor=161b22" alt="License MIT"></a>
43
+ </p>
44
+
45
+ <p align="center">
46
+ <code>pip install seobuddy</code> &nbsp;→&nbsp; <code>seobuddy https://yoursite.com</code> &nbsp;→&nbsp; done.
47
+ </p>
48
+
49
+ <p align="center">
50
+ No clone. No build step. One package from <a href="https://pypi.org/project/seobuddy/">PyPI</a>.
51
+ </p>
52
+
53
+ ---
54
+
55
+ ## See it in action
56
+
57
+ <p align="center">
58
+ <img src="docs/assets/terminal-preview.svg" alt="SEObuddy terminal output: coloured progress, per-page score 64/100, category bars, and top issues" width="780">
59
+ </p>
60
+
61
+ <p align="center">
62
+ <em>Real Rich UI in your terminal — greens for passes, yellows for warnings, reds for failures.</em>
63
+ <br>
64
+ <a href="docs/REPORT_EXAMPLE.md">Full terminal + report examples</a>
65
+ </p>
66
+
67
+ **Install in seconds:**
68
+
69
+ ```bash
70
+ pip install seobuddy
71
+ seobuddy https://nikitay.com
72
+ ```
73
+
74
+ Prefer an isolated CLI? Use `pipx install seobuddy` instead of `pip`.
75
+
76
+ Crawl your site, watch scores update live, then open the timestamped `*-report.md` file. That is the whole workflow.
77
+
78
+ > **Note:** Open-source **technical SEO audit CLI** — not affiliated with the commercial product at [seobuddy.com](https://seobuddy.com/).
79
+
80
+ ## Documentation
81
+
82
+ Full manuals (same content as below, in more detail):
83
+
84
+ - [User Manual](docs/USER_MANUAL.md) — installation, usage, reading results, troubleshooting
85
+ - [Technical Manual](docs/TECHNICAL_MANUAL.md) — architecture, modules, scoring, tests, extension points
86
+ - [Report Example](docs/REPORT_EXAMPLE.md) — sample terminal summary and Markdown report (`google.com` audit)
87
+ - [Publishing](docs/PUBLISHING.md) — PyPI releases (manual upload and GitHub Actions)
88
+
89
+ ## Features
90
+
91
+ - **Async BFS crawler** — configurable depth, same-domain internal links only, deduplicated URLs
92
+ - **11 weighted per-page categories** — title, meta, Open Graph, JSON-LD, headings, content, links, images, canonical, hreflang, technical
93
+ - **Site-wide checks** — robots.txt validation and crawl enforcement, XML sitemap audit with coverage metrics
94
+ - **Per-page and site scores** — 0–100 with weighted category averages and letter grades
95
+ - **Rich terminal UI** — progress bars, color-coded scores, live per-page results, final summary panel
96
+ - **Markdown reports** — executive summary, score breakdown, page-by-page findings, prioritized recommendations
97
+
98
+ ## Requirements
99
+
100
+ - Python 3.11+
101
+
102
+ ## Quick start
103
+
104
+ | Step | Command |
105
+ |------|---------|
106
+ | **Install** | `pip install seobuddy` |
107
+ | **Audit** | `seobuddy https://nikitay.com` |
108
+ | **Report** | Open `yyyymmddhhmm-<hostname>-report.md` in the current directory |
109
+
110
+ Defaults: **depth 2**, **5** concurrent requests, **10s** timeout. Use `--depth 0` for a homepage-only check.
111
+
112
+ ```bash
113
+ seobuddy --help # all options
114
+ pip install -U seobuddy # upgrade
115
+ pipx install seobuddy # isolated global CLI (macOS/Linux)
116
+ ```
117
+
118
+ **Contributors** — clone and editable install:
119
+
120
+ ```bash
121
+ git clone https://github.com/nikitaycs50/SEObuddy.git && cd SEObuddy
122
+ python3 -m venv .venv && source .venv/bin/activate
123
+ pip install -e ".[dev]"
124
+ ```
125
+
126
+ ## Usage
127
+
128
+ ```bash
129
+ seobuddy <URL> [OPTIONS]
130
+ ```
131
+
132
+ ### Arguments
133
+
134
+ | Argument | Description |
135
+ |----------|-------------|
136
+ | `URL` | Full site URL to audit (`https://` recommended). Bare domains like `nikitay.com` are normalized to `https://`. |
137
+
138
+ ### Options
139
+
140
+ | Option | Default | Description |
141
+ |--------|---------|-------------|
142
+ | `--depth` | `2` | Link hops from the seed URL. `0` = seed page only. |
143
+ | `--max-pages` | `50` | Stop crawling after this many pages (avoids huge sites). |
144
+ | `--concurrency` | `5` | Maximum parallel HTTP requests during the crawl. |
145
+ | `--timeout` | `10` | Per-request timeout in seconds. |
146
+ | `--output-dir` | `.` | Folder where the Markdown report is saved. |
147
+ | `--user-agent` | Chrome 131 (desktop) | HTTP User-Agent on every request. Default mimics Chrome for compatibility; override to identify SEObuddy if required. |
148
+ | `--no-color` | off | Plain terminal output (no Rich colors). |
149
+
150
+ ### Examples
151
+
152
+ ```bash
153
+ # Default crawl (depth 2)
154
+ seobuddy https://example.com
155
+
156
+ # Homepage only (fast check)
157
+ seobuddy https://nikitay.com --depth 0
158
+
159
+ # Deeper crawl, more parallelism
160
+ seobuddy https://nikitay.com --depth 2 --concurrency 10
161
+
162
+ # Save reports to a folder
163
+ seobuddy https://nikitay.com --output-dir ./reports
164
+
165
+ # CI / logs (no colors)
166
+ seobuddy https://nikitay.com --no-color
167
+
168
+ # Custom User-Agent (default is Chrome-like)
169
+ seobuddy https://nikitay.com --user-agent "SEObuddy/0.2.0 (+https://nikitay.com)"
170
+
171
+ # Module entry point
172
+ python -m seobuddy https://nikitay.com
173
+ ```
174
+
175
+ ### Verified example: nikitay.com
176
+
177
+ | URL | Result |
178
+ |-----|--------|
179
+ | `https://nikitay.com` | Works — crawls the site, prints scores, writes a report |
180
+ | `https://niktiay.com` | **Fails** — hostname does not resolve (typo: missing **k** in *nikitay*) |
181
+
182
+ After `seobuddy https://nikitay.com --depth 1` you should see: startup banner, progress bar, one line per page, **SITE AUDIT COMPLETE** panel, and a report file in the output directory.
183
+
184
+ ## Understanding the terminal output
185
+
186
+ Rich renders **colour-coded** bars and grades in real time (disable with `--no-color`).
187
+
188
+ ### Per-page line (live)
189
+
190
+ | Colour | Score | Example |
191
+ |--------|-------|---------|
192
+ | 🟢 Green | 80–100 | `██████████ 100/100` · category `✓` |
193
+ | 🟡 Yellow | 60–79 | `████████░░ 64/100` · category `~` |
194
+ | 🟠 Orange | 40–59 | `█████░░░░░ 47/100` |
195
+ | 🔴 Red | 0–39 | `██░░░░░░░░ 2/100` · category `✗` |
196
+
197
+ ```text
198
+ ████████░░ 64/100 / — title ~ meta ~ og ✓ h1 ✓
199
+ ```
200
+
201
+ | Symbol | Meaning |
202
+ |--------|---------|
203
+ | `█` / `░` | Score bar (colour matches score band) |
204
+ | `✓` `~` `✗` | Category pass / warn / fail |
205
+
206
+ ### Final summary
207
+
208
+ Overall score, **letter grade**, category table (average score + pages OK ≥ 80), and **TOP ISSUES** by impact.
209
+
210
+ | Grade | Score range |
211
+ |-------|-------------|
212
+ | A | 90+ |
213
+ | B | 80–89 |
214
+ | C+ | 70–79 |
215
+ | C | 60–69 |
216
+ | D | 50–59 |
217
+ | F | Below 50 |
218
+
219
+ ## The Markdown report
220
+
221
+ **Filename:** `yyyymmddhhmm-<hostname>-report.md` — example: `202606041408-nikitay.com-report.md`
222
+
223
+ | Section | Contents |
224
+ |---------|----------|
225
+ | **Executive Summary** | Site score, date, seed URL, page count, duration, top 5 issues |
226
+ | **Score Breakdown** | All categories with scores and pages OK |
227
+ | **Page-by-Page Analysis** | Collapsible `<details>` per URL (findings + suggestions) |
228
+ | **Recommendations** | Prioritized, deduplicated action items |
229
+
230
+ Open in VS Code, Cursor, GitHub, or any Markdown viewer that supports HTML `<details>`.
231
+
232
+ ## What SEObuddy checks
233
+
234
+ Page score = weighted average of categories. Site score = average of all page scores.
235
+
236
+ | Category | Weight | What it looks for |
237
+ |----------|--------|-------------------|
238
+ | **Title** | 15% | `<title>` present, ~50–60 characters, unique across crawled pages |
239
+ | **Meta description** | 10% | `meta name="description"`, ~150–160 characters, unique |
240
+ | **Open Graph** | 9% | `og:title`, `og:description`, `og:image`, `og:url` |
241
+ | **JSON-LD** | 9% | Valid `application/ld+json`, schema type, required fields |
242
+ | **Headings** | 9% | Exactly one H1, logical heading order (no skipped levels) |
243
+ | **Content** | 15% | Word count (300+ tiers), text vs HTML ratio (target ≥ 15%) |
244
+ | **Links** | 9% | Internal links reachable (HEAD/GET probes), descriptive anchor text |
245
+ | **Images** | 9% | Non-empty `alt` on images, lazy loading where applicable |
246
+ | **Canonical** | 5% | `link rel="canonical"` present and consistent |
247
+ | **Hreflang** | 5% | `link rel="alternate" hreflang` tags, reciprocity across crawled pages |
248
+ | **Technical** | 5% | Viewport meta, HTTPS, reasonable URL length and structure |
249
+
250
+ **Site-wide** (reported separately from per-page weighted scores):
251
+
252
+ | Check | What it looks for |
253
+ |-------|-------------------|
254
+ | **Robots.txt** | Fetchable file, valid rules, crawl respects `Disallow` for your User-Agent |
255
+ | **Sitemap** | XML sitemap reachable, valid structure, coverage vs crawled URLs |
256
+
257
+ **Content tiers:** &lt;300 words → 0; 300–499 → 60; 500–799 → 80; 800+ → 100 (combined with text/HTML ratio).
258
+
259
+ **Links check:** up to 20 unique internal URLs probed per page; penalizes generic anchors (“click here”, “read more”, etc.).
260
+
261
+ ## Auditing large sites (e.g. Google)
262
+
263
+ Mega-sites expose thousands of locale and redirect URLs. SEObuddy caps crawls at **`--max-pages`** (default 50), dedupes by **path** (not query), skips utility routes (`/ml`, `/intl/…`, policies), drops non-crawlable **redirect targets**, and truncates long paths in the terminal. For Google, use `--depth 0` or `--max-pages 10`.
264
+
265
+ ## Crawl behavior
266
+
267
+ - **BFS** with path-based dedup; seed at depth `0`, enqueue links only if `depth < config.depth`
268
+ - **Same domain only** — external links are ignored (`www.` stripped when comparing netloc)
269
+ - **Skips:** `mailto:`, `tel:`, `javascript:`, `data:`, fragments-only, non-HTML, duplicate paths, `/cdn-cgi/`, `/ml`, long query strings
270
+ - **Redirects** — followed up to 5 hops; non-crawlable final URLs are not audited
271
+ - **Per page:** URL, status, headers, HTML body, fetch time (ms); pages yielded as they complete for live UI
272
+
273
+ ## Architecture
274
+
275
+ SEObuddy is a **Python 3.11+** package (Hatchling, `src/` layout) with a **Typer** CLI:
276
+
277
+ 1. Validates and normalizes the seed URL.
278
+ 2. **BFS-crawls** same-domain HTML with **httpx** (async).
279
+ 3. **Audits** each page with **BeautifulSoup** + **lxml** and eleven pluggable checks, plus site-wide robots.txt and sitemap checks.
280
+ 4. Renders **Rich** terminal UI and writes a **Markdown** report.
281
+
282
+ ```mermaid
283
+ flowchart LR
284
+ CLI[cli.py] --> Crawler[crawler.py]
285
+ Crawler -->|PageData| Auditor[auditor.py]
286
+ Auditor -->|PageAudit| Display[display.py]
287
+ Auditor -->|SiteAudit| Report[report.py]
288
+ Checks[checks/*] --> Auditor
289
+ ```
290
+
291
+ ### Entry points
292
+
293
+ | Mechanism | Target |
294
+ |-----------|--------|
295
+ | Console script `seobuddy` | `seobuddy.cli:app` |
296
+ | `python -m seobuddy` | `__main__.py` → `app()` |
297
+
298
+ `asyncio.run()` wraps `_run_audit` from the Typer command. Connection failures on the seed fetch exit with code **1**.
299
+
300
+ ### Core data models (`models.py`)
301
+
302
+ | Type | Purpose |
303
+ |------|---------|
304
+ | `AuditConfig` | CLI settings (depth, concurrency, timeout, paths, UA) |
305
+ | `PageData` | Raw crawl result: URLs, status, headers, HTML, timing |
306
+ | `CheckResult` | Category outcome: score, weight, status, findings, suggestions |
307
+ | `PageAudit` | One page’s checks + weighted score |
308
+ | `SiteContext` | Cross-page state: titles, metas, canonicals, fetched URLs |
309
+ | `SiteAudit` | Full run: pages, timing, `site_score` |
310
+
311
+ `CheckStatus`: `pass` | `warn` | `fail` (from score thresholds in `base.status_from_score`).
312
+
313
+ ### URL layer (`url_utils.py`)
314
+
315
+ - `normalize_url` — http/https only, lowercase host, strip fragment; `None` for non-crawlable paths
316
+ - `is_crawlable_path` — skips `/cdn-cgi/` (e.g. Cloudflare email protection)
317
+ - `same_domain` — compares netloc with `www.` stripped
318
+
319
+ ### Crawler (`crawler.py`)
320
+
321
+ `AsyncCrawler` batches queue waves up to `concurrency` with `asyncio.Semaphore`. `GET` with redirect following; `ConnectError` propagates on seed failure; other HTTP errors may yield `status_code=0`. Link extraction from `<a href>` when depth allows and response is HTML.
322
+
323
+ ### Auditor (`auditor.py`)
324
+
325
+ 1. Parse HTML with BeautifulSoup + lxml.
326
+ 2. Non-HTML or error status: stub checks except **technical** (URL/headers still audited).
327
+ 3. Sync checks: title, meta, opengraph, jsonld, headings, content, images, canonical, hreflang, technical.
328
+ 4. Async `links.check_async` with shared `httpx` client.
329
+ 5. `weighted_page_score(results)` and `path_display(final_url)`.
330
+
331
+ Checks update `SiteContext` for cross-page deduplication (titles, meta descriptions).
332
+
333
+ ### Scoring helpers (`checks/base.py`)
334
+
335
+ - `clamp_score`, `weighted_page_score`, `letter_grade`
336
+ - `aggregate_category_scores` — per-category mean + pages_ok (≥ 80)
337
+ - `top_issues` — impact = weight × (100 − category_avg)
338
+ - `collect_recommendations` — deduplicated suggestions by impact
339
+
340
+ ### Project structure
341
+
342
+ ```text
343
+ SEObuddy/
344
+ ├── LICENSE
345
+ ├── pyproject.toml
346
+ ├── README.md
347
+ ├── docs/
348
+ │ ├── USER_MANUAL.md
349
+ │ ├── TECHNICAL_MANUAL.md
350
+ │ ├── REPORT_EXAMPLE.md
351
+ │ └── PUBLISHING.md
352
+ ├── scripts/
353
+ │ └── publish-to-pypi.sh
354
+ ├── src/seobuddy/
355
+ │ ├── __init__.py # __version__
356
+ │ ├── __main__.py
357
+ │ ├── cli.py
358
+ │ ├── crawler.py
359
+ │ ├── auditor.py
360
+ │ ├── models.py
361
+ │ ├── url_utils.py
362
+ │ ├── site_resources.py
363
+ │ ├── display.py
364
+ │ ├── report.py
365
+ │ └── checks/
366
+ │ ├── base.py
367
+ │ ├── title.py … technical.py, hreflang.py
368
+ │ ├── robots_check.py, sitemap_check.py
369
+ └── tests/
370
+ ├── conftest.py
371
+ ├── helpers.py
372
+ └── test_*.py
373
+ ```
374
+
375
+ ## Development
376
+
377
+ ```bash
378
+ pip install -e ".[dev]"
379
+ pytest -q
380
+ ```
381
+
382
+ | Test file | Coverage |
383
+ |-----------|----------|
384
+ | `test_checks_*.py` | Individual check logic |
385
+ | `test_checks_base.py` | Grading and weighted scores |
386
+ | `test_auditor.py` | Full page audit integration |
387
+ | `test_crawler.py` | URL normalize, link extract, mock transport BFS |
388
+ | `test_report.py` | Filename format and report sections |
389
+
390
+ ### Manual verification
391
+
392
+ ```bash
393
+ pip install -e ".[dev]"
394
+ pytest -q
395
+ seobuddy https://nikitay.com --depth 2
396
+ seobuddy https://nikitay.com --depth 0
397
+ seobuddy https://nonexistent.invalid # expect exit 1
398
+ ```
399
+
400
+ Expect: terminal progress, per-page lines, report `*-nikitay.com-report.md` with four sections, graceful failure on invalid hosts (exit **1**).
401
+
402
+ ### Extension points
403
+
404
+ **New check:** add `checks/newcheck.py`, register weight in `CATEGORY_WEIGHTS` / `CATEGORY_ORDER` / `CATEGORY_LABELS`, wire in `auditor.py`, add tests.
405
+
406
+ **Crawl rules:** extend `_SKIP_PATH_PREFIXES` in `url_utils.py` or adjust BFS depth semantics in `AsyncCrawler.crawl`.
407
+
408
+ **CI / headless:** `--no-color` and a dedicated `--output-dir` for artifacts.
409
+
410
+ ## Known limitations (v0.2)
411
+
412
+ | Area | Limitation |
413
+ |------|------------|
414
+ | JavaScript rendering | Static HTML only (no browser execution) |
415
+ | Rate limiting | User-controlled via `--concurrency` only |
416
+ | Hreflang | HTML `<link>` tags only; no HTTP header alternates |
417
+ | Sitemap | Audited for coverage; not used to discover crawl URLs |
418
+ | Authentication | No logged-in page support |
419
+
420
+ ## Errors and troubleshooting
421
+
422
+ | Situation | Behavior | Exit code |
423
+ |-----------|----------|-----------|
424
+ | Successful audit | Report written, summary shown | `0` |
425
+ | Bad URL format | Clear error message | `1` |
426
+ | Host unreachable (DNS, connection) | `Could not connect to host` | `1` |
427
+ | Request timeout | Error message | `1` |
428
+
429
+ | Problem | What to try |
430
+ |---------|-------------|
431
+ | `command not found: seobuddy` | PyPI: `pip install seobuddy` or `pipx install seobuddy`. Dev: activate venv and `pip install -e .` |
432
+ | `externally-managed-environment` (Homebrew Python) | Use `pipx install seobuddy`, or a venv — avoid `pip install` into system Python |
433
+ | `Could not connect to host` | Check URL spelling, browser/curl, increase `--timeout` |
434
+ | Low content score on SPAs | Only HTTP HTML is analyzed, not client-rendered DOM |
435
+ | Unexpected extra pages | Some sites inject CDN paths; `/cdn-cgi/` is skipped |
436
+
437
+ ## Privacy and etiquette
438
+
439
+ - Only requests URLs you point it at, within same domain and depth you set.
440
+ - Default User-Agent mimics **Chrome desktop** so fewer sites block the crawler; set `--user-agent` explicitly if your policy requires an identifiable bot string.
441
+ - Crawl **respects robots.txt** `Disallow` rules for your User-Agent; use reasonable depth and concurrency on live sites.
442
+
443
+ ## License
444
+
445
+ MIT License — see [LICENSE](LICENSE).
446
+
447
+ Copyright © 2026 [NikitaY.com](https://nikitay.com/). Created by [NikitaY.com](https://nikitay.com/).
448
+
449
+ This project is a **technical SEO audit CLI** and is not affiliated with [seobuddy.com](https://seobuddy.com/).