seoextract 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seoextract-0.1.0/PKG-INFO +294 -0
- seoextract-0.1.0/README.md +281 -0
- seoextract-0.1.0/pyproject.toml +24 -0
- seoextract-0.1.0/seoextract/__init__.py +3 -0
- seoextract-0.1.0/seoextract/crawler.py +148 -0
- seoextract-0.1.0/seoextract/init.py +84 -0
- seoextract-0.1.0/seoextract/models.py +116 -0
- seoextract-0.1.0/seoextract/parser.py +140 -0
- seoextract-0.1.0/seoextract/rules.py +268 -0
- seoextract-0.1.0/seoextract/safe_browsing.py +61 -0
- seoextract-0.1.0/seoextract/scorer.py +48 -0
- seoextract-0.1.0/seoextract.egg-info/PKG-INFO +294 -0
- seoextract-0.1.0/seoextract.egg-info/SOURCES.txt +15 -0
- seoextract-0.1.0/seoextract.egg-info/dependency_links.txt +1 -0
- seoextract-0.1.0/seoextract.egg-info/requires.txt +5 -0
- seoextract-0.1.0/seoextract.egg-info/top_level.txt +1 -0
- seoextract-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seoextract
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight Python SEO audit engine that returns Pydantic structured output.
|
|
5
|
+
Author: Britto K
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: requests>=2.31.0
|
|
9
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
10
|
+
Requires-Dist: lxml>=5.0.0
|
|
11
|
+
Requires-Dist: pydantic>=2.0.0
|
|
12
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
13
|
+
|
|
14
|
+
# SEOExtractHF
|
|
15
|
+
|
|
16
|
+
<div align="center">
|
|
17
|
+
|
|
18
|
+
**A lightweight Python SEO audit engine with built-in Google Safe Browsing support.**
|
|
19
|
+
|
|
20
|
+
Returns validated **Pydantic structured output** that can be directly consumed by AI agents, dashboards, APIs, report generators, and automation pipelines.
|
|
21
|
+
|
|
22
|
+
</div>
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
|
|
28
|
+
- Website crawler
|
|
29
|
+
- Google Safe Browsing validation
|
|
30
|
+
- Technical SEO auditing
|
|
31
|
+
- Pydantic structured output
|
|
32
|
+
- Page-level SEO metrics
|
|
33
|
+
- Site-level SEO scoring
|
|
34
|
+
- Severity-based issue detection
|
|
35
|
+
- Duplicate title detection
|
|
36
|
+
- Duplicate meta description detection
|
|
37
|
+
- Canonical tag detection
|
|
38
|
+
- Viewport detection
|
|
39
|
+
- Schema.org detection
|
|
40
|
+
- Image alt-text validation
|
|
41
|
+
- Internal linking analysis
|
|
42
|
+
- Thin content detection
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
# Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install seoextracthf
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
or install from source
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install -e .
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
# Requirements
|
|
61
|
+
|
|
62
|
+
- Python 3.10+
|
|
63
|
+
- Google Safe Browsing API Key
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
# Google Safe Browsing Setup
|
|
68
|
+
|
|
69
|
+
SEOExtractHF checks every website against Google's Safe Browsing service **before crawling**.
|
|
70
|
+
|
|
71
|
+
If Google reports the website as unsafe, crawling is stopped automatically.
|
|
72
|
+
|
|
73
|
+
If no Google Safe Browsing API key is provided, safe_browsing.is_safe will be None.
|
|
74
|
+
---
|
|
75
|
+
|
|
76
|
+
## Option 1 (Recommended)
|
|
77
|
+
|
|
78
|
+
Create a `.env` file.
|
|
79
|
+
|
|
80
|
+
```text
|
|
81
|
+
.env
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Add your API key.
|
|
85
|
+
|
|
86
|
+
```env
|
|
87
|
+
GOOGLE_SAFE_BROWSING_API_KEY=YOUR_API_KEY
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
SEOExtractHF automatically loads the API key.
|
|
91
|
+
|
|
92
|
+
No additional code is required.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Option 2
|
|
97
|
+
|
|
98
|
+
Pass the API key manually.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from seoextracthf import SEOExtract
|
|
102
|
+
|
|
103
|
+
result = SEOExtract.audit(
|
|
104
|
+
"https://example.com",
|
|
105
|
+
safe_browsing_api_key="YOUR_API_KEY"
|
|
106
|
+
)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
When an API key is supplied manually, the `.env` file is **not used**.
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
# Quick Start
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
from seoextracthf import SEOExtract
|
|
117
|
+
|
|
118
|
+
result = SEOExtract.audit(
|
|
119
|
+
"https://example.com"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
print(result.model_dump_json(indent=2))
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
# Returned Object
|
|
128
|
+
|
|
129
|
+
SEOExtractHF returns a validated Pydantic model.
|
|
130
|
+
|
|
131
|
+
```text
|
|
132
|
+
AuditResult
|
|
133
|
+
│
|
|
134
|
+
├── url
|
|
135
|
+
├── audit_date
|
|
136
|
+
├── pages_crawled
|
|
137
|
+
├── site_score
|
|
138
|
+
├── grade
|
|
139
|
+
├── total_issues
|
|
140
|
+
├── critical_count
|
|
141
|
+
├── warning_count
|
|
142
|
+
├── info_count
|
|
143
|
+
├── pages
|
|
144
|
+
├── issues
|
|
145
|
+
└── safe_browsing
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
# Example
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from seoextracthf import SEOExtract
|
|
154
|
+
|
|
155
|
+
result = SEOExtract.audit("https://example.com")
|
|
156
|
+
|
|
157
|
+
print(result.site_score)
|
|
158
|
+
print(result.grade)
|
|
159
|
+
print(result.safe_browsing)
|
|
160
|
+
|
|
161
|
+
for issue in result.issues:
|
|
162
|
+
print(issue.issue_type)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
# Safe Browsing Result
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
{
|
|
171
|
+
"is_safe": True,
|
|
172
|
+
"threats": [],
|
|
173
|
+
"error": None
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
If Google reports a threat:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
{
|
|
181
|
+
"is_safe": False,
|
|
182
|
+
"threats": [
|
|
183
|
+
"MALWARE"
|
|
184
|
+
],
|
|
185
|
+
"error": None
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
SEOExtractHF immediately stops crawling unsafe websites.
|
|
190
|
+
|
|
191
|
+
---
|
|
192
|
+
|
|
193
|
+
# Current SEO Checks
|
|
194
|
+
|
|
195
|
+
## Page Quality
|
|
196
|
+
|
|
197
|
+
- Title validation
|
|
198
|
+
- Meta description validation
|
|
199
|
+
- H1 validation
|
|
200
|
+
- Thin content detection
|
|
201
|
+
|
|
202
|
+
## Technical SEO
|
|
203
|
+
|
|
204
|
+
- Canonical tag
|
|
205
|
+
- Viewport meta tag
|
|
206
|
+
- Schema.org JSON-LD
|
|
207
|
+
- HTTP status validation
|
|
208
|
+
|
|
209
|
+
## Images
|
|
210
|
+
|
|
211
|
+
- Missing ALT attributes
|
|
212
|
+
|
|
213
|
+
## Links
|
|
214
|
+
|
|
215
|
+
- Internal link analysis
|
|
216
|
+
|
|
217
|
+
## Site-wide Checks
|
|
218
|
+
|
|
219
|
+
- Duplicate titles
|
|
220
|
+
- Duplicate meta descriptions
|
|
221
|
+
|
|
222
|
+
## Security
|
|
223
|
+
|
|
224
|
+
- Google Safe Browsing validation
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
# Example Output
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
AuditResult(
|
|
232
|
+
site_score=91.0,
|
|
233
|
+
grade="A",
|
|
234
|
+
total_issues=4,
|
|
235
|
+
pages_crawled=15
|
|
236
|
+
)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
# Project Structure
|
|
242
|
+
|
|
243
|
+
```
|
|
244
|
+
seoextracthf/
|
|
245
|
+
│
|
|
246
|
+
├── crawler.py
|
|
247
|
+
├── parser.py
|
|
248
|
+
├── rules.py
|
|
249
|
+
├── scorer.py
|
|
250
|
+
├── safe_browsing.py
|
|
251
|
+
├── models.py
|
|
252
|
+
└── __init__.py
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
# Designed For
|
|
258
|
+
|
|
259
|
+
SEOExtractHF is designed to be used inside:
|
|
260
|
+
|
|
261
|
+
- AI SEO Agents
|
|
262
|
+
- LangGraph workflows
|
|
263
|
+
- FastAPI applications
|
|
264
|
+
- Streamlit dashboards
|
|
265
|
+
- Report generators
|
|
266
|
+
- CI/CD quality checks
|
|
267
|
+
- Data pipelines
|
|
268
|
+
- SEO automation tools
|
|
269
|
+
|
|
270
|
+
---
|
|
271
|
+
|
|
272
|
+
# Dependencies
|
|
273
|
+
|
|
274
|
+
- beautifulsoup4
|
|
275
|
+
- lxml
|
|
276
|
+
- requests
|
|
277
|
+
- pydantic
|
|
278
|
+
- python-dotenv
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
# License
|
|
283
|
+
|
|
284
|
+
MIT License
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
# Author
|
|
289
|
+
|
|
290
|
+
**Britto K**
|
|
291
|
+
|
|
292
|
+
GitHub:
|
|
293
|
+
|
|
294
|
+
https://github.com/Britto1221# seoextract
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
# SEOExtractHF
|
|
2
|
+
|
|
3
|
+
<div align="center">
|
|
4
|
+
|
|
5
|
+
**A lightweight Python SEO audit engine with built-in Google Safe Browsing support.**
|
|
6
|
+
|
|
7
|
+
Returns validated **Pydantic structured output** that can be directly consumed by AI agents, dashboards, APIs, report generators, and automation pipelines.
|
|
8
|
+
|
|
9
|
+
</div>
|
|
10
|
+
|
|
11
|
+
---
|
|
12
|
+
|
|
13
|
+
## Features
|
|
14
|
+
|
|
15
|
+
- Website crawler
|
|
16
|
+
- Google Safe Browsing validation
|
|
17
|
+
- Technical SEO auditing
|
|
18
|
+
- Pydantic structured output
|
|
19
|
+
- Page-level SEO metrics
|
|
20
|
+
- Site-level SEO scoring
|
|
21
|
+
- Severity-based issue detection
|
|
22
|
+
- Duplicate title detection
|
|
23
|
+
- Duplicate meta description detection
|
|
24
|
+
- Canonical tag detection
|
|
25
|
+
- Viewport detection
|
|
26
|
+
- Schema.org detection
|
|
27
|
+
- Image alt-text validation
|
|
28
|
+
- Internal linking analysis
|
|
29
|
+
- Thin content detection
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
# Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install seoextracthf
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
or install from source
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e .
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
# Requirements
|
|
48
|
+
|
|
49
|
+
- Python 3.10+
|
|
50
|
+
- Google Safe Browsing API Key
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
# Google Safe Browsing Setup
|
|
55
|
+
|
|
56
|
+
SEOExtractHF checks every website against Google's Safe Browsing service **before crawling**.
|
|
57
|
+
|
|
58
|
+
If Google reports the website as unsafe, crawling is stopped automatically.
|
|
59
|
+
|
|
60
|
+
If no Google Safe Browsing API key is provided, safe_browsing.is_safe will be None.
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Option 1 (Recommended)
|
|
64
|
+
|
|
65
|
+
Create a `.env` file.
|
|
66
|
+
|
|
67
|
+
```text
|
|
68
|
+
.env
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Add your API key.
|
|
72
|
+
|
|
73
|
+
```env
|
|
74
|
+
GOOGLE_SAFE_BROWSING_API_KEY=YOUR_API_KEY
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
SEOExtractHF automatically loads the API key.
|
|
78
|
+
|
|
79
|
+
No additional code is required.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Option 2
|
|
84
|
+
|
|
85
|
+
Pass the API key manually.
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from seoextracthf import SEOExtract
|
|
89
|
+
|
|
90
|
+
result = SEOExtract.audit(
|
|
91
|
+
"https://example.com",
|
|
92
|
+
safe_browsing_api_key="YOUR_API_KEY"
|
|
93
|
+
)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
When an API key is supplied manually, the `.env` file is **not used**.
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
# Quick Start
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from seoextracthf import SEOExtract
|
|
104
|
+
|
|
105
|
+
result = SEOExtract.audit(
|
|
106
|
+
"https://example.com"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
print(result.model_dump_json(indent=2))
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
# Returned Object
|
|
115
|
+
|
|
116
|
+
SEOExtractHF returns a validated Pydantic model.
|
|
117
|
+
|
|
118
|
+
```text
|
|
119
|
+
AuditResult
|
|
120
|
+
│
|
|
121
|
+
├── url
|
|
122
|
+
├── audit_date
|
|
123
|
+
├── pages_crawled
|
|
124
|
+
├── site_score
|
|
125
|
+
├── grade
|
|
126
|
+
├── total_issues
|
|
127
|
+
├── critical_count
|
|
128
|
+
├── warning_count
|
|
129
|
+
├── info_count
|
|
130
|
+
├── pages
|
|
131
|
+
├── issues
|
|
132
|
+
└── safe_browsing
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
---
|
|
136
|
+
|
|
137
|
+
# Example
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from seoextracthf import SEOExtract
|
|
141
|
+
|
|
142
|
+
result = SEOExtract.audit("https://example.com")
|
|
143
|
+
|
|
144
|
+
print(result.site_score)
|
|
145
|
+
print(result.grade)
|
|
146
|
+
print(result.safe_browsing)
|
|
147
|
+
|
|
148
|
+
for issue in result.issues:
|
|
149
|
+
print(issue.issue_type)
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
---
|
|
153
|
+
|
|
154
|
+
# Safe Browsing Result
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
{
|
|
158
|
+
"is_safe": True,
|
|
159
|
+
"threats": [],
|
|
160
|
+
"error": None
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
If Google reports a threat:
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
{
|
|
168
|
+
"is_safe": False,
|
|
169
|
+
"threats": [
|
|
170
|
+
"MALWARE"
|
|
171
|
+
],
|
|
172
|
+
"error": None
|
|
173
|
+
}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
SEOExtractHF immediately stops crawling unsafe websites.
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
# Current SEO Checks
|
|
181
|
+
|
|
182
|
+
## Page Quality
|
|
183
|
+
|
|
184
|
+
- Title validation
|
|
185
|
+
- Meta description validation
|
|
186
|
+
- H1 validation
|
|
187
|
+
- Thin content detection
|
|
188
|
+
|
|
189
|
+
## Technical SEO
|
|
190
|
+
|
|
191
|
+
- Canonical tag
|
|
192
|
+
- Viewport meta tag
|
|
193
|
+
- Schema.org JSON-LD
|
|
194
|
+
- HTTP status validation
|
|
195
|
+
|
|
196
|
+
## Images
|
|
197
|
+
|
|
198
|
+
- Missing ALT attributes
|
|
199
|
+
|
|
200
|
+
## Links
|
|
201
|
+
|
|
202
|
+
- Internal link analysis
|
|
203
|
+
|
|
204
|
+
## Site-wide Checks
|
|
205
|
+
|
|
206
|
+
- Duplicate titles
|
|
207
|
+
- Duplicate meta descriptions
|
|
208
|
+
|
|
209
|
+
## Security
|
|
210
|
+
|
|
211
|
+
- Google Safe Browsing validation
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
# Example Output
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
AuditResult(
|
|
219
|
+
site_score=91.0,
|
|
220
|
+
grade="A",
|
|
221
|
+
total_issues=4,
|
|
222
|
+
pages_crawled=15
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
# Project Structure
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
seoextracthf/
|
|
232
|
+
│
|
|
233
|
+
├── crawler.py
|
|
234
|
+
├── parser.py
|
|
235
|
+
├── rules.py
|
|
236
|
+
├── scorer.py
|
|
237
|
+
├── safe_browsing.py
|
|
238
|
+
├── models.py
|
|
239
|
+
└── __init__.py
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
# Designed For
|
|
245
|
+
|
|
246
|
+
SEOExtractHF is designed to be used inside:
|
|
247
|
+
|
|
248
|
+
- AI SEO Agents
|
|
249
|
+
- LangGraph workflows
|
|
250
|
+
- FastAPI applications
|
|
251
|
+
- Streamlit dashboards
|
|
252
|
+
- Report generators
|
|
253
|
+
- CI/CD quality checks
|
|
254
|
+
- Data pipelines
|
|
255
|
+
- SEO automation tools
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
# Dependencies
|
|
260
|
+
|
|
261
|
+
- beautifulsoup4
|
|
262
|
+
- lxml
|
|
263
|
+
- requests
|
|
264
|
+
- pydantic
|
|
265
|
+
- python-dotenv
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
# License
|
|
270
|
+
|
|
271
|
+
MIT License
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
# Author
|
|
276
|
+
|
|
277
|
+
**Britto K**
|
|
278
|
+
|
|
279
|
+
GitHub:
|
|
280
|
+
|
|
281
|
+
https://github.com/Britto1221# seoextract
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "seoextract"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A lightweight Python SEO audit engine that returns Pydantic structured output."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Britto K" }
|
|
9
|
+
]
|
|
10
|
+
dependencies = [
|
|
11
|
+
"requests>=2.31.0",
|
|
12
|
+
"beautifulsoup4>=4.12.0",
|
|
13
|
+
"lxml>=5.0.0",
|
|
14
|
+
"pydantic>=2.0.0",
|
|
15
|
+
"python-dotenv>=1.0.0"
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["setuptools>=68.0.0", "wheel"]
|
|
20
|
+
build-backend = "setuptools.build_meta"
|
|
21
|
+
|
|
22
|
+
[tool.setuptools.packages.find]
|
|
23
|
+
where = ["."]
|
|
24
|
+
include = ["seoextract*"]
|