seoextract 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,294 @@
1
+ Metadata-Version: 2.4
2
+ Name: seoextract
3
+ Version: 0.1.0
4
+ Summary: A lightweight Python SEO audit engine that returns Pydantic structured output.
5
+ Author: Britto K
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ Requires-Dist: requests>=2.31.0
9
+ Requires-Dist: beautifulsoup4>=4.12.0
10
+ Requires-Dist: lxml>=5.0.0
11
+ Requires-Dist: pydantic>=2.0.0
12
+ Requires-Dist: python-dotenv>=1.0.0
13
+
14
+ # SEOExtractHF
15
+
16
+ <div align="center">
17
+
18
+ **A lightweight Python SEO audit engine with built-in Google Safe Browsing support.**
19
+
20
+ Returns validated **Pydantic structured output** that can be directly consumed by AI agents, dashboards, APIs, report generators, and automation pipelines.
21
+
22
+ </div>
23
+
24
+ ---
25
+
26
+ ## Features
27
+
28
+ - Website crawler
29
+ - Google Safe Browsing validation
30
+ - Technical SEO auditing
31
+ - Pydantic structured output
32
+ - Page-level SEO metrics
33
+ - Site-level SEO scoring
34
+ - Severity-based issue detection
35
+ - Duplicate title detection
36
+ - Duplicate meta description detection
37
+ - Canonical tag detection
38
+ - Viewport detection
39
+ - Schema.org detection
40
+ - Image alt-text validation
41
+ - Internal linking analysis
42
+ - Thin content detection
43
+
44
+ ---
45
+
46
+ # Installation
47
+
48
+ ```bash
49
+ pip install seoextracthf
50
+ ```
51
+
52
+ or install from source
53
+
54
+ ```bash
55
+ pip install -e .
56
+ ```
57
+
58
+ ---
59
+
60
+ # Requirements
61
+
62
+ - Python 3.10+
63
+ - Google Safe Browsing API Key
64
+
65
+ ---
66
+
67
+ # Google Safe Browsing Setup
68
+
69
+ SEOExtractHF checks every website against Google's Safe Browsing service **before crawling**.
70
+
71
+ If Google reports the website as unsafe, crawling is stopped automatically.
72
+
73
+ If no Google Safe Browsing API key is provided, safe_browsing.is_safe will be None.
74
+ ---
75
+
76
+ ## Option 1 (Recommended)
77
+
78
+ Create a `.env` file.
79
+
80
+ ```text
81
+ .env
82
+ ```
83
+
84
+ Add your API key.
85
+
86
+ ```env
87
+ GOOGLE_SAFE_BROWSING_API_KEY=YOUR_API_KEY
88
+ ```
89
+
90
+ SEOExtractHF automatically loads the API key.
91
+
92
+ No additional code is required.
93
+
94
+ ---
95
+
96
+ ## Option 2
97
+
98
+ Pass the API key manually.
99
+
100
+ ```python
101
+ from seoextracthf import SEOExtract
102
+
103
+ result = SEOExtract.audit(
104
+ "https://example.com",
105
+ safe_browsing_api_key="YOUR_API_KEY"
106
+ )
107
+ ```
108
+
109
+ When an API key is supplied manually, the `.env` file is **not used**.
110
+
111
+ ---
112
+
113
+ # Quick Start
114
+
115
+ ```python
116
+ from seoextracthf import SEOExtract
117
+
118
+ result = SEOExtract.audit(
119
+ "https://example.com"
120
+ )
121
+
122
+ print(result.model_dump_json(indent=2))
123
+ ```
124
+
125
+ ---
126
+
127
+ # Returned Object
128
+
129
+ SEOExtractHF returns a validated Pydantic model.
130
+
131
+ ```text
132
+ AuditResult
133
+
134
+ ├── url
135
+ ├── audit_date
136
+ ├── pages_crawled
137
+ ├── site_score
138
+ ├── grade
139
+ ├── total_issues
140
+ ├── critical_count
141
+ ├── warning_count
142
+ ├── info_count
143
+ ├── pages
144
+ ├── issues
145
+ └── safe_browsing
146
+ ```
147
+
148
+ ---
149
+
150
+ # Example
151
+
152
+ ```python
153
+ from seoextracthf import SEOExtract
154
+
155
+ result = SEOExtract.audit("https://example.com")
156
+
157
+ print(result.site_score)
158
+ print(result.grade)
159
+ print(result.safe_browsing)
160
+
161
+ for issue in result.issues:
162
+ print(issue.issue_type)
163
+ ```
164
+
165
+ ---
166
+
167
+ # Safe Browsing Result
168
+
169
+ ```python
170
+ {
171
+ "is_safe": True,
172
+ "threats": [],
173
+ "error": None
174
+ }
175
+ ```
176
+
177
+ If Google reports a threat:
178
+
179
+ ```python
180
+ {
181
+ "is_safe": False,
182
+ "threats": [
183
+ "MALWARE"
184
+ ],
185
+ "error": None
186
+ }
187
+ ```
188
+
189
+ SEOExtractHF immediately stops crawling unsafe websites.
190
+
191
+ ---
192
+
193
+ # Current SEO Checks
194
+
195
+ ## Page Quality
196
+
197
+ - Title validation
198
+ - Meta description validation
199
+ - H1 validation
200
+ - Thin content detection
201
+
202
+ ## Technical SEO
203
+
204
+ - Canonical tag
205
+ - Viewport meta tag
206
+ - Schema.org JSON-LD
207
+ - HTTP status validation
208
+
209
+ ## Images
210
+
211
+ - Missing ALT attributes
212
+
213
+ ## Links
214
+
215
+ - Internal link analysis
216
+
217
+ ## Site-wide Checks
218
+
219
+ - Duplicate titles
220
+ - Duplicate meta descriptions
221
+
222
+ ## Security
223
+
224
+ - Google Safe Browsing validation
225
+
226
+ ---
227
+
228
+ # Example Output
229
+
230
+ ```python
231
+ AuditResult(
232
+ site_score=91.0,
233
+ grade="A",
234
+ total_issues=4,
235
+ pages_crawled=15
236
+ )
237
+ ```
238
+
239
+ ---
240
+
241
+ # Project Structure
242
+
243
+ ```
244
+ seoextracthf/
245
+
246
+ ├── crawler.py
247
+ ├── parser.py
248
+ ├── rules.py
249
+ ├── scorer.py
250
+ ├── safe_browsing.py
251
+ ├── models.py
252
+ └── __init__.py
253
+ ```
254
+
255
+ ---
256
+
257
+ # Designed For
258
+
259
+ SEOExtractHF is designed to be used inside:
260
+
261
+ - AI SEO Agents
262
+ - LangGraph workflows
263
+ - FastAPI applications
264
+ - Streamlit dashboards
265
+ - Report generators
266
+ - CI/CD quality checks
267
+ - Data pipelines
268
+ - SEO automation tools
269
+
270
+ ---
271
+
272
+ # Dependencies
273
+
274
+ - beautifulsoup4
275
+ - lxml
276
+ - requests
277
+ - pydantic
278
+ - python-dotenv
279
+
280
+ ---
281
+
282
+ # License
283
+
284
+ MIT License
285
+
286
+ ---
287
+
288
+ # Author
289
+
290
+ **Britto K**
291
+
292
+ GitHub:
293
+
294
+ https://github.com/Britto1221# seoextract
@@ -0,0 +1,281 @@
1
+ # SEOExtractHF
2
+
3
+ <div align="center">
4
+
5
+ **A lightweight Python SEO audit engine with built-in Google Safe Browsing support.**
6
+
7
+ Returns validated **Pydantic structured output** that can be directly consumed by AI agents, dashboards, APIs, report generators, and automation pipelines.
8
+
9
+ </div>
10
+
11
+ ---
12
+
13
+ ## Features
14
+
15
+ - Website crawler
16
+ - Google Safe Browsing validation
17
+ - Technical SEO auditing
18
+ - Pydantic structured output
19
+ - Page-level SEO metrics
20
+ - Site-level SEO scoring
21
+ - Severity-based issue detection
22
+ - Duplicate title detection
23
+ - Duplicate meta description detection
24
+ - Canonical tag detection
25
+ - Viewport detection
26
+ - Schema.org detection
27
+ - Image alt-text validation
28
+ - Internal linking analysis
29
+ - Thin content detection
30
+
31
+ ---
32
+
33
+ # Installation
34
+
35
+ ```bash
36
+ pip install seoextracthf
37
+ ```
38
+
39
+ or install from source
40
+
41
+ ```bash
42
+ pip install -e .
43
+ ```
44
+
45
+ ---
46
+
47
+ # Requirements
48
+
49
+ - Python 3.10+
50
+ - Google Safe Browsing API Key
51
+
52
+ ---
53
+
54
+ # Google Safe Browsing Setup
55
+
56
+ SEOExtractHF checks every website against Google's Safe Browsing service **before crawling**.
57
+
58
+ If Google reports the website as unsafe, crawling is stopped automatically.
59
+
60
+ If no Google Safe Browsing API key is provided, safe_browsing.is_safe will be None.
61
+ ---
62
+
63
+ ## Option 1 (Recommended)
64
+
65
+ Create a `.env` file.
66
+
67
+ ```text
68
+ .env
69
+ ```
70
+
71
+ Add your API key.
72
+
73
+ ```env
74
+ GOOGLE_SAFE_BROWSING_API_KEY=YOUR_API_KEY
75
+ ```
76
+
77
+ SEOExtractHF automatically loads the API key.
78
+
79
+ No additional code is required.
80
+
81
+ ---
82
+
83
+ ## Option 2
84
+
85
+ Pass the API key manually.
86
+
87
+ ```python
88
+ from seoextracthf import SEOExtract
89
+
90
+ result = SEOExtract.audit(
91
+ "https://example.com",
92
+ safe_browsing_api_key="YOUR_API_KEY"
93
+ )
94
+ ```
95
+
96
+ When an API key is supplied manually, the `.env` file is **not used**.
97
+
98
+ ---
99
+
100
+ # Quick Start
101
+
102
+ ```python
103
+ from seoextracthf import SEOExtract
104
+
105
+ result = SEOExtract.audit(
106
+ "https://example.com"
107
+ )
108
+
109
+ print(result.model_dump_json(indent=2))
110
+ ```
111
+
112
+ ---
113
+
114
+ # Returned Object
115
+
116
+ SEOExtractHF returns a validated Pydantic model.
117
+
118
+ ```text
119
+ AuditResult
120
+
121
+ ├── url
122
+ ├── audit_date
123
+ ├── pages_crawled
124
+ ├── site_score
125
+ ├── grade
126
+ ├── total_issues
127
+ ├── critical_count
128
+ ├── warning_count
129
+ ├── info_count
130
+ ├── pages
131
+ ├── issues
132
+ └── safe_browsing
133
+ ```
134
+
135
+ ---
136
+
137
+ # Example
138
+
139
+ ```python
140
+ from seoextracthf import SEOExtract
141
+
142
+ result = SEOExtract.audit("https://example.com")
143
+
144
+ print(result.site_score)
145
+ print(result.grade)
146
+ print(result.safe_browsing)
147
+
148
+ for issue in result.issues:
149
+ print(issue.issue_type)
150
+ ```
151
+
152
+ ---
153
+
154
+ # Safe Browsing Result
155
+
156
+ ```python
157
+ {
158
+ "is_safe": True,
159
+ "threats": [],
160
+ "error": None
161
+ }
162
+ ```
163
+
164
+ If Google reports a threat:
165
+
166
+ ```python
167
+ {
168
+ "is_safe": False,
169
+ "threats": [
170
+ "MALWARE"
171
+ ],
172
+ "error": None
173
+ }
174
+ ```
175
+
176
+ SEOExtractHF immediately stops crawling unsafe websites.
177
+
178
+ ---
179
+
180
+ # Current SEO Checks
181
+
182
+ ## Page Quality
183
+
184
+ - Title validation
185
+ - Meta description validation
186
+ - H1 validation
187
+ - Thin content detection
188
+
189
+ ## Technical SEO
190
+
191
+ - Canonical tag
192
+ - Viewport meta tag
193
+ - Schema.org JSON-LD
194
+ - HTTP status validation
195
+
196
+ ## Images
197
+
198
+ - Missing ALT attributes
199
+
200
+ ## Links
201
+
202
+ - Internal link analysis
203
+
204
+ ## Site-wide Checks
205
+
206
+ - Duplicate titles
207
+ - Duplicate meta descriptions
208
+
209
+ ## Security
210
+
211
+ - Google Safe Browsing validation
212
+
213
+ ---
214
+
215
+ # Example Output
216
+
217
+ ```python
218
+ AuditResult(
219
+ site_score=91.0,
220
+ grade="A",
221
+ total_issues=4,
222
+ pages_crawled=15
223
+ )
224
+ ```
225
+
226
+ ---
227
+
228
+ # Project Structure
229
+
230
+ ```
231
+ seoextracthf/
232
+
233
+ ├── crawler.py
234
+ ├── parser.py
235
+ ├── rules.py
236
+ ├── scorer.py
237
+ ├── safe_browsing.py
238
+ ├── models.py
239
+ └── __init__.py
240
+ ```
241
+
242
+ ---
243
+
244
+ # Designed For
245
+
246
+ SEOExtractHF is designed to be used inside:
247
+
248
+ - AI SEO Agents
249
+ - LangGraph workflows
250
+ - FastAPI applications
251
+ - Streamlit dashboards
252
+ - Report generators
253
+ - CI/CD quality checks
254
+ - Data pipelines
255
+ - SEO automation tools
256
+
257
+ ---
258
+
259
+ # Dependencies
260
+
261
+ - beautifulsoup4
262
+ - lxml
263
+ - requests
264
+ - pydantic
265
+ - python-dotenv
266
+
267
+ ---
268
+
269
+ # License
270
+
271
+ MIT License
272
+
273
+ ---
274
+
275
+ # Author
276
+
277
+ **Britto K**
278
+
279
+ GitHub:
280
+
281
+ https://github.com/Britto1221# seoextract
@@ -0,0 +1,24 @@
1
+ [project]
2
+ name = "seoextract"
3
+ version = "0.1.0"
4
+ description = "A lightweight Python SEO audit engine that returns Pydantic structured output."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ authors = [
8
+ { name = "Britto K" }
9
+ ]
10
+ dependencies = [
11
+ "requests>=2.31.0",
12
+ "beautifulsoup4>=4.12.0",
13
+ "lxml>=5.0.0",
14
+ "pydantic>=2.0.0",
15
+ "python-dotenv>=1.0.0"
16
+ ]
17
+
18
+ [build-system]
19
+ requires = ["setuptools>=68.0.0", "wheel"]
20
+ build-backend = "setuptools.build_meta"
21
+
22
+ [tool.setuptools.packages.find]
23
+ where = ["."]
24
+ include = ["seoextract*"]
@@ -0,0 +1,3 @@
1
+ from .init import SEOExtract
2
+
3
+ __all__ = ["SEOExtract"]