seo-analysis 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seo_analysis-1.0.0/PKG-INFO +79 -0
- seo_analysis-1.0.0/README.md +57 -0
- seo_analysis-1.0.0/pyproject.toml +37 -0
- seo_analysis-1.0.0/setup.cfg +4 -0
- seo_analysis-1.0.0/src/seo_analysis/__init__.py +6 -0
- seo_analysis-1.0.0/src/seo_analysis/analyzer.py +331 -0
- seo_analysis-1.0.0/src/seo_analysis/cli.py +57 -0
- seo_analysis-1.0.0/src/seo_analysis.egg-info/PKG-INFO +79 -0
- seo_analysis-1.0.0/src/seo_analysis.egg-info/SOURCES.txt +11 -0
- seo_analysis-1.0.0/src/seo_analysis.egg-info/dependency_links.txt +1 -0
- seo_analysis-1.0.0/src/seo_analysis.egg-info/entry_points.txt +2 -0
- seo_analysis-1.0.0/src/seo_analysis.egg-info/requires.txt +4 -0
- seo_analysis-1.0.0/src/seo_analysis.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seo-analysis
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Spreadsheet-driven on-page SEO analysis for a list of domains and keywords.
|
|
5
|
+
Author: Devharsh Trivedi
|
|
6
|
+
License: GPL-3.0-or-later
|
|
7
|
+
Project-URL: Homepage, https://github.com/com-puter-tips/SEO-Analysis
|
|
8
|
+
Project-URL: Repository, https://github.com/com-puter-tips/SEO-Analysis
|
|
9
|
+
Keywords: seo,analysis,keywords,scraping,openpyxl,beautifulsoup
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
15
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: beautifulsoup4>=4.11
|
|
19
|
+
Requires-Dist: lxml>=4.9
|
|
20
|
+
Requires-Dist: openpyxl>=3.0
|
|
21
|
+
Requires-Dist: requests>=2.28
|
|
22
|
+
|
|
23
|
+
# SEO-Analysis
|
|
24
|
+
|
|
25
|
+
Spreadsheet-driven on-page SEO analysis for a list of domains and keywords.
|
|
26
|
+
|
|
27
|
+
Put keywords and URLs in an Excel workbook, run the tool, and it fills the sheet
|
|
28
|
+
with title / meta-description / heading / image / link / video / list-item
|
|
29
|
+
analysis for each page and colour-codes the cells (red = missing, yellow =
|
|
30
|
+
partial keyword match, white = good).
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
pip install seo-analysis
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Use it
|
|
39
|
+
|
|
40
|
+
Prepare a workbook (see `Test.xlsx`): column A = space-separated keywords,
|
|
41
|
+
column B = the URL, one row per check (the first row is a header). Then:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
seo-analysis Test.xlsx
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Options:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
seo-analysis path/to/book.xlsx --sheet Sheet1 --timeout 30 --user-agent "Mozilla/5.0"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
- `--sheet` worksheet name (default `Sheet1`)
|
|
54
|
+
- `--timeout` per-request timeout in seconds (default: none)
|
|
55
|
+
- `--user-agent` custom User-Agent header (helps with sites that block the default)
|
|
56
|
+
|
|
57
|
+
The results are written back into the same workbook, into columns C–T.
|
|
58
|
+
|
|
59
|
+
## Use it from Python
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from seo_analysis import analyze
|
|
63
|
+
|
|
64
|
+
analyze("Test.xlsx", sheet_name="Sheet1", timeout=30)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Backward compatible
|
|
68
|
+
|
|
69
|
+
The original usage still works unchanged — from the repo root:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
python SEO.py
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
which analyses `Test.xlsx` (Sheet1) in the current directory.
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
GPL-3.0-or-later.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# SEO-Analysis
|
|
2
|
+
|
|
3
|
+
Spreadsheet-driven on-page SEO analysis for a list of domains and keywords.
|
|
4
|
+
|
|
5
|
+
Put keywords and URLs in an Excel workbook, run the tool, and it fills the sheet
|
|
6
|
+
with title / meta-description / heading / image / link / video / list-item
|
|
7
|
+
analysis for each page and colour-codes the cells (red = missing, yellow =
|
|
8
|
+
partial keyword match, white = good).
|
|
9
|
+
|
|
10
|
+
## Install
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
pip install seo-analysis
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Use it
|
|
17
|
+
|
|
18
|
+
Prepare a workbook (see `Test.xlsx`): column A = space-separated keywords,
|
|
19
|
+
column B = the URL, one row per check (the first row is a header). Then:
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
seo-analysis Test.xlsx
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Options:
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
seo-analysis path/to/book.xlsx --sheet Sheet1 --timeout 30 --user-agent "Mozilla/5.0"
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
- `--sheet` worksheet name (default `Sheet1`)
|
|
32
|
+
- `--timeout` per-request timeout in seconds (default: none)
|
|
33
|
+
- `--user-agent` custom User-Agent header (helps with sites that block the default)
|
|
34
|
+
|
|
35
|
+
The results are written back into the same workbook, into columns C–T.
|
|
36
|
+
|
|
37
|
+
## Use it from Python
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from seo_analysis import analyze
|
|
41
|
+
|
|
42
|
+
analyze("Test.xlsx", sheet_name="Sheet1", timeout=30)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Backward compatible
|
|
46
|
+
|
|
47
|
+
The original usage still works unchanged — from the repo root:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
python SEO.py
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
which analyses `Test.xlsx` (Sheet1) in the current directory.
|
|
54
|
+
|
|
55
|
+
## License
|
|
56
|
+
|
|
57
|
+
GPL-3.0-or-later.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=64", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "seo-analysis"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Spreadsheet-driven on-page SEO analysis for a list of domains and keywords."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "GPL-3.0-or-later" }
|
|
12
|
+
authors = [{ name = "Devharsh Trivedi" }]
|
|
13
|
+
keywords = ["seo", "analysis", "keywords", "scraping", "openpyxl", "beautifulsoup"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
20
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"beautifulsoup4>=4.11",
|
|
24
|
+
"lxml>=4.9",
|
|
25
|
+
"openpyxl>=3.0",
|
|
26
|
+
"requests>=2.28",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.urls]
|
|
30
|
+
Homepage = "https://github.com/com-puter-tips/SEO-Analysis"
|
|
31
|
+
Repository = "https://github.com/com-puter-tips/SEO-Analysis"
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
seo-analysis = "seo_analysis.cli:main"
|
|
35
|
+
|
|
36
|
+
[tool.setuptools.packages.find]
|
|
37
|
+
where = ["src"]
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""On-page SEO analysis driven by an Excel workbook.
|
|
2
|
+
|
|
3
|
+
Behavior is preserved from the original SEO.py: read keywords (column A) and
|
|
4
|
+
URLs (column B) from the given worksheet, fetch each page, and fill columns
|
|
5
|
+
C-T with title/description/header/image/link/video/list analysis, colour-coding
|
|
6
|
+
cells. The only change is that the workbook path and sheet name are now
|
|
7
|
+
parameters (defaulting to "Test.xlsx" / "Sheet1") so it can be packaged.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
|
|
13
|
+
import openpyxl
|
|
14
|
+
import requests
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
from openpyxl.styles import Font, PatternFill
|
|
17
|
+
|
|
18
|
+
RED = [255, 0, 0]
|
|
19
|
+
YELLOW = [255, 255, 0]
|
|
20
|
+
WHITE = [255, 255, 255]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _fill(cell, rgb):
|
|
24
|
+
color_string = "".join([str(hex(i))[2:].upper().rjust(2, "0") for i in rgb])
|
|
25
|
+
cell.fill = PatternFill(
|
|
26
|
+
fill_type="solid",
|
|
27
|
+
start_color="FF" + color_string,
|
|
28
|
+
end_color="FF" + color_string,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _change_count(sheet, index, keywords, col):
|
|
33
|
+
count = 0
|
|
34
|
+
for kw in keywords:
|
|
35
|
+
if kw.lower() in sheet[col + index].value.lower():
|
|
36
|
+
count += 1
|
|
37
|
+
return count
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _change_color(sheet, index, count, col, lnth):
|
|
41
|
+
if count == 0:
|
|
42
|
+
_fill(sheet[col + index], RED)
|
|
43
|
+
elif count > 0 and count < lnth:
|
|
44
|
+
_fill(sheet[col + index], YELLOW)
|
|
45
|
+
else:
|
|
46
|
+
_fill(sheet[col + index], WHITE)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _header_count(sheet, soup, index, keywords, tag, col, lnth):
|
|
50
|
+
countall = 0
|
|
51
|
+
countfull = 0
|
|
52
|
+
countpart = 0
|
|
53
|
+
|
|
54
|
+
for h in soup.findAll(tag):
|
|
55
|
+
countall += 1
|
|
56
|
+
counttemp = 0
|
|
57
|
+
for kw in keywords:
|
|
58
|
+
if kw.lower() in h.get_text().lower():
|
|
59
|
+
counttemp += 1
|
|
60
|
+
|
|
61
|
+
if counttemp == lnth:
|
|
62
|
+
countfull += 1
|
|
63
|
+
elif counttemp > 0 and counttemp < lnth:
|
|
64
|
+
countpart += 1
|
|
65
|
+
|
|
66
|
+
sheet[col + index] = str(countall) + " - " + str(countfull) + " - " + str(countpart)
|
|
67
|
+
|
|
68
|
+
if countall == 0:
|
|
69
|
+
_fill(sheet[col + index], RED)
|
|
70
|
+
else:
|
|
71
|
+
_fill(sheet[col + index], WHITE)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def analyze(filepath="Test.xlsx", sheet_name="Sheet1", timeout=None, user_agent=None):
|
|
75
|
+
"""Run the SEO analysis over the workbook at ``filepath``.
|
|
76
|
+
|
|
77
|
+
This mirrors the original script exactly, including its console output and
|
|
78
|
+
error handling, so existing usage (analysing ``Test.xlsx``) is unchanged.
|
|
79
|
+
|
|
80
|
+
``timeout`` (seconds) and ``user_agent`` are optional. When left as ``None``
|
|
81
|
+
the HTTP request behaves identically to the original (no timeout, default
|
|
82
|
+
requests User-Agent).
|
|
83
|
+
"""
|
|
84
|
+
headers = {"User-Agent": user_agent} if user_agent else None
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
|
|
88
|
+
start = datetime.now()
|
|
89
|
+
print("\nprogram started at " + str(start))
|
|
90
|
+
|
|
91
|
+
wb = openpyxl.load_workbook(filepath)
|
|
92
|
+
sheet = wb[sheet_name]
|
|
93
|
+
list_URL = []
|
|
94
|
+
|
|
95
|
+
for cellObj in sheet["B"]:
|
|
96
|
+
list_URL.append(cellObj.value)
|
|
97
|
+
|
|
98
|
+
for idx, url in enumerate(list_URL[1:]):
|
|
99
|
+
if idx != "" and idx != None and url != "" and url != None:
|
|
100
|
+
print(
|
|
101
|
+
"\nprocessing ["
|
|
102
|
+
+ str(idx + 1)
|
|
103
|
+
+ "/"
|
|
104
|
+
+ str(len(list_URL) - 1)
|
|
105
|
+
+ "] --> "
|
|
106
|
+
+ url
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
page = requests.get(url, timeout=timeout, headers=headers)
|
|
110
|
+
soup = BeautifulSoup(page.text, "lxml")
|
|
111
|
+
index = str(idx + 2)
|
|
112
|
+
keywords = str(sheet["A" + index].value).split()
|
|
113
|
+
lnth = len(keywords)
|
|
114
|
+
|
|
115
|
+
sheet["C" + index].value = sheet["B" + index].value
|
|
116
|
+
count = 0
|
|
117
|
+
for kw in keywords:
|
|
118
|
+
if kw.lower() in sheet["B" + index].value.lower():
|
|
119
|
+
count += 1
|
|
120
|
+
_change_color(sheet, index, count, "C", lnth)
|
|
121
|
+
|
|
122
|
+
if soup.title == None:
|
|
123
|
+
sheet["D" + index] = "NO TITLE FOUND"
|
|
124
|
+
_fill(sheet["D" + index], RED)
|
|
125
|
+
else:
|
|
126
|
+
if soup.title.string != None:
|
|
127
|
+
sheet["D" + index] = soup.title.string
|
|
128
|
+
count = 0
|
|
129
|
+
for kw in keywords:
|
|
130
|
+
if kw.lower() in soup.title.string.lower():
|
|
131
|
+
count += 1
|
|
132
|
+
_change_color(sheet, index, count, "D", lnth)
|
|
133
|
+
else:
|
|
134
|
+
sheet["D" + index] = "NO TITLE FOUND"
|
|
135
|
+
_fill(sheet["D" + index], RED)
|
|
136
|
+
|
|
137
|
+
desc = soup.find(attrs={"name": "Description"})
|
|
138
|
+
|
|
139
|
+
if desc == None:
|
|
140
|
+
desc = soup.find(attrs={"name": "description"})
|
|
141
|
+
|
|
142
|
+
if desc != None:
|
|
143
|
+
if desc["content"] == None or len(desc["content"].strip()) == 0:
|
|
144
|
+
sheet["E" + index] = "NO DESCRIPTION FOUND"
|
|
145
|
+
else:
|
|
146
|
+
sheet["E" + index] = desc["content"]
|
|
147
|
+
else:
|
|
148
|
+
sheet["E" + index] = "NO DESCRIPTION FOUND"
|
|
149
|
+
|
|
150
|
+
count = _change_count(sheet, index, keywords, "E")
|
|
151
|
+
_change_color(sheet, index, count, "E", lnth)
|
|
152
|
+
|
|
153
|
+
count = 0
|
|
154
|
+
for h1 in soup.findAll("h1"):
|
|
155
|
+
count += 1
|
|
156
|
+
|
|
157
|
+
if count == 0:
|
|
158
|
+
sheet["F" + index] = "NO H1 FOUND"
|
|
159
|
+
elif count == 1:
|
|
160
|
+
if soup.find("h1") != None:
|
|
161
|
+
if len(soup.find("h1").get_text()) == 0:
|
|
162
|
+
sheet["F" + index] = "NO H1 FOUND"
|
|
163
|
+
else:
|
|
164
|
+
sheet["F" + index] = soup.find("h1").get_text()
|
|
165
|
+
else:
|
|
166
|
+
sheet["F" + index] = "NO H1 FOUND"
|
|
167
|
+
else:
|
|
168
|
+
sheet["F" + index] = (
|
|
169
|
+
"MEHR ALS 2 H1-ÜBERSCHRIFTEN (" + str(count) + " H1 FOUND)"
|
|
170
|
+
)
|
|
171
|
+
sheet["F" + index].font = Font(bold=True)
|
|
172
|
+
|
|
173
|
+
count = _change_count(sheet, index, keywords, "F")
|
|
174
|
+
_change_color(sheet, index, count, "F", lnth)
|
|
175
|
+
|
|
176
|
+
_header_count(sheet, soup, index, keywords, "h2", "G", lnth)
|
|
177
|
+
_header_count(sheet, soup, index, keywords, "h3", "H", lnth)
|
|
178
|
+
_header_count(sheet, soup, index, keywords, "h4", "I", lnth)
|
|
179
|
+
_header_count(sheet, soup, index, keywords, "h5", "J", lnth)
|
|
180
|
+
_header_count(sheet, soup, index, keywords, "h6", "K", lnth)
|
|
181
|
+
|
|
182
|
+
imflag = 0
|
|
183
|
+
for im in soup.findAll("img"):
|
|
184
|
+
wd = im.get("width")
|
|
185
|
+
if wd == None:
|
|
186
|
+
wd = 0
|
|
187
|
+
else:
|
|
188
|
+
wd = wd.replace(";", "")
|
|
189
|
+
wd = wd.replace("p", "")
|
|
190
|
+
wd = wd.replace("x", "")
|
|
191
|
+
|
|
192
|
+
if int(wd) >= 300:
|
|
193
|
+
ht = im.get("height")
|
|
194
|
+
if ht == None:
|
|
195
|
+
ht = 0
|
|
196
|
+
else:
|
|
197
|
+
ht = ht.replace(";", "")
|
|
198
|
+
ht = ht.replace("p", "")
|
|
199
|
+
ht = ht.replace("x", "")
|
|
200
|
+
|
|
201
|
+
if int(ht) >= 300:
|
|
202
|
+
src = im.get("src")
|
|
203
|
+
if src == None or len(src.strip()) == 0:
|
|
204
|
+
src = "NO SRC FOUND"
|
|
205
|
+
|
|
206
|
+
alt = im.get("alt")
|
|
207
|
+
if alt == None or len(alt.strip()) == 0:
|
|
208
|
+
alt = "NO ALT FOUND"
|
|
209
|
+
|
|
210
|
+
title = im.get("title")
|
|
211
|
+
if title == None or len(title.strip()) == 0:
|
|
212
|
+
title = "NO IMAGE-TITLE FOUND"
|
|
213
|
+
|
|
214
|
+
sheet["L" + index] = src
|
|
215
|
+
sheet["M" + index] = alt
|
|
216
|
+
sheet["N" + index] = title
|
|
217
|
+
sheet["O" + index] = ht
|
|
218
|
+
sheet["P" + index] = wd
|
|
219
|
+
|
|
220
|
+
imflag = 1
|
|
221
|
+
break
|
|
222
|
+
|
|
223
|
+
if not imflag:
|
|
224
|
+
sheet["L" + index] = "BILD FEHLT"
|
|
225
|
+
sheet["L" + index].font = Font(bold=True)
|
|
226
|
+
sheet["M" + index] = "BILD FEHLT"
|
|
227
|
+
sheet["M" + index].font = Font(bold=True)
|
|
228
|
+
sheet["N" + index] = "BILD FEHLT"
|
|
229
|
+
sheet["N" + index].font = Font(bold=True)
|
|
230
|
+
|
|
231
|
+
count = _change_count(sheet, index, keywords, "L")
|
|
232
|
+
_change_color(sheet, index, count, "L", lnth)
|
|
233
|
+
|
|
234
|
+
count = _change_count(sheet, index, keywords, "M")
|
|
235
|
+
_change_color(sheet, index, count, "M", lnth)
|
|
236
|
+
|
|
237
|
+
count = _change_count(sheet, index, keywords, "N")
|
|
238
|
+
_change_color(sheet, index, count, "N", lnth)
|
|
239
|
+
|
|
240
|
+
extlink = 0
|
|
241
|
+
intlink = 0
|
|
242
|
+
|
|
243
|
+
for a in soup.findAll("a", attrs={"href": True}):
|
|
244
|
+
if (
|
|
245
|
+
len(a["href"].strip()) > 1
|
|
246
|
+
and a["href"][0] != "#"
|
|
247
|
+
and "javascript:" not in a["href"].strip()
|
|
248
|
+
and "mailto:" not in a["href"].strip()
|
|
249
|
+
and "tel:" not in a["href"].strip()
|
|
250
|
+
):
|
|
251
|
+
if "http" in a["href"].strip() or "https" in a["href"].strip():
|
|
252
|
+
if (
|
|
253
|
+
urlparse(sheet["B" + index].value).netloc.lower()
|
|
254
|
+
in urlparse(a["href"].strip()).netloc.lower()
|
|
255
|
+
):
|
|
256
|
+
intlink += 1
|
|
257
|
+
else:
|
|
258
|
+
extlink += 1
|
|
259
|
+
else:
|
|
260
|
+
intlink += 1
|
|
261
|
+
|
|
262
|
+
sheet["Q" + index] = intlink
|
|
263
|
+
sheet["R" + index] = extlink
|
|
264
|
+
|
|
265
|
+
if intlink == 0:
|
|
266
|
+
_fill(sheet["Q" + index], RED)
|
|
267
|
+
else:
|
|
268
|
+
_fill(sheet["Q" + index], WHITE)
|
|
269
|
+
|
|
270
|
+
if extlink == 0:
|
|
271
|
+
_fill(sheet["R" + index], RED)
|
|
272
|
+
else:
|
|
273
|
+
_fill(sheet["R" + index], WHITE)
|
|
274
|
+
|
|
275
|
+
imflag = 0
|
|
276
|
+
for ifr in soup.findAll("iframe", attrs={"src": True}):
|
|
277
|
+
if "youtube.com" in ifr["src"]:
|
|
278
|
+
imflag = 1
|
|
279
|
+
sheet["S" + index] = ifr["src"]
|
|
280
|
+
_fill(sheet["S" + index], WHITE)
|
|
281
|
+
break
|
|
282
|
+
|
|
283
|
+
if not imflag:
|
|
284
|
+
sheet["S" + index] = "NO YOUTUBE VIDEO FOUND"
|
|
285
|
+
_fill(sheet["S" + index], RED)
|
|
286
|
+
|
|
287
|
+
li = soup.findAll("li")
|
|
288
|
+
sheet["T" + index] = len(li)
|
|
289
|
+
if len(li) == 0:
|
|
290
|
+
_fill(sheet["T" + index], RED)
|
|
291
|
+
else:
|
|
292
|
+
_fill(sheet["T" + index], WHITE)
|
|
293
|
+
|
|
294
|
+
else:
|
|
295
|
+
print(
|
|
296
|
+
"\nprocessing ["
|
|
297
|
+
+ str(idx + 1)
|
|
298
|
+
+ "/"
|
|
299
|
+
+ str(len(list_URL) - 1)
|
|
300
|
+
+ "] --> SKIPPING.. INVALID DOMAIN FOUND.."
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
wb.save(filepath)
|
|
304
|
+
end = datetime.now()
|
|
305
|
+
print("\nprogram finished at " + str(end))
|
|
306
|
+
print(
|
|
307
|
+
"\ntotal time taken is "
|
|
308
|
+
+ str((end - start).seconds)
|
|
309
|
+
+ "."
|
|
310
|
+
+ str((end - start).microseconds)
|
|
311
|
+
+ " seconds"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
except FileNotFoundError:
|
|
315
|
+
print("\ncould not find the file (" + filepath + "), please check path\n")
|
|
316
|
+
|
|
317
|
+
except PermissionError:
|
|
318
|
+
print("\nfile save failed, please close the file and run program again\n")
|
|
319
|
+
|
|
320
|
+
except IOError:
|
|
321
|
+
print(
|
|
322
|
+
"\nwebsite ("
|
|
323
|
+
+ url
|
|
324
|
+
+ ") not found, exiting program, correct URL and run program again\n"
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
except (NameError, TypeError, RuntimeError, KeyError):
|
|
328
|
+
print("\nsomething went wrong, exiting program\n")
|
|
329
|
+
|
|
330
|
+
except Exception as e:
|
|
331
|
+
print(e)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Command-line interface for SEO-Analysis."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
|
|
5
|
+
from . import __version__
|
|
6
|
+
from .analyzer import analyze
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main(argv=None):
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
prog="seo-analysis",
|
|
12
|
+
description=(
|
|
13
|
+
"Spreadsheet-driven on-page SEO analysis. Reads keywords (column A) "
|
|
14
|
+
"and URLs (column B) from an .xlsx workbook, fetches each page, and "
|
|
15
|
+
"fills columns C-T with title, description, header, image, link, "
|
|
16
|
+
"video and list-item analysis, colour-coding the cells."
|
|
17
|
+
),
|
|
18
|
+
)
|
|
19
|
+
parser.add_argument(
|
|
20
|
+
"file",
|
|
21
|
+
nargs="?",
|
|
22
|
+
default="Test.xlsx",
|
|
23
|
+
help="Path to the .xlsx workbook (default: Test.xlsx).",
|
|
24
|
+
)
|
|
25
|
+
parser.add_argument(
|
|
26
|
+
"--sheet",
|
|
27
|
+
default="Sheet1",
|
|
28
|
+
help="Worksheet name to read and write (default: Sheet1).",
|
|
29
|
+
)
|
|
30
|
+
parser.add_argument(
|
|
31
|
+
"--timeout",
|
|
32
|
+
type=float,
|
|
33
|
+
default=None,
|
|
34
|
+
help="Per-request timeout in seconds (default: no timeout).",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--user-agent",
|
|
38
|
+
default=None,
|
|
39
|
+
help="Custom User-Agent header for requests (default: requests default).",
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--version",
|
|
43
|
+
action="version",
|
|
44
|
+
version="%(prog)s " + __version__,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
args = parser.parse_args(argv)
|
|
48
|
+
analyze(
|
|
49
|
+
filepath=args.file,
|
|
50
|
+
sheet_name=args.sheet,
|
|
51
|
+
timeout=args.timeout,
|
|
52
|
+
user_agent=args.user_agent,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
if __name__ == "__main__":
|
|
57
|
+
main()
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seo-analysis
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Spreadsheet-driven on-page SEO analysis for a list of domains and keywords.
|
|
5
|
+
Author: Devharsh Trivedi
|
|
6
|
+
License: GPL-3.0-or-later
|
|
7
|
+
Project-URL: Homepage, https://github.com/com-puter-tips/SEO-Analysis
|
|
8
|
+
Project-URL: Repository, https://github.com/com-puter-tips/SEO-Analysis
|
|
9
|
+
Keywords: seo,analysis,keywords,scraping,openpyxl,beautifulsoup
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
15
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
16
|
+
Requires-Python: >=3.8
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
Requires-Dist: beautifulsoup4>=4.11
|
|
19
|
+
Requires-Dist: lxml>=4.9
|
|
20
|
+
Requires-Dist: openpyxl>=3.0
|
|
21
|
+
Requires-Dist: requests>=2.28
|
|
22
|
+
|
|
23
|
+
# SEO-Analysis
|
|
24
|
+
|
|
25
|
+
Spreadsheet-driven on-page SEO analysis for a list of domains and keywords.
|
|
26
|
+
|
|
27
|
+
Put keywords and URLs in an Excel workbook, run the tool, and it fills the sheet
|
|
28
|
+
with title / meta-description / heading / image / link / video / list-item
|
|
29
|
+
analysis for each page and colour-codes the cells (red = missing, yellow =
|
|
30
|
+
partial keyword match, white = good).
|
|
31
|
+
|
|
32
|
+
## Install
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
pip install seo-analysis
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Use it
|
|
39
|
+
|
|
40
|
+
Prepare a workbook (see `Test.xlsx`): column A = space-separated keywords,
|
|
41
|
+
column B = the URL, one row per check (the first row is a header). Then:
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
seo-analysis Test.xlsx
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Options:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
seo-analysis path/to/book.xlsx --sheet Sheet1 --timeout 30 --user-agent "Mozilla/5.0"
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
- `--sheet` worksheet name (default `Sheet1`)
|
|
54
|
+
- `--timeout` per-request timeout in seconds (default: none)
|
|
55
|
+
- `--user-agent` custom User-Agent header (helps with sites that block the default)
|
|
56
|
+
|
|
57
|
+
The results are written back into the same workbook, into columns C–T.
|
|
58
|
+
|
|
59
|
+
## Use it from Python
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
from seo_analysis import analyze
|
|
63
|
+
|
|
64
|
+
analyze("Test.xlsx", sheet_name="Sheet1", timeout=30)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Backward compatible
|
|
68
|
+
|
|
69
|
+
The original usage still works unchanged — from the repo root:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
python SEO.py
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
which analyses `Test.xlsx` (Sheet1) in the current directory.
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
GPL-3.0-or-later.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/seo_analysis/__init__.py
|
|
4
|
+
src/seo_analysis/analyzer.py
|
|
5
|
+
src/seo_analysis/cli.py
|
|
6
|
+
src/seo_analysis.egg-info/PKG-INFO
|
|
7
|
+
src/seo_analysis.egg-info/SOURCES.txt
|
|
8
|
+
src/seo_analysis.egg-info/dependency_links.txt
|
|
9
|
+
src/seo_analysis.egg-info/entry_points.txt
|
|
10
|
+
src/seo_analysis.egg-info/requires.txt
|
|
11
|
+
src/seo_analysis.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
seo_analysis
|