sitemapxml 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sitemapxml-0.1.0/PKG-INFO +49 -0
- sitemapxml-0.1.0/README.md +35 -0
- sitemapxml-0.1.0/pyproject.toml +28 -0
- sitemapxml-0.1.0/setup.cfg +4 -0
- sitemapxml-0.1.0/src/sitemapxml/__init__.py +6 -0
- sitemapxml-0.1.0/src/sitemapxml/cli.py +129 -0
- sitemapxml-0.1.0/src/sitemapxml.egg-info/PKG-INFO +49 -0
- sitemapxml-0.1.0/src/sitemapxml.egg-info/SOURCES.txt +10 -0
- sitemapxml-0.1.0/src/sitemapxml.egg-info/dependency_links.txt +1 -0
- sitemapxml-0.1.0/src/sitemapxml.egg-info/entry_points.txt +2 -0
- sitemapxml-0.1.0/src/sitemapxml.egg-info/requires.txt +2 -0
- sitemapxml-0.1.0/src/sitemapxml.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sitemapxml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool to parse sitemaps and extract URL metadata into a CSV format.
|
|
5
|
+
Author-email: Amal Alexander <amalalex95@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://www.linkedin.com/in/amal-alexander-305780131/
|
|
7
|
+
Project-URL: Bug Tracker, https://www.linkedin.com/in/amal-alexander-305780131/
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.7
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: requests
|
|
13
|
+
Requires-Dist: beautifulsoup4
|
|
14
|
+
|
|
15
|
+
# sitemapxml
|
|
16
|
+
|
|
17
|
+
`sitemapxml` is a powerful and fast command-line tool that extracts all URLs from a given XML sitemap, fetches each URL, and generates a comprehensive CSV report containing:
|
|
18
|
+
- Extract all sitemap URLs
|
|
19
|
+
- HTTP Status Code
|
|
20
|
+
- Title Tag
|
|
21
|
+
- Meta Description
|
|
22
|
+
- Content Length
|
|
23
|
+
- Canonical URL
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
Install via pip:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install sitemapxml
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
Simply run the CLI command and pass the URL of the sitemap:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
sitemapxml https://example.com/sitemap.xml
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
This will automatically create a `sitemap_report.csv` file in your current directory containing all the extracted metrics. You can also specify an output file:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
sitemapxml https://example.com/sitemap.xml -o my_report.csv
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Author
|
|
48
|
+
- Email: amalalex95@gmail.com
|
|
49
|
+
- LinkedIn: [Amal Alexander](https://www.linkedin.com/in/amal-alexander-305780131/)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# sitemapxml
|
|
2
|
+
|
|
3
|
+
`sitemapxml` is a powerful and fast command-line tool that extracts all URLs from a given XML sitemap, fetches each URL, and generates a comprehensive CSV report containing:
|
|
4
|
+
- Extract all sitemap URLs
|
|
5
|
+
- HTTP Status Code
|
|
6
|
+
- Title Tag
|
|
7
|
+
- Meta Description
|
|
8
|
+
- Content Length
|
|
9
|
+
- Canonical URL
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
|
|
13
|
+
Install via pip:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install sitemapxml
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Usage
|
|
20
|
+
|
|
21
|
+
Simply run the CLI command and pass the URL of the sitemap:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
sitemapxml https://example.com/sitemap.xml
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
This will automatically create a `sitemap_report.csv` file in your current directory containing all the extracted metrics. You can also specify an output file:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
sitemapxml https://example.com/sitemap.xml -o my_report.csv
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Author
|
|
34
|
+
- Email: amalalex95@gmail.com
|
|
35
|
+
- LinkedIn: [Amal Alexander](https://www.linkedin.com/in/amal-alexander-305780131/)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "sitemapxml"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Amal Alexander", email="amalalex95@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A CLI tool to parse sitemaps and extract URL metadata into a CSV format."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.7"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"requests",
|
|
20
|
+
"beautifulsoup4"
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
"Homepage" = "https://www.linkedin.com/in/amal-alexander-305780131/"
|
|
25
|
+
"Bug Tracker" = "https://www.linkedin.com/in/amal-alexander-305780131/"
|
|
26
|
+
|
|
27
|
+
[project.scripts]
|
|
28
|
+
sitemapxml = "sitemapxml.cli:main"
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import csv
|
|
3
|
+
import concurrent.futures
|
|
4
|
+
import requests
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
import sys
|
|
7
|
+
import threading
|
|
8
|
+
|
|
9
|
+
def get_sitemap_urls(url, headers):
|
|
10
|
+
print(f"Fetching sitemap: {url}")
|
|
11
|
+
try:
|
|
12
|
+
response = requests.get(url, headers=headers, timeout=15)
|
|
13
|
+
response.raise_for_status()
|
|
14
|
+
except Exception as e:
|
|
15
|
+
print(f"Error fetching sitemap {url}: {e}")
|
|
16
|
+
return []
|
|
17
|
+
|
|
18
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
19
|
+
|
|
20
|
+
urls = []
|
|
21
|
+
# If the sitemap is an index, fetch sub-sitemaps
|
|
22
|
+
sitemaps = soup.find_all('sitemap')
|
|
23
|
+
for sitemap in sitemaps:
|
|
24
|
+
loc = sitemap.find('loc')
|
|
25
|
+
if loc and loc.text:
|
|
26
|
+
urls.extend(get_sitemap_urls(loc.text.strip(), headers))
|
|
27
|
+
|
|
28
|
+
# If the sitemap contains urls
|
|
29
|
+
url_tags = soup.find_all('url')
|
|
30
|
+
for url_tag in url_tags:
|
|
31
|
+
loc = url_tag.find('loc')
|
|
32
|
+
if loc and loc.text:
|
|
33
|
+
urls.append(loc.text.strip())
|
|
34
|
+
|
|
35
|
+
return list(set(urls))
|
|
36
|
+
|
|
37
|
+
def process_url(url, headers):
|
|
38
|
+
result = {
|
|
39
|
+
'url': url,
|
|
40
|
+
'status_code': None,
|
|
41
|
+
'title': None,
|
|
42
|
+
'description': None,
|
|
43
|
+
'content_length': None,
|
|
44
|
+
'canonical': None
|
|
45
|
+
}
|
|
46
|
+
try:
|
|
47
|
+
response = requests.get(url, headers=headers, timeout=15)
|
|
48
|
+
result['status_code'] = response.status_code
|
|
49
|
+
result['content_length'] = len(response.content)
|
|
50
|
+
|
|
51
|
+
# Only parse HTML pages for meta tags
|
|
52
|
+
content_type = response.headers.get('Content-Type', '')
|
|
53
|
+
if 'text/html' in content_type:
|
|
54
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
|
55
|
+
|
|
56
|
+
# Extract Title
|
|
57
|
+
if soup.title and soup.title.string:
|
|
58
|
+
result['title'] = soup.title.string.strip()
|
|
59
|
+
|
|
60
|
+
# Extract Description
|
|
61
|
+
desc_tag = soup.find('meta', attrs={'name': 'description'})
|
|
62
|
+
if not desc_tag:
|
|
63
|
+
desc_tag = soup.find('meta', attrs={'property': 'og:description'})
|
|
64
|
+
if desc_tag and desc_tag.get('content'):
|
|
65
|
+
result['description'] = desc_tag['content'].strip()
|
|
66
|
+
|
|
67
|
+
# Extract Canonical
|
|
68
|
+
canonical_tag = soup.find('link', attrs={'rel': 'canonical'})
|
|
69
|
+
if canonical_tag and canonical_tag.get('href'):
|
|
70
|
+
result['canonical'] = canonical_tag['href'].strip()
|
|
71
|
+
|
|
72
|
+
except requests.exceptions.RequestException as e:
|
|
73
|
+
result['status_code'] = str(e)
|
|
74
|
+
except Exception:
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
return result
|
|
78
|
+
|
|
79
|
+
def main():
|
|
80
|
+
parser = argparse.ArgumentParser(description="Extract URLs and metadata from an XML API Sitemap.")
|
|
81
|
+
parser.add_argument("url", help="The URL of the sitemap.xml to parse")
|
|
82
|
+
parser.add_argument("-o", "--output", default="sitemap_report.csv", help="Output CSV file name (default: sitemap_report.csv)")
|
|
83
|
+
parser.add_argument("-w", "--workers", type=int, default=10, help="Number of concurrent workers (default: 10)")
|
|
84
|
+
|
|
85
|
+
args = parser.parse_args()
|
|
86
|
+
headers = {'User-Agent': 'sitemapxml/0.1.0'}
|
|
87
|
+
|
|
88
|
+
print(f"Starting sitemap extraction from: {args.url}")
|
|
89
|
+
urls = get_sitemap_urls(args.url, headers)
|
|
90
|
+
|
|
91
|
+
if not urls:
|
|
92
|
+
print("No URLs found in the sitemap or failed to fetch.")
|
|
93
|
+
sys.exit(1)
|
|
94
|
+
|
|
95
|
+
print(f"Found {len(urls)} URLs. Starting metadata extraction with {args.workers} concurrent workers...")
|
|
96
|
+
|
|
97
|
+
results = []
|
|
98
|
+
processed = 0
|
|
99
|
+
total = len(urls)
|
|
100
|
+
lock = threading.Lock()
|
|
101
|
+
|
|
102
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
|
|
103
|
+
future_to_url = {executor.submit(process_url, u, headers): u for u in urls}
|
|
104
|
+
for future in concurrent.futures.as_completed(future_to_url):
|
|
105
|
+
data = future.result()
|
|
106
|
+
results.append(data)
|
|
107
|
+
with lock:
|
|
108
|
+
processed += 1
|
|
109
|
+
if processed % 10 == 0 or processed == total:
|
|
110
|
+
print(f"Progress: {processed}/{total} URLs processed", end='\r')
|
|
111
|
+
|
|
112
|
+
print("\nExtraction complete! Saving output...")
|
|
113
|
+
|
|
114
|
+
# Save to CSV
|
|
115
|
+
fieldnames = ['url', 'status_code', 'title', 'description', 'content_length', 'canonical']
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
with open(args.output, mode='w', newline='', encoding='utf-8') as f:
|
|
119
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
120
|
+
writer.writeheader()
|
|
121
|
+
for row in results:
|
|
122
|
+
writer.writerow(row)
|
|
123
|
+
print(f"Report saved to {args.output}")
|
|
124
|
+
except Exception as e:
|
|
125
|
+
print(f"Error saving to CSV: {e}")
|
|
126
|
+
sys.exit(1)
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
main()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sitemapxml
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool to parse sitemaps and extract URL metadata into a CSV format.
|
|
5
|
+
Author-email: Amal Alexander <amalalex95@gmail.com>
|
|
6
|
+
Project-URL: Homepage, https://www.linkedin.com/in/amal-alexander-305780131/
|
|
7
|
+
Project-URL: Bug Tracker, https://www.linkedin.com/in/amal-alexander-305780131/
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.7
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: requests
|
|
13
|
+
Requires-Dist: beautifulsoup4
|
|
14
|
+
|
|
15
|
+
# sitemapxml
|
|
16
|
+
|
|
17
|
+
`sitemapxml` is a powerful and fast command-line tool that extracts all URLs from a given XML sitemap, fetches each URL, and generates a comprehensive CSV report containing:
|
|
18
|
+
- Extract all sitemap URLs
|
|
19
|
+
- HTTP Status Code
|
|
20
|
+
- Title Tag
|
|
21
|
+
- Meta Description
|
|
22
|
+
- Content Length
|
|
23
|
+
- Canonical URL
|
|
24
|
+
|
|
25
|
+
## Installation
|
|
26
|
+
|
|
27
|
+
Install via pip:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install sitemapxml
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
Simply run the CLI command and pass the URL of the sitemap:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
sitemapxml https://example.com/sitemap.xml
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
This will automatically create a `sitemap_report.csv` file in your current directory containing all the extracted metrics. You can also specify an output file:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
sitemapxml https://example.com/sitemap.xml -o my_report.csv
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Author
|
|
48
|
+
- Email: amalalex95@gmail.com
|
|
49
|
+
- LinkedIn: [Amal Alexander](https://www.linkedin.com/in/amal-alexander-305780131/)
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/sitemapxml/__init__.py
|
|
4
|
+
src/sitemapxml/cli.py
|
|
5
|
+
src/sitemapxml.egg-info/PKG-INFO
|
|
6
|
+
src/sitemapxml.egg-info/SOURCES.txt
|
|
7
|
+
src/sitemapxml.egg-info/dependency_links.txt
|
|
8
|
+
src/sitemapxml.egg-info/entry_points.txt
|
|
9
|
+
src/sitemapxml.egg-info/requires.txt
|
|
10
|
+
src/sitemapxml.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sitemapxml
|