sitemapxml 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ Metadata-Version: 2.4
2
+ Name: sitemapxml
3
+ Version: 0.1.0
4
+ Summary: A CLI tool to parse sitemaps and extract URL metadata into a CSV format.
5
+ Author-email: Amal Alexander <amalalex95@gmail.com>
6
+ Project-URL: Homepage, https://www.linkedin.com/in/amal-alexander-305780131/
7
+ Project-URL: Bug Tracker, https://www.linkedin.com/in/amal-alexander-305780131/
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.7
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: requests
13
+ Requires-Dist: beautifulsoup4
14
+
15
+ # sitemapxml
16
+
17
+ `sitemapxml` is a powerful and fast command-line tool that extracts all URLs from a given XML sitemap, fetches each URL, and generates a comprehensive CSV report containing:
18
+ - Extract all sitemap URLs
19
+ - HTTP Status Code
20
+ - Title Tag
21
+ - Meta Description
22
+ - Content Length
23
+ - Canonical URL
24
+
25
+ ## Installation
26
+
27
+ Install via pip:
28
+
29
+ ```bash
30
+ pip install sitemapxml
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ Simply run the CLI command and pass the URL of the sitemap:
36
+
37
+ ```bash
38
+ sitemapxml https://example.com/sitemap.xml
39
+ ```
40
+
41
+ This will automatically create a `sitemap_report.csv` file in your current directory containing all the extracted metrics. You can also specify an output file:
42
+
43
+ ```bash
44
+ sitemapxml https://example.com/sitemap.xml -o my_report.csv
45
+ ```
46
+
47
+ ## Author
48
+ - Email: amalalex95@gmail.com
49
+ - LinkedIn: [Amal Alexander](https://www.linkedin.com/in/amal-alexander-305780131/)
@@ -0,0 +1,35 @@
1
+ # sitemapxml
2
+
3
+ `sitemapxml` is a powerful and fast command-line tool that extracts all URLs from a given XML sitemap, fetches each URL, and generates a comprehensive CSV report containing:
4
+ - Extract all sitemap URLs
5
+ - HTTP Status Code
6
+ - Title Tag
7
+ - Meta Description
8
+ - Content Length
9
+ - Canonical URL
10
+
11
+ ## Installation
12
+
13
+ Install via pip:
14
+
15
+ ```bash
16
+ pip install sitemapxml
17
+ ```
18
+
19
+ ## Usage
20
+
21
+ Simply run the CLI command and pass the URL of the sitemap:
22
+
23
+ ```bash
24
+ sitemapxml https://example.com/sitemap.xml
25
+ ```
26
+
27
+ This will automatically create a `sitemap_report.csv` file in your current directory containing all the extracted metrics. You can also specify an output file:
28
+
29
+ ```bash
30
+ sitemapxml https://example.com/sitemap.xml -o my_report.csv
31
+ ```
32
+
33
+ ## Author
34
+ - Email: amalalex95@gmail.com
35
+ - LinkedIn: [Amal Alexander](https://www.linkedin.com/in/amal-alexander-305780131/)
@@ -0,0 +1,28 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "sitemapxml"
7
+ version = "0.1.0"
8
+ authors = [
9
+ { name="Amal Alexander", email="amalalex95@gmail.com" },
10
+ ]
11
+ description = "A CLI tool to parse sitemaps and extract URL metadata into a CSV format."
12
+ readme = "README.md"
13
+ requires-python = ">=3.7"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ dependencies = [
19
+ "requests",
20
+ "beautifulsoup4"
21
+ ]
22
+
23
+ [project.urls]
24
+ "Homepage" = "https://www.linkedin.com/in/amal-alexander-305780131/"
25
+ "Bug Tracker" = "https://www.linkedin.com/in/amal-alexander-305780131/"
26
+
27
+ [project.scripts]
28
+ sitemapxml = "sitemapxml.cli:main"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ """
2
+ sitemapxml package
3
+ Extract URLs from a sitemap and get metadata.
4
+ """
5
+
6
+ __version__ = "0.1.0"
@@ -0,0 +1,129 @@
1
+ import argparse
2
+ import csv
3
+ import concurrent.futures
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ import sys
7
+ import threading
8
+
9
+ def get_sitemap_urls(url, headers):
10
+ print(f"Fetching sitemap: {url}")
11
+ try:
12
+ response = requests.get(url, headers=headers, timeout=15)
13
+ response.raise_for_status()
14
+ except Exception as e:
15
+ print(f"Error fetching sitemap {url}: {e}")
16
+ return []
17
+
18
+ soup = BeautifulSoup(response.content, 'html.parser')
19
+
20
+ urls = []
21
+ # If the sitemap is an index, fetch sub-sitemaps
22
+ sitemaps = soup.find_all('sitemap')
23
+ for sitemap in sitemaps:
24
+ loc = sitemap.find('loc')
25
+ if loc and loc.text:
26
+ urls.extend(get_sitemap_urls(loc.text.strip(), headers))
27
+
28
+ # If the sitemap contains urls
29
+ url_tags = soup.find_all('url')
30
+ for url_tag in url_tags:
31
+ loc = url_tag.find('loc')
32
+ if loc and loc.text:
33
+ urls.append(loc.text.strip())
34
+
35
+ return list(set(urls))
36
+
37
+ def process_url(url, headers):
38
+ result = {
39
+ 'url': url,
40
+ 'status_code': None,
41
+ 'title': None,
42
+ 'description': None,
43
+ 'content_length': None,
44
+ 'canonical': None
45
+ }
46
+ try:
47
+ response = requests.get(url, headers=headers, timeout=15)
48
+ result['status_code'] = response.status_code
49
+ result['content_length'] = len(response.content)
50
+
51
+ # Only parse HTML pages for meta tags
52
+ content_type = response.headers.get('Content-Type', '')
53
+ if 'text/html' in content_type:
54
+ soup = BeautifulSoup(response.content, 'html.parser')
55
+
56
+ # Extract Title
57
+ if soup.title and soup.title.string:
58
+ result['title'] = soup.title.string.strip()
59
+
60
+ # Extract Description
61
+ desc_tag = soup.find('meta', attrs={'name': 'description'})
62
+ if not desc_tag:
63
+ desc_tag = soup.find('meta', attrs={'property': 'og:description'})
64
+ if desc_tag and desc_tag.get('content'):
65
+ result['description'] = desc_tag['content'].strip()
66
+
67
+ # Extract Canonical
68
+ canonical_tag = soup.find('link', attrs={'rel': 'canonical'})
69
+ if canonical_tag and canonical_tag.get('href'):
70
+ result['canonical'] = canonical_tag['href'].strip()
71
+
72
+ except requests.exceptions.RequestException as e:
73
+ result['status_code'] = str(e)
74
+ except Exception:
75
+ pass
76
+
77
+ return result
78
+
79
+ def main():
80
+ parser = argparse.ArgumentParser(description="Extract URLs and metadata from an XML API Sitemap.")
81
+ parser.add_argument("url", help="The URL of the sitemap.xml to parse")
82
+ parser.add_argument("-o", "--output", default="sitemap_report.csv", help="Output CSV file name (default: sitemap_report.csv)")
83
+ parser.add_argument("-w", "--workers", type=int, default=10, help="Number of concurrent workers (default: 10)")
84
+
85
+ args = parser.parse_args()
86
+ headers = {'User-Agent': 'sitemapxml/0.1.0'}
87
+
88
+ print(f"Starting sitemap extraction from: {args.url}")
89
+ urls = get_sitemap_urls(args.url, headers)
90
+
91
+ if not urls:
92
+ print("No URLs found in the sitemap or failed to fetch.")
93
+ sys.exit(1)
94
+
95
+ print(f"Found {len(urls)} URLs. Starting metadata extraction with {args.workers} concurrent workers...")
96
+
97
+ results = []
98
+ processed = 0
99
+ total = len(urls)
100
+ lock = threading.Lock()
101
+
102
+ with concurrent.futures.ThreadPoolExecutor(max_workers=args.workers) as executor:
103
+ future_to_url = {executor.submit(process_url, u, headers): u for u in urls}
104
+ for future in concurrent.futures.as_completed(future_to_url):
105
+ data = future.result()
106
+ results.append(data)
107
+ with lock:
108
+ processed += 1
109
+ if processed % 10 == 0 or processed == total:
110
+ print(f"Progress: {processed}/{total} URLs processed", end='\r')
111
+
112
+ print("\nExtraction complete! Saving output...")
113
+
114
+ # Save to CSV
115
+ fieldnames = ['url', 'status_code', 'title', 'description', 'content_length', 'canonical']
116
+
117
+ try:
118
+ with open(args.output, mode='w', newline='', encoding='utf-8') as f:
119
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
120
+ writer.writeheader()
121
+ for row in results:
122
+ writer.writerow(row)
123
+ print(f"Report saved to {args.output}")
124
+ except Exception as e:
125
+ print(f"Error saving to CSV: {e}")
126
+ sys.exit(1)
127
+
128
+ if __name__ == "__main__":
129
+ main()
@@ -0,0 +1,49 @@
1
+ Metadata-Version: 2.4
2
+ Name: sitemapxml
3
+ Version: 0.1.0
4
+ Summary: A CLI tool to parse sitemaps and extract URL metadata into a CSV format.
5
+ Author-email: Amal Alexander <amalalex95@gmail.com>
6
+ Project-URL: Homepage, https://www.linkedin.com/in/amal-alexander-305780131/
7
+ Project-URL: Bug Tracker, https://www.linkedin.com/in/amal-alexander-305780131/
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.7
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: requests
13
+ Requires-Dist: beautifulsoup4
14
+
15
+ # sitemapxml
16
+
17
+ `sitemapxml` is a powerful and fast command-line tool that extracts all URLs from a given XML sitemap, fetches each URL, and generates a comprehensive CSV report containing:
18
+ - Extract all sitemap URLs
19
+ - HTTP Status Code
20
+ - Title Tag
21
+ - Meta Description
22
+ - Content Length
23
+ - Canonical URL
24
+
25
+ ## Installation
26
+
27
+ Install via pip:
28
+
29
+ ```bash
30
+ pip install sitemapxml
31
+ ```
32
+
33
+ ## Usage
34
+
35
+ Simply run the CLI command and pass the URL of the sitemap:
36
+
37
+ ```bash
38
+ sitemapxml https://example.com/sitemap.xml
39
+ ```
40
+
41
+ This will automatically create a `sitemap_report.csv` file in your current directory containing all the extracted metrics. You can also specify an output file:
42
+
43
+ ```bash
44
+ sitemapxml https://example.com/sitemap.xml -o my_report.csv
45
+ ```
46
+
47
+ ## Author
48
+ - Email: amalalex95@gmail.com
49
+ - LinkedIn: [Amal Alexander](https://www.linkedin.com/in/amal-alexander-305780131/)
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/sitemapxml/__init__.py
4
+ src/sitemapxml/cli.py
5
+ src/sitemapxml.egg-info/PKG-INFO
6
+ src/sitemapxml.egg-info/SOURCES.txt
7
+ src/sitemapxml.egg-info/dependency_links.txt
8
+ src/sitemapxml.egg-info/entry_points.txt
9
+ src/sitemapxml.egg-info/requires.txt
10
+ src/sitemapxml.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sitemapxml = sitemapxml.cli:main
@@ -0,0 +1,2 @@
1
+ requests
2
+ beautifulsoup4
@@ -0,0 +1 @@
1
+ sitemapxml