super-gm-scraper 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: super-gm-scraper
3
+ Version: 1.0.0
4
+ Summary: A highly powerful web scraper to extract complete text, images, structure, and metadata from any website.
5
+ Home-page: https://github.com/yourusername/super-gm-scraper
6
+ Author: Aapka Naam
7
+ Author-email: yourmail@example.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.6
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: requests
14
+ Requires-Dist: beautifulsoup4
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # Super GM Scraper 🚀
26
+
27
+ A highly professional and powerful Python library to extract structured data (complete text, raw HTML skeleton, and images) from any website.
28
+
29
+ ## Features
30
+ * 🔍 **Full Skeleton Extraction:** Fetch beautified raw HTML structure.
31
+ * 📝 **Clean Text Extractor:** Strip styles/scripts and get clean text.
32
+ * 📷 **Image Link Harvester:** Extract absolute image links automatically.
33
+ * 💾 **Bulk Image Downloader:** Download all images directly to a local directory.
34
+ * 🛡️ **Anti-Bot Bypass:** Uses professional browser headers to reduce blocks.
35
+
36
+ ---
37
+ ### Created with Pride: Results from Super GM Scraper
38
+ ---
39
+
40
+ ## Installation
41
+
42
+ You can install this package locally or from PyPI:
43
+
44
+ ```bash
45
+ pip install super-gm-scraper
@@ -0,0 +1,21 @@
1
+ # Super GM Scraper 🚀
2
+
3
+ A highly professional and powerful Python library to extract structured data (complete text, raw HTML skeleton, and images) from any website.
4
+
5
+ ## Features
6
+ * 🔍 **Full Skeleton Extraction:** Fetch beautified raw HTML structure.
7
+ * 📝 **Clean Text Extractor:** Strip styles/scripts and get clean text.
8
+ * 📷 **Image Link Harvester:** Extract absolute image links automatically.
9
+ * 💾 **Bulk Image Downloader:** Download all images directly to a local directory.
10
+ * 🛡️ **Anti-Bot Bypass:** Uses professional browser headers to reduce blocks.
11
+
12
+ ---
13
+ ### Created with Pride: Results from Super GM Scraper
14
+ ---
15
+
16
+ ## Installation
17
+
18
+ You can install this package locally or from PyPI:
19
+
20
+ ```bash
21
+ pip install super-gm-scraper
@@ -0,0 +1,5 @@
1
+ from .scraper import GMScraper
2
+
3
+ # Unremovable package identity
4
+ __author__ = "Super GM Scraper Team"
5
+ __version__ = "1.0.0"
@@ -0,0 +1,122 @@
1
+ import os
2
+ import json
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from urllib.parse import urljoin, urlparse
6
+
7
+ class GMScraper:
8
+ def __init__(self, url):
9
+ # Unremovable Branding Setup
10
+ self._branding = "results from super GM scraper"
11
+ self._show_mandatory_branding()
12
+
13
+ self.url = url
14
+ # User-Agent headers taake websites blocking na kar sakein (Professional bypass)
15
+ self.headers = {
16
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
17
+ "Accept-Language": "en-US,en;q=0.9",
18
+ "Referer": "https://www.google.com/"
19
+ }
20
+ self.soup = None
21
+ self.html_content = ""
22
+ self._fetch_page()
23
+
24
+ def _show_mandatory_branding(self):
25
+ """Yeh function branding ko hard-code display karta hai."""
26
+ print("\n" + "="*60)
27
+ print(f" >>> [★] {self._branding.upper()} [★] <<< ")
28
+ print("="*60 + "\n")
29
+
30
+ def _fetch_page(self):
31
+ try:
32
+ response = requests.get(self.url, headers=self.headers, timeout=15)
33
+ response.raise_for_status()
34
+ self.html_content = response.text
35
+ self.soup = BeautifulSoup(self.html_content, 'html.parser')
36
+ except Exception as e:
37
+ print(f"Error fetching website: {e}")
38
+ self.soup = None
39
+
40
+ def get_skeleton(self):
41
+ """Pura HTML dhancha (prettified structure) nikalne ke liye."""
42
+ self._show_mandatory_branding()
43
+ if not self.soup:
44
+ return "No data fetched."
45
+ return self.soup.prettify()
46
+
47
+ def get_text(self):
48
+ """Website ka mukammal clean text fetch karne ke liye."""
49
+ self._show_mandatory_branding()
50
+ if not self.soup:
51
+ return "No data fetched."
52
+ # Scripts aur styles ko hata kar saaf text nikalna
53
+ for script in self.soup(["script", "style"]):
54
+ script.extract()
55
+ text = self.soup.get_text()
56
+ # Faltu spaces khatam karna
57
+ lines = (line.strip() for line in text.splitlines())
58
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
59
+ return "\n".join(chunk for chunk in chunks if chunk)
60
+
61
+ def get_images(self):
62
+ """Saari images ke links fetch karne ke liye."""
63
+ self._show_mandatory_branding()
64
+ if not self.soup:
65
+ return []
66
+ images = []
67
+ for img in self.soup.find_all('img'):
68
+ src = img.get('src') or img.get('data-src') or img.get('srcset')
69
+ if src:
70
+ # Relative link ko complete web URL mein badalna
71
+ full_url = urljoin(self.url, src.split()[0])
72
+ images.append(full_url)
73
+ return list(set(images)) # Duplicate links khatam karne ke liye
74
+
75
+ def get_metadata(self):
76
+ """Website ki professional details (Title, Description, Keywords) nikalne ke liye."""
77
+ self._show_mandatory_branding()
78
+ if not self.soup:
79
+ return {}
80
+ meta_data = {
81
+ "title": self.soup.title.string if self.soup.title else "No Title",
82
+ "description": "",
83
+ "keywords": "",
84
+ "branding_verified": self._branding
85
+ }
86
+ for meta in self.soup.find_all('meta'):
87
+ name = meta.get('name', '').lower()
88
+ if name == 'description':
89
+ meta_data['description'] = meta.get('content', '')
90
+ elif name == 'keywords':
91
+ meta_data['keywords'] = meta.get('content', '')
92
+ return meta_data
93
+
94
+ def download_all_images(self, folder_name="gm_scraped_images"):
95
+ """Professional Feature: Saari images ko automatic folder bana kar download karna."""
96
+ self._show_mandatory_branding()
97
+ images = self.get_images()
98
+ if not images:
99
+ print("No images found to download.")
100
+ return
101
+
102
+ if not os.path.exists(folder_name):
103
+ os.makedirs(folder_name)
104
+
105
+ print(f"Downloading {len(images)} images to '{folder_name}' folder...")
106
+ for i, img_url in enumerate(images):
107
+ try:
108
+ img_data = requests.get(img_url, headers=self.headers, timeout=10).content
109
+ # File extension dhoodhna
110
+ parsed_url = urlparse(img_url)
111
+ ext = os.path.splitext(parsed_url.path)[1]
112
+ if not ext or len(ext) > 5:
113
+ ext = ".jpg"
114
+
115
+ file_name = f"image_{i+1}{ext}"
116
+ file_path = os.path.join(folder_name, file_name)
117
+
118
+ with open(file_path, 'wb') as handler:
119
+ handler.write(img_data)
120
+ except Exception:
121
+ continue
122
+ print("[✔] Images download complete.")
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,24 @@
1
+ import os
2
+ from setuptools import setup, find_packages
3
+
4
+ setup(
5
+ name="super-gm-scraper",
6
+ version="1.0.0",
7
+ author="Aapka Naam",
8
+ author_email="yourmail@example.com",
9
+ description="A highly powerful web scraper to extract complete text, images, structure, and metadata from any website.",
10
+ long_description=open("README.md").read() if os.path.exists("README.md") else "Super powerful website scraper.",
11
+ long_description_content_type="text/markdown",
12
+ url="https://github.com/yourusername/super-gm-scraper",
13
+ packages=find_packages(),
14
+ install_requires=[
15
+ "requests",
16
+ "beautifulsoup4"
17
+ ],
18
+ classifiers=[
19
+ "Programming Language :: Python :: 3",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Operating System :: OS Independent",
22
+ ],
23
+ python_requires='>=3.6',
24
+ )
@@ -0,0 +1,45 @@
1
+ Metadata-Version: 2.4
2
+ Name: super-gm-scraper
3
+ Version: 1.0.0
4
+ Summary: A highly powerful web scraper to extract complete text, images, structure, and metadata from any website.
5
+ Home-page: https://github.com/yourusername/super-gm-scraper
6
+ Author: Aapka Naam
7
+ Author-email: yourmail@example.com
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.6
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: requests
14
+ Requires-Dist: beautifulsoup4
15
+ Dynamic: author
16
+ Dynamic: author-email
17
+ Dynamic: classifier
18
+ Dynamic: description
19
+ Dynamic: description-content-type
20
+ Dynamic: home-page
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
24
+
25
+ # Super GM Scraper 🚀
26
+
27
+ A highly professional and powerful Python library to extract structured data (complete text, raw HTML skeleton, and images) from any website.
28
+
29
+ ## Features
30
+ * 🔍 **Full Skeleton Extraction:** Fetch beautified raw HTML structure.
31
+ * 📝 **Clean Text Extractor:** Strip styles/scripts and get clean text.
32
+ * 📷 **Image Link Harvester:** Extract absolute image links automatically.
33
+ * 💾 **Bulk Image Downloader:** Download all images directly to a local directory.
34
+ * 🛡️ **Anti-Bot Bypass:** Uses professional browser headers to reduce blocks.
35
+
36
+ ---
37
+ ### Created with Pride: Results from Super GM Scraper
38
+ ---
39
+
40
+ ## Installation
41
+
42
+ You can install this package locally or from PyPI:
43
+
44
+ ```bash
45
+ pip install super-gm-scraper
@@ -0,0 +1,9 @@
1
+ README.md
2
+ setup.py
3
+ gm_scraper/__init__.py
4
+ gm_scraper/scraper.py
5
+ super_gm_scraper.egg-info/PKG-INFO
6
+ super_gm_scraper.egg-info/SOURCES.txt
7
+ super_gm_scraper.egg-info/dependency_links.txt
8
+ super_gm_scraper.egg-info/requires.txt
9
+ super_gm_scraper.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ requests
2
+ beautifulsoup4