super-gm-scraper 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- super_gm_scraper-1.0.0/PKG-INFO +45 -0
- super_gm_scraper-1.0.0/README.md +21 -0
- super_gm_scraper-1.0.0/gm_scraper/__init__.py +5 -0
- super_gm_scraper-1.0.0/gm_scraper/scraper.py +122 -0
- super_gm_scraper-1.0.0/setup.cfg +4 -0
- super_gm_scraper-1.0.0/setup.py +24 -0
- super_gm_scraper-1.0.0/super_gm_scraper.egg-info/PKG-INFO +45 -0
- super_gm_scraper-1.0.0/super_gm_scraper.egg-info/SOURCES.txt +9 -0
- super_gm_scraper-1.0.0/super_gm_scraper.egg-info/dependency_links.txt +1 -0
- super_gm_scraper-1.0.0/super_gm_scraper.egg-info/requires.txt +2 -0
- super_gm_scraper-1.0.0/super_gm_scraper.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: super-gm-scraper
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A highly powerful web scraper to extract complete text, images, structure, and metadata from any website.
|
|
5
|
+
Home-page: https://github.com/yourusername/super-gm-scraper
|
|
6
|
+
Author: Aapka Naam
|
|
7
|
+
Author-email: yourmail@example.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: requests
|
|
14
|
+
Requires-Dist: beautifulsoup4
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: classifier
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
24
|
+
|
|
25
|
+
# Super GM Scraper 🚀
|
|
26
|
+
|
|
27
|
+
A highly professional and powerful Python library to extract structured data (complete text, raw HTML skeleton, and images) from any website.
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
* 🔍 **Full Skeleton Extraction:** Fetch beautified raw HTML structure.
|
|
31
|
+
* 📝 **Clean Text Extractor:** Strip styles/scripts and get clean text.
|
|
32
|
+
* 📷 **Image Link Harvester:** Extract absolute image links automatically.
|
|
33
|
+
* 💾 **Bulk Image Downloader:** Download all images directly to a local directory.
|
|
34
|
+
* 🛡️ **Anti-Bot Bypass:** Uses professional browser headers to reduce blocks.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
### Created with Pride: Results from Super GM Scraper
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
You can install this package locally or from PyPI:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install super-gm-scraper
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Super GM Scraper 🚀
|
|
2
|
+
|
|
3
|
+
A highly professional and powerful Python library to extract structured data (complete text, raw HTML skeleton, and images) from any website.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
* 🔍 **Full Skeleton Extraction:** Fetch beautified raw HTML structure.
|
|
7
|
+
* 📝 **Clean Text Extractor:** Strip styles/scripts and get clean text.
|
|
8
|
+
* 📷 **Image Link Harvester:** Extract absolute image links automatically.
|
|
9
|
+
* 💾 **Bulk Image Downloader:** Download all images directly to a local directory.
|
|
10
|
+
* 🛡️ **Anti-Bot Bypass:** Uses professional browser headers to reduce blocks.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
### Created with Pride: Results from Super GM Scraper
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
You can install this package locally or from PyPI:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install super-gm-scraper
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import requests
|
|
4
|
+
from bs4 import BeautifulSoup
|
|
5
|
+
from urllib.parse import urljoin, urlparse
|
|
6
|
+
|
|
7
|
+
class GMScraper:
|
|
8
|
+
def __init__(self, url):
|
|
9
|
+
# Unremovable Branding Setup
|
|
10
|
+
self._branding = "results from super GM scraper"
|
|
11
|
+
self._show_mandatory_branding()
|
|
12
|
+
|
|
13
|
+
self.url = url
|
|
14
|
+
# User-Agent headers taake websites blocking na kar sakein (Professional bypass)
|
|
15
|
+
self.headers = {
|
|
16
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
17
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
18
|
+
"Referer": "https://www.google.com/"
|
|
19
|
+
}
|
|
20
|
+
self.soup = None
|
|
21
|
+
self.html_content = ""
|
|
22
|
+
self._fetch_page()
|
|
23
|
+
|
|
24
|
+
def _show_mandatory_branding(self):
|
|
25
|
+
"""Yeh function branding ko hard-code display karta hai."""
|
|
26
|
+
print("\n" + "="*60)
|
|
27
|
+
print(f" >>> [★] {self._branding.upper()} [★] <<< ")
|
|
28
|
+
print("="*60 + "\n")
|
|
29
|
+
|
|
30
|
+
def _fetch_page(self):
|
|
31
|
+
try:
|
|
32
|
+
response = requests.get(self.url, headers=self.headers, timeout=15)
|
|
33
|
+
response.raise_for_status()
|
|
34
|
+
self.html_content = response.text
|
|
35
|
+
self.soup = BeautifulSoup(self.html_content, 'html.parser')
|
|
36
|
+
except Exception as e:
|
|
37
|
+
print(f"Error fetching website: {e}")
|
|
38
|
+
self.soup = None
|
|
39
|
+
|
|
40
|
+
def get_skeleton(self):
|
|
41
|
+
"""Pura HTML dhancha (prettified structure) nikalne ke liye."""
|
|
42
|
+
self._show_mandatory_branding()
|
|
43
|
+
if not self.soup:
|
|
44
|
+
return "No data fetched."
|
|
45
|
+
return self.soup.prettify()
|
|
46
|
+
|
|
47
|
+
def get_text(self):
|
|
48
|
+
"""Website ka mukammal clean text fetch karne ke liye."""
|
|
49
|
+
self._show_mandatory_branding()
|
|
50
|
+
if not self.soup:
|
|
51
|
+
return "No data fetched."
|
|
52
|
+
# Scripts aur styles ko hata kar saaf text nikalna
|
|
53
|
+
for script in self.soup(["script", "style"]):
|
|
54
|
+
script.extract()
|
|
55
|
+
text = self.soup.get_text()
|
|
56
|
+
# Faltu spaces khatam karna
|
|
57
|
+
lines = (line.strip() for line in text.splitlines())
|
|
58
|
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
59
|
+
return "\n".join(chunk for chunk in chunks if chunk)
|
|
60
|
+
|
|
61
|
+
def get_images(self):
|
|
62
|
+
"""Saari images ke links fetch karne ke liye."""
|
|
63
|
+
self._show_mandatory_branding()
|
|
64
|
+
if not self.soup:
|
|
65
|
+
return []
|
|
66
|
+
images = []
|
|
67
|
+
for img in self.soup.find_all('img'):
|
|
68
|
+
src = img.get('src') or img.get('data-src') or img.get('srcset')
|
|
69
|
+
if src:
|
|
70
|
+
# Relative link ko complete web URL mein badalna
|
|
71
|
+
full_url = urljoin(self.url, src.split()[0])
|
|
72
|
+
images.append(full_url)
|
|
73
|
+
return list(set(images)) # Duplicate links khatam karne ke liye
|
|
74
|
+
|
|
75
|
+
def get_metadata(self):
|
|
76
|
+
"""Website ki professional details (Title, Description, Keywords) nikalne ke liye."""
|
|
77
|
+
self._show_mandatory_branding()
|
|
78
|
+
if not self.soup:
|
|
79
|
+
return {}
|
|
80
|
+
meta_data = {
|
|
81
|
+
"title": self.soup.title.string if self.soup.title else "No Title",
|
|
82
|
+
"description": "",
|
|
83
|
+
"keywords": "",
|
|
84
|
+
"branding_verified": self._branding
|
|
85
|
+
}
|
|
86
|
+
for meta in self.soup.find_all('meta'):
|
|
87
|
+
name = meta.get('name', '').lower()
|
|
88
|
+
if name == 'description':
|
|
89
|
+
meta_data['description'] = meta.get('content', '')
|
|
90
|
+
elif name == 'keywords':
|
|
91
|
+
meta_data['keywords'] = meta.get('content', '')
|
|
92
|
+
return meta_data
|
|
93
|
+
|
|
94
|
+
def download_all_images(self, folder_name="gm_scraped_images"):
|
|
95
|
+
"""Professional Feature: Saari images ko automatic folder bana kar download karna."""
|
|
96
|
+
self._show_mandatory_branding()
|
|
97
|
+
images = self.get_images()
|
|
98
|
+
if not images:
|
|
99
|
+
print("No images found to download.")
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
if not os.path.exists(folder_name):
|
|
103
|
+
os.makedirs(folder_name)
|
|
104
|
+
|
|
105
|
+
print(f"Downloading {len(images)} images to '{folder_name}' folder...")
|
|
106
|
+
for i, img_url in enumerate(images):
|
|
107
|
+
try:
|
|
108
|
+
img_data = requests.get(img_url, headers=self.headers, timeout=10).content
|
|
109
|
+
# File extension dhoodhna
|
|
110
|
+
parsed_url = urlparse(img_url)
|
|
111
|
+
ext = os.path.splitext(parsed_url.path)[1]
|
|
112
|
+
if not ext or len(ext) > 5:
|
|
113
|
+
ext = ".jpg"
|
|
114
|
+
|
|
115
|
+
file_name = f"image_{i+1}{ext}"
|
|
116
|
+
file_path = os.path.join(folder_name, file_name)
|
|
117
|
+
|
|
118
|
+
with open(file_path, 'wb') as handler:
|
|
119
|
+
handler.write(img_data)
|
|
120
|
+
except Exception:
|
|
121
|
+
continue
|
|
122
|
+
print("[✔] Images download complete.")
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from setuptools import setup, find_packages
|
|
3
|
+
|
|
4
|
+
setup(
|
|
5
|
+
name="super-gm-scraper",
|
|
6
|
+
version="1.0.0",
|
|
7
|
+
author="Aapka Naam",
|
|
8
|
+
author_email="yourmail@example.com",
|
|
9
|
+
description="A highly powerful web scraper to extract complete text, images, structure, and metadata from any website.",
|
|
10
|
+
long_description=open("README.md").read() if os.path.exists("README.md") else "Super powerful website scraper.",
|
|
11
|
+
long_description_content_type="text/markdown",
|
|
12
|
+
url="https://github.com/yourusername/super-gm-scraper",
|
|
13
|
+
packages=find_packages(),
|
|
14
|
+
install_requires=[
|
|
15
|
+
"requests",
|
|
16
|
+
"beautifulsoup4"
|
|
17
|
+
],
|
|
18
|
+
classifiers=[
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
"Operating System :: OS Independent",
|
|
22
|
+
],
|
|
23
|
+
python_requires='>=3.6',
|
|
24
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: super-gm-scraper
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A highly powerful web scraper to extract complete text, images, structure, and metadata from any website.
|
|
5
|
+
Home-page: https://github.com/yourusername/super-gm-scraper
|
|
6
|
+
Author: Aapka Naam
|
|
7
|
+
Author-email: yourmail@example.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: requests
|
|
14
|
+
Requires-Dist: beautifulsoup4
|
|
15
|
+
Dynamic: author
|
|
16
|
+
Dynamic: author-email
|
|
17
|
+
Dynamic: classifier
|
|
18
|
+
Dynamic: description
|
|
19
|
+
Dynamic: description-content-type
|
|
20
|
+
Dynamic: home-page
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
24
|
+
|
|
25
|
+
# Super GM Scraper 🚀
|
|
26
|
+
|
|
27
|
+
A highly professional and powerful Python library to extract structured data (complete text, raw HTML skeleton, and images) from any website.
|
|
28
|
+
|
|
29
|
+
## Features
|
|
30
|
+
* 🔍 **Full Skeleton Extraction:** Fetch beautified raw HTML structure.
|
|
31
|
+
* 📝 **Clean Text Extractor:** Strip styles/scripts and get clean text.
|
|
32
|
+
* 📷 **Image Link Harvester:** Extract absolute image links automatically.
|
|
33
|
+
* 💾 **Bulk Image Downloader:** Download all images directly to a local directory.
|
|
34
|
+
* 🛡️ **Anti-Bot Bypass:** Uses professional browser headers to reduce blocks.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
### Created with Pride: Results from Super GM Scraper
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Installation
|
|
41
|
+
|
|
42
|
+
You can install this package locally or from PyPI:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install super-gm-scraper
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
gm_scraper/__init__.py
|
|
4
|
+
gm_scraper/scraper.py
|
|
5
|
+
super_gm_scraper.egg-info/PKG-INFO
|
|
6
|
+
super_gm_scraper.egg-info/SOURCES.txt
|
|
7
|
+
super_gm_scraper.egg-info/dependency_links.txt
|
|
8
|
+
super_gm_scraper.egg-info/requires.txt
|
|
9
|
+
super_gm_scraper.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
gm_scraper
|