longcat-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- longcat_scraper-0.1.0/PKG-INFO +46 -0
- longcat_scraper-0.1.0/README.md +25 -0
- longcat_scraper-0.1.0/setup.cfg +4 -0
- longcat_scraper-0.1.0/setup.py +27 -0
- longcat_scraper-0.1.0/src/longcat_scraper/__init__.py +141 -0
- longcat_scraper-0.1.0/src/longcat_scraper.egg-info/PKG-INFO +46 -0
- longcat_scraper-0.1.0/src/longcat_scraper.egg-info/SOURCES.txt +9 -0
- longcat_scraper-0.1.0/src/longcat_scraper.egg-info/dependency_links.txt +1 -0
- longcat_scraper-0.1.0/src/longcat_scraper.egg-info/entry_points.txt +2 -0
- longcat_scraper-0.1.0/src/longcat_scraper.egg-info/requires.txt +1 -0
- longcat_scraper-0.1.0/src/longcat_scraper.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: longcat-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A robust web scraper built with Longcat AI
|
|
5
|
+
Home-page: https://github.com/yourusername/longcat-scraper
|
|
6
|
+
Author: BotMaster
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: requests
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: classifier
|
|
15
|
+
Dynamic: description
|
|
16
|
+
Dynamic: description-content-type
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: requires-dist
|
|
19
|
+
Dynamic: requires-python
|
|
20
|
+
Dynamic: summary
|
|
21
|
+
|
|
22
|
+
# Longcat Scraper 🚀
|
|
23
|
+
|
|
24
|
+
A robust web scraper generated and managed by **Longcat AI**.
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
- ✅ Multi-URL scraping
|
|
28
|
+
- ✅ Retry logic with exponential backoff
|
|
29
|
+
- ✅ Detailed logging
|
|
30
|
+
- ✅ Error handling for timeouts and connection issues
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
```bash
|
|
34
|
+
pip install longcat-scraper
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
After installation, run:
|
|
39
|
+
```bash
|
|
40
|
+
longcat-scrape
|
|
41
|
+
```
|
|
42
|
+
OR use it in your code:
|
|
43
|
+
```python
|
|
44
|
+
import longcat_scraper
|
|
45
|
+
longcat_scraper.main()
|
|
46
|
+
```
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Longcat Scraper 🚀
|
|
2
|
+
|
|
3
|
+
A robust web scraper generated and managed by **Longcat AI**.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
- ✅ Multi-URL scraping
|
|
7
|
+
- ✅ Retry logic with exponential backoff
|
|
8
|
+
- ✅ Detailed logging
|
|
9
|
+
- ✅ Error handling for timeouts and connection issues
|
|
10
|
+
|
|
11
|
+
## Installation
|
|
12
|
+
```bash
|
|
13
|
+
pip install longcat-scraper
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
After installation, run:
|
|
18
|
+
```bash
|
|
19
|
+
longcat-scrape
|
|
20
|
+
```
|
|
21
|
+
OR use it in your code:
|
|
22
|
+
```python
|
|
23
|
+
import longcat_scraper
|
|
24
|
+
longcat_scraper.main()
|
|
25
|
+
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name="longcat-scraper",
|
|
5
|
+
version="0.1.0",
|
|
6
|
+
author="BotMaster",
|
|
7
|
+
description="A robust web scraper built with Longcat AI",
|
|
8
|
+
long_description=open("README.md").read(),
|
|
9
|
+
long_description_content_type="text/markdown",
|
|
10
|
+
url="https://github.com/yourusername/longcat-scraper",
|
|
11
|
+
package_dir={"": "src"},
|
|
12
|
+
packages=find_packages(where="src"),
|
|
13
|
+
install_requires=[
|
|
14
|
+
"requests",
|
|
15
|
+
],
|
|
16
|
+
classifiers=[
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
],
|
|
21
|
+
python_requires='>=3.6',
|
|
22
|
+
entry_points={
|
|
23
|
+
'console_scripts': [
|
|
24
|
+
'longcat-scrape=longcat_scraper:main',
|
|
25
|
+
],
|
|
26
|
+
},
|
|
27
|
+
)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import time
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# ✅ Logging setup
|
|
7
|
+
logging.basicConfig(
|
|
8
|
+
level=logging.INFO,
|
|
9
|
+
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
10
|
+
handlers=[
|
|
11
|
+
logging.FileHandler("scraper.log"),
|
|
12
|
+
logging.StreamHandler()
|
|
13
|
+
]
|
|
14
|
+
)
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# ✅ URLs to scrape
|
|
18
|
+
urls = [
|
|
19
|
+
"https://invest-tracing.com/",
|
|
20
|
+
"https://invest-tracing.com/latest-hyips.html",
|
|
21
|
+
"https://invest-tracing.com/scam-hyips.html",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# ✅ Robust Headers
|
|
25
|
+
headers = {
|
|
26
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
27
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
28
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
29
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
30
|
+
"Connection": "keep-alive",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# ✅ Retry settings
|
|
34
|
+
MAX_RETRIES = 3
|
|
35
|
+
RETRY_DELAY = 5 # seconds
|
|
36
|
+
TIMEOUT = 30 # seconds
|
|
37
|
+
|
|
38
|
+
# ✅ Output folder
|
|
39
|
+
OUTPUT_DIR = Path("scraped_pages")
|
|
40
|
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def download_page(url: str, retries: int = MAX_RETRIES) -> str | None:
|
|
44
|
+
"""Download page with retry logic"""
|
|
45
|
+
for attempt in range(1, retries + 1):
|
|
46
|
+
try:
|
|
47
|
+
logger.info(f"⏳ Downloading: {url} (Attempt {attempt}/{retries})")
|
|
48
|
+
|
|
49
|
+
r = requests.get(
|
|
50
|
+
url,
|
|
51
|
+
headers=headers,
|
|
52
|
+
timeout=TIMEOUT,
|
|
53
|
+
allow_redirects=True
|
|
54
|
+
)
|
|
55
|
+
r.raise_for_status() # Raise error for bad status codes
|
|
56
|
+
|
|
57
|
+
logger.info(f"✅ Success! Status: {r.status_code}, Size: {len(r.text)} bytes")
|
|
58
|
+
return r.text
|
|
59
|
+
|
|
60
|
+
except requests.exceptions.Timeout:
|
|
61
|
+
logger.warning(f"⏱️ Timeout on attempt {attempt}")
|
|
62
|
+
|
|
63
|
+
except requests.exceptions.ConnectionError:
|
|
64
|
+
logger.warning(f"🔌 Connection error on attempt {attempt}")
|
|
65
|
+
|
|
66
|
+
except requests.exceptions.HTTPError as e:
|
|
67
|
+
logger.warning(f"❌ HTTP Error: {e}")
|
|
68
|
+
if r.status_code == 404:
|
|
69
|
+
logger.error(f"🚫 Page not found: {url}")
|
|
70
|
+
return None # No point retrying 404
|
|
71
|
+
|
|
72
|
+
except requests.exceptions.RequestException as e:
|
|
73
|
+
logger.warning(f"⚠️ Request error: {e}")
|
|
74
|
+
|
|
75
|
+
# Wait before retry
|
|
76
|
+
if attempt < retries:
|
|
77
|
+
wait_time = RETRY_DELAY * attempt # Exponential backoff
|
|
78
|
+
logger.info(f"⏳ Waiting {wait_time}s before retry...")
|
|
79
|
+
time.sleep(wait_time)
|
|
80
|
+
|
|
81
|
+
logger.error(f"❌ Failed after {retries} attempts: {url}")
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def save_page(content: str, filename: str) -> bool:
|
|
86
|
+
"""Save page content to file"""
|
|
87
|
+
try:
|
|
88
|
+
filepath = OUTPUT_DIR / filename
|
|
89
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
90
|
+
f.write(content)
|
|
91
|
+
logger.info(f"💾 Saved: {filepath}")
|
|
92
|
+
return True
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(f"❌ Failed to save {filename}: {e}")
|
|
95
|
+
return False
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def main():
|
|
99
|
+
"""Main function to scrape all pages"""
|
|
100
|
+
logger.info("=" * 50)
|
|
101
|
+
logger.info("🚀 Starting Web Scraper")
|
|
102
|
+
logger.info(f"📄 Total URLs: {len(urls)}")
|
|
103
|
+
logger.info("=" * 50)
|
|
104
|
+
|
|
105
|
+
results = {
|
|
106
|
+
"success": 0,
|
|
107
|
+
"failed": 0,
|
|
108
|
+
"skipped": 0
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
for i, url in enumerate(urls, start=1):
|
|
112
|
+
logger.info(f"\n{'─' * 40}")
|
|
113
|
+
logger.info(f"[{i}/{len(urls)}] Processing: {url}")
|
|
114
|
+
|
|
115
|
+
# Download page
|
|
116
|
+
content = download_page(url)
|
|
117
|
+
|
|
118
|
+
if content:
|
|
119
|
+
filename = f"page_{i}.html"
|
|
120
|
+
if save_page(content, filename):
|
|
121
|
+
results["success"] += 1
|
|
122
|
+
else:
|
|
123
|
+
results["failed"] += 1
|
|
124
|
+
else:
|
|
125
|
+
results["failed"] += 1
|
|
126
|
+
|
|
127
|
+
# Small delay between requests (be polite to server)
|
|
128
|
+
if i < len(urls):
|
|
129
|
+
time.sleep(2)
|
|
130
|
+
|
|
131
|
+
# Summary
|
|
132
|
+
logger.info("\n" + "=" * 50)
|
|
133
|
+
logger.info("📊 SCRAPING SUMMARY")
|
|
134
|
+
logger.info(f"✅ Success: {results['success']}")
|
|
135
|
+
logger.info(f"❌ Failed: {results['failed']}")
|
|
136
|
+
logger.info(f"⏭️ Skipped: {results['skipped']}")
|
|
137
|
+
logger.info("=" * 50)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
main()
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: longcat-scraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A robust web scraper built with Longcat AI
|
|
5
|
+
Home-page: https://github.com/yourusername/longcat-scraper
|
|
6
|
+
Author: BotMaster
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Requires-Python: >=3.6
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
Requires-Dist: requests
|
|
13
|
+
Dynamic: author
|
|
14
|
+
Dynamic: classifier
|
|
15
|
+
Dynamic: description
|
|
16
|
+
Dynamic: description-content-type
|
|
17
|
+
Dynamic: home-page
|
|
18
|
+
Dynamic: requires-dist
|
|
19
|
+
Dynamic: requires-python
|
|
20
|
+
Dynamic: summary
|
|
21
|
+
|
|
22
|
+
# Longcat Scraper 🚀
|
|
23
|
+
|
|
24
|
+
A robust web scraper generated and managed by **Longcat AI**.
|
|
25
|
+
|
|
26
|
+
## Features
|
|
27
|
+
- ✅ Multi-URL scraping
|
|
28
|
+
- ✅ Retry logic with exponential backoff
|
|
29
|
+
- ✅ Detailed logging
|
|
30
|
+
- ✅ Error handling for timeouts and connection issues
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
```bash
|
|
34
|
+
pip install longcat-scraper
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
After installation, run:
|
|
39
|
+
```bash
|
|
40
|
+
longcat-scrape
|
|
41
|
+
```
|
|
42
|
+
OR use it in your code:
|
|
43
|
+
```python
|
|
44
|
+
import longcat_scraper
|
|
45
|
+
longcat_scraper.main()
|
|
46
|
+
```
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
setup.py
|
|
3
|
+
src/longcat_scraper/__init__.py
|
|
4
|
+
src/longcat_scraper.egg-info/PKG-INFO
|
|
5
|
+
src/longcat_scraper.egg-info/SOURCES.txt
|
|
6
|
+
src/longcat_scraper.egg-info/dependency_links.txt
|
|
7
|
+
src/longcat_scraper.egg-info/entry_points.txt
|
|
8
|
+
src/longcat_scraper.egg-info/requires.txt
|
|
9
|
+
src/longcat_scraper.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
requests
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
longcat_scraper
|