longcat-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,141 @@
1
+ import requests
2
+ import time
3
+ import logging
4
+ from pathlib import Path
5
+
6
+ # ✅ Logging setup
7
+ logging.basicConfig(
8
+ level=logging.INFO,
9
+ format="%(asctime)s - %(levelname)s - %(message)s",
10
+ handlers=[
11
+ logging.FileHandler("scraper.log"),
12
+ logging.StreamHandler()
13
+ ]
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # ✅ URLs to scrape
18
+ urls = [
19
+ "https://invest-tracing.com/",
20
+ "https://invest-tracing.com/latest-hyips.html",
21
+ "https://invest-tracing.com/scam-hyips.html",
22
+ ]
23
+
24
+ # ✅ Robust Headers
25
+ headers = {
26
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
27
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
28
+ "Accept-Language": "en-US,en;q=0.5",
29
+ "Accept-Encoding": "gzip, deflate, br",
30
+ "Connection": "keep-alive",
31
+ }
32
+
33
+ # ✅ Retry settings
34
+ MAX_RETRIES = 3
35
+ RETRY_DELAY = 5 # seconds
36
+ TIMEOUT = 30 # seconds
37
+
38
+ # ✅ Output folder
39
+ OUTPUT_DIR = Path("scraped_pages")
40
+ OUTPUT_DIR.mkdir(exist_ok=True)
41
+
42
+
43
+ def download_page(url: str, retries: int = MAX_RETRIES) -> str | None:
44
+ """Download page with retry logic"""
45
+ for attempt in range(1, retries + 1):
46
+ try:
47
+ logger.info(f"⏳ Downloading: {url} (Attempt {attempt}/{retries})")
48
+
49
+ r = requests.get(
50
+ url,
51
+ headers=headers,
52
+ timeout=TIMEOUT,
53
+ allow_redirects=True
54
+ )
55
+ r.raise_for_status() # Raise error for bad status codes
56
+
57
+ logger.info(f"✅ Success! Status: {r.status_code}, Size: {len(r.text)} bytes")
58
+ return r.text
59
+
60
+ except requests.exceptions.Timeout:
61
+ logger.warning(f"⏱️ Timeout on attempt {attempt}")
62
+
63
+ except requests.exceptions.ConnectionError:
64
+ logger.warning(f"🔌 Connection error on attempt {attempt}")
65
+
66
+ except requests.exceptions.HTTPError as e:
67
+ logger.warning(f"❌ HTTP Error: {e}")
68
+ if r.status_code == 404:
69
+ logger.error(f"🚫 Page not found: {url}")
70
+ return None # No point retrying 404
71
+
72
+ except requests.exceptions.RequestException as e:
73
+ logger.warning(f"⚠️ Request error: {e}")
74
+
75
+ # Wait before retry
76
+ if attempt < retries:
77
+ wait_time = RETRY_DELAY * attempt # Exponential backoff
78
+ logger.info(f"⏳ Waiting {wait_time}s before retry...")
79
+ time.sleep(wait_time)
80
+
81
+ logger.error(f"❌ Failed after {retries} attempts: {url}")
82
+ return None
83
+
84
+
85
+ def save_page(content: str, filename: str) -> bool:
86
+ """Save page content to file"""
87
+ try:
88
+ filepath = OUTPUT_DIR / filename
89
+ with open(filepath, "w", encoding="utf-8") as f:
90
+ f.write(content)
91
+ logger.info(f"💾 Saved: {filepath}")
92
+ return True
93
+ except Exception as e:
94
+ logger.error(f"❌ Failed to save {filename}: {e}")
95
+ return False
96
+
97
+
98
+ def main():
99
+ """Main function to scrape all pages"""
100
+ logger.info("=" * 50)
101
+ logger.info("🚀 Starting Web Scraper")
102
+ logger.info(f"📄 Total URLs: {len(urls)}")
103
+ logger.info("=" * 50)
104
+
105
+ results = {
106
+ "success": 0,
107
+ "failed": 0,
108
+ "skipped": 0
109
+ }
110
+
111
+ for i, url in enumerate(urls, start=1):
112
+ logger.info(f"\n{'─' * 40}")
113
+ logger.info(f"[{i}/{len(urls)}] Processing: {url}")
114
+
115
+ # Download page
116
+ content = download_page(url)
117
+
118
+ if content:
119
+ filename = f"page_{i}.html"
120
+ if save_page(content, filename):
121
+ results["success"] += 1
122
+ else:
123
+ results["failed"] += 1
124
+ else:
125
+ results["failed"] += 1
126
+
127
+ # Small delay between requests (be polite to server)
128
+ if i < len(urls):
129
+ time.sleep(2)
130
+
131
+ # Summary
132
+ logger.info("\n" + "=" * 50)
133
+ logger.info("📊 SCRAPING SUMMARY")
134
+ logger.info(f"✅ Success: {results['success']}")
135
+ logger.info(f"❌ Failed: {results['failed']}")
136
+ logger.info(f"⏭️ Skipped: {results['skipped']}")
137
+ logger.info("=" * 50)
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.4
2
+ Name: longcat-scraper
3
+ Version: 0.1.0
4
+ Summary: A robust web scraper built with Longcat AI
5
+ Home-page: https://github.com/yourusername/longcat-scraper
6
+ Author: BotMaster
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: License :: OSI Approved :: MIT License
9
+ Classifier: Operating System :: OS Independent
10
+ Requires-Python: >=3.6
11
+ Description-Content-Type: text/markdown
12
+ Requires-Dist: requests
13
+ Dynamic: author
14
+ Dynamic: classifier
15
+ Dynamic: description
16
+ Dynamic: description-content-type
17
+ Dynamic: home-page
18
+ Dynamic: requires-dist
19
+ Dynamic: requires-python
20
+ Dynamic: summary
21
+
22
+ # Longcat Scraper 🚀
23
+
24
+ A robust web scraper generated and managed by **Longcat AI**.
25
+
26
+ ## Features
27
+ - ✅ Multi-URL scraping
28
+ - ✅ Retry logic with exponential backoff
29
+ - ✅ Detailed logging
30
+ - ✅ Error handling for timeouts and connection issues
31
+
32
+ ## Installation
33
+ ```bash
34
+ pip install longcat-scraper
35
+ ```
36
+
37
+ ## Usage
38
+ After installation, run:
39
+ ```bash
40
+ longcat-scrape
41
+ ```
42
+ OR use it in your code:
43
+ ```python
44
+ import longcat_scraper
45
+ longcat_scraper.main()
46
+ ```
@@ -0,0 +1,6 @@
1
+ longcat_scraper/__init__.py,sha256=UKpTeVuGnzsJXlPQuvlCfsETso5TbL8kHDsN1yt3_Ck,4273
2
+ longcat_scraper-0.1.0.dist-info/METADATA,sha256=g4FQtDOOTixkQw10FPVcKfHJX781oNCNV8jRyaAmweU,1046
3
+ longcat_scraper-0.1.0.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
4
+ longcat_scraper-0.1.0.dist-info/entry_points.txt,sha256=2eUl0KQzgk4MiMbvfVLu2y8C4wVlWfQA8AokKCmbYgU,56
5
+ longcat_scraper-0.1.0.dist-info/top_level.txt,sha256=f4hsz5QelaUa6l_Z29l5R9e5WuESCAoZVNO67GimPeg,16
6
+ longcat_scraper-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ longcat-scrape = longcat_scraper:main
@@ -0,0 +1 @@
1
+ longcat_scraper