@zetagoaurum-dev/straw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ import httpx
2
+ from typing import Optional, Dict, Any
3
+ from .helpers import get_random_user_agent, async_sleep
4
+
5
+ class StrawClient:
6
+ def __init__(self, proxy: Optional[str] = None, timeout: int = 10, retries: int = 3, rotate_user_agent: bool = True):
7
+ self.proxy = proxy
8
+ self.timeout = timeout
9
+ self.retries = retries
10
+ self.rotate_user_agent = rotate_user_agent
11
+
12
+ # We share the client across requests if possible, but for true stateless scraping,
13
+ # we can spin up async clients per request or manage a session pool here.
14
+ # httpx AsyncClient handles connection pooling out of the box.
15
+ self._client = httpx.AsyncClient(
16
+ proxy=self.proxy,
17
+ timeout=self.timeout,
18
+ verify=False, # Ignore strict SSL to match JS version capability
19
+ follow_redirects=True
20
+ )
21
+
22
+ async def request(self, method: str, url: str, **kwargs) -> httpx.Response:
23
+ attempts = 0
24
+ max_retries = max(1, self.retries)
25
+
26
+ headers = kwargs.pop('headers', {})
27
+ if self.rotate_user_agent and 'User-Agent' not in headers:
28
+ headers['User-Agent'] = get_random_user_agent()
29
+
30
+ if 'Accept' not in headers:
31
+ headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
32
+ if 'Accept-Language' not in headers:
33
+ headers['Accept-Language'] = 'en-US,en;q=0.9'
34
+
35
+ # Force HTTP/1.1 or HTTP/2 default connection properties via httpx
36
+
37
+ while attempts < max_retries:
38
+ try:
39
+ response = await self._client.request(method, url, headers=headers, **kwargs)
40
+
41
+ # Check rate limits
42
+ if response.status_code in [429, 500, 502, 503, 504]:
43
+ raise httpx.HTTPStatusError(f"HTTP Error {response.status_code}", request=response.request, response=response)
44
+
45
+ return response
46
+ except Exception as e:
47
+ attempts += 1
48
+ if attempts >= max_retries:
49
+ raise Exception(f"Failed to fetch {url} after {max_retries} attempts. Last error: {str(e)}")
50
+ # Exponential backoff
51
+ await async_sleep(1000 * (2 ** attempts))
52
+
53
+ raise Exception("Unreachable")
54
+
55
+ async def get_text(self, url: str, **kwargs) -> str:
56
+ response = await self.request("GET", url, **kwargs)
57
+ return response.text
58
+
59
+ async def get_json(self, url: str, **kwargs) -> Any:
60
+ response = await self.request("GET", url, **kwargs)
61
+ return response.json()
62
+
63
+ async def close(self):
64
+ await self._client.aclose()
@@ -0,0 +1,18 @@
1
+ import random
2
+ import asyncio
3
+
4
+ USER_AGENTS = [
5
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
6
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
7
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
8
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
9
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:109.0) Gecko/20100101 Firefox/121.0',
10
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
11
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
12
+ ]
13
+
14
+ def get_random_user_agent() -> str:
15
+ return random.choice(USER_AGENTS)
16
+
17
+ async def async_sleep(ms: int):
18
+ await asyncio.sleep(ms / 1000.0)
package/straw/media.py ADDED
@@ -0,0 +1,38 @@
1
+ import re
2
+ from typing import Dict, List, Any
3
+ from bs4 import BeautifulSoup
4
+ from .client import StrawClient
5
+
6
+ class MediaScraper:
7
+ def __init__(self, **client_options):
8
+ self.client = StrawClient(**client_options)
9
+
10
+ async def extract_media(self, url: str) -> Dict[str, Any]:
11
+ html = await self.client.get_text(url)
12
+ soup = BeautifulSoup(html, 'lxml')
13
+
14
+ page_title = soup.title.string.strip() if soup.title and soup.title.string else ""
15
+ media_links = set()
16
+
17
+ for tag in soup.find_all(['video', 'audio', 'source', 'img']):
18
+ src = tag.get('src') or tag.get('srcset')
19
+ if src:
20
+ urls = re.findall(r'https?:\/\/[^\s"',]+', src)
21
+ for u in urls:
22
+ media_links.add(u)
23
+ if src.startswith('http') and src not in media_links:
24
+ media_links.add(src)
25
+
26
+ for tag in soup.find_all('a', href=True):
27
+ href = tag.get('href')
28
+ if href and href.startswith('http') and re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv|png|jpg|jpeg|gif|svg|webp|avif|ico|bmp)(?:\?.*)?$', href, re.IGNORECASE):
29
+ media_links.add(href)
30
+
31
+ raw_links = re.findall(r'''https?:\/\/[^\s"',]+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|bmp|pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv)''', html, re.IGNORECASE)
32
+ for link in raw_links:
33
+ media_links.add(link)
34
+
35
+ return {
36
+ 'pageTitle': page_title,
37
+ 'mediaLinks': list(media_links)
38
+ }
package/straw/web.py ADDED
@@ -0,0 +1,51 @@
1
+ from typing import Dict, List, Optional, Any
2
+ from bs4 import BeautifulSoup
3
+ from .client import StrawClient
4
+
5
+ class WebScraper:
6
+ def __init__(self, **client_options):
7
+ self.client = StrawClient(**client_options)
8
+
9
+ async def scrape(self, url: str) -> Dict[str, Any]:
10
+ html = await self.client.get_text(url)
11
+ soup = BeautifulSoup(html, 'lxml')
12
+
13
+ title = soup.title.string.strip() if soup.title and soup.title.string else ""
14
+
15
+ description = ""
16
+ desc_tag = soup.find('meta', attrs={'name': 'description'})
17
+ if desc_tag and desc_tag.get('content'):
18
+ description = desc_tag['content']
19
+
20
+ if not description:
21
+ og_desc = soup.find('meta', attrs={'property': 'og:description'})
22
+ if og_desc and og_desc.get('content'):
23
+ description = og_desc['content']
24
+
25
+ meta_tags = {}
26
+ for tag in soup.find_all('meta'):
27
+ name = tag.get('name') or tag.get('property')
28
+ content = tag.get('content')
29
+ if name and content:
30
+ meta_tags[name] = content
31
+
32
+ links = []
33
+ for tag in soup.find_all('a', href=True):
34
+ href = tag.get('href', '')
35
+ text = tag.get_text(strip=True)
36
+ if href.startswith('http'):
37
+ links.append({'text': text, 'href': href})
38
+
39
+ # Remove scripts and styles
40
+ for tag in soup(['script', 'style', 'noscript', 'iframe', 'svg']):
41
+ tag.decompose()
42
+
43
+ text_content = ' '.join(soup.get_text(separator=' ').split())
44
+
45
+ return {
46
+ 'title': title,
47
+ 'description': description,
48
+ 'text': text_content,
49
+ 'links': links,
50
+ 'meta': meta_tags
51
+ }
@@ -0,0 +1,55 @@
1
+ import json
2
+ import re
3
+ from typing import Dict, List, Any
4
+ from .client import StrawClient
5
+
6
+ class YouTubeScraper:
7
+ def __init__(self, **client_options):
8
+ self.client = StrawClient(**client_options)
9
+
10
+ async def scrape_video(self, url: str) -> Dict[str, Any]:
11
+ headers = {
12
+ 'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
13
+ }
14
+ html = await self.client.get_text(url, headers=headers)
15
+
16
+ match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)', html)
17
+ if not match:
18
+ raise Exception("ytInitialPlayerResponse not found. YouTube layout changed or IP blocked.")
19
+
20
+ data = json.loads(match.group(1))
21
+ details = data.get('videoDetails', {})
22
+ streaming_data = data.get('streamingData', {})
23
+
24
+ if not details:
25
+ raise Exception("Video details not found inside player response.")
26
+
27
+ formats = []
28
+ raw_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
29
+
30
+ for f in raw_formats:
31
+ if 'url' in f:
32
+ mime_type = f.get('mimeType', '')
33
+ formats.append({
34
+ 'url': f['url'],
35
+ 'mimeType': mime_type,
36
+ 'width': f.get('width'),
37
+ 'height': f.get('height'),
38
+ 'quality': f.get('qualityLabel') or f.get('quality'),
39
+ 'bitrate': f.get('bitrate'),
40
+ 'hasAudio': 'audio/' in mime_type,
41
+ 'hasVideo': 'video/' in mime_type
42
+ })
43
+
44
+ thumbnails = details.get('thumbnail', {}).get('thumbnails', [])
45
+ best_thumbnail = thumbnails[-1]['url'] if thumbnails else ''
46
+
47
+ return {
48
+ 'title': details.get('title', ''),
49
+ 'author': details.get('author', ''),
50
+ 'description': details.get('shortDescription', ''),
51
+ 'views': details.get('viewCount', '0'),
52
+ 'durationSeconds': details.get('lengthSeconds', '0'),
53
+ 'thumbnail': best_thumbnail,
54
+ 'formats': formats
55
+ }
package/tests/test.py ADDED
@@ -0,0 +1,52 @@
1
+ import asyncio
2
+ import sys
3
+ import os
4
+
5
+ # Ensure the parent directory is in the path to import 'straw'
6
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
7
+
8
+ from straw import WebScraper, YouTubeScraper, MediaScraper
9
+
10
+ async def run_tests():
11
+ print("Testing Straw Library (Python)...")
12
+ print("-" * 33)
13
+
14
+ try:
15
+ print("1. Testing Web Scraper on example.com...")
16
+ web = WebScraper()
17
+ web_res = await web.scrape("https://example.com")
18
+ print(f"Web Scraper Output: Title = {web_res['title']}")
19
+ print(f"Web Scraper Output: Text = {web_res['text'][:50]}...")
20
+ await web.client.close()
21
+
22
+ print("\n" + "-" * 33)
23
+
24
+ print("2. Testing YouTube Scraper...")
25
+ yt = YouTubeScraper()
26
+ yt_res = await yt.scrape_video("https://www.youtube.com/watch?v=aqz-KE-bpKQ")
27
+ print(f"YouTube Scraper Output: Title = {yt_res['title']}")
28
+ print(f"YouTube Scraper Output: Duration = {yt_res['durationSeconds']} seconds")
29
+ print(f"YouTube Scraper Output: Found {len(yt_res['formats'])} formats")
30
+ await yt.client.close()
31
+
32
+ print("\n" + "-" * 33)
33
+
34
+ print("3. Testing Media extractor on a public media page...")
35
+ media = MediaScraper()
36
+ media_res = await media.extract_media("https://en.wikipedia.org/wiki/File:Big_Buck_Bunny_4K.webm")
37
+ print(f"Media Scraper Output: Found {len(media_res['mediaLinks'])} media links")
38
+ if len(media_res['mediaLinks']) > 0:
39
+ print(f"Sample Link: {media_res['mediaLinks'][0][:50]}...")
40
+ await media.client.close()
41
+
42
+ print("\n" + "-" * 33)
43
+ print("All tests completed successfully!")
44
+
45
+ except Exception as e:
46
+ print(f"Test failed: {e}")
47
+ sys.exit(1)
48
+
49
+ if __name__ == "__main__":
50
+ if sys.platform == "win32":
51
+ asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
52
+ asyncio.run(run_tests())
package/tests/test.ts ADDED
@@ -0,0 +1,45 @@
1
+ import straw from '../src/index';
2
+
3
+ async function runTests() {
4
+ console.log('Testing Straw Library...');
5
+ console.log('---------------------------------');
6
+
7
+ try {
8
+ // 1. Web Scraper Test
9
+ console.log('1. Testing Web Scraper on example.com...');
10
+ const webClient = straw.web();
11
+ const webResult = await webClient.scrape('https://example.com');
12
+ console.log(`Web Scraper Output: Title = ${webResult.title}`);
13
+ console.log(`Web Scraper Output: Text = ${webResult.text.substring(0, 50)}...`);
14
+
15
+ console.log('\n---------------------------------');
16
+
17
+ // 2. YouTube Scraper Test
18
+ console.log('2. Testing YouTube Scraper...');
19
+ const ytClient = straw.youtube();
20
+ // Use a generic test video like Big Buck Bunny
21
+ const ytResult = await ytClient.scrapeVideo('https://www.youtube.com/watch?v=aqz-KE-bpKQ');
22
+ console.log(`YouTube Scraper Output: Title = ${ytResult.title}`);
23
+ console.log(`YouTube Scraper Output: Duration = ${ytResult.durationSeconds} seconds`);
24
+ console.log(`YouTube Scraper Output: Found ${ytResult.formats.length} formats`);
25
+
26
+ console.log('\n---------------------------------');
27
+
28
+ // 3. Media Scraper Test
29
+ console.log('3. Testing Media extractor on a public media page (using a wikipedia sample file page)...');
30
+ const mediaClient = straw.media();
31
+ const mediaResult = await mediaClient.extractMedia('https://en.wikipedia.org/wiki/File:Big_Buck_Bunny_4K.webm');
32
+ console.log(`Media Scraper Output: Found ${mediaResult.mediaLinks.length} media links`);
33
+ if (mediaResult.mediaLinks.length > 0) {
34
+ console.log(`Sample Link: ${mediaResult.mediaLinks[0].substring(0, 50)}...`);
35
+ }
36
+
37
+ console.log('\n---------------------------------');
38
+ console.log('All tests completed successfully!');
39
+ } catch (error) {
40
+ console.error('Test failed:', error);
41
+ process.exit(1);
42
+ }
43
+ }
44
+
45
+ runTests();
package/tsconfig.json ADDED
@@ -0,0 +1,13 @@
1
+ {
2
+ "compilerOptions": {
3
+ "target": "es2022",
4
+ "module": "NodeNext",
5
+ "moduleResolution": "NodeNext",
6
+ "esModuleInterop": true,
7
+ "forceConsistentCasingInFileNames": true,
8
+ "strict": true,
9
+ "skipLibCheck": true,
10
+ "outDir": "dist"
11
+ },
12
+ "include": ["src/**/*", "tests/**/*"]
13
+ }