npm - @zetagoaurum-dev/straw - Versions diffs - 1.0.0 - Mend

@zetagoaurum-dev/straw 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/CHANGELOG.md +18 -0
package/LICENSE +21 -0
package/README.md +136 -0
package/dist/index.d.mts +90 -0
package/dist/index.d.ts +90 -0
package/dist/index.js +291 -0
package/dist/index.mjs +251 -0
package/package.json +40 -0
package/pyproject.toml +23 -0
package/src/core/client.ts +94 -0
package/src/index.ts +26 -0
package/src/scrapers/media.ts +58 -0
package/src/scrapers/web.ts +64 -0
package/src/scrapers/youtube.ts +92 -0
package/src/utils/helpers.ts +17 -0
package/straw/__init__.py +11 -0
package/straw/__pycache__/__init__.cpython-311.pyc +0 -0
package/straw/__pycache__/client.cpython-311.pyc +0 -0
package/straw/__pycache__/helpers.cpython-311.pyc +0 -0
package/straw/__pycache__/media.cpython-311.pyc +0 -0
package/straw/__pycache__/web.cpython-311.pyc +0 -0
package/straw/__pycache__/youtube.cpython-311.pyc +0 -0
package/straw/client.py +64 -0
package/straw/helpers.py +18 -0
package/straw/media.py +38 -0
package/straw/web.py +51 -0
package/straw/youtube.py +55 -0
package/tests/test.py +52 -0
package/tests/test.ts +45 -0
package/tsconfig.json +13 -0

package/straw/__pycache__/media.cpython-311.pyc ADDED Viewed

Binary file

package/straw/__pycache__/web.cpython-311.pyc ADDED Viewed

Binary file

package/straw/__pycache__/youtube.cpython-311.pyc ADDED Viewed

Binary file

package/straw/client.py ADDED Viewed

@@ -0,0 +1,64 @@
+import httpx
+from typing import Optional, Dict, Any
+from .helpers import get_random_user_agent, async_sleep
+class StrawClient:
+    def __init__(self, proxy: Optional[str] = None, timeout: int = 10, retries: int = 3, rotate_user_agent: bool = True):
+        self.proxy = proxy
+        self.timeout = timeout
+        self.retries = retries
+        self.rotate_user_agent = rotate_user_agent
+        # We share the client across requests if possible, but for true stateless scraping,
+        # we can spin up async clients per request or manage a session pool here.
+        # httpx AsyncClient handles connection pooling out of the box.
+        self._client = httpx.AsyncClient(
+            proxy=self.proxy,
+            timeout=self.timeout,
+            verify=False,  # Ignore strict SSL to match JS version capability
+            follow_redirects=True
+        )
+    async def request(self, method: str, url: str, **kwargs) -> httpx.Response:
+        attempts = 0
+        max_retries = max(1, self.retries)
+        headers = kwargs.pop('headers', {})
+        if self.rotate_user_agent and 'User-Agent' not in headers:
+            headers['User-Agent'] = get_random_user_agent()
+        if 'Accept' not in headers:
+            headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
+        if 'Accept-Language' not in headers:
+            headers['Accept-Language'] = 'en-US,en;q=0.9'
+        # Force HTTP/1.1 or HTTP/2 default connection properties via httpx
+        while attempts < max_retries:
+            try:
+                response = await self._client.request(method, url, headers=headers, **kwargs)
+                # Check rate limits
+                if response.status_code in [429, 500, 502, 503, 504]:
+                    raise httpx.HTTPStatusError(f"HTTP Error {response.status_code}", request=response.request, response=response)
+                return response
+            except Exception as e:
+                attempts += 1
+                if attempts >= max_retries:
+                    raise Exception(f"Failed to fetch {url} after {max_retries} attempts. Last error: {str(e)}")
+                # Exponential backoff
+                await async_sleep(1000 * (2 ** attempts))
+        raise Exception("Unreachable")
+    async def get_text(self, url: str, **kwargs) -> str:
+        response = await self.request("GET", url, **kwargs)
+        return response.text
+    async def get_json(self, url: str, **kwargs) -> Any:
+        response = await self.request("GET", url, **kwargs)
+        return response.json()
+    async def close(self):
+        await self._client.aclose()

package/straw/helpers.py ADDED Viewed

@@ -0,0 +1,18 @@
+import random
+import asyncio
+USER_AGENTS = [
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14.2; rv:109.0) Gecko/20100101 Firefox/121.0',
+    'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_2_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15',
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0'
+]
+def get_random_user_agent() -> str:
+    return random.choice(USER_AGENTS)
+async def async_sleep(ms: int):
+    await asyncio.sleep(ms / 1000.0)

package/straw/media.py ADDED Viewed

@@ -0,0 +1,38 @@
+import re
+from typing import Dict, List, Any
+from bs4 import BeautifulSoup
+from .client import StrawClient
+class MediaScraper:
+    def __init__(self, **client_options):
+        self.client = StrawClient(**client_options)
+    async def extract_media(self, url: str) -> Dict[str, Any]:
+        html = await self.client.get_text(url)
+        soup = BeautifulSoup(html, 'lxml')
+        page_title = soup.title.string.strip() if soup.title and soup.title.string else ""
+        media_links = set()
+        for tag in soup.find_all(['video', 'audio', 'source', 'img']):
+            src = tag.get('src') or tag.get('srcset')
+            if src:
+                urls = re.findall(r'https?:\/\/[^\s"',]+', src)
+                for u in urls:
+                    media_links.add(u)
+                if src.startswith('http') and src not in media_links:
+                    media_links.add(src)
+        for tag in soup.find_all('a', href=True):
+            href = tag.get('href')
+            if href and href.startswith('http') and re.search(r'\.(pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv|png|jpg|jpeg|gif|svg|webp|avif|ico|bmp)(?:\?.*)?$', href, re.IGNORECASE):
+                media_links.add(href)
+        raw_links = re.findall(r'''https?:\/\/[^\s"',]+\.(?:png|jpg|jpeg|gif|svg|webp|avif|ico|bmp|pdf|doc|docx|xls|xlsx|ppt|pptx|txt|csv|rtf|mp4|mp3|webm|wav|ogg|m4a|avi|mkv|mov|flv)''', html, re.IGNORECASE)
+        for link in raw_links:
+            media_links.add(link)
+        return {
+            'pageTitle': page_title,
+            'mediaLinks': list(media_links)
+        }

package/straw/web.py ADDED Viewed

@@ -0,0 +1,51 @@
+from typing import Dict, List, Optional, Any
+from bs4 import BeautifulSoup
+from .client import StrawClient
+class WebScraper:
+    def __init__(self, **client_options):
+        self.client = StrawClient(**client_options)
+    async def scrape(self, url: str) -> Dict[str, Any]:
+        html = await self.client.get_text(url)
+        soup = BeautifulSoup(html, 'lxml')
+        title = soup.title.string.strip() if soup.title and soup.title.string else ""
+        description = ""
+        desc_tag = soup.find('meta', attrs={'name': 'description'})
+        if desc_tag and desc_tag.get('content'):
+            description = desc_tag['content']
+        if not description:
+            og_desc = soup.find('meta', attrs={'property': 'og:description'})
+            if og_desc and og_desc.get('content'):
+                description = og_desc['content']
+        meta_tags = {}
+        for tag in soup.find_all('meta'):
+            name = tag.get('name') or tag.get('property')
+            content = tag.get('content')
+            if name and content:
+                meta_tags[name] = content
+        links = []
+        for tag in soup.find_all('a', href=True):
+            href = tag.get('href', '')
+            text = tag.get_text(strip=True)
+            if href.startswith('http'):
+                links.append({'text': text, 'href': href})
+        # Remove scripts and styles
+        for tag in soup(['script', 'style', 'noscript', 'iframe', 'svg']):
+            tag.decompose()
+        text_content = ' '.join(soup.get_text(separator=' ').split())
+        return {
+            'title': title,
+            'description': description,
+            'text': text_content,
+            'links': links,
+            'meta': meta_tags
+        }

package/straw/youtube.py ADDED Viewed

@@ -0,0 +1,55 @@
+import json
+import re
+from typing import Dict, List, Any
+from .client import StrawClient
+class YouTubeScraper:
+    def __init__(self, **client_options):
+        self.client = StrawClient(**client_options)
+    async def scrape_video(self, url: str) -> Dict[str, Any]:
+        headers = {
+            'Cookie': 'CONSENT=YES+cb.20230501-14-p0.en+FX+430'
+        }
+        html = await self.client.get_text(url, headers=headers)
+        match = re.search(r'ytInitialPlayerResponse\s*=\s*({.*?});(?:var|<\/script>)', html)
+        if not match:
+            raise Exception("ytInitialPlayerResponse not found. YouTube layout changed or IP blocked.")
+        data = json.loads(match.group(1))
+        details = data.get('videoDetails', {})
+        streaming_data = data.get('streamingData', {})
+        if not details:
+            raise Exception("Video details not found inside player response.")
+        formats = []
+        raw_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
+        for f in raw_formats:
+            if 'url' in f:
+                mime_type = f.get('mimeType', '')
+                formats.append({
+                    'url': f['url'],
+                    'mimeType': mime_type,
+                    'width': f.get('width'),
+                    'height': f.get('height'),
+                    'quality': f.get('qualityLabel') or f.get('quality'),
+                    'bitrate': f.get('bitrate'),
+                    'hasAudio': 'audio/' in mime_type,
+                    'hasVideo': 'video/' in mime_type
+                })
+        thumbnails = details.get('thumbnail', {}).get('thumbnails', [])
+        best_thumbnail = thumbnails[-1]['url'] if thumbnails else ''
+        return {
+            'title': details.get('title', ''),
+            'author': details.get('author', ''),
+            'description': details.get('shortDescription', ''),
+            'views': details.get('viewCount', '0'),
+            'durationSeconds': details.get('lengthSeconds', '0'),
+            'thumbnail': best_thumbnail,
+            'formats': formats
+        }

package/tests/test.py ADDED Viewed

@@ -0,0 +1,52 @@
+import asyncio
+import sys
+import os
+# Ensure the parent directory is in the path to import 'straw'
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from straw import WebScraper, YouTubeScraper, MediaScraper
+async def run_tests():
+    print("Testing Straw Library (Python)...")
+    print("-" * 33)
+    try:
+        print("1. Testing Web Scraper on example.com...")
+        web = WebScraper()
+        web_res = await web.scrape("https://example.com")
+        print(f"Web Scraper Output: Title = {web_res['title']}")
+        print(f"Web Scraper Output: Text = {web_res['text'][:50]}...")
+        await web.client.close()
+        print("\n" + "-" * 33)
+        print("2. Testing YouTube Scraper...")
+        yt = YouTubeScraper()
+        yt_res = await yt.scrape_video("https://www.youtube.com/watch?v=aqz-KE-bpKQ")
+        print(f"YouTube Scraper Output: Title = {yt_res['title']}")
+        print(f"YouTube Scraper Output: Duration = {yt_res['durationSeconds']} seconds")
+        print(f"YouTube Scraper Output: Found {len(yt_res['formats'])} formats")
+        await yt.client.close()
+        print("\n" + "-" * 33)
+        print("3. Testing Media extractor on a public media page...")
+        media = MediaScraper()
+        media_res = await media.extract_media("https://en.wikipedia.org/wiki/File:Big_Buck_Bunny_4K.webm")
+        print(f"Media Scraper Output: Found {len(media_res['mediaLinks'])} media links")
+        if len(media_res['mediaLinks']) > 0:
+            print(f"Sample Link: {media_res['mediaLinks'][0][:50]}...")
+        await media.client.close()
+        print("\n" + "-" * 33)
+        print("All tests completed successfully!")
+    except Exception as e:
+        print(f"Test failed: {e}")
+        sys.exit(1)
+if __name__ == "__main__":
+    if sys.platform == "win32":
+         asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+    asyncio.run(run_tests())

package/tests/test.ts ADDED Viewed

@@ -0,0 +1,45 @@
+import straw from '../src/index';
+async function runTests() {
+    console.log('Testing Straw Library...');
+    console.log('---------------------------------');
+    try {
+        // 1. Web Scraper Test
+        console.log('1. Testing Web Scraper on example.com...');
+        const webClient = straw.web();
+        const webResult = await webClient.scrape('https://example.com');
+        console.log(`Web Scraper Output: Title = ${webResult.title}`);
+        console.log(`Web Scraper Output: Text = ${webResult.text.substring(0, 50)}...`);
+        console.log('\n---------------------------------');
+        // 2. YouTube Scraper Test
+        console.log('2. Testing YouTube Scraper...');
+        const ytClient = straw.youtube();
+        // Use a generic test video like Big Buck Bunny
+        const ytResult = await ytClient.scrapeVideo('https://www.youtube.com/watch?v=aqz-KE-bpKQ');
+        console.log(`YouTube Scraper Output: Title = ${ytResult.title}`);
+        console.log(`YouTube Scraper Output: Duration = ${ytResult.durationSeconds} seconds`);
+        console.log(`YouTube Scraper Output: Found ${ytResult.formats.length} formats`);
+        console.log('\n---------------------------------');
+        // 3. Media Scraper Test
+        console.log('3. Testing Media extractor on a public media page (using a wikipedia sample file page)...');
+        const mediaClient = straw.media();
+        const mediaResult = await mediaClient.extractMedia('https://en.wikipedia.org/wiki/File:Big_Buck_Bunny_4K.webm');
+        console.log(`Media Scraper Output: Found ${mediaResult.mediaLinks.length} media links`);
+        if (mediaResult.mediaLinks.length > 0) {
+            console.log(`Sample Link: ${mediaResult.mediaLinks[0].substring(0, 50)}...`);
+        }
+        console.log('\n---------------------------------');
+        console.log('All tests completed successfully!');
+    } catch (error) {
+        console.error('Test failed:', error);
+        process.exit(1);
+    }
+}
+runTests();

package/tsconfig.json ADDED Viewed

@@ -0,0 +1,13 @@
+{
+  "compilerOptions": {
+    "target": "es2022",
+    "module": "NodeNext",
+    "moduleResolution": "NodeNext",
+    "esModuleInterop": true,
+    "forceConsistentCasingInFileNames": true,
+    "strict": true,
+    "skipLibCheck": true,
+    "outDir": "dist"
+  },
+  "include": ["src/**/*", "tests/**/*"]
+}