PyPI - crawlix - Versions diffs - 0.1.0__tar.gz - Mend

crawlix 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

crawlix-0.1.0/.gitignore +38 -0
crawlix-0.1.0/AGENTS.md +384 -0
crawlix-0.1.0/CHANGELOG.md +24 -0
crawlix-0.1.0/CONTRIBUTING.md +53 -0
crawlix-0.1.0/LICENSE +21 -0
crawlix-0.1.0/PKG-INFO +189 -0
crawlix-0.1.0/README.md +153 -0
crawlix-0.1.0/SECURITY.md +20 -0
crawlix-0.1.0/docs/superpowers/plans/2025-05-17-crawlix-implementation.md +2597 -0
crawlix-0.1.0/docs/superpowers/specs/2025-05-17-crawlix-design.md +430 -0
crawlix-0.1.0/pyproject.toml +76 -0
crawlix-0.1.0/src/crawlix/__init__.py +28 -0
crawlix-0.1.0/src/crawlix/_version.py +1 -0
crawlix-0.1.0/src/crawlix/async_api.py +82 -0
crawlix-0.1.0/src/crawlix/backends/__init__.py +79 -0
crawlix-0.1.0/src/crawlix/backends/httpx.py +92 -0
crawlix-0.1.0/src/crawlix/backends/playwright.py +223 -0
crawlix-0.1.0/src/crawlix/backends/protocol.py +176 -0
crawlix-0.1.0/src/crawlix/backends/requests.py +139 -0
crawlix-0.1.0/src/crawlix/backends/selenium.py +270 -0
crawlix-0.1.0/src/crawlix/browser.py +71 -0
crawlix-0.1.0/src/crawlix/element.py +171 -0
crawlix-0.1.0/src/crawlix/exceptions.py +26 -0
crawlix-0.1.0/src/crawlix/page.py +223 -0
crawlix-0.1.0/src/crawlix/utils.py +36 -0
crawlix-0.1.0/tests/__init__.py +0 -0
crawlix-0.1.0/tests/conftest.py +4 -0
crawlix-0.1.0/tests/test_async_api.py +49 -0
crawlix-0.1.0/tests/test_backends/__init__.py +0 -0
crawlix-0.1.0/tests/test_backends/test_playwright.py +9 -0
crawlix-0.1.0/tests/test_backends/test_requests.py +80 -0
crawlix-0.1.0/tests/test_backends/test_selenium.py +11 -0
crawlix-0.1.0/tests/test_browser.py +86 -0
crawlix-0.1.0/tests/test_element.py +138 -0
crawlix-0.1.0/tests/test_page.py +125 -0

crawlix-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,38 @@
+__pycache__/
+*.py[cod]
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+.mypy_cache/
+.ruff_cache/
+*.log
+.env
+.venv/
+venv/
+ENV/
+.idea/
+.vscode/
+*.swp
+*.swo
+*~
+.DS_Store
+node_modules/

crawlix-0.1.0/AGENTS.md ADDED Viewed

@@ -0,0 +1,384 @@
+# crawlix
+> One API. Any backend. Full browser automation to lightweight scraping.
+**PyPI**: `pip install crawlix`
+**Author**: keylordelrey
+**License**: MIT
+**Python**: 3.8+
+---
+## What crawlix is
+crawlix is a Python browser automation and web scraping library with a unified API across multiple backends. The same code works whether you're doing simple HTTP scraping or full Playwright-powered browser automation — you switch backends, not code.
+```python
+# Zero setup
+from crawlix import Browser
+with Browser() as b:
+    page = b.open("https://example.com")
+    print(page.find("h1").text)
+# Full browser automation — same API
+with Browser(backend="playwright") as b:
+    page = b.open("https://example.com")
+    page.click("#login")
+    page.type("#email", "user@example.com")
+    page.type("#password", "secret")
+    page.submit("form")
+    page.wait_for(".dashboard")
+    page.screenshot("result.png")
+```
+---
+## Install
+```bash
+# Core — lightweight HTTP scraping
+pip install crawlix
+# With specific backend
+pip install crawlix[requests]    # requests + BeautifulSoup4
+pip install crawlix[playwright]  # full browser via Playwright
+pip install crawlix[selenium]    # full browser via Selenium
+pip install crawlix[async]       # async support via httpx
+pip install crawlix[full]        # everything
+```
+---
+## Core Design Rules
+1. **Same API across all backends** — switching backend never requires rewriting user code
+2. **Auto-detect best available backend** — no config needed, crawlix figures it out
+3. **Zero hard dependencies** — `pip install crawlix` always succeeds
+4. **Fail with helpful errors** — if an operation needs a browser backend, the error tells you exactly what to install
+5. **Context manager always** — resources always cleaned up properly
+6. **Stealth on by default** — realistic headers, UA rotation, no bot fingerprint out of the box
+---
+## Backend Priority (auto-detect order)
+```
+playwright → selenium → httpx → requests+bs4 → error with install hint
+```
+Override anytime:
+```python
+Browser(backend="playwright")
+Browser(backend="requests")
+Browser(backend="selenium")
+Browser(backend="httpx")
+```
+---
+## Full API
+### Browser
+```python
+Browser(
+    backend="auto",         # backend selection
+    headless=True,          # browser backends
+    stealth=True,           # realistic headers + UA rotation
+    timeout=30,             # seconds, applies to all operations
+    proxy=None,             # "http://user:pass@host:port"
+    locale="en-US",         # browser locale
+    user_agent=None,        # override UA
+)
+b.open(url)          # → Page
+b.new_page()         # → blank Page
+b.close()
+b.backend_name       # → str, which backend is active
+b.supports_js        # → bool
+```
+### Page
+```python
+# Navigation
+page.goto(url)           # → Page (chainable)
+page.reload()            # → Page
+page.back()              # → Page
+page.forward()           # → Page
+# Properties
+page.url                 # → str
+page.title               # → str
+page.html                # → str, full HTML
+page.text                # → str, visible text only
+page.status              # → int, HTTP status code
+page.headers             # → dict
+# Querying
+page.find(selector)           # → Element | None
+page.find_all(selector)       # → list[Element]
+page.find_text(text)          # → Element | None
+page.xpath(expr)              # → list[Element]
+# Interaction (browser backends)
+page.click(selector)
+page.double_click(selector)
+page.right_click(selector)
+page.type(selector, text)
+page.clear(selector)
+page.submit(selector="form")
+page.select(selector, value)
+page.hover(selector)
+page.focus(selector)
+page.blur(selector)
+page.scroll(x=0, y=500)
+page.scroll_to(selector)
+page.drag(source, target)
+page.key(key)                 # e.g. "Enter", "Tab", "Escape"
+page.upload(selector, path)   # file upload
+# Waiting (browser backends)
+page.wait_for(selector, timeout=10)
+page.wait_for_text(text, timeout=10)
+page.wait_for_url(pattern, timeout=10)
+page.wait_for_load(timeout=30)
+page.wait_for_network_idle(timeout=30)
+page.sleep(seconds)
+# JavaScript (browser backends)
+page.evaluate(js_code)             # → any
+page.evaluate_on(selector, js)     # → any
+# Network
+page.set_headers(headers)
+page.set_cookies(cookies)
+page.get_cookies()                 # → list[dict]
+page.clear_cookies()
+page.intercept(pattern, handler)   # intercept requests (playwright)
+# Extraction helpers
+page.links()                       # → list[str], all hrefs
+page.images()                      # → list[str], all srcs
+page.tables()                      # → list[list[list[str]]]
+page.forms()                       # → list[dict]
+page.json()                        # → dict, parse response as JSON
+page.meta()                        # → dict, all meta tags
+# Output
+page.screenshot(path=None)         # → bytes, saves if path given
+page.pdf(path=None)                # → bytes
+page.save(path)                    # save HTML to file
+page.show()                        # print pretty HTML (debug)
+```
+### Element
+```python
+# Properties
+el.text          # → str, inner text
+el.html          # → str, inner HTML
+el.outer_html    # → str, outer HTML
+el.tag           # → str
+el.id            # → str
+el.classes       # → list[str]
+# Attributes
+el.attr(name, default="")    # → str
+el.attrs                     # → dict, all attributes
+el.has_attr(name)            # → bool
+# Traversal
+el.find(selector)            # → Element | None
+el.find_all(selector)        # → list[Element]
+el.parent()                  # → Element | None
+el.children()                # → list[Element]
+el.siblings()                # → list[Element]
+el.next()                    # → Element | None
+el.prev()                    # → Element | None
+# Interaction (browser backends)
+el.click()
+el.double_click()
+el.type(text)
+el.clear()
+el.hover()
+el.focus()
+el.scroll_into_view()
+el.is_visible()              # → bool
+el.is_enabled()              # → bool
+el.is_checked()              # → bool (checkboxes)
+el.bounding_box()            # → dict {x, y, width, height}
+el.screenshot(path=None)     # → bytes, screenshot of element only
+# Magic
+el.__str__()     # → .text
+el.__repr__()    # → "Element(<tag> .class #id)"
+el.__bool__()    # → True (so `if page.find("x"):` works naturally)
+```
+---
+## Convenience Functions
+```python
+from crawlix import get, fetch, browse
+# One-liner scraping
+page = get("https://example.com")     # → Page
+html = fetch("https://example.com")   # → str HTML
+page = browse("https://example.com")  # → Page, forces browser backend
+# Async variants
+from crawlix.async_api import aget, afetch
+page = await aget("https://example.com")
+html = await afetch("https://example.com")
+```
+---
+## Exceptions
+```python
+from crawlix.exceptions import (
+    CrawlixError,       # base — catch-all
+    BackendError,       # backend unavailable or op not supported
+    TimeoutError,       # wait exceeded timeout
+    NavigationError,    # page failed to load
+    SelectorError,      # invalid selector or element not found
+    NetworkError,       # connection error
+    JavaScriptError,    # JS evaluation failed
+)
+```
+`BackendError` always includes install hint:
+```
+BackendError: screenshot() requires a browser backend.
+Install one:
+  pip install crawlix[playwright]   ← recommended
+  pip install crawlix[selenium]
+```
+---
+## Usage Examples
+```python
+# Basic scraping
+from crawlix import Browser
+with Browser() as b:
+    page = b.open("https://news.ycombinator.com")
+    for item in page.find_all(".titleline > a"):
+        print(item.text, item.attr("href"))
+# Login flow
+with Browser(backend="playwright") as b:
+    page = b.open("https://github.com/login")
+    page.type("#login_field", "username")
+    page.type("#password", "password")
+    page.click("[type=submit]")
+    page.wait_for(".dashboard-sidebar")
+    print("logged in:", page.url)
+# Screenshot
+with Browser(backend="playwright", headless=True) as b:
+    page = b.open("https://github.com/keyreyla/crawlix")
+    page.screenshot("crawlix.png")
+# Intercept network requests
+with Browser(backend="playwright") as b:
+    page = b.open("https://example.com")
+    page.intercept("**/api/**", lambda req: req.respond({"mocked": True}))
+# JSON API
+with Browser() as b:
+    data = b.open("https://api.github.com/users/keyreyla").json()
+# Async
+import asyncio
+from crawlix.async_api import AsyncBrowser
+async def main():
+    async with AsyncBrowser() as b:
+        page = await b.open("https://example.com")
+        items = await page.find_all("a")
+        print(len(items))
+asyncio.run(main())
+# Proxy
+with Browser(proxy="http://user:pass@proxy:8080") as b:
+    page = b.open("https://ipinfo.io/json")
+    print(page.json()["ip"])
+# Table extraction
+with Browser() as b:
+    page = b.open("https://en.wikipedia.org/wiki/Python_(programming_language)")
+    for row in page.tables()[0]:
+        print(row)
+# JavaScript evaluation
+with Browser(backend="playwright") as b:
+    page = b.open("https://example.com")
+    title = page.evaluate("document.title")
+    count = page.evaluate_on("ul > li", "el => el.childElementCount")
+# File upload
+with Browser(backend="playwright") as b:
+    page = b.open("https://example.com/upload")
+    page.upload("#file-input", "/path/to/file.pdf")
+    page.click("#submit")
+    page.wait_for(".success")
+# Chaining
+with Browser(backend="playwright") as b:
+    page = (
+        b.open("https://example.com/login")
+        .type("#email", "user@example.com")
+        .type("#password", "secret")
+        .click("[type=submit]")
+        .wait_for(".dashboard")
+    )
+    print(page.title)
+```
+---
+## Publish to TestPyPI & PyPI
+```bash
+# Build
+rm -rf dist/
+python -m build
+# TestPyPI first — always
+twine upload --repository testpypi dist/*
+pip install --index-url https://test.pypi.org/simple/ crawlix
+# Real PyPI
+twine upload dist/*
+```
+Auto-publish via GitHub Actions on `git tag v*` push.
+---
+## Agent Rules
+When implementing crawlix, always follow these non-negotiable rules:
+1. Core `crawlix/` package must have **zero imports from optional libraries at module level** — all optional imports inside functions/methods only, wrapped in try/except ImportError
+2. Every method that requires a browser backend must raise `BackendError` with a helpful install message on HTTP backends — never `NotImplementedError`, never silent failure
+3. All public Page methods return `self` for chaining — `page.type(...).click(...).wait_for(...)` must always work
+4. `Browser.__exit__` must always call `close()` even if an exception occurred inside the block
+5. `Element.__bool__` always returns `True` — `if page.find("x"):` works naturally
+6. Type hints on all public API, Python 3.8 compatible — use `Optional[str]` not `str | None`
+7. Auto-detect backend runs once at `Browser.__init__` and caches — never re-detect per request
+8. Stealth headers applied by default unless `stealth=False` explicitly passed
+9. All wait methods respect `timeout` set at `Browser()` level as default, overridable per-call
+10. Unit tests use mocked HTTP responses only — never hit real URLs in test suite
+11. Single source of truth for version — only in `src/crawlix/_version.py`
+12. `pyproject.toml` uses hatchling, dynamic version from `_version.py`

crawlix-0.1.0/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,24 @@
+# Changelog
+All notable changes to crawlix will be documented in this file.
+## [0.1.0] - 2026-05-17
+### Added
+- Initial release
+- `Browser` class with context manager support and auto-detect backend
+- `Page` class with navigation, querying, interaction, and chaining support
+- `Element` class with traversal, attributes, and magic methods
+- `Backend` ABC with strategy pattern for multiple backends
+- `RequestsBackend` — lightweight HTTP scraping via requests + BeautifulSoup4
+- `PlaywrightBackend` — full browser automation via Playwright
+- `SeleniumBackend` — full browser automation via Selenium
+- `HttpxBackend` — async HTTP support via httpx
+- `AsyncBrowser` and `AsyncPage` for async workflows
+- `aget()` and `afetch()` convenience functions
+- Auto-detect backend (playwright > selenium > requests)
+- Stealth mode with realistic headers and UA rotation
+- BackendError with helpful install hints for browser-only features
+- Full type hints (Python 3.10+)
+- Comprehensive test suite with mocked responses

crawlix-0.1.0/CONTRIBUTING.md ADDED Viewed

@@ -0,0 +1,53 @@
+# Contributing to crawlix
+Thank you for considering contributing to crawlix! This document outlines the guidelines for contributing.
+## Development Setup
+```bash
+git clone https://github.com/keyreyla/crawlix.git
+cd crawlix
+python -m venv .venv
+source .venv/bin/activate
+pip install -e ".[full]"
+pip install pytest pytest-mock responses ruff mypy build
+```
+## Running Tests
+```bash
+pytest
+pytest -v              # verbose
+pytest --cov           # coverage
+```
+## Code Quality
+```bash
+ruff check src/
+mypy src/
+```
+## Pull Request Process
+1. Fork the repository
+2. Create a feature branch (`git checkout -b feat/my-feature`)
+3. Commit your changes using conventional commits
+4. Ensure all tests pass
+5. Open a pull request
+## Conventional Commits
+- `feat:` — new feature
+- `fix:` — bug fix
+- `docs:` — documentation
+- `test:` — tests
+- `refactor:` — code restructuring
+- `chore:` — maintenance
+## Code Style
+- Python 3.10+ type hints
+- Ruff for linting and formatting
+- Line length: 100
+- Quotes: double

crawlix-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 keylordelrey
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

crawlix-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,189 @@
+Metadata-Version: 2.4
+Name: crawlix
+Version: 0.1.0
+Summary: One API. Any backend. Full browser automation to lightweight scraping.
+Project-URL: Homepage, https://github.com/keylordelrey/crawlix
+Project-URL: Source, https://github.com/keylordelrey/crawlix
+Project-URL: Issues, https://github.com/keylordelrey/crawlix/issues
+Author: keylordelrey
+License: MIT
+License-File: LICENSE
+Keywords: browser-automation,playwright,selenium,web-scraping
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Requires-Dist: beautifulsoup4>=4.12
+Requires-Dist: requests>=2.28
+Provides-Extra: async
+Requires-Dist: httpx>=0.24; extra == 'async'
+Provides-Extra: full
+Requires-Dist: httpx>=0.24; extra == 'full'
+Requires-Dist: playwright>=1.40; extra == 'full'
+Requires-Dist: selenium>=4.15; extra == 'full'
+Provides-Extra: playwright
+Requires-Dist: playwright>=1.40; extra == 'playwright'
+Provides-Extra: selenium
+Requires-Dist: selenium>=4.15; extra == 'selenium'
+Description-Content-Type: text/markdown
+# crawlix
+> One API. Any backend. Full browser automation to lightweight scraping.
+**PyPI**: `pip install crawlix`
+**Author**: keylordelrey
+**License**: MIT
+**Python**: 3.10+
+---
+## What crawlix is
+crawlix is a Python browser automation and web scraping library with a unified API across multiple backends. The same code works whether you are doing simple HTTP scraping or full Playwright-powered browser automation — you switch backends, not code.
+```python
+from crawlix import Browser
+with Browser() as b:
+    page = b.open("https://example.com")
+    print(page.find("h1").text)
+with Browser(backend="playwright") as b:
+    page = b.open("https://example.com")
+    page.click("#login")
+    page.type("#email", "user@example.com")
+    page.submit("form")
+    page.wait_for(".dashboard")
+    page.screenshot("result.png")
+```
+---
+## Install
+```bash
+pip install crawlix
+pip install crawlix[playwright]
+pip install crawlix[selenium]
+pip install crawlix[async]
+pip install crawlix[full]
+```
+---
+## Core Design Rules
+1. **Same API across all backends** — switching backend never requires rewriting user code
+2. **Auto-detect best available backend** — no config needed, crawlix figures it out
+3. **Zero hard dependencies** — `pip install crawlix` always succeeds
+4. **Fail with helpful errors** — BackendError tells you exactly what to install
+5. **Context manager always** — resources always cleaned up properly
+6. **Stealth on by default** — realistic headers, UA rotation, no bot fingerprint
+---
+## Backend Priority
+```
+playwright > selenium > requests+bs4 (core)
+```
+Override anytime:
+```python
+Browser(backend="playwright")
+Browser(backend="requests")
+Browser(backend="selenium")
+```
+---
+## Quick Examples
+```python
+from crawlix import Browser, get, fetch
+with Browser() as b:
+    page = b.open("https://news.ycombinator.com")
+    for item in page.find_all(".titleline > a"):
+        print(item.text, item.attr("href"))
+data = get("https://api.github.com/users/keyreyla").json()
+html = fetch("https://example.com")
+```
+For async:
+```python
+import asyncio
+from crawlix.async_api import AsyncBrowser
+async def main():
+    async with AsyncBrowser() as b:
+        page = await b.open("https://example.com")
+        print(page.html)
+asyncio.run(main())
+```
+---
+## API Overview
+### Browser
+```python
+Browser(backend="auto", headless=True, stealth=True, timeout=30, proxy=None, locale="en-US", user_agent=None)
+b.open(url) -> Page
+b.new_page() -> Page
+b.close()
+b.backend_name -> str
+b.supports_js -> bool
+```
+### Page (all methods return `self` for chaining)
+```python
+page.find(selector) -> Element | None
+page.find_all(selector) -> list[Element]
+page.click(selector) -> Page
+page.type(selector, text) -> Page
+page.screenshot(path=None) -> bytes
+page.html -> str
+page.text -> str
+page.json() -> dict
+page.links() -> list[str]
+```
+### Element
+```python
+el.text -> str
+el.attr(name) -> str
+el.attrs -> dict
+el.find(selector) -> Element | None
+el.click() -> Element
+bool(el)  # always True
+```
+---
+## Exceptions
+```python
+from crawlix.exceptions import CrawlixError, BackendError, TimeoutError, NavigationError, SelectorError, NetworkError, JavaScriptError
+```
+---
+## Development
+```bash
+git clone https://github.com/keyreyla/crawlix.git
+cd crawlix
+pip install -e ".[full]"
+pytest
+```