souppot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,44 @@
1
+ name: CI
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ branches:
7
+ - main
8
+
9
+ jobs:
10
+ quality:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - name: Check out repository
15
+ uses: actions/checkout@v4
16
+
17
+ - name: Set up Python
18
+ uses: actions/setup-python@v5
19
+ with:
20
+ python-version: "3.11"
21
+
22
+ - name: Install Hatch
23
+ run: python -m pip install --upgrade pip hatch
24
+
25
+ - name: Check formatting
26
+ run: hatch run ruff format --check .
27
+
28
+ - name: Lint
29
+ run: hatch run ruff check .
30
+
31
+ - name: Type check
32
+ run: hatch run mypy src/souppot
33
+
34
+ - name: Run unit tests
35
+ run: hatch run pytest tests/test_core.py
36
+
37
+ - name: Install Playwright Chromium
38
+ run: hatch run python -m playwright install --with-deps chromium
39
+
40
+ - name: Run functional tests
41
+ run: hatch run pytest tests/functional
42
+
43
+ - name: Build package
44
+ run: hatch run python -m build
@@ -0,0 +1,51 @@
1
+ name: Docs
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ branches:
7
+ - docs
8
+
9
+ permissions:
10
+ contents: read
11
+ pages: write
12
+ id-token: write
13
+
14
+ concurrency:
15
+ group: pages
16
+ cancel-in-progress: false
17
+
18
+ jobs:
19
+ deploy:
20
+ name: Deploy
21
+ runs-on: ubuntu-latest
22
+ environment:
23
+ name: github-pages
24
+ url: ${{ steps.deployment.outputs.page_url }}
25
+
26
+ steps:
27
+ - name: Check out repository
28
+ uses: actions/checkout@v4
29
+
30
+ - name: Set up Python
31
+ uses: actions/setup-python@v5
32
+ with:
33
+ python-version: "3.11"
34
+
35
+ - name: Install Hatch
36
+ run: python -m pip install --upgrade pip hatch
37
+
38
+ - name: Build docs
39
+ run: hatch run docs:build
40
+
41
+ - name: Configure Pages
42
+ uses: actions/configure-pages@v5
43
+
44
+ - name: Upload Pages artifact
45
+ uses: actions/upload-pages-artifact@v3
46
+ with:
47
+ path: docs/_build/html
48
+
49
+ - name: Deploy to GitHub Pages
50
+ id: deployment
51
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,51 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: write
11
+ id-token: write
12
+
13
+ jobs:
14
+ build:
15
+ name: Build and publish distribution packages
16
+ runs-on: ubuntu-latest
17
+ environment:
18
+ name: pypi
19
+ url: https://pypi.org/p/souppot
20
+
21
+ steps:
22
+ - name: Check out repository
23
+ uses: actions/checkout@v4
24
+
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v5
27
+ with:
28
+ python-version: "3.11"
29
+
30
+ - name: Install build tooling
31
+ run: python -m pip install --upgrade pip build twine
32
+
33
+ - name: Build sdist and wheel
34
+ run: python -m build
35
+
36
+ - name: Check distribution metadata
37
+ run: twine check dist/*
38
+
39
+ - name: Publish to PyPI
40
+ uses: pypa/gh-action-pypi-publish@release/v1
41
+ with:
42
+ skip-existing: true
43
+
44
+ - name: Generate SHA256 checksums
45
+ run: sha256sum dist/* > dist/checksums.txt
46
+
47
+ - name: Upload artifacts to GitHub Release
48
+ uses: softprops/action-gh-release@v3
49
+ with:
50
+ files: dist/*
51
+ generate_release_notes: true
@@ -0,0 +1,10 @@
1
+ __pycache__/
2
+ .pytest_cache/
3
+ .mypy_cache/
4
+ .ruff_cache/
5
+
6
+ build/
7
+ dist/
8
+ *.egg-info/
9
+
10
+ docs/_build/
@@ -0,0 +1,9 @@
1
+ # Changelog
2
+
3
+ ## v0.1.0
4
+
5
+ Initial release.
6
+
7
+ - Added `cold_soup` for fetching static HTML with `requests`.
8
+ - Added `hot_soup` for parsing JavaScript-rendered pages with Playwright Chromium.
9
+ - Added `hot_pot` for downloading files through Playwright's request context.
souppot-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 souppot contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
souppot-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: souppot
3
+ Version: 0.1.0
4
+ Summary: Small helpers for fetching and parsing HTML with requests or Playwright.
5
+ Project-URL: Repository, https://github.com/octanima-labs/souppot
6
+ Project-URL: Documentation, https://octanima-labs.github.io/souppot/
7
+ Project-URL: Issues, https://github.com/octanima-labs/souppot/issues
8
+ Author-email: octanima-labs <octanima@tuta.io>
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Internet :: WWW/HTTP
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.11
22
+ Requires-Dist: 2ning
23
+ Requires-Dist: beautifulsoup4
24
+ Requires-Dist: playwright
25
+ Requires-Dist: requests
@@ -0,0 +1,73 @@
1
+ # souppot
2
+
3
+ Small helpers for fetching and parsing HTML with `requests` or Playwright.
4
+
5
+ ## Installation
6
+
7
+ From a checkout:
8
+
9
+ ```bash
10
+ pip install .
11
+ ```
12
+
13
+ When published to PyPI:
14
+
15
+ ```bash
16
+ pip install souppot
17
+ ```
18
+
19
+ For JavaScript-rendered pages and Playwright-backed downloads, install Chromium:
20
+
21
+ ```bash
22
+ python -m playwright install chromium
23
+ ```
24
+
25
+ ## Usage
26
+
27
+ Fetch static HTML:
28
+
29
+ ```python
30
+ from souppot import cold_soup
31
+
32
+ soup = cold_soup("https://example.com")
33
+
34
+ if soup:
35
+ print(soup.title.string)
36
+ ```
37
+
38
+ Fetch JavaScript-rendered HTML:
39
+
40
+ ```python
41
+ from souppot import hot_soup
42
+
43
+ soup = hot_soup("https://example.com", wait_selector=".loaded")
44
+
45
+ if soup:
46
+ print(soup.select_one(".loaded").get_text(strip=True))
47
+ ```
48
+
49
+ Download a file with Playwright:
50
+
51
+ ```python
52
+ from souppot import hot_pot
53
+
54
+ path = hot_pot(
55
+ "https://example.com/file.zip",
56
+ "downloads/file.zip",
57
+ referer="https://example.com",
58
+ )
59
+
60
+ print(path)
61
+ ```
62
+
63
+ ## Documentation
64
+
65
+ The extended API documentation can be found [here](https://octanima-labs.github.io/souppot/).
66
+
67
+ ## Changelog
68
+
69
+ See [CHANGELOG.md](CHANGELOG.md).
70
+
71
+ ## License
72
+
73
+ MIT. See [LICENSE](LICENSE).
@@ -0,0 +1,5 @@
1
+ API Reference
2
+ =============
3
+
4
+ .. automodule:: souppot
5
+ :members: cold_soup, hot_soup, hot_pot
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+
7
+ ROOT = Path(__file__).resolve().parents[1]
8
+ sys.path.insert(0, str(ROOT / "src"))
9
+
10
+ project = "souppot"
11
+ copyright = "2026, souppot contributors"
12
+ author = "souppot contributors"
13
+ release = "0.1.0"
14
+
15
+ extensions = [
16
+ "myst_parser",
17
+ "sphinx.ext.autodoc",
18
+ "sphinx.ext.intersphinx",
19
+ "sphinx.ext.napoleon",
20
+ "sphinx_autodoc_typehints",
21
+ ]
22
+
23
+ source_suffix = {
24
+ ".md": "markdown",
25
+ ".rst": "restructuredtext",
26
+ }
27
+ master_doc = "index"
28
+
29
+ html_theme = "pydata_sphinx_theme"
30
+ html_title = "souppot"
31
+ html_sidebars = {
32
+ "**": [],
33
+ }
34
+ html_theme_options = {
35
+ "show_toc_level": 2,
36
+ }
37
+
38
+ autodoc_default_options = {
39
+ "members": True,
40
+ "show-inheritance": True,
41
+ }
42
+ autodoc_typehints = "description"
43
+ autodoc_typehints_format = "short"
44
+ napoleon_google_docstring = True
45
+ napoleon_numpy_docstring = False
46
+
47
+ intersphinx_mapping = {
48
+ "python": ("https://docs.python.org/3", None),
49
+ "requests": ("https://requests.readthedocs.io/en/latest/", None),
50
+ "bs4": ("https://www.crummy.com/software/BeautifulSoup/bs4/doc/", None),
51
+ }
@@ -0,0 +1,37 @@
1
+ # souppot
2
+
3
+ Small helpers for fetching and parsing HTML with `requests` or Playwright.
4
+
5
+ Use `cold_soup` for normal server-rendered HTML, `hot_soup` for JavaScript-rendered pages, and `hot_pot` when a download needs Playwright's browser-like request stack.
6
+
7
+ ```{toctree}
8
+ :maxdepth: 2
9
+ :caption: Contents
10
+
11
+ usage
12
+ api
13
+ ```
14
+
15
+ ## Installation
16
+
17
+ From a checkout:
18
+
19
+ ```bash
20
+ pip install .
21
+ ```
22
+
23
+ When published to PyPI:
24
+
25
+ ```bash
26
+ pip install souppot
27
+ ```
28
+
29
+ For JavaScript-rendered pages and Playwright-backed downloads, install Chromium:
30
+
31
+ ```bash
32
+ python -m playwright install chromium
33
+ ```
34
+
35
+ ## License
36
+
37
+ souppot is released under the MIT license.
@@ -0,0 +1,49 @@
1
+ # Usage
2
+
3
+ ## Static HTML
4
+
5
+ Use `cold_soup` for pages that do not require JavaScript rendering.
6
+
7
+ ```python
8
+ from souppot import cold_soup
9
+
10
+ soup = cold_soup("https://example.com")
11
+
12
+ if soup:
13
+ print(soup.title.string)
14
+ ```
15
+
16
+ `cold_soup` returns a `BeautifulSoup` object for HTML responses, a raw `requests.Response` for other successful response types, and `None` for missing URLs or non-200 responses.
17
+
18
+ ## JavaScript-Rendered HTML
19
+
20
+ Use `hot_soup` when a page needs Playwright Chromium to render JavaScript before parsing.
21
+
22
+ ```python
23
+ from souppot import hot_soup
24
+
25
+ soup = hot_soup("https://example.com", wait_selector=".loaded")
26
+
27
+ if soup:
28
+ print(soup.select_one(".loaded").get_text(strip=True))
29
+ ```
30
+
31
+ If `wait_selector` times out, `hot_soup` logs the timeout and parses whatever DOM is available.
32
+
33
+ ## Downloads
34
+
35
+ Use `hot_pot` when a file should be downloaded through Playwright's request context.
36
+
37
+ ```python
38
+ from souppot import hot_pot
39
+
40
+ path = hot_pot(
41
+ "https://example.com/file.zip",
42
+ "downloads/file.zip",
43
+ referer="https://example.com",
44
+ )
45
+
46
+ print(path)
47
+ ```
48
+
49
+ Parent directories for the destination path are created automatically.
@@ -0,0 +1,89 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "souppot"
7
+ version = "0.1.0"
8
+ description = "Small helpers for fetching and parsing HTML with requests or Playwright."
9
+ license = "MIT"
10
+ authors = [
11
+ { name = "octanima-labs", email = "octanima@tuta.io" },
12
+ ]
13
+ requires-python = ">=3.11"
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ "Topic :: Internet :: WWW/HTTP",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ ]
26
+ dependencies = [
27
+ "beautifulsoup4",
28
+ "requests",
29
+ "playwright",
30
+ "2ning",
31
+ ]
32
+
33
+ [project.urls]
34
+ Repository = "https://github.com/octanima-labs/souppot"
35
+ Documentation = "https://octanima-labs.github.io/souppot/"
36
+ Issues = "https://github.com/octanima-labs/souppot/issues"
37
+
38
+ [dependency-groups]
39
+ dev = [
40
+ "build",
41
+ "mypy",
42
+ "pytest",
43
+ "ruff",
44
+ ]
45
+ docs = [
46
+ "myst-parser",
47
+ "pydata-sphinx-theme",
48
+ "sphinx",
49
+ "sphinx-autodoc-typehints",
50
+ ]
51
+
52
+ [tool.ruff]
53
+ target-version = "py311"
54
+
55
+ [tool.mypy]
56
+ python_version = "3.11"
57
+
58
+ [[tool.mypy.overrides]]
59
+ module = ["tuning"]
60
+ ignore_missing_imports = true
61
+
62
+ [tool.hatch.envs.default]
63
+ dependencies = [
64
+ "build",
65
+ "mypy",
66
+ "pytest",
67
+ "ruff",
68
+ ]
69
+
70
+ [tool.hatch.envs.docs]
71
+ dependencies = [
72
+ "myst-parser",
73
+ "pydata-sphinx-theme",
74
+ "sphinx",
75
+ "sphinx-autodoc-typehints",
76
+ ]
77
+
78
+ [tool.hatch.envs.docs.scripts]
79
+ build = "sphinx-build -W -b html docs docs/_build/html"
80
+
81
+ [tool.hatch.build.targets.wheel]
82
+ packages = ["src/souppot"]
83
+
84
+ [tool.pytest.ini_options]
85
+ testpaths = ["tests"]
86
+ pythonpath = ["src"]
87
+ markers = [
88
+ "functional: local HTTP server tests that exercise real requests and Playwright paths",
89
+ ]
@@ -0,0 +1,5 @@
1
+ """Public package interface for souppot HTML fetching helpers."""
2
+
3
+ from .core import cold_soup, hot_pot, hot_soup
4
+
5
+ __all__ = ("cold_soup", "hot_soup", "hot_pot")
@@ -0,0 +1,261 @@
1
+ """Core helpers for fetching static pages, rendered pages, and downloads.
2
+
3
+ ``cold_soup`` uses ``requests`` for normal HTTP responses. ``hot_soup`` and
4
+ ``hot_pot`` use Playwright Chromium for JavaScript-rendered pages and
5
+ browser-like download requests.
6
+ """
7
+
8
+ import time
9
+ from pathlib import Path
10
+ from typing import Final
11
+ from urllib.parse import urlparse
12
+
13
+ import requests
14
+ import tuning
15
+ from bs4 import BeautifulSoup
16
+ from playwright.sync_api import Error as PlaywrightError
17
+ from playwright.sync_api import Browser
18
+ from playwright.sync_api import BrowserContext
19
+ from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
20
+ from playwright.sync_api import sync_playwright
21
+
22
+
23
+ logger = tuning.getLogger(__name__)
24
+
25
+ BROWSER_USER_AGENT: Final[str] = (
26
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
27
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
28
+ "Chrome/122.0.0.0 Safari/537.36"
29
+ )
30
+ HTML_ACCEPT: Final[str] = (
31
+ "text/html,application/xhtml+xml,application/xml;q=0.9,"
32
+ "image/avif,image/webp,image/apng,*/*;q=0.8"
33
+ )
34
+ HTML_HEADERS: Final[dict[str, str]] = {
35
+ "User-Agent": BROWSER_USER_AGENT,
36
+ "Accept": HTML_ACCEPT,
37
+ "Accept-Language": "en-US,en;q=0.9",
38
+ "Accept-Encoding": "gzip, deflate, br",
39
+ "Connection": "keep-alive",
40
+ "Upgrade-Insecure-Requests": "1",
41
+ "Cache-Control": "no-cache",
42
+ "Pragma": "no-cache",
43
+ "DNT": "1",
44
+ "Sec-Fetch-Dest": "document",
45
+ "Sec-Fetch-Mode": "navigate",
46
+ "Sec-Fetch-Site": "none",
47
+ "Sec-Fetch-User": "?1",
48
+ }
49
+ PLAYWRIGHT_HTML_HEADERS: Final[dict[str, str]] = {
50
+ "Accept": HTML_ACCEPT,
51
+ "Accept-Language": "en-US,en;q=0.9",
52
+ "Cache-Control": "no-cache",
53
+ "Pragma": "no-cache",
54
+ "DNT": "1",
55
+ "Upgrade-Insecure-Requests": "1",
56
+ }
57
+ DOWNLOAD_HEADERS: Final[dict[str, str]] = {
58
+ "Accept": "application/octet-stream,*/*;q=0.8",
59
+ "Accept-Language": "en-US,en;q=0.9",
60
+ "Cache-Control": "no-cache",
61
+ "Pragma": "no-cache",
62
+ }
63
+
64
+ __all__: Final[tuple[str, ...]] = ("cold_soup", "hot_soup", "hot_pot")
65
+
66
+
67
+ def _clean_url(url: str | None) -> str | None:
68
+ """Strip URL input and normalize missing values to ``None``."""
69
+ if url is None:
70
+ return None
71
+ url = str(url).strip()
72
+ return url or None
73
+
74
+
75
+ def cold_soup(
76
+ url: str | None,
77
+ check_errors: bool = False,
78
+ ) -> BeautifulSoup | requests.Response | None:
79
+ """Fetch a URL with ``requests`` and parse HTML responses.
80
+
81
+ Args:
82
+ url: URL to fetch. ``None`` and blank strings are treated as missing.
83
+ check_errors: If true, call ``raise_for_status()`` before normal status
84
+ handling.
85
+
86
+ Returns:
87
+ ``BeautifulSoup`` for ``200`` responses with a ``text/html`` content
88
+ type, the raw ``requests.Response`` for other ``200`` responses, and
89
+ ``None`` for missing URLs or non-``200`` responses.
90
+
91
+ Raises:
92
+ requests.HTTPError: If ``check_errors`` is true and the response status
93
+ is an HTTP error.
94
+ """
95
+ url = _clean_url(url)
96
+ if url is None:
97
+ logger.warning("URL not provided")
98
+ return None
99
+ logger.debug("GET %s", url)
100
+ parsed = urlparse(url)
101
+ origin = (
102
+ f"{parsed.scheme}://{parsed.netloc}"
103
+ if parsed.scheme and parsed.netloc
104
+ else None
105
+ )
106
+ headers = HTML_HEADERS.copy()
107
+ if origin:
108
+ headers["Referer"] = origin + "/"
109
+
110
+ res = requests.get(url=url, headers=headers, timeout=15, allow_redirects=True)
111
+ if check_errors:
112
+ res.raise_for_status()
113
+ if res.history:
114
+ logger.debug("Redirected (%s hops) -> %s", len(res.history), res.url)
115
+ for hop in res.history:
116
+ logger.debug(" %s %s", hop.status_code, hop.url)
117
+ if res.status_code != 200:
118
+ logger.error("HTTP error: %s", res.status_code)
119
+ return None
120
+
121
+ ct = res.headers.get("Content-Type", "").lower()
122
+ if "text/html" not in ct:
123
+ logger.debug("Not an HTML page (%s)", ct)
124
+ return res
125
+
126
+ soup = BeautifulSoup(res.text, "html.parser")
127
+ logger.debug("✅ Soup is served (%s chars)", len(res.text))
128
+ return soup
129
+
130
+
131
+ def hot_soup(
132
+ url: str | None,
133
+ wait_seconds: float = 3,
134
+ wait_selector: str | None = None,
135
+ ) -> BeautifulSoup | None:
136
+ """Render a URL with Playwright Chromium and parse the final DOM.
137
+
138
+ Args:
139
+ url: URL to render. ``None`` and blank strings are treated as missing.
140
+ wait_seconds: Seconds to sleep after ``domcontentloaded`` when no
141
+ ``wait_selector`` is provided. When waiting for a selector, this is
142
+ converted to the selector timeout with a minimum of 1000 ms.
143
+ wait_selector: Optional CSS selector to wait for before parsing. If the
144
+ selector times out, the currently rendered DOM is parsed anyway.
145
+
146
+ Returns:
147
+ ``BeautifulSoup`` for the rendered page, or ``None`` for missing URLs or
148
+ Playwright errors.
149
+ """
150
+ url = _clean_url(url)
151
+ if url is None:
152
+ logger.warning("URL not provided")
153
+ return None
154
+
155
+ logger.debug("RENDER %s", url)
156
+ try:
157
+ with sync_playwright() as p:
158
+ browser: Browser | None = None
159
+ context: BrowserContext | None = None
160
+ try:
161
+ browser = p.chromium.launch(headless=True)
162
+ context = browser.new_context(
163
+ user_agent=BROWSER_USER_AGENT,
164
+ locale="en-US",
165
+ viewport={"width": 1920, "height": 1080},
166
+ )
167
+ context.set_extra_http_headers(PLAYWRIGHT_HTML_HEADERS.copy())
168
+
169
+ page = context.new_page()
170
+ response = page.goto(url, wait_until="domcontentloaded", timeout=30_000)
171
+ if page.url != url:
172
+ status = response.status if response is not None else "?"
173
+ logger.info("Redirected -> %s (status: %s)", page.url, status)
174
+
175
+ if wait_selector:
176
+ try:
177
+ page.wait_for_selector(
178
+ wait_selector,
179
+ timeout=max(1000, int(wait_seconds * 1000)),
180
+ )
181
+ except PlaywrightTimeoutError:
182
+ logger.error("Timeout waiting for selector: %s", wait_selector)
183
+ # Continue anyway and parse whatever has been rendered so far.
184
+ else:
185
+ time.sleep(max(0, float(wait_seconds)))
186
+
187
+ html = page.content()
188
+ finally:
189
+ if context is not None:
190
+ context.close()
191
+ if browser is not None:
192
+ browser.close()
193
+
194
+ soup = BeautifulSoup(html, "html.parser")
195
+ logger.debug("✅ Soup is served (JS-rendered - %s chars)", len(html))
196
+ return soup
197
+
198
+ except PlaywrightError as e:
199
+ logger.error("Playwright error: %s", e, exc_info=True)
200
+ return None
201
+
202
+
203
+ def hot_pot(
204
+ url: str | None,
205
+ dest: str | Path,
206
+ referer: str | None = None,
207
+ timeout_ms: int = 60_000,
208
+ ) -> Path:
209
+ """Download a URL with Playwright's request context and save it to disk.
210
+
211
+ Args:
212
+ url: URL to download. ``None`` and blank strings raise ``ValueError``.
213
+ dest: Destination file path. Parent directories are created if needed.
214
+ referer: Optional ``Referer`` header to send with the request.
215
+ timeout_ms: Playwright request timeout in milliseconds.
216
+
217
+ Returns:
218
+ The destination path as a ``Path``.
219
+
220
+ Raises:
221
+ ValueError: If ``url`` is missing.
222
+ playwright.sync_api.Error: If Playwright cannot complete the request.
223
+ """
224
+ url = _clean_url(url)
225
+ if url is None:
226
+ raise ValueError("URL not provided")
227
+
228
+ dest = Path(dest)
229
+ dest.parent.mkdir(parents=True, exist_ok=True)
230
+
231
+ logger.debug("PLAYWRIGHT GET %s", url)
232
+ with sync_playwright() as p:
233
+ browser: Browser | None = None
234
+ context: BrowserContext | None = None
235
+ try:
236
+ browser = p.chromium.launch(headless=True)
237
+ context = browser.new_context(
238
+ user_agent=BROWSER_USER_AGENT,
239
+ locale="en-US",
240
+ )
241
+
242
+ headers = DOWNLOAD_HEADERS.copy()
243
+ if referer:
244
+ headers["Referer"] = referer
245
+
246
+ response = context.request.get(
247
+ url,
248
+ headers=headers,
249
+ fail_on_status_code=True,
250
+ timeout=timeout_ms,
251
+ )
252
+ body = response.body()
253
+ finally:
254
+ if context is not None:
255
+ context.close()
256
+ if browser is not None:
257
+ browser.close()
258
+
259
+ dest.write_bytes(body)
260
+ logger.debug("Download saved to: %s", dest)
261
+ return dest
File without changes
@@ -0,0 +1,12 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <title>Soup Pot Fixture</title>
5
+ </head>
6
+ <body>
7
+ <main id="content">
8
+ <h1 id="title">Soup Pot</h1>
9
+ <p class="message">Fixture HTML for parser unit tests.</p>
10
+ </main>
11
+ </body>
12
+ </html>
@@ -0,0 +1 @@
1
+ souppot functional download fixture
@@ -0,0 +1,21 @@
1
+ <!doctype html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8">
5
+ <title>Souppot Functional Fixture</title>
6
+ </head>
7
+ <body>
8
+ <main id="content">
9
+ <h1 id="title">Souppot Functional Fixture</h1>
10
+ <p class="static">This element is present in the original HTML.</p>
11
+ </main>
12
+ <script>
13
+ setTimeout(() => {
14
+ const node = document.createElement("p");
15
+ node.className = "delayed";
16
+ node.textContent = "This element was created by JavaScript.";
17
+ document.querySelector("#content").appendChild(node);
18
+ }, 200);
19
+ </script>
20
+ </body>
21
+ </html>
@@ -0,0 +1,91 @@
1
+ from functools import partial
2
+ from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
3
+ from pathlib import Path
4
+ from threading import Thread
5
+
6
+ import pytest
7
+ from bs4 import BeautifulSoup
8
+ from playwright.sync_api import Error as PlaywrightError
9
+ from playwright.sync_api import sync_playwright
10
+ from souppot import cold_soup, hot_pot, hot_soup
11
+
12
+
13
+ pytestmark = pytest.mark.functional
14
+
15
+ FIXTURES = Path(__file__).parent / "fixtures"
16
+
17
+
18
+ class QuietHandler(SimpleHTTPRequestHandler):
19
+ def log_message(self, format: str, *args: object) -> None:
20
+ return None
21
+
22
+
23
+ @pytest.fixture(scope="module")
24
+ def fixture_server() -> str:
25
+ handler = partial(QuietHandler, directory=str(FIXTURES))
26
+ server = ThreadingHTTPServer(("127.0.0.1", 0), handler)
27
+ thread = Thread(target=server.serve_forever, daemon=True)
28
+ thread.start()
29
+
30
+ try:
31
+ host, port = server.server_address
32
+ yield f"http://{host}:{port}"
33
+ finally:
34
+ server.shutdown()
35
+ server.server_close()
36
+ thread.join(timeout=5)
37
+
38
+
39
+ @pytest.fixture(scope="module")
40
+ def chromium_available() -> None:
41
+ try:
42
+ with sync_playwright() as p:
43
+ browser = p.chromium.launch(headless=True)
44
+ browser.close()
45
+ except PlaywrightError as exc:
46
+ pytest.skip(f"Playwright Chromium is not available: {exc}")
47
+
48
+
49
+ def test_cold_soup_fetches_local_html(fixture_server: str) -> None:
50
+ soup = cold_soup(f"{fixture_server}/page.html")
51
+
52
+ assert isinstance(soup, BeautifulSoup)
53
+ assert (
54
+ soup.select_one("#title").get_text(strip=True) == "Souppot Functional Fixture"
55
+ )
56
+ assert (
57
+ soup.select_one(".static").get_text(strip=True)
58
+ == "This element is present in the original HTML."
59
+ )
60
+ assert soup.select_one(".delayed") is None
61
+
62
+
63
+ def test_hot_soup_waits_for_javascript_created_element(
64
+ fixture_server: str,
65
+ chromium_available: None,
66
+ ) -> None:
67
+ soup = hot_soup(
68
+ f"{fixture_server}/page.html", wait_selector=".delayed", wait_seconds=2
69
+ )
70
+
71
+ assert isinstance(soup, BeautifulSoup)
72
+ assert (
73
+ soup.select_one(".delayed").get_text(strip=True)
74
+ == "This element was created by JavaScript."
75
+ )
76
+
77
+
78
+ def test_hot_pot_downloads_local_file(
79
+ fixture_server: str,
80
+ chromium_available: None,
81
+ tmp_path: Path,
82
+ ) -> None:
83
+ source = FIXTURES / "dummy.bin"
84
+ dest = tmp_path / "downloads" / "dummy.bin"
85
+
86
+ result = hot_pot(
87
+ f"{fixture_server}/dummy.bin", dest, referer=f"{fixture_server}/page.html"
88
+ )
89
+
90
+ assert result == dest
91
+ assert dest.read_bytes() == source.read_bytes()
@@ -0,0 +1,353 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+ import souppot
5
+ from bs4 import BeautifulSoup
6
+ from souppot import core
7
+
8
+
9
+ FIXTURES = Path(__file__).parent / "fixtures"
10
+
11
+
12
+ class FakeResponse:
13
+ def __init__(
14
+ self,
15
+ *,
16
+ status_code: int = 200,
17
+ text: str = "",
18
+ headers: dict[str, str] | None = None,
19
+ url: str = "https://example.com/page",
20
+ history: list[object] | None = None,
21
+ error: Exception | None = None,
22
+ ) -> None:
23
+ self.status_code = status_code
24
+ self.text = text
25
+ self.headers = headers or {}
26
+ self.url = url
27
+ self.history = history or []
28
+ self.error = error
29
+
30
+ def raise_for_status(self) -> None:
31
+ if self.error is not None:
32
+ raise self.error
33
+
34
+
35
+ class FakeRenderedResponse:
36
+ status = 200
37
+
38
+
39
+ class FakePage:
40
+ def __init__(self, html: str, *, wait_raises: bool = False) -> None:
41
+ self.html = html
42
+ self.url = "https://example.com/page"
43
+ self.wait_raises = wait_raises
44
+ self.wait_selector_calls: list[tuple[str, int]] = []
45
+
46
+ def goto(self, url: str, *, wait_until: str, timeout: int) -> FakeRenderedResponse:
47
+ self.url = url
48
+ self.goto_call = {"url": url, "wait_until": wait_until, "timeout": timeout}
49
+ return FakeRenderedResponse()
50
+
51
+ def wait_for_selector(self, selector: str, *, timeout: int) -> None:
52
+ self.wait_selector_calls.append((selector, timeout))
53
+ if self.wait_raises:
54
+ raise core.PlaywrightTimeoutError("selector timed out")
55
+
56
+ def content(self) -> str:
57
+ return self.html
58
+
59
+
60
+ class FakeBrowserContext:
61
+ def __init__(self, page: FakePage) -> None:
62
+ self.page = page
63
+ self.extra_headers: dict[str, str] | None = None
64
+ self.closed = False
65
+
66
+ def set_extra_http_headers(self, headers: dict[str, str]) -> None:
67
+ self.extra_headers = headers
68
+
69
+ def new_page(self) -> FakePage:
70
+ return self.page
71
+
72
+ def close(self) -> None:
73
+ self.closed = True
74
+
75
+
76
+ class FakeBrowser:
77
+ def __init__(self, context: FakeBrowserContext) -> None:
78
+ self.context = context
79
+ self.closed = False
80
+
81
+ def new_context(self, **kwargs: object) -> FakeBrowserContext:
82
+ self.new_context_kwargs = kwargs
83
+ return self.context
84
+
85
+ def close(self) -> None:
86
+ self.closed = True
87
+
88
+
89
+ class FakeChromium:
90
+ def __init__(self, browser: FakeBrowser) -> None:
91
+ self.browser = browser
92
+
93
+ def launch(self, *, headless: bool) -> FakeBrowser:
94
+ self.launch_kwargs = {"headless": headless}
95
+ return self.browser
96
+
97
+
98
+ class FakeSyncPlaywright:
99
+ def __init__(self, chromium: FakeChromium) -> None:
100
+ self.chromium = chromium
101
+
102
+ def __enter__(self) -> "FakeSyncPlaywright":
103
+ return self
104
+
105
+ def __exit__(self, *args: object) -> None:
106
+ return None
107
+
108
+
109
+ class FakeDownloadResponse:
110
+ def __init__(self, body: bytes) -> None:
111
+ self._body = body
112
+
113
+ def body(self) -> bytes:
114
+ return self._body
115
+
116
+
117
+ class FakeRequestContext:
118
+ def __init__(self, body: bytes) -> None:
119
+ self.body = body
120
+ self.calls: list[dict[str, object]] = []
121
+
122
+ def get(self, url: str, **kwargs: object) -> FakeDownloadResponse:
123
+ self.calls.append({"url": url, **kwargs})
124
+ return FakeDownloadResponse(self.body)
125
+
126
+
127
+ class FakeDownloadContext:
128
+ def __init__(self, body: bytes) -> None:
129
+ self.request = FakeRequestContext(body)
130
+ self.closed = False
131
+
132
+ def close(self) -> None:
133
+ self.closed = True
134
+
135
+
136
+ class FakeDownloadBrowser:
137
+ def __init__(self, context: FakeDownloadContext) -> None:
138
+ self.context = context
139
+ self.closed = False
140
+
141
+ def new_context(self, **kwargs: object) -> FakeDownloadContext:
142
+ self.new_context_kwargs = kwargs
143
+ return self.context
144
+
145
+ def close(self) -> None:
146
+ self.closed = True
147
+
148
+
149
+ class FakeDownloadChromium:
150
+ def __init__(self, browser: FakeDownloadBrowser) -> None:
151
+ self.browser = browser
152
+
153
+ def launch(self, *, headless: bool) -> FakeDownloadBrowser:
154
+ self.launch_kwargs = {"headless": headless}
155
+ return self.browser
156
+
157
+
158
+ @pytest.fixture
159
+ def fixture_html() -> str:
160
+ return (FIXTURES / "basic.html").read_text(encoding="utf-8")
161
+
162
+
163
+ def test_package_exports_public_api() -> None:
164
+ assert souppot.__all__ == ("cold_soup", "hot_soup", "hot_pot")
165
+ assert souppot.cold_soup is core.cold_soup
166
+ assert souppot.hot_soup is core.hot_soup
167
+ assert souppot.hot_pot is core.hot_pot
168
+
169
+
170
+ @pytest.mark.parametrize("url", [None, "", " "])
171
+ def test_cold_soup_missing_url_returns_none_without_request(
172
+ monkeypatch: pytest.MonkeyPatch, url: str | None
173
+ ) -> None:
174
+ def fail_get(**kwargs: object) -> None:
175
+ raise AssertionError("requests.get should not be called")
176
+
177
+ monkeypatch.setattr(core.requests, "get", fail_get)
178
+
179
+ assert core.cold_soup(url) is None
180
+
181
+
182
+ def test_cold_soup_sends_browser_like_request_headers(
183
+ monkeypatch: pytest.MonkeyPatch,
184
+ ) -> None:
185
+ response = FakeResponse(headers={"Content-Type": "application/json"})
186
+ calls: list[dict[str, object]] = []
187
+
188
+ def fake_get(**kwargs: object) -> FakeResponse:
189
+ calls.append(kwargs)
190
+ return response
191
+
192
+ monkeypatch.setattr(core.requests, "get", fake_get)
193
+
194
+ assert core.cold_soup(" https://example.com/path ") is response
195
+ call = calls[0]
196
+ headers = call["headers"]
197
+
198
+ assert call["url"] == "https://example.com/path"
199
+ assert "stream" not in call
200
+ assert call["timeout"] == 15
201
+ assert call["allow_redirects"] is True
202
+ assert isinstance(headers, dict)
203
+ assert "Mozilla/5.0" in headers["User-Agent"]
204
+ assert headers["Referer"] == "https://example.com/"
205
+
206
+
207
+ def test_cold_soup_returns_beautifulsoup_for_html_response(
208
+ monkeypatch: pytest.MonkeyPatch,
209
+ fixture_html: str,
210
+ ) -> None:
211
+ response = FakeResponse(
212
+ text=fixture_html, headers={"Content-Type": "text/html; charset=utf-8"}
213
+ )
214
+
215
+ monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
216
+
217
+ soup = core.cold_soup("https://example.com/page")
218
+
219
+ assert isinstance(soup, BeautifulSoup)
220
+ assert soup.select_one("#title").get_text(strip=True) == "Soup Pot"
221
+
222
+
223
+ def test_cold_soup_returns_response_for_non_html_response(
224
+ monkeypatch: pytest.MonkeyPatch,
225
+ ) -> None:
226
+ response = FakeResponse(
227
+ text='{"ok": true}', headers={"Content-Type": "application/json"}
228
+ )
229
+
230
+ monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
231
+
232
+ assert core.cold_soup("https://example.com/data.json") is response
233
+
234
+
235
+ def test_cold_soup_returns_none_for_non_200_response(
236
+ monkeypatch: pytest.MonkeyPatch,
237
+ ) -> None:
238
+ response = FakeResponse(status_code=404, headers={"Content-Type": "text/html"})
239
+
240
+ monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
241
+
242
+ assert core.cold_soup("https://example.com/missing") is None
243
+
244
+
245
+ def test_cold_soup_check_errors_raises_before_status_handling(
246
+ monkeypatch: pytest.MonkeyPatch,
247
+ ) -> None:
248
+ class MarkerError(Exception):
249
+ pass
250
+
251
+ response = FakeResponse(status_code=500, error=MarkerError("boom"))
252
+
253
+ monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
254
+
255
+ with pytest.raises(MarkerError):
256
+ core.cold_soup("https://example.com/error", check_errors=True)
257
+
258
+
259
+ @pytest.mark.parametrize("url", [None, "", " "])
260
+ def test_hot_soup_missing_url_returns_none_without_playwright(
261
+ monkeypatch: pytest.MonkeyPatch, url: str | None
262
+ ) -> None:
263
+ def fail_sync_playwright() -> None:
264
+ raise AssertionError("sync_playwright should not be called")
265
+
266
+ monkeypatch.setattr(core, "sync_playwright", fail_sync_playwright)
267
+
268
+ assert core.hot_soup(url) is None
269
+
270
+
271
+ def test_hot_soup_parses_rendered_html_from_fake_playwright(
272
+ monkeypatch: pytest.MonkeyPatch,
273
+ fixture_html: str,
274
+ ) -> None:
275
+ page = FakePage(fixture_html)
276
+ context = FakeBrowserContext(page)
277
+ browser = FakeBrowser(context)
278
+ playwright = FakeSyncPlaywright(FakeChromium(browser))
279
+ sleep_calls: list[float] = []
280
+
281
+ monkeypatch.setattr(core, "sync_playwright", lambda: playwright)
282
+ monkeypatch.setattr(core.time, "sleep", lambda seconds: sleep_calls.append(seconds))
283
+
284
+ soup = core.hot_soup("https://example.com/page", wait_seconds=0)
285
+
286
+ assert isinstance(soup, BeautifulSoup)
287
+ assert (
288
+ soup.select_one(".message").get_text(strip=True)
289
+ == "Fixture HTML for parser unit tests."
290
+ )
291
+ assert sleep_calls == [0]
292
+ assert context.closed is True
293
+ assert browser.closed is True
294
+
295
+
296
+ def test_hot_soup_wait_selector_timeout_still_parses_html(
297
+ monkeypatch: pytest.MonkeyPatch,
298
+ fixture_html: str,
299
+ ) -> None:
300
+ page = FakePage(fixture_html, wait_raises=True)
301
+ context = FakeBrowserContext(page)
302
+ browser = FakeBrowser(context)
303
+ playwright = FakeSyncPlaywright(FakeChromium(browser))
304
+
305
+ monkeypatch.setattr(core, "sync_playwright", lambda: playwright)
306
+
307
+ soup = core.hot_soup(
308
+ "https://example.com/page", wait_selector="#missing", wait_seconds=0.2
309
+ )
310
+
311
+ assert isinstance(soup, BeautifulSoup)
312
+ assert soup.select_one("#content") is not None
313
+ assert page.wait_selector_calls == [("#missing", 1000)]
314
+
315
+
316
+ @pytest.mark.parametrize("url", [None, "", " "])
317
+ def test_hot_pot_missing_url_raises_value_error(
318
+ url: str | None, tmp_path: Path
319
+ ) -> None:
320
+ with pytest.raises(ValueError, match="URL not provided"):
321
+ core.hot_pot(url, tmp_path / "out.bin")
322
+
323
+
324
+ def test_hot_pot_creates_parent_dirs_and_writes_body(
325
+ monkeypatch: pytest.MonkeyPatch, tmp_path: Path
326
+ ) -> None:
327
+ body = b"downloaded bytes"
328
+ context = FakeDownloadContext(body)
329
+ browser = FakeDownloadBrowser(context)
330
+ playwright = FakeSyncPlaywright(FakeDownloadChromium(browser))
331
+ dest = tmp_path / "nested" / "out.bin"
332
+
333
+ monkeypatch.setattr(core, "sync_playwright", lambda: playwright)
334
+
335
+ result = core.hot_pot(
336
+ " https://example.com/file.bin ",
337
+ dest,
338
+ referer="https://example.com/page",
339
+ timeout_ms=123,
340
+ )
341
+
342
+ assert result == dest
343
+ assert dest.read_bytes() == body
344
+ assert context.closed is True
345
+ assert browser.closed is True
346
+
347
+ call = context.request.calls[0]
348
+ headers = call["headers"]
349
+ assert call["url"] == "https://example.com/file.bin"
350
+ assert call["fail_on_status_code"] is True
351
+ assert call["timeout"] == 123
352
+ assert isinstance(headers, dict)
353
+ assert headers["Referer"] == "https://example.com/page"