souppot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- souppot-0.1.0/.github/workflows/ci.yml +44 -0
- souppot-0.1.0/.github/workflows/docs.yml +51 -0
- souppot-0.1.0/.github/workflows/release.yml +51 -0
- souppot-0.1.0/.gitignore +10 -0
- souppot-0.1.0/CHANGELOG.md +9 -0
- souppot-0.1.0/LICENSE +21 -0
- souppot-0.1.0/PKG-INFO +25 -0
- souppot-0.1.0/README.md +73 -0
- souppot-0.1.0/docs/api.rst +5 -0
- souppot-0.1.0/docs/conf.py +51 -0
- souppot-0.1.0/docs/index.md +37 -0
- souppot-0.1.0/docs/usage.md +49 -0
- souppot-0.1.0/pyproject.toml +89 -0
- souppot-0.1.0/src/souppot/__init__.py +5 -0
- souppot-0.1.0/src/souppot/core.py +261 -0
- souppot-0.1.0/src/souppot/py.typed +0 -0
- souppot-0.1.0/tests/fixtures/basic.html +12 -0
- souppot-0.1.0/tests/functional/fixtures/dummy.bin +1 -0
- souppot-0.1.0/tests/functional/fixtures/page.html +21 -0
- souppot-0.1.0/tests/functional/test_functional.py +91 -0
- souppot-0.1.0/tests/test_core.py +353 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
quality:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- name: Check out repository
|
|
15
|
+
uses: actions/checkout@v4
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.11"
|
|
21
|
+
|
|
22
|
+
- name: Install Hatch
|
|
23
|
+
run: python -m pip install --upgrade pip hatch
|
|
24
|
+
|
|
25
|
+
- name: Check formatting
|
|
26
|
+
run: hatch run ruff format --check .
|
|
27
|
+
|
|
28
|
+
- name: Lint
|
|
29
|
+
run: hatch run ruff check .
|
|
30
|
+
|
|
31
|
+
- name: Type check
|
|
32
|
+
run: hatch run mypy src/souppot
|
|
33
|
+
|
|
34
|
+
- name: Run unit tests
|
|
35
|
+
run: hatch run pytest tests/test_core.py
|
|
36
|
+
|
|
37
|
+
- name: Install Playwright Chromium
|
|
38
|
+
run: hatch run python -m playwright install --with-deps chromium
|
|
39
|
+
|
|
40
|
+
- name: Run functional tests
|
|
41
|
+
run: hatch run pytest tests/functional
|
|
42
|
+
|
|
43
|
+
- name: Build package
|
|
44
|
+
run: hatch run python -m build
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- docs
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
pages: write
|
|
12
|
+
id-token: write
|
|
13
|
+
|
|
14
|
+
concurrency:
|
|
15
|
+
group: pages
|
|
16
|
+
cancel-in-progress: false
|
|
17
|
+
|
|
18
|
+
jobs:
|
|
19
|
+
deploy:
|
|
20
|
+
name: Deploy
|
|
21
|
+
runs-on: ubuntu-latest
|
|
22
|
+
environment:
|
|
23
|
+
name: github-pages
|
|
24
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
25
|
+
|
|
26
|
+
steps:
|
|
27
|
+
- name: Check out repository
|
|
28
|
+
uses: actions/checkout@v4
|
|
29
|
+
|
|
30
|
+
- name: Set up Python
|
|
31
|
+
uses: actions/setup-python@v5
|
|
32
|
+
with:
|
|
33
|
+
python-version: "3.11"
|
|
34
|
+
|
|
35
|
+
- name: Install Hatch
|
|
36
|
+
run: python -m pip install --upgrade pip hatch
|
|
37
|
+
|
|
38
|
+
- name: Build docs
|
|
39
|
+
run: hatch run docs:build
|
|
40
|
+
|
|
41
|
+
- name: Configure Pages
|
|
42
|
+
uses: actions/configure-pages@v5
|
|
43
|
+
|
|
44
|
+
- name: Upload Pages artifact
|
|
45
|
+
uses: actions/upload-pages-artifact@v3
|
|
46
|
+
with:
|
|
47
|
+
path: docs/_build/html
|
|
48
|
+
|
|
49
|
+
- name: Deploy to GitHub Pages
|
|
50
|
+
id: deployment
|
|
51
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
id-token: write
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
build:
|
|
15
|
+
name: Build and publish distribution packages
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
environment:
|
|
18
|
+
name: pypi
|
|
19
|
+
url: https://pypi.org/p/souppot
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- name: Check out repository
|
|
23
|
+
uses: actions/checkout@v4
|
|
24
|
+
|
|
25
|
+
- name: Set up Python
|
|
26
|
+
uses: actions/setup-python@v5
|
|
27
|
+
with:
|
|
28
|
+
python-version: "3.11"
|
|
29
|
+
|
|
30
|
+
- name: Install build tooling
|
|
31
|
+
run: python -m pip install --upgrade pip build twine
|
|
32
|
+
|
|
33
|
+
- name: Build sdist and wheel
|
|
34
|
+
run: python -m build
|
|
35
|
+
|
|
36
|
+
- name: Check distribution metadata
|
|
37
|
+
run: twine check dist/*
|
|
38
|
+
|
|
39
|
+
- name: Publish to PyPI
|
|
40
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
41
|
+
with:
|
|
42
|
+
skip-existing: true
|
|
43
|
+
|
|
44
|
+
- name: Generate SHA256 checksums
|
|
45
|
+
run: sha256sum dist/* > dist/checksums.txt
|
|
46
|
+
|
|
47
|
+
- name: Upload artifacts to GitHub Release
|
|
48
|
+
uses: softprops/action-gh-release@v3
|
|
49
|
+
with:
|
|
50
|
+
files: dist/*
|
|
51
|
+
generate_release_notes: true
|
souppot-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## v0.1.0
|
|
4
|
+
|
|
5
|
+
Initial release.
|
|
6
|
+
|
|
7
|
+
- Added `cold_soup` for fetching static HTML with `requests`.
|
|
8
|
+
- Added `hot_soup` for parsing JavaScript-rendered pages with Playwright Chromium.
|
|
9
|
+
- Added `hot_pot` for downloading files through Playwright's request context.
|
souppot-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 souppot contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
souppot-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: souppot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Small helpers for fetching and parsing HTML with requests or Playwright.
|
|
5
|
+
Project-URL: Repository, https://github.com/octanima-labs/souppot
|
|
6
|
+
Project-URL: Documentation, https://octanima-labs.github.io/souppot/
|
|
7
|
+
Project-URL: Issues, https://github.com/octanima-labs/souppot/issues
|
|
8
|
+
Author-email: octanima-labs <octanima@tuta.io>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: 2ning
|
|
23
|
+
Requires-Dist: beautifulsoup4
|
|
24
|
+
Requires-Dist: playwright
|
|
25
|
+
Requires-Dist: requests
|
souppot-0.1.0/README.md
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# souppot
|
|
2
|
+
|
|
3
|
+
Small helpers for fetching and parsing HTML with `requests` or Playwright.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
From a checkout:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install .
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
When published to PyPI:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install souppot
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
For JavaScript-rendered pages and Playwright-backed downloads, install Chromium:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python -m playwright install chromium
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Usage
|
|
26
|
+
|
|
27
|
+
Fetch static HTML:
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from souppot import cold_soup
|
|
31
|
+
|
|
32
|
+
soup = cold_soup("https://example.com")
|
|
33
|
+
|
|
34
|
+
if soup:
|
|
35
|
+
print(soup.title.string)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Fetch JavaScript-rendered HTML:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from souppot import hot_soup
|
|
42
|
+
|
|
43
|
+
soup = hot_soup("https://example.com", wait_selector=".loaded")
|
|
44
|
+
|
|
45
|
+
if soup:
|
|
46
|
+
print(soup.select_one(".loaded").get_text(strip=True))
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Download a file with Playwright:
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from souppot import hot_pot
|
|
53
|
+
|
|
54
|
+
path = hot_pot(
|
|
55
|
+
"https://example.com/file.zip",
|
|
56
|
+
"downloads/file.zip",
|
|
57
|
+
referer="https://example.com",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
print(path)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Documentation
|
|
64
|
+
|
|
65
|
+
The extended API documentation can be found [here](https://octanima-labs.github.io/souppot/).
|
|
66
|
+
|
|
67
|
+
## Changelog
|
|
68
|
+
|
|
69
|
+
See [CHANGELOG.md](CHANGELOG.md).
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
MIT. See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
8
|
+
sys.path.insert(0, str(ROOT / "src"))
|
|
9
|
+
|
|
10
|
+
project = "souppot"
|
|
11
|
+
copyright = "2026, souppot contributors"
|
|
12
|
+
author = "souppot contributors"
|
|
13
|
+
release = "0.1.0"
|
|
14
|
+
|
|
15
|
+
extensions = [
|
|
16
|
+
"myst_parser",
|
|
17
|
+
"sphinx.ext.autodoc",
|
|
18
|
+
"sphinx.ext.intersphinx",
|
|
19
|
+
"sphinx.ext.napoleon",
|
|
20
|
+
"sphinx_autodoc_typehints",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
source_suffix = {
|
|
24
|
+
".md": "markdown",
|
|
25
|
+
".rst": "restructuredtext",
|
|
26
|
+
}
|
|
27
|
+
master_doc = "index"
|
|
28
|
+
|
|
29
|
+
html_theme = "pydata_sphinx_theme"
|
|
30
|
+
html_title = "souppot"
|
|
31
|
+
html_sidebars = {
|
|
32
|
+
"**": [],
|
|
33
|
+
}
|
|
34
|
+
html_theme_options = {
|
|
35
|
+
"show_toc_level": 2,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
autodoc_default_options = {
|
|
39
|
+
"members": True,
|
|
40
|
+
"show-inheritance": True,
|
|
41
|
+
}
|
|
42
|
+
autodoc_typehints = "description"
|
|
43
|
+
autodoc_typehints_format = "short"
|
|
44
|
+
napoleon_google_docstring = True
|
|
45
|
+
napoleon_numpy_docstring = False
|
|
46
|
+
|
|
47
|
+
intersphinx_mapping = {
|
|
48
|
+
"python": ("https://docs.python.org/3", None),
|
|
49
|
+
"requests": ("https://requests.readthedocs.io/en/latest/", None),
|
|
50
|
+
"bs4": ("https://www.crummy.com/software/BeautifulSoup/bs4/doc/", None),
|
|
51
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# souppot
|
|
2
|
+
|
|
3
|
+
Small helpers for fetching and parsing HTML with `requests` or Playwright.
|
|
4
|
+
|
|
5
|
+
Use `cold_soup` for normal server-rendered HTML, `hot_soup` for JavaScript-rendered pages, and `hot_pot` when a download needs Playwright's browser-like request stack.
|
|
6
|
+
|
|
7
|
+
```{toctree}
|
|
8
|
+
:maxdepth: 2
|
|
9
|
+
:caption: Contents
|
|
10
|
+
|
|
11
|
+
usage
|
|
12
|
+
api
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
From a checkout:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install .
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
When published to PyPI:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install souppot
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
For JavaScript-rendered pages and Playwright-backed downloads, install Chromium:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
python -m playwright install chromium
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## License
|
|
36
|
+
|
|
37
|
+
souppot is released under the MIT license.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Usage
|
|
2
|
+
|
|
3
|
+
## Static HTML
|
|
4
|
+
|
|
5
|
+
Use `cold_soup` for pages that do not require JavaScript rendering.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from souppot import cold_soup
|
|
9
|
+
|
|
10
|
+
soup = cold_soup("https://example.com")
|
|
11
|
+
|
|
12
|
+
if soup:
|
|
13
|
+
print(soup.title.string)
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
`cold_soup` returns a `BeautifulSoup` object for HTML responses, a raw `requests.Response` for other successful response types, and `None` for missing URLs or non-200 responses.
|
|
17
|
+
|
|
18
|
+
## JavaScript-Rendered HTML
|
|
19
|
+
|
|
20
|
+
Use `hot_soup` when a page needs Playwright Chromium to render JavaScript before parsing.
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from souppot import hot_soup
|
|
24
|
+
|
|
25
|
+
soup = hot_soup("https://example.com", wait_selector=".loaded")
|
|
26
|
+
|
|
27
|
+
if soup:
|
|
28
|
+
print(soup.select_one(".loaded").get_text(strip=True))
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
If `wait_selector` times out, `hot_soup` logs the timeout and parses whatever DOM is available.
|
|
32
|
+
|
|
33
|
+
## Downloads
|
|
34
|
+
|
|
35
|
+
Use `hot_pot` when a file should be downloaded through Playwright's request context.
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from souppot import hot_pot
|
|
39
|
+
|
|
40
|
+
path = hot_pot(
|
|
41
|
+
"https://example.com/file.zip",
|
|
42
|
+
"downloads/file.zip",
|
|
43
|
+
referer="https://example.com",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
print(path)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Parent directories for the destination path are created automatically.
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "souppot"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Small helpers for fetching and parsing HTML with requests or Playwright."
|
|
9
|
+
license = "MIT"
|
|
10
|
+
authors = [
|
|
11
|
+
{ name = "octanima-labs", email = "octanima@tuta.io" },
|
|
12
|
+
]
|
|
13
|
+
requires-python = ">=3.11"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"beautifulsoup4",
|
|
28
|
+
"requests",
|
|
29
|
+
"playwright",
|
|
30
|
+
"2ning",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Repository = "https://github.com/octanima-labs/souppot"
|
|
35
|
+
Documentation = "https://octanima-labs.github.io/souppot/"
|
|
36
|
+
Issues = "https://github.com/octanima-labs/souppot/issues"
|
|
37
|
+
|
|
38
|
+
[dependency-groups]
|
|
39
|
+
dev = [
|
|
40
|
+
"build",
|
|
41
|
+
"mypy",
|
|
42
|
+
"pytest",
|
|
43
|
+
"ruff",
|
|
44
|
+
]
|
|
45
|
+
docs = [
|
|
46
|
+
"myst-parser",
|
|
47
|
+
"pydata-sphinx-theme",
|
|
48
|
+
"sphinx",
|
|
49
|
+
"sphinx-autodoc-typehints",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[tool.ruff]
|
|
53
|
+
target-version = "py311"
|
|
54
|
+
|
|
55
|
+
[tool.mypy]
|
|
56
|
+
python_version = "3.11"
|
|
57
|
+
|
|
58
|
+
[[tool.mypy.overrides]]
|
|
59
|
+
module = ["tuning"]
|
|
60
|
+
ignore_missing_imports = true
|
|
61
|
+
|
|
62
|
+
[tool.hatch.envs.default]
|
|
63
|
+
dependencies = [
|
|
64
|
+
"build",
|
|
65
|
+
"mypy",
|
|
66
|
+
"pytest",
|
|
67
|
+
"ruff",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
[tool.hatch.envs.docs]
|
|
71
|
+
dependencies = [
|
|
72
|
+
"myst-parser",
|
|
73
|
+
"pydata-sphinx-theme",
|
|
74
|
+
"sphinx",
|
|
75
|
+
"sphinx-autodoc-typehints",
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
[tool.hatch.envs.docs.scripts]
|
|
79
|
+
build = "sphinx-build -W -b html docs docs/_build/html"
|
|
80
|
+
|
|
81
|
+
[tool.hatch.build.targets.wheel]
|
|
82
|
+
packages = ["src/souppot"]
|
|
83
|
+
|
|
84
|
+
[tool.pytest.ini_options]
|
|
85
|
+
testpaths = ["tests"]
|
|
86
|
+
pythonpath = ["src"]
|
|
87
|
+
markers = [
|
|
88
|
+
"functional: local HTTP server tests that exercise real requests and Playwright paths",
|
|
89
|
+
]
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
"""Core helpers for fetching static pages, rendered pages, and downloads.
|
|
2
|
+
|
|
3
|
+
``cold_soup`` uses ``requests`` for normal HTTP responses. ``hot_soup`` and
|
|
4
|
+
``hot_pot`` use Playwright Chromium for JavaScript-rendered pages and
|
|
5
|
+
browser-like download requests.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Final
|
|
11
|
+
from urllib.parse import urlparse
|
|
12
|
+
|
|
13
|
+
import requests
|
|
14
|
+
import tuning
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
from playwright.sync_api import Error as PlaywrightError
|
|
17
|
+
from playwright.sync_api import Browser
|
|
18
|
+
from playwright.sync_api import BrowserContext
|
|
19
|
+
from playwright.sync_api import TimeoutError as PlaywrightTimeoutError
|
|
20
|
+
from playwright.sync_api import sync_playwright
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
logger = tuning.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
BROWSER_USER_AGENT: Final[str] = (
|
|
26
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
27
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
28
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
29
|
+
)
|
|
30
|
+
HTML_ACCEPT: Final[str] = (
|
|
31
|
+
"text/html,application/xhtml+xml,application/xml;q=0.9,"
|
|
32
|
+
"image/avif,image/webp,image/apng,*/*;q=0.8"
|
|
33
|
+
)
|
|
34
|
+
HTML_HEADERS: Final[dict[str, str]] = {
|
|
35
|
+
"User-Agent": BROWSER_USER_AGENT,
|
|
36
|
+
"Accept": HTML_ACCEPT,
|
|
37
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
38
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
39
|
+
"Connection": "keep-alive",
|
|
40
|
+
"Upgrade-Insecure-Requests": "1",
|
|
41
|
+
"Cache-Control": "no-cache",
|
|
42
|
+
"Pragma": "no-cache",
|
|
43
|
+
"DNT": "1",
|
|
44
|
+
"Sec-Fetch-Dest": "document",
|
|
45
|
+
"Sec-Fetch-Mode": "navigate",
|
|
46
|
+
"Sec-Fetch-Site": "none",
|
|
47
|
+
"Sec-Fetch-User": "?1",
|
|
48
|
+
}
|
|
49
|
+
PLAYWRIGHT_HTML_HEADERS: Final[dict[str, str]] = {
|
|
50
|
+
"Accept": HTML_ACCEPT,
|
|
51
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
52
|
+
"Cache-Control": "no-cache",
|
|
53
|
+
"Pragma": "no-cache",
|
|
54
|
+
"DNT": "1",
|
|
55
|
+
"Upgrade-Insecure-Requests": "1",
|
|
56
|
+
}
|
|
57
|
+
DOWNLOAD_HEADERS: Final[dict[str, str]] = {
|
|
58
|
+
"Accept": "application/octet-stream,*/*;q=0.8",
|
|
59
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
60
|
+
"Cache-Control": "no-cache",
|
|
61
|
+
"Pragma": "no-cache",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
__all__: Final[tuple[str, ...]] = ("cold_soup", "hot_soup", "hot_pot")
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _clean_url(url: str | None) -> str | None:
|
|
68
|
+
"""Strip URL input and normalize missing values to ``None``."""
|
|
69
|
+
if url is None:
|
|
70
|
+
return None
|
|
71
|
+
url = str(url).strip()
|
|
72
|
+
return url or None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def cold_soup(
|
|
76
|
+
url: str | None,
|
|
77
|
+
check_errors: bool = False,
|
|
78
|
+
) -> BeautifulSoup | requests.Response | None:
|
|
79
|
+
"""Fetch a URL with ``requests`` and parse HTML responses.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
url: URL to fetch. ``None`` and blank strings are treated as missing.
|
|
83
|
+
check_errors: If true, call ``raise_for_status()`` before normal status
|
|
84
|
+
handling.
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
``BeautifulSoup`` for ``200`` responses with a ``text/html`` content
|
|
88
|
+
type, the raw ``requests.Response`` for other ``200`` responses, and
|
|
89
|
+
``None`` for missing URLs or non-``200`` responses.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
requests.HTTPError: If ``check_errors`` is true and the response status
|
|
93
|
+
is an HTTP error.
|
|
94
|
+
"""
|
|
95
|
+
url = _clean_url(url)
|
|
96
|
+
if url is None:
|
|
97
|
+
logger.warning("URL not provided")
|
|
98
|
+
return None
|
|
99
|
+
logger.debug("GET %s", url)
|
|
100
|
+
parsed = urlparse(url)
|
|
101
|
+
origin = (
|
|
102
|
+
f"{parsed.scheme}://{parsed.netloc}"
|
|
103
|
+
if parsed.scheme and parsed.netloc
|
|
104
|
+
else None
|
|
105
|
+
)
|
|
106
|
+
headers = HTML_HEADERS.copy()
|
|
107
|
+
if origin:
|
|
108
|
+
headers["Referer"] = origin + "/"
|
|
109
|
+
|
|
110
|
+
res = requests.get(url=url, headers=headers, timeout=15, allow_redirects=True)
|
|
111
|
+
if check_errors:
|
|
112
|
+
res.raise_for_status()
|
|
113
|
+
if res.history:
|
|
114
|
+
logger.debug("Redirected (%s hops) -> %s", len(res.history), res.url)
|
|
115
|
+
for hop in res.history:
|
|
116
|
+
logger.debug(" %s %s", hop.status_code, hop.url)
|
|
117
|
+
if res.status_code != 200:
|
|
118
|
+
logger.error("HTTP error: %s", res.status_code)
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
ct = res.headers.get("Content-Type", "").lower()
|
|
122
|
+
if "text/html" not in ct:
|
|
123
|
+
logger.debug("Not an HTML page (%s)", ct)
|
|
124
|
+
return res
|
|
125
|
+
|
|
126
|
+
soup = BeautifulSoup(res.text, "html.parser")
|
|
127
|
+
logger.debug("✅ Soup is served (%s chars)", len(res.text))
|
|
128
|
+
return soup
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def hot_soup(
|
|
132
|
+
url: str | None,
|
|
133
|
+
wait_seconds: float = 3,
|
|
134
|
+
wait_selector: str | None = None,
|
|
135
|
+
) -> BeautifulSoup | None:
|
|
136
|
+
"""Render a URL with Playwright Chromium and parse the final DOM.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
url: URL to render. ``None`` and blank strings are treated as missing.
|
|
140
|
+
wait_seconds: Seconds to sleep after ``domcontentloaded`` when no
|
|
141
|
+
``wait_selector`` is provided. When waiting for a selector, this is
|
|
142
|
+
converted to the selector timeout with a minimum of 1000 ms.
|
|
143
|
+
wait_selector: Optional CSS selector to wait for before parsing. If the
|
|
144
|
+
selector times out, the currently rendered DOM is parsed anyway.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
``BeautifulSoup`` for the rendered page, or ``None`` for missing URLs or
|
|
148
|
+
Playwright errors.
|
|
149
|
+
"""
|
|
150
|
+
url = _clean_url(url)
|
|
151
|
+
if url is None:
|
|
152
|
+
logger.warning("URL not provided")
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
logger.debug("RENDER %s", url)
|
|
156
|
+
try:
|
|
157
|
+
with sync_playwright() as p:
|
|
158
|
+
browser: Browser | None = None
|
|
159
|
+
context: BrowserContext | None = None
|
|
160
|
+
try:
|
|
161
|
+
browser = p.chromium.launch(headless=True)
|
|
162
|
+
context = browser.new_context(
|
|
163
|
+
user_agent=BROWSER_USER_AGENT,
|
|
164
|
+
locale="en-US",
|
|
165
|
+
viewport={"width": 1920, "height": 1080},
|
|
166
|
+
)
|
|
167
|
+
context.set_extra_http_headers(PLAYWRIGHT_HTML_HEADERS.copy())
|
|
168
|
+
|
|
169
|
+
page = context.new_page()
|
|
170
|
+
response = page.goto(url, wait_until="domcontentloaded", timeout=30_000)
|
|
171
|
+
if page.url != url:
|
|
172
|
+
status = response.status if response is not None else "?"
|
|
173
|
+
logger.info("Redirected -> %s (status: %s)", page.url, status)
|
|
174
|
+
|
|
175
|
+
if wait_selector:
|
|
176
|
+
try:
|
|
177
|
+
page.wait_for_selector(
|
|
178
|
+
wait_selector,
|
|
179
|
+
timeout=max(1000, int(wait_seconds * 1000)),
|
|
180
|
+
)
|
|
181
|
+
except PlaywrightTimeoutError:
|
|
182
|
+
logger.error("Timeout waiting for selector: %s", wait_selector)
|
|
183
|
+
# Continue anyway and parse whatever has been rendered so far.
|
|
184
|
+
else:
|
|
185
|
+
time.sleep(max(0, float(wait_seconds)))
|
|
186
|
+
|
|
187
|
+
html = page.content()
|
|
188
|
+
finally:
|
|
189
|
+
if context is not None:
|
|
190
|
+
context.close()
|
|
191
|
+
if browser is not None:
|
|
192
|
+
browser.close()
|
|
193
|
+
|
|
194
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
195
|
+
logger.debug("✅ Soup is served (JS-rendered - %s chars)", len(html))
|
|
196
|
+
return soup
|
|
197
|
+
|
|
198
|
+
except PlaywrightError as e:
|
|
199
|
+
logger.error("Playwright error: %s", e, exc_info=True)
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def hot_pot(
|
|
204
|
+
url: str | None,
|
|
205
|
+
dest: str | Path,
|
|
206
|
+
referer: str | None = None,
|
|
207
|
+
timeout_ms: int = 60_000,
|
|
208
|
+
) -> Path:
|
|
209
|
+
"""Download a URL with Playwright's request context and save it to disk.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
url: URL to download. ``None`` and blank strings raise ``ValueError``.
|
|
213
|
+
dest: Destination file path. Parent directories are created if needed.
|
|
214
|
+
referer: Optional ``Referer`` header to send with the request.
|
|
215
|
+
timeout_ms: Playwright request timeout in milliseconds.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
The destination path as a ``Path``.
|
|
219
|
+
|
|
220
|
+
Raises:
|
|
221
|
+
ValueError: If ``url`` is missing.
|
|
222
|
+
playwright.sync_api.Error: If Playwright cannot complete the request.
|
|
223
|
+
"""
|
|
224
|
+
url = _clean_url(url)
|
|
225
|
+
if url is None:
|
|
226
|
+
raise ValueError("URL not provided")
|
|
227
|
+
|
|
228
|
+
dest = Path(dest)
|
|
229
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
230
|
+
|
|
231
|
+
logger.debug("PLAYWRIGHT GET %s", url)
|
|
232
|
+
with sync_playwright() as p:
|
|
233
|
+
browser: Browser | None = None
|
|
234
|
+
context: BrowserContext | None = None
|
|
235
|
+
try:
|
|
236
|
+
browser = p.chromium.launch(headless=True)
|
|
237
|
+
context = browser.new_context(
|
|
238
|
+
user_agent=BROWSER_USER_AGENT,
|
|
239
|
+
locale="en-US",
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
headers = DOWNLOAD_HEADERS.copy()
|
|
243
|
+
if referer:
|
|
244
|
+
headers["Referer"] = referer
|
|
245
|
+
|
|
246
|
+
response = context.request.get(
|
|
247
|
+
url,
|
|
248
|
+
headers=headers,
|
|
249
|
+
fail_on_status_code=True,
|
|
250
|
+
timeout=timeout_ms,
|
|
251
|
+
)
|
|
252
|
+
body = response.body()
|
|
253
|
+
finally:
|
|
254
|
+
if context is not None:
|
|
255
|
+
context.close()
|
|
256
|
+
if browser is not None:
|
|
257
|
+
browser.close()
|
|
258
|
+
|
|
259
|
+
dest.write_bytes(body)
|
|
260
|
+
logger.debug("Download saved to: %s", dest)
|
|
261
|
+
return dest
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
souppot functional download fixture
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
<!doctype html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="utf-8">
|
|
5
|
+
<title>Souppot Functional Fixture</title>
|
|
6
|
+
</head>
|
|
7
|
+
<body>
|
|
8
|
+
<main id="content">
|
|
9
|
+
<h1 id="title">Souppot Functional Fixture</h1>
|
|
10
|
+
<p class="static">This element is present in the original HTML.</p>
|
|
11
|
+
</main>
|
|
12
|
+
<script>
|
|
13
|
+
setTimeout(() => {
|
|
14
|
+
const node = document.createElement("p");
|
|
15
|
+
node.className = "delayed";
|
|
16
|
+
node.textContent = "This element was created by JavaScript.";
|
|
17
|
+
document.querySelector("#content").appendChild(node);
|
|
18
|
+
}, 200);
|
|
19
|
+
</script>
|
|
20
|
+
</body>
|
|
21
|
+
</html>
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from threading import Thread
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from playwright.sync_api import Error as PlaywrightError
|
|
9
|
+
from playwright.sync_api import sync_playwright
|
|
10
|
+
from souppot import cold_soup, hot_pot, hot_soup
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
pytestmark = pytest.mark.functional
|
|
14
|
+
|
|
15
|
+
FIXTURES = Path(__file__).parent / "fixtures"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class QuietHandler(SimpleHTTPRequestHandler):
|
|
19
|
+
def log_message(self, format: str, *args: object) -> None:
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@pytest.fixture(scope="module")
|
|
24
|
+
def fixture_server() -> str:
|
|
25
|
+
handler = partial(QuietHandler, directory=str(FIXTURES))
|
|
26
|
+
server = ThreadingHTTPServer(("127.0.0.1", 0), handler)
|
|
27
|
+
thread = Thread(target=server.serve_forever, daemon=True)
|
|
28
|
+
thread.start()
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
host, port = server.server_address
|
|
32
|
+
yield f"http://{host}:{port}"
|
|
33
|
+
finally:
|
|
34
|
+
server.shutdown()
|
|
35
|
+
server.server_close()
|
|
36
|
+
thread.join(timeout=5)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.fixture(scope="module")
|
|
40
|
+
def chromium_available() -> None:
|
|
41
|
+
try:
|
|
42
|
+
with sync_playwright() as p:
|
|
43
|
+
browser = p.chromium.launch(headless=True)
|
|
44
|
+
browser.close()
|
|
45
|
+
except PlaywrightError as exc:
|
|
46
|
+
pytest.skip(f"Playwright Chromium is not available: {exc}")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_cold_soup_fetches_local_html(fixture_server: str) -> None:
|
|
50
|
+
soup = cold_soup(f"{fixture_server}/page.html")
|
|
51
|
+
|
|
52
|
+
assert isinstance(soup, BeautifulSoup)
|
|
53
|
+
assert (
|
|
54
|
+
soup.select_one("#title").get_text(strip=True) == "Souppot Functional Fixture"
|
|
55
|
+
)
|
|
56
|
+
assert (
|
|
57
|
+
soup.select_one(".static").get_text(strip=True)
|
|
58
|
+
== "This element is present in the original HTML."
|
|
59
|
+
)
|
|
60
|
+
assert soup.select_one(".delayed") is None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def test_hot_soup_waits_for_javascript_created_element(
|
|
64
|
+
fixture_server: str,
|
|
65
|
+
chromium_available: None,
|
|
66
|
+
) -> None:
|
|
67
|
+
soup = hot_soup(
|
|
68
|
+
f"{fixture_server}/page.html", wait_selector=".delayed", wait_seconds=2
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
assert isinstance(soup, BeautifulSoup)
|
|
72
|
+
assert (
|
|
73
|
+
soup.select_one(".delayed").get_text(strip=True)
|
|
74
|
+
== "This element was created by JavaScript."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_hot_pot_downloads_local_file(
|
|
79
|
+
fixture_server: str,
|
|
80
|
+
chromium_available: None,
|
|
81
|
+
tmp_path: Path,
|
|
82
|
+
) -> None:
|
|
83
|
+
source = FIXTURES / "dummy.bin"
|
|
84
|
+
dest = tmp_path / "downloads" / "dummy.bin"
|
|
85
|
+
|
|
86
|
+
result = hot_pot(
|
|
87
|
+
f"{fixture_server}/dummy.bin", dest, referer=f"{fixture_server}/page.html"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
assert result == dest
|
|
91
|
+
assert dest.read_bytes() == source.read_bytes()
|
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
import souppot
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from souppot import core
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
FIXTURES = Path(__file__).parent / "fixtures"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class FakeResponse:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
*,
|
|
16
|
+
status_code: int = 200,
|
|
17
|
+
text: str = "",
|
|
18
|
+
headers: dict[str, str] | None = None,
|
|
19
|
+
url: str = "https://example.com/page",
|
|
20
|
+
history: list[object] | None = None,
|
|
21
|
+
error: Exception | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self.status_code = status_code
|
|
24
|
+
self.text = text
|
|
25
|
+
self.headers = headers or {}
|
|
26
|
+
self.url = url
|
|
27
|
+
self.history = history or []
|
|
28
|
+
self.error = error
|
|
29
|
+
|
|
30
|
+
def raise_for_status(self) -> None:
|
|
31
|
+
if self.error is not None:
|
|
32
|
+
raise self.error
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class FakeRenderedResponse:
|
|
36
|
+
status = 200
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FakePage:
|
|
40
|
+
def __init__(self, html: str, *, wait_raises: bool = False) -> None:
|
|
41
|
+
self.html = html
|
|
42
|
+
self.url = "https://example.com/page"
|
|
43
|
+
self.wait_raises = wait_raises
|
|
44
|
+
self.wait_selector_calls: list[tuple[str, int]] = []
|
|
45
|
+
|
|
46
|
+
def goto(self, url: str, *, wait_until: str, timeout: int) -> FakeRenderedResponse:
|
|
47
|
+
self.url = url
|
|
48
|
+
self.goto_call = {"url": url, "wait_until": wait_until, "timeout": timeout}
|
|
49
|
+
return FakeRenderedResponse()
|
|
50
|
+
|
|
51
|
+
def wait_for_selector(self, selector: str, *, timeout: int) -> None:
|
|
52
|
+
self.wait_selector_calls.append((selector, timeout))
|
|
53
|
+
if self.wait_raises:
|
|
54
|
+
raise core.PlaywrightTimeoutError("selector timed out")
|
|
55
|
+
|
|
56
|
+
def content(self) -> str:
|
|
57
|
+
return self.html
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class FakeBrowserContext:
|
|
61
|
+
def __init__(self, page: FakePage) -> None:
|
|
62
|
+
self.page = page
|
|
63
|
+
self.extra_headers: dict[str, str] | None = None
|
|
64
|
+
self.closed = False
|
|
65
|
+
|
|
66
|
+
def set_extra_http_headers(self, headers: dict[str, str]) -> None:
|
|
67
|
+
self.extra_headers = headers
|
|
68
|
+
|
|
69
|
+
def new_page(self) -> FakePage:
|
|
70
|
+
return self.page
|
|
71
|
+
|
|
72
|
+
def close(self) -> None:
|
|
73
|
+
self.closed = True
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class FakeBrowser:
|
|
77
|
+
def __init__(self, context: FakeBrowserContext) -> None:
|
|
78
|
+
self.context = context
|
|
79
|
+
self.closed = False
|
|
80
|
+
|
|
81
|
+
def new_context(self, **kwargs: object) -> FakeBrowserContext:
|
|
82
|
+
self.new_context_kwargs = kwargs
|
|
83
|
+
return self.context
|
|
84
|
+
|
|
85
|
+
def close(self) -> None:
|
|
86
|
+
self.closed = True
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class FakeChromium:
|
|
90
|
+
def __init__(self, browser: FakeBrowser) -> None:
|
|
91
|
+
self.browser = browser
|
|
92
|
+
|
|
93
|
+
def launch(self, *, headless: bool) -> FakeBrowser:
|
|
94
|
+
self.launch_kwargs = {"headless": headless}
|
|
95
|
+
return self.browser
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class FakeSyncPlaywright:
|
|
99
|
+
def __init__(self, chromium: FakeChromium) -> None:
|
|
100
|
+
self.chromium = chromium
|
|
101
|
+
|
|
102
|
+
def __enter__(self) -> "FakeSyncPlaywright":
|
|
103
|
+
return self
|
|
104
|
+
|
|
105
|
+
def __exit__(self, *args: object) -> None:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
class FakeDownloadResponse:
|
|
110
|
+
def __init__(self, body: bytes) -> None:
|
|
111
|
+
self._body = body
|
|
112
|
+
|
|
113
|
+
def body(self) -> bytes:
|
|
114
|
+
return self._body
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class FakeRequestContext:
|
|
118
|
+
def __init__(self, body: bytes) -> None:
|
|
119
|
+
self.body = body
|
|
120
|
+
self.calls: list[dict[str, object]] = []
|
|
121
|
+
|
|
122
|
+
def get(self, url: str, **kwargs: object) -> FakeDownloadResponse:
|
|
123
|
+
self.calls.append({"url": url, **kwargs})
|
|
124
|
+
return FakeDownloadResponse(self.body)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class FakeDownloadContext:
|
|
128
|
+
def __init__(self, body: bytes) -> None:
|
|
129
|
+
self.request = FakeRequestContext(body)
|
|
130
|
+
self.closed = False
|
|
131
|
+
|
|
132
|
+
def close(self) -> None:
|
|
133
|
+
self.closed = True
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class FakeDownloadBrowser:
|
|
137
|
+
def __init__(self, context: FakeDownloadContext) -> None:
|
|
138
|
+
self.context = context
|
|
139
|
+
self.closed = False
|
|
140
|
+
|
|
141
|
+
def new_context(self, **kwargs: object) -> FakeDownloadContext:
|
|
142
|
+
self.new_context_kwargs = kwargs
|
|
143
|
+
return self.context
|
|
144
|
+
|
|
145
|
+
def close(self) -> None:
|
|
146
|
+
self.closed = True
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class FakeDownloadChromium:
|
|
150
|
+
def __init__(self, browser: FakeDownloadBrowser) -> None:
|
|
151
|
+
self.browser = browser
|
|
152
|
+
|
|
153
|
+
def launch(self, *, headless: bool) -> FakeDownloadBrowser:
|
|
154
|
+
self.launch_kwargs = {"headless": headless}
|
|
155
|
+
return self.browser
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@pytest.fixture
|
|
159
|
+
def fixture_html() -> str:
|
|
160
|
+
return (FIXTURES / "basic.html").read_text(encoding="utf-8")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_package_exports_public_api() -> None:
|
|
164
|
+
assert souppot.__all__ == ("cold_soup", "hot_soup", "hot_pot")
|
|
165
|
+
assert souppot.cold_soup is core.cold_soup
|
|
166
|
+
assert souppot.hot_soup is core.hot_soup
|
|
167
|
+
assert souppot.hot_pot is core.hot_pot
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
@pytest.mark.parametrize("url", [None, "", " "])
|
|
171
|
+
def test_cold_soup_missing_url_returns_none_without_request(
|
|
172
|
+
monkeypatch: pytest.MonkeyPatch, url: str | None
|
|
173
|
+
) -> None:
|
|
174
|
+
def fail_get(**kwargs: object) -> None:
|
|
175
|
+
raise AssertionError("requests.get should not be called")
|
|
176
|
+
|
|
177
|
+
monkeypatch.setattr(core.requests, "get", fail_get)
|
|
178
|
+
|
|
179
|
+
assert core.cold_soup(url) is None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_cold_soup_sends_browser_like_request_headers(
|
|
183
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
184
|
+
) -> None:
|
|
185
|
+
response = FakeResponse(headers={"Content-Type": "application/json"})
|
|
186
|
+
calls: list[dict[str, object]] = []
|
|
187
|
+
|
|
188
|
+
def fake_get(**kwargs: object) -> FakeResponse:
|
|
189
|
+
calls.append(kwargs)
|
|
190
|
+
return response
|
|
191
|
+
|
|
192
|
+
monkeypatch.setattr(core.requests, "get", fake_get)
|
|
193
|
+
|
|
194
|
+
assert core.cold_soup(" https://example.com/path ") is response
|
|
195
|
+
call = calls[0]
|
|
196
|
+
headers = call["headers"]
|
|
197
|
+
|
|
198
|
+
assert call["url"] == "https://example.com/path"
|
|
199
|
+
assert "stream" not in call
|
|
200
|
+
assert call["timeout"] == 15
|
|
201
|
+
assert call["allow_redirects"] is True
|
|
202
|
+
assert isinstance(headers, dict)
|
|
203
|
+
assert "Mozilla/5.0" in headers["User-Agent"]
|
|
204
|
+
assert headers["Referer"] == "https://example.com/"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def test_cold_soup_returns_beautifulsoup_for_html_response(
|
|
208
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
209
|
+
fixture_html: str,
|
|
210
|
+
) -> None:
|
|
211
|
+
response = FakeResponse(
|
|
212
|
+
text=fixture_html, headers={"Content-Type": "text/html; charset=utf-8"}
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
|
|
216
|
+
|
|
217
|
+
soup = core.cold_soup("https://example.com/page")
|
|
218
|
+
|
|
219
|
+
assert isinstance(soup, BeautifulSoup)
|
|
220
|
+
assert soup.select_one("#title").get_text(strip=True) == "Soup Pot"
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def test_cold_soup_returns_response_for_non_html_response(
|
|
224
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
225
|
+
) -> None:
|
|
226
|
+
response = FakeResponse(
|
|
227
|
+
text='{"ok": true}', headers={"Content-Type": "application/json"}
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
|
|
231
|
+
|
|
232
|
+
assert core.cold_soup("https://example.com/data.json") is response
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def test_cold_soup_returns_none_for_non_200_response(
|
|
236
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
237
|
+
) -> None:
|
|
238
|
+
response = FakeResponse(status_code=404, headers={"Content-Type": "text/html"})
|
|
239
|
+
|
|
240
|
+
monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
|
|
241
|
+
|
|
242
|
+
assert core.cold_soup("https://example.com/missing") is None
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def test_cold_soup_check_errors_raises_before_status_handling(
|
|
246
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
247
|
+
) -> None:
|
|
248
|
+
class MarkerError(Exception):
|
|
249
|
+
pass
|
|
250
|
+
|
|
251
|
+
response = FakeResponse(status_code=500, error=MarkerError("boom"))
|
|
252
|
+
|
|
253
|
+
monkeypatch.setattr(core.requests, "get", lambda **kwargs: response)
|
|
254
|
+
|
|
255
|
+
with pytest.raises(MarkerError):
|
|
256
|
+
core.cold_soup("https://example.com/error", check_errors=True)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@pytest.mark.parametrize("url", [None, "", " "])
|
|
260
|
+
def test_hot_soup_missing_url_returns_none_without_playwright(
|
|
261
|
+
monkeypatch: pytest.MonkeyPatch, url: str | None
|
|
262
|
+
) -> None:
|
|
263
|
+
def fail_sync_playwright() -> None:
|
|
264
|
+
raise AssertionError("sync_playwright should not be called")
|
|
265
|
+
|
|
266
|
+
monkeypatch.setattr(core, "sync_playwright", fail_sync_playwright)
|
|
267
|
+
|
|
268
|
+
assert core.hot_soup(url) is None
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def test_hot_soup_parses_rendered_html_from_fake_playwright(
|
|
272
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
273
|
+
fixture_html: str,
|
|
274
|
+
) -> None:
|
|
275
|
+
page = FakePage(fixture_html)
|
|
276
|
+
context = FakeBrowserContext(page)
|
|
277
|
+
browser = FakeBrowser(context)
|
|
278
|
+
playwright = FakeSyncPlaywright(FakeChromium(browser))
|
|
279
|
+
sleep_calls: list[float] = []
|
|
280
|
+
|
|
281
|
+
monkeypatch.setattr(core, "sync_playwright", lambda: playwright)
|
|
282
|
+
monkeypatch.setattr(core.time, "sleep", lambda seconds: sleep_calls.append(seconds))
|
|
283
|
+
|
|
284
|
+
soup = core.hot_soup("https://example.com/page", wait_seconds=0)
|
|
285
|
+
|
|
286
|
+
assert isinstance(soup, BeautifulSoup)
|
|
287
|
+
assert (
|
|
288
|
+
soup.select_one(".message").get_text(strip=True)
|
|
289
|
+
== "Fixture HTML for parser unit tests."
|
|
290
|
+
)
|
|
291
|
+
assert sleep_calls == [0]
|
|
292
|
+
assert context.closed is True
|
|
293
|
+
assert browser.closed is True
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def test_hot_soup_wait_selector_timeout_still_parses_html(
|
|
297
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
298
|
+
fixture_html: str,
|
|
299
|
+
) -> None:
|
|
300
|
+
page = FakePage(fixture_html, wait_raises=True)
|
|
301
|
+
context = FakeBrowserContext(page)
|
|
302
|
+
browser = FakeBrowser(context)
|
|
303
|
+
playwright = FakeSyncPlaywright(FakeChromium(browser))
|
|
304
|
+
|
|
305
|
+
monkeypatch.setattr(core, "sync_playwright", lambda: playwright)
|
|
306
|
+
|
|
307
|
+
soup = core.hot_soup(
|
|
308
|
+
"https://example.com/page", wait_selector="#missing", wait_seconds=0.2
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
assert isinstance(soup, BeautifulSoup)
|
|
312
|
+
assert soup.select_one("#content") is not None
|
|
313
|
+
assert page.wait_selector_calls == [("#missing", 1000)]
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
@pytest.mark.parametrize("url", [None, "", " "])
|
|
317
|
+
def test_hot_pot_missing_url_raises_value_error(
|
|
318
|
+
url: str | None, tmp_path: Path
|
|
319
|
+
) -> None:
|
|
320
|
+
with pytest.raises(ValueError, match="URL not provided"):
|
|
321
|
+
core.hot_pot(url, tmp_path / "out.bin")
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def test_hot_pot_creates_parent_dirs_and_writes_body(
|
|
325
|
+
monkeypatch: pytest.MonkeyPatch, tmp_path: Path
|
|
326
|
+
) -> None:
|
|
327
|
+
body = b"downloaded bytes"
|
|
328
|
+
context = FakeDownloadContext(body)
|
|
329
|
+
browser = FakeDownloadBrowser(context)
|
|
330
|
+
playwright = FakeSyncPlaywright(FakeDownloadChromium(browser))
|
|
331
|
+
dest = tmp_path / "nested" / "out.bin"
|
|
332
|
+
|
|
333
|
+
monkeypatch.setattr(core, "sync_playwright", lambda: playwright)
|
|
334
|
+
|
|
335
|
+
result = core.hot_pot(
|
|
336
|
+
" https://example.com/file.bin ",
|
|
337
|
+
dest,
|
|
338
|
+
referer="https://example.com/page",
|
|
339
|
+
timeout_ms=123,
|
|
340
|
+
)
|
|
341
|
+
|
|
342
|
+
assert result == dest
|
|
343
|
+
assert dest.read_bytes() == body
|
|
344
|
+
assert context.closed is True
|
|
345
|
+
assert browser.closed is True
|
|
346
|
+
|
|
347
|
+
call = context.request.calls[0]
|
|
348
|
+
headers = call["headers"]
|
|
349
|
+
assert call["url"] == "https://example.com/file.bin"
|
|
350
|
+
assert call["fail_on_status_code"] is True
|
|
351
|
+
assert call["timeout"] == 123
|
|
352
|
+
assert isinstance(headers, dict)
|
|
353
|
+
assert headers["Referer"] == "https://example.com/page"
|