nscraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nscraper-0.1.0/LICENSE +21 -0
- nscraper-0.1.0/PKG-INFO +125 -0
- nscraper-0.1.0/README.md +116 -0
- nscraper-0.1.0/build_backend.py +160 -0
- nscraper-0.1.0/pyproject.toml +22 -0
- nscraper-0.1.0/src/nscraper/__init__.py +42 -0
- nscraper-0.1.0/src/nscraper/__main__.py +66 -0
- nscraper-0.1.0/src/nscraper/core.py +18 -0
- nscraper-0.1.0/src/nscraper/errors.py +25 -0
- nscraper-0.1.0/src/nscraper/models.py +24 -0
- nscraper-0.1.0/src/nscraper/scraper/__init__.py +7 -0
- nscraper-0.1.0/src/nscraper/scraper/base.py +34 -0
- nscraper-0.1.0/src/nscraper/scraper/http.py +32 -0
- nscraper-0.1.0/src/nscraper/scraper/seleniumbase.py +12 -0
- nscraper-0.1.0/src/nscraper/utils.py +116 -0
- nscraper-0.1.0/tests/test_core.py +94 -0
- nscraper-0.1.0/tests/test_http_scraper.py +113 -0
- nscraper-0.1.0/tests/test_main.py +101 -0
- nscraper-0.1.0/tests/test_utils.py +54 -0
nscraper-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 mikerr1@github.com
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
nscraper-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: nscraper
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A small importable Python module.
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Dist: niquests==3.18.4
|
|
7
|
+
Requires-Dist: justhtml==1.14.0
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
|
|
10
|
+
# nscraper
|
|
11
|
+
|
|
12
|
+
`nscraper` is a small Python package scaffolded for two use cases:
|
|
13
|
+
|
|
14
|
+
- import it from other projects
|
|
15
|
+
- run it directly with `python -m nscraper`
|
|
16
|
+
|
|
17
|
+
## License
|
|
18
|
+
|
|
19
|
+
MIT. You can fork, modify, and reuse it with minimal restrictions as long as
|
|
20
|
+
the license notice is kept with the software.
|
|
21
|
+
|
|
22
|
+
## Install
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install nscraper
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
For development:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv sync --dev
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Use as a module
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from nscraper import HttpScraper, ScrapeOptions
|
|
38
|
+
|
|
39
|
+
options = ScrapeOptions(
|
|
40
|
+
url="https://example.com",
|
|
41
|
+
headers={"Accept": "text/html"},
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
content = HttpScraper(options).scrape()
|
|
45
|
+
print(content)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Run the Module
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
python -m nscraper -u https://example.com -H default
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
Fetch a URL:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
python -m nscraper -u https://example.com -H default
|
|
58
|
+
python -m nscraper -u https://example.com -H '{"Accept": "text/html"}'
|
|
59
|
+
python -m nscraper -u https://example.com -H default -c cookies.json
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Current API
|
|
63
|
+
|
|
64
|
+
- `nscraper.ScrapeOptions`
|
|
65
|
+
- `nscraper.BaseScraper`
|
|
66
|
+
- `nscraper.HttpScraper`
|
|
67
|
+
- `nscraper.SeleniumBaseScraper`
|
|
68
|
+
- `nscraper.get_scraper(options: ScrapeOptions) -> BaseScraper`
|
|
69
|
+
- `nscraper.validate_url(url: str) -> str`
|
|
70
|
+
- `nscraper.parse_headers(raw_headers: str | None) -> dict[str, str]`
|
|
71
|
+
- `nscraper.load_cookies_file(path: Path | str | None) -> dict[str, str] | None`
|
|
72
|
+
- `nscraper.basic_html_transform(content: str) -> str`
|
|
73
|
+
- runtime dependency: `niquests==3.18.4`
|
|
74
|
+
- runtime dependency: `justhtml==1.14.0`
|
|
75
|
+
- development dependency: `pytest`
|
|
76
|
+
|
|
77
|
+
## Module Flags
|
|
78
|
+
|
|
79
|
+
- `-u` / `--url` required
|
|
80
|
+
- `-H` / `--headers` required, or `default`
|
|
81
|
+
- `-e` / `--engine` with `http` or `seleniumbase`
|
|
82
|
+
- `-p` / `--proxy`
|
|
83
|
+
- `--timeout` default `3`
|
|
84
|
+
- `-o` / `--output`
|
|
85
|
+
- `-c` / `--cookies-file` optional JSON file
|
|
86
|
+
- `-t` / `--transform` default `raw`
|
|
87
|
+
|
|
88
|
+
Behavior:
|
|
89
|
+
|
|
90
|
+
- invalid or malformed URLs raise `InvalidUrlError`
|
|
91
|
+
- missing or malformed headers raise `InvalidHeadersError`
|
|
92
|
+
- missing or malformed cookie files raise `InvalidCookiesError`
|
|
93
|
+
- use `-H default` to apply the built-in `Accept` and `User-Agent` header dict
|
|
94
|
+
- use `-c` only when you want to send cookies; omit it to keep current behavior
|
|
95
|
+
- output files are always overwritten
|
|
96
|
+
- `basic_html` removes non-content elements and writes cleaned HTML output
|
|
97
|
+
|
|
98
|
+
Default `User-Agent`:
|
|
99
|
+
|
|
100
|
+
```text
|
|
101
|
+
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
The package is intentionally minimal so you can extend it into a reusable library
|
|
105
|
+
and publish it to PyPI.
|
|
106
|
+
|
|
107
|
+
## GitHub And PyPI Release Flow
|
|
108
|
+
|
|
109
|
+
- pull requests to `master` run tests in GitHub Actions
|
|
110
|
+
- published GitHub releases run tests, build `sdist` and `wheel`, then publish to PyPI
|
|
111
|
+
- the publish workflow is in [.github/workflows/release.yml](/home/ubuntu/projects/nscraper/.github/workflows/release.yml)
|
|
112
|
+
|
|
113
|
+
Before the release workflow can publish, configure Trusted Publishing in PyPI:
|
|
114
|
+
|
|
115
|
+
1. create the project on PyPI if it does not exist yet
|
|
116
|
+
2. in PyPI, open the project publishing settings
|
|
117
|
+
3. add a trusted publisher for this GitHub repository
|
|
118
|
+
4. use the `release` workflow on the `master` branch
|
|
119
|
+
|
|
120
|
+
After that, the normal flow is:
|
|
121
|
+
|
|
122
|
+
1. push code to GitHub
|
|
123
|
+
2. merge to `master`
|
|
124
|
+
3. create a GitHub release for the version tag
|
|
125
|
+
4. let GitHub Actions test, build, and publish the package
|
nscraper-0.1.0/README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# nscraper
|
|
2
|
+
|
|
3
|
+
`nscraper` is a small Python package scaffolded for two use cases:
|
|
4
|
+
|
|
5
|
+
- import it from other projects
|
|
6
|
+
- run it directly with `python -m nscraper`
|
|
7
|
+
|
|
8
|
+
## License
|
|
9
|
+
|
|
10
|
+
MIT. You can fork, modify, and reuse it with minimal restrictions as long as
|
|
11
|
+
the license notice is kept with the software.
|
|
12
|
+
|
|
13
|
+
## Install
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install nscraper
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
For development:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uv sync --dev
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Use as a module
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from nscraper import HttpScraper, ScrapeOptions
|
|
29
|
+
|
|
30
|
+
options = ScrapeOptions(
|
|
31
|
+
url="https://example.com",
|
|
32
|
+
headers={"Accept": "text/html"},
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
content = HttpScraper(options).scrape()
|
|
36
|
+
print(content)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Run the Module
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python -m nscraper -u https://example.com -H default
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Fetch a URL:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
python -m nscraper -u https://example.com -H default
|
|
49
|
+
python -m nscraper -u https://example.com -H '{"Accept": "text/html"}'
|
|
50
|
+
python -m nscraper -u https://example.com -H default -c cookies.json
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Current API
|
|
54
|
+
|
|
55
|
+
- `nscraper.ScrapeOptions`
|
|
56
|
+
- `nscraper.BaseScraper`
|
|
57
|
+
- `nscraper.HttpScraper`
|
|
58
|
+
- `nscraper.SeleniumBaseScraper`
|
|
59
|
+
- `nscraper.get_scraper(options: ScrapeOptions) -> BaseScraper`
|
|
60
|
+
- `nscraper.validate_url(url: str) -> str`
|
|
61
|
+
- `nscraper.parse_headers(raw_headers: str | None) -> dict[str, str]`
|
|
62
|
+
- `nscraper.load_cookies_file(path: Path | str | None) -> dict[str, str] | None`
|
|
63
|
+
- `nscraper.basic_html_transform(content: str) -> str`
|
|
64
|
+
- runtime dependency: `niquests==3.18.4`
|
|
65
|
+
- runtime dependency: `justhtml==1.14.0`
|
|
66
|
+
- development dependency: `pytest`
|
|
67
|
+
|
|
68
|
+
## Module Flags
|
|
69
|
+
|
|
70
|
+
- `-u` / `--url` required
|
|
71
|
+
- `-H` / `--headers` required, or `default`
|
|
72
|
+
- `-e` / `--engine` with `http` or `seleniumbase`
|
|
73
|
+
- `-p` / `--proxy`
|
|
74
|
+
- `--timeout` default `3`
|
|
75
|
+
- `-o` / `--output`
|
|
76
|
+
- `-c` / `--cookies-file` optional JSON file
|
|
77
|
+
- `-t` / `--transform` default `raw`
|
|
78
|
+
|
|
79
|
+
Behavior:
|
|
80
|
+
|
|
81
|
+
- invalid or malformed URLs raise `InvalidUrlError`
|
|
82
|
+
- missing or malformed headers raise `InvalidHeadersError`
|
|
83
|
+
- missing or malformed cookie files raise `InvalidCookiesError`
|
|
84
|
+
- use `-H default` to apply the built-in `Accept` and `User-Agent` header dict
|
|
85
|
+
- use `-c` only when you want to send cookies; omit it to keep current behavior
|
|
86
|
+
- output files are always overwritten
|
|
87
|
+
- `basic_html` removes non-content elements and writes cleaned HTML output
|
|
88
|
+
|
|
89
|
+
Default `User-Agent`:
|
|
90
|
+
|
|
91
|
+
```text
|
|
92
|
+
Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
The package is intentionally minimal so you can extend it into a reusable library
|
|
96
|
+
and publish it to PyPI.
|
|
97
|
+
|
|
98
|
+
## GitHub And PyPI Release Flow
|
|
99
|
+
|
|
100
|
+
- pull requests to `master` run tests in GitHub Actions
|
|
101
|
+
- published GitHub releases run tests, build `sdist` and `wheel`, then publish to PyPI
|
|
102
|
+
- the publish workflow is in [.github/workflows/release.yml](/home/ubuntu/projects/nscraper/.github/workflows/release.yml)
|
|
103
|
+
|
|
104
|
+
Before the release workflow can publish, configure Trusted Publishing in PyPI:
|
|
105
|
+
|
|
106
|
+
1. create the project on PyPI if it does not exist yet
|
|
107
|
+
2. in PyPI, open the project publishing settings
|
|
108
|
+
3. add a trusted publisher for this GitHub repository
|
|
109
|
+
4. use the `release` workflow on the `master` branch
|
|
110
|
+
|
|
111
|
+
After that, the normal flow is:
|
|
112
|
+
|
|
113
|
+
1. push code to GitHub
|
|
114
|
+
2. merge to `master`
|
|
115
|
+
3. create a GitHub release for the version tag
|
|
116
|
+
4. let GitHub Actions test, build, and publish the package
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Minimal PEP 517 backend for offline editable and wheel installs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
import base64
|
|
8
|
+
import csv
|
|
9
|
+
import hashlib
|
|
10
|
+
import io
|
|
11
|
+
import os
|
|
12
|
+
import tarfile
|
|
13
|
+
import tempfile
|
|
14
|
+
import zipfile
|
|
15
|
+
|
|
16
|
+
NAME = "nscraper"
|
|
17
|
+
VERSION = "0.1.0"
|
|
18
|
+
DIST_INFO = f"{NAME}-{VERSION}.dist-info"
|
|
19
|
+
SUMMARY = "A small importable Python module."
|
|
20
|
+
README_PATH = Path(__file__).resolve().parent / "README.md"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class FileRecord:
|
|
25
|
+
path: str
|
|
26
|
+
data: bytes
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _normalize(path: str) -> str:
|
|
30
|
+
return path.replace(os.sep, "/")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _metadata_text() -> str:
|
|
34
|
+
description = README_PATH.read_text(encoding="utf-8")
|
|
35
|
+
return (
|
|
36
|
+
"Metadata-Version: 2.1\n"
|
|
37
|
+
f"Name: {NAME}\n"
|
|
38
|
+
f"Version: {VERSION}\n"
|
|
39
|
+
f"Summary: {SUMMARY}\n"
|
|
40
|
+
"License: MIT\n"
|
|
41
|
+
"Requires-Dist: niquests==3.18.4\n"
|
|
42
|
+
"Requires-Dist: justhtml==1.14.0\n"
|
|
43
|
+
"Description-Content-Type: text/markdown\n"
|
|
44
|
+
"\n"
|
|
45
|
+
f"{description}"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _wheel_metadata() -> list[FileRecord]:
|
|
50
|
+
wheel = (
|
|
51
|
+
"Wheel-Version: 1.0\n"
|
|
52
|
+
"Generator: nscraper.build_backend\n"
|
|
53
|
+
"Root-Is-Purelib: true\n"
|
|
54
|
+
"Tag: py3-none-any\n"
|
|
55
|
+
)
|
|
56
|
+
meta = _metadata_text()
|
|
57
|
+
entry_points = "[console_scripts]\nnscraper = nscraper.__main__:main\n"
|
|
58
|
+
return [
|
|
59
|
+
FileRecord(f"{DIST_INFO}/WHEEL", wheel.encode()),
|
|
60
|
+
FileRecord(f"{DIST_INFO}/METADATA", meta.encode()),
|
|
61
|
+
FileRecord(f"{DIST_INFO}/entry_points.txt", entry_points.encode()),
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _package_files() -> list[FileRecord]:
|
|
66
|
+
root = Path(__file__).resolve().parent / "src" / NAME
|
|
67
|
+
records: list[FileRecord] = []
|
|
68
|
+
for path in root.rglob("*.py"):
|
|
69
|
+
rel = _normalize(str(path.relative_to(root.parent)))
|
|
70
|
+
records.append(FileRecord(rel, path.read_bytes()))
|
|
71
|
+
return records
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _sdist_files(root: Path) -> list[tuple[Path, str]]:
|
|
75
|
+
members = ["build_backend.py", "pyproject.toml", "README.md", "LICENSE"]
|
|
76
|
+
files = [(root / member, f"{NAME}-{VERSION}/{member}") for member in members]
|
|
77
|
+
for base in ("src", "tests"):
|
|
78
|
+
for path in (root / base).rglob("*"):
|
|
79
|
+
if path.is_dir() or "__pycache__" in path.parts or path.suffix == ".pyc":
|
|
80
|
+
continue
|
|
81
|
+
arcname = f"{NAME}-{VERSION}/{path.relative_to(root)}"
|
|
82
|
+
files.append((path, arcname))
|
|
83
|
+
return files
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _build_wheel_dir(wheel_directory: str, editable: bool = False) -> str:
|
|
87
|
+
out = Path(wheel_directory)
|
|
88
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
89
|
+
wheel_name = f"{NAME}-{VERSION}-py3-none-any.whl"
|
|
90
|
+
wheel_path = out / wheel_name
|
|
91
|
+
records = _wheel_metadata()
|
|
92
|
+
if editable:
|
|
93
|
+
pth = f"{NAME}.pth"
|
|
94
|
+
src_dir = (Path(__file__).resolve().parent / "src").as_posix()
|
|
95
|
+
records.append(FileRecord(pth, f"{src_dir}\n".encode()))
|
|
96
|
+
else:
|
|
97
|
+
records.extend(_package_files())
|
|
98
|
+
with tempfile.TemporaryDirectory() as td:
|
|
99
|
+
tmp = Path(td)
|
|
100
|
+
with zipfile.ZipFile(wheel_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
|
101
|
+
for record in records:
|
|
102
|
+
zf.writestr(record.path, record.data)
|
|
103
|
+
dist_info = tmp / DIST_INFO
|
|
104
|
+
dist_info.mkdir()
|
|
105
|
+
record_file = dist_info / "RECORD"
|
|
106
|
+
rows = []
|
|
107
|
+
for record in records:
|
|
108
|
+
digest = base64.urlsafe_b64encode(hashlib.sha256(record.data).digest()).rstrip(b"=").decode()
|
|
109
|
+
rows.append((record.path, f"sha256={digest}", str(len(record.data))))
|
|
110
|
+
rows.append((f"{DIST_INFO}/RECORD", "", ""))
|
|
111
|
+
with record_file.open("w", newline="", encoding="utf-8") as f:
|
|
112
|
+
writer = csv.writer(f)
|
|
113
|
+
writer.writerows(rows)
|
|
114
|
+
zf.write(record_file, f"{DIST_INFO}/RECORD")
|
|
115
|
+
return wheel_name
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def get_requires_for_build_wheel(config_settings=None):
|
|
119
|
+
return []
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_requires_for_build_editable(config_settings=None):
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def prepare_metadata_for_build_wheel(metadata_directory, config_settings=None):
|
|
127
|
+
dist = Path(metadata_directory) / DIST_INFO
|
|
128
|
+
dist.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
for record in _wheel_metadata():
|
|
130
|
+
if record.path.endswith(("WHEEL", "METADATA", "entry_points.txt")):
|
|
131
|
+
(dist / Path(record.path).name).write_bytes(record.data)
|
|
132
|
+
return DIST_INFO
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
|
|
136
|
+
return _build_wheel_dir(wheel_directory, editable=False)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def build_editable(wheel_directory, config_settings=None, metadata_directory=None):
|
|
140
|
+
return _build_wheel_dir(wheel_directory, editable=True)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def prepare_metadata_for_build_editable(metadata_directory, config_settings=None):
|
|
144
|
+
return prepare_metadata_for_build_wheel(metadata_directory, config_settings)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def build_sdist(sdist_directory, config_settings=None):
|
|
148
|
+
out = Path(sdist_directory)
|
|
149
|
+
out.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
sdist_name = f"{NAME}-{VERSION}.tar.gz"
|
|
151
|
+
sdist_path = out / sdist_name
|
|
152
|
+
root = Path(__file__).resolve().parent
|
|
153
|
+
pkg_info = _metadata_text().encode("utf-8")
|
|
154
|
+
with tarfile.open(sdist_path, "w:gz") as tf:
|
|
155
|
+
for path, arcname in _sdist_files(root):
|
|
156
|
+
tf.add(path, arcname=arcname)
|
|
157
|
+
info = tarfile.TarInfo(name=f"{NAME}-{VERSION}/PKG-INFO")
|
|
158
|
+
info.size = len(pkg_info)
|
|
159
|
+
tf.addfile(info, fileobj=io.BytesIO(pkg_info))
|
|
160
|
+
return sdist_name
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = []
|
|
3
|
+
build-backend = "build_backend"
|
|
4
|
+
backend-path = ["."]
|
|
5
|
+
|
|
6
|
+
[project]
|
|
7
|
+
name = "nscraper"
|
|
8
|
+
version = "0.1.0"
|
|
9
|
+
description = "A small importable Python module."
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "mikerr1@github.com" }
|
|
15
|
+
]
|
|
16
|
+
dependencies = ["niquests==3.18.4", "justhtml==1.14.0"]
|
|
17
|
+
|
|
18
|
+
[project.scripts]
|
|
19
|
+
nscraper = "nscraper.__main__:main"
|
|
20
|
+
|
|
21
|
+
[dependency-groups]
|
|
22
|
+
dev = ["pytest"]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Public package interface for nscraper."""
|
|
2
|
+
|
|
3
|
+
from .core import get_scraper, hello
|
|
4
|
+
from .errors import (
|
|
5
|
+
InvalidHeadersError,
|
|
6
|
+
InvalidCookiesError,
|
|
7
|
+
InvalidUrlError,
|
|
8
|
+
NetworkError,
|
|
9
|
+
NscraperError,
|
|
10
|
+
RequestError,
|
|
11
|
+
)
|
|
12
|
+
from .models import ScrapeOptions
|
|
13
|
+
from .scraper import BaseScraper, HttpScraper, SeleniumBaseScraper
|
|
14
|
+
from .utils import (
|
|
15
|
+
DEFAULT_HEADERS,
|
|
16
|
+
basic_html_transform,
|
|
17
|
+
load_cookies_file,
|
|
18
|
+
parse_headers,
|
|
19
|
+
validate_url,
|
|
20
|
+
write_output,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"BaseScraper",
|
|
25
|
+
"DEFAULT_HEADERS",
|
|
26
|
+
"HttpScraper",
|
|
27
|
+
"InvalidHeadersError",
|
|
28
|
+
"InvalidCookiesError",
|
|
29
|
+
"InvalidUrlError",
|
|
30
|
+
"NetworkError",
|
|
31
|
+
"NscraperError",
|
|
32
|
+
"RequestError",
|
|
33
|
+
"ScrapeOptions",
|
|
34
|
+
"SeleniumBaseScraper",
|
|
35
|
+
"basic_html_transform",
|
|
36
|
+
"load_cookies_file",
|
|
37
|
+
"hello",
|
|
38
|
+
"get_scraper",
|
|
39
|
+
"parse_headers",
|
|
40
|
+
"validate_url",
|
|
41
|
+
"write_output",
|
|
42
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Module entry point for nscraper."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .core import get_scraper
|
|
9
|
+
from .errors import NscraperError
|
|
10
|
+
from .models import ScrapeOptions
|
|
11
|
+
from .utils import DEFAULT_HEADERS, load_cookies_file, parse_headers
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
15
|
+
parser = argparse.ArgumentParser(prog="nscraper")
|
|
16
|
+
parser.add_argument("-u", "--url", required=True, help="Target URL")
|
|
17
|
+
parser.add_argument(
|
|
18
|
+
"-e",
|
|
19
|
+
"--engine",
|
|
20
|
+
choices=("http", "seleniumbase"),
|
|
21
|
+
default="http",
|
|
22
|
+
help="Request engine",
|
|
23
|
+
)
|
|
24
|
+
parser.add_argument("-p", "--proxy", help="Proxy URL")
|
|
25
|
+
parser.add_argument("-H", "--headers", required=True, help='Headers as JSON string or "default"')
|
|
26
|
+
parser.add_argument("-c", "--cookies-file", help="Path to a JSON cookies file")
|
|
27
|
+
parser.add_argument("--timeout", type=float, default=3.0, help="Timeout in seconds")
|
|
28
|
+
parser.add_argument("-o", "--output", help="Output path for HTML")
|
|
29
|
+
parser.add_argument(
|
|
30
|
+
"-t",
|
|
31
|
+
"--transform",
|
|
32
|
+
choices=("raw", "basic_html"),
|
|
33
|
+
default="raw",
|
|
34
|
+
help="Transform mode",
|
|
35
|
+
)
|
|
36
|
+
return parser
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _build_options(args: argparse.Namespace) -> ScrapeOptions:
|
|
40
|
+
headers = DEFAULT_HEADERS if args.headers == "default" else parse_headers(args.headers)
|
|
41
|
+
return ScrapeOptions(
|
|
42
|
+
url=args.url,
|
|
43
|
+
engine=args.engine,
|
|
44
|
+
proxy=args.proxy,
|
|
45
|
+
headers=headers,
|
|
46
|
+
cookies=load_cookies_file(args.cookies_file),
|
|
47
|
+
timeout=args.timeout,
|
|
48
|
+
output_path=Path(args.output) if args.output else None,
|
|
49
|
+
transform=args.transform,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main(argv: list[str] | None = None) -> int:
|
|
54
|
+
args = build_parser().parse_args(argv)
|
|
55
|
+
try:
|
|
56
|
+
options = _build_options(args)
|
|
57
|
+
content = get_scraper(options).scrape()
|
|
58
|
+
except NscraperError as exc:
|
|
59
|
+
raise SystemExit(str(exc)) from exc
|
|
60
|
+
if not options.output_path:
|
|
61
|
+
print(content)
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__":
|
|
66
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Core library functions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .models import ScrapeOptions
|
|
6
|
+
from .scraper import BaseScraper, HttpScraper, SeleniumBaseScraper
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def hello(name: str = "world") -> str:
|
|
10
|
+
"""Return a friendly greeting."""
|
|
11
|
+
cleaned = name.strip() or "world"
|
|
12
|
+
return f"Hello, {cleaned}!"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_scraper(options: ScrapeOptions) -> BaseScraper:
|
|
16
|
+
if options.engine == "seleniumbase":
|
|
17
|
+
return SeleniumBaseScraper(options)
|
|
18
|
+
return HttpScraper(options)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Project-specific exceptions for nscraper."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NscraperError(Exception):
|
|
5
|
+
"""Base exception for nscraper failures."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class InvalidUrlError(NscraperError):
|
|
9
|
+
"""Raised when a URL argument is missing or invalid."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class InvalidHeadersError(NscraperError):
|
|
13
|
+
"""Raised when headers are missing or malformed."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class InvalidCookiesError(NscraperError):
|
|
17
|
+
"""Raised when cookies are missing or malformed."""
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RequestError(NscraperError):
|
|
21
|
+
"""Raised when an HTTP request fails with a non-success response."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class NetworkError(NscraperError):
|
|
25
|
+
"""Raised when the request cannot be completed due to network issues."""
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Data models for nscraper operations."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Literal
|
|
8
|
+
|
|
9
|
+
Engine = Literal["http", "seleniumbase"]
|
|
10
|
+
Transform = Literal["raw", "basic_html"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass(frozen=True, slots=True)
|
|
14
|
+
class ScrapeOptions:
|
|
15
|
+
"""Normalized scrape configuration."""
|
|
16
|
+
|
|
17
|
+
url: str
|
|
18
|
+
engine: Engine = "http"
|
|
19
|
+
proxy: str | None = None
|
|
20
|
+
headers: dict[str, str] | None = None
|
|
21
|
+
cookies: dict[str, str] | None = None
|
|
22
|
+
timeout: float = 3.0
|
|
23
|
+
output_path: Path | None = None
|
|
24
|
+
transform: Transform = "raw"
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Base scraper contract."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
from ..models import ScrapeOptions
|
|
8
|
+
from ..utils import basic_html_transform, write_output
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseScraper(ABC):
|
|
12
|
+
"""Base scraper contract."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, options: ScrapeOptions) -> None:
|
|
15
|
+
self.options = options
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def send_request(self) -> str:
|
|
19
|
+
"""Send the underlying request and return raw content."""
|
|
20
|
+
|
|
21
|
+
def transform(self, content: str) -> str:
|
|
22
|
+
if self.options.transform == "basic_html":
|
|
23
|
+
return basic_html_transform(content)
|
|
24
|
+
return content
|
|
25
|
+
|
|
26
|
+
def store(self, content: str) -> None:
|
|
27
|
+
if self.options.output_path:
|
|
28
|
+
write_output(self.options.output_path, content)
|
|
29
|
+
|
|
30
|
+
def scrape(self) -> str:
|
|
31
|
+
content = self.send_request()
|
|
32
|
+
content = self.transform(content)
|
|
33
|
+
self.store(content)
|
|
34
|
+
return content
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""HTTP scraper implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import niquests
|
|
6
|
+
|
|
7
|
+
from ..errors import InvalidHeadersError, NetworkError, RequestError
|
|
8
|
+
from ..utils import validate_url
|
|
9
|
+
from .base import BaseScraper
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class HttpScraper(BaseScraper):
|
|
13
|
+
"""Scraper implementation backed by niquests."""
|
|
14
|
+
|
|
15
|
+
def send_request(self) -> str:
|
|
16
|
+
url = validate_url(self.options.url)
|
|
17
|
+
headers = self.options.headers or {}
|
|
18
|
+
if not headers:
|
|
19
|
+
raise InvalidHeadersError("headers are required")
|
|
20
|
+
kwargs: dict[str, object] = {"headers": headers, "timeout": self.options.timeout}
|
|
21
|
+
if self.options.proxy:
|
|
22
|
+
kwargs["proxies"] = {"http": self.options.proxy, "https": self.options.proxy}
|
|
23
|
+
if self.options.cookies:
|
|
24
|
+
kwargs["cookies"] = self.options.cookies
|
|
25
|
+
try:
|
|
26
|
+
response = niquests.get(url, **kwargs)
|
|
27
|
+
response.raise_for_status()
|
|
28
|
+
except niquests.exceptions.HTTPError as exc:
|
|
29
|
+
raise RequestError(f"request failed for {url}") from exc
|
|
30
|
+
except niquests.exceptions.RequestException as exc:
|
|
31
|
+
raise NetworkError(f"network failure while fetching {url}") from exc
|
|
32
|
+
return response.text
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""SeleniumBase scraper implementation placeholder."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import BaseScraper
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class SeleniumBaseScraper(BaseScraper):
|
|
9
|
+
"""Scraper placeholder for SeleniumBase-driven scraping."""
|
|
10
|
+
|
|
11
|
+
def send_request(self) -> str:
|
|
12
|
+
raise NotImplementedError("seleniumbase scraper is not implemented yet")
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Reusable utility functions for nscraper."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
from justhtml import JustHTML
|
|
11
|
+
|
|
12
|
+
from .errors import InvalidCookiesError, InvalidHeadersError, InvalidUrlError
|
|
13
|
+
|
|
14
|
+
DEFAULT_HEADERS = {
|
|
15
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
16
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
BASIC_HTML_CLEANUP_SELECTORS = (
|
|
20
|
+
"script",
|
|
21
|
+
"style",
|
|
22
|
+
"noscript",
|
|
23
|
+
"iframe",
|
|
24
|
+
"source",
|
|
25
|
+
"svg",
|
|
26
|
+
"template",
|
|
27
|
+
"[aria-hidden='true']",
|
|
28
|
+
"[hidden]",
|
|
29
|
+
".ads",
|
|
30
|
+
".advertisement",
|
|
31
|
+
".banner",
|
|
32
|
+
".social-share",
|
|
33
|
+
".newsletter",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
_ARIA_HIDDEN_RE = re.compile(
|
|
37
|
+
r"<(?P<tag>[a-zA-Z][\w:-]*)(?=[^>]*\baria-hidden=(['\"])true\2)[^>]*>.*?</(?P=tag)>",
|
|
38
|
+
re.IGNORECASE | re.DOTALL,
|
|
39
|
+
)
|
|
40
|
+
_HIDDEN_RE = re.compile(
|
|
41
|
+
r"<(?P<tag>[a-zA-Z][\w:-]*)(?=[^>]*\bhidden(?:\s|>|=))[^>]*>.*?</(?P=tag)>",
|
|
42
|
+
re.IGNORECASE | re.DOTALL,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def validate_url(url: str) -> str:
|
|
47
|
+
cleaned = url.strip()
|
|
48
|
+
parsed = urlparse(cleaned)
|
|
49
|
+
if (
|
|
50
|
+
not cleaned
|
|
51
|
+
or parsed.scheme not in {"http", "https"}
|
|
52
|
+
or not parsed.netloc
|
|
53
|
+
or " " in cleaned
|
|
54
|
+
or not parsed.hostname
|
|
55
|
+
):
|
|
56
|
+
raise InvalidUrlError("invalid url")
|
|
57
|
+
return cleaned
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def parse_headers(raw_headers: str | None) -> dict[str, str]:
|
|
61
|
+
if raw_headers is None or not raw_headers.strip():
|
|
62
|
+
raise InvalidHeadersError("headers are required")
|
|
63
|
+
try:
|
|
64
|
+
value = json.loads(raw_headers)
|
|
65
|
+
except json.JSONDecodeError as exc:
|
|
66
|
+
raise InvalidHeadersError("headers must be valid JSON") from exc
|
|
67
|
+
if not isinstance(value, dict) or not value:
|
|
68
|
+
raise InvalidHeadersError("headers must be a non-empty object")
|
|
69
|
+
headers: dict[str, str] = {}
|
|
70
|
+
for key, item in value.items():
|
|
71
|
+
if not isinstance(key, str) or not key.strip():
|
|
72
|
+
raise InvalidHeadersError("header names must be non-empty strings")
|
|
73
|
+
if not isinstance(item, str):
|
|
74
|
+
raise InvalidHeadersError("header values must be strings")
|
|
75
|
+
headers[key.strip()] = item
|
|
76
|
+
return headers
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def load_cookies_file(cookies_file: Path | str | None) -> dict[str, str] | None:
|
|
80
|
+
if cookies_file is None:
|
|
81
|
+
return None
|
|
82
|
+
path = Path(cookies_file)
|
|
83
|
+
if not path.exists():
|
|
84
|
+
raise InvalidCookiesError(f"cookies file not found: {path}")
|
|
85
|
+
try:
|
|
86
|
+
payload = json.loads(path.read_text(encoding="utf-8"))
|
|
87
|
+
except json.JSONDecodeError as exc:
|
|
88
|
+
raise InvalidCookiesError("cookies file must contain valid JSON") from exc
|
|
89
|
+
if not isinstance(payload, dict) or not payload:
|
|
90
|
+
raise InvalidCookiesError("cookies file must contain a non-empty JSON object")
|
|
91
|
+
cookies: dict[str, str] = {}
|
|
92
|
+
for key, value in payload.items():
|
|
93
|
+
if not isinstance(key, str) or not key.strip():
|
|
94
|
+
raise InvalidCookiesError("cookie names must be non-empty strings")
|
|
95
|
+
if not isinstance(value, str):
|
|
96
|
+
raise InvalidCookiesError("cookie values must be strings")
|
|
97
|
+
cookies[key.strip()] = value
|
|
98
|
+
return cookies
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def basic_html_transform(content: str) -> str:
|
|
102
|
+
cleaned_input = _ARIA_HIDDEN_RE.sub("", content)
|
|
103
|
+
cleaned_input = _HIDDEN_RE.sub("", cleaned_input)
|
|
104
|
+
doc = JustHTML(cleaned_input, fragment=False)
|
|
105
|
+
for selector in BASIC_HTML_CLEANUP_SELECTORS:
|
|
106
|
+
for node in doc.query(selector):
|
|
107
|
+
if node.parent:
|
|
108
|
+
node.parent.remove_child(node)
|
|
109
|
+
for head in doc.query("head"):
|
|
110
|
+
while head.has_child_nodes():
|
|
111
|
+
head.remove_child(head.children[0])
|
|
112
|
+
return doc.to_html(pretty=True)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def write_output(output_path: Path, content: str) -> None:
|
|
116
|
+
output_path.write_text(content, encoding="utf-8")
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import pytest
|
|
2
|
+
|
|
3
|
+
from nscraper import InvalidHeadersError, InvalidUrlError, basic_html_transform, parse_headers, validate_url
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@pytest.mark.parametrize(
|
|
7
|
+
("url", "expected"),
|
|
8
|
+
[
|
|
9
|
+
("http://example.com", "http://example.com"),
|
|
10
|
+
("https://example.com", "https://example.com"),
|
|
11
|
+
],
|
|
12
|
+
)
|
|
13
|
+
def test_validate_url_accepts_standard_urls(url, expected):
|
|
14
|
+
assert validate_url(url) == expected
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.mark.parametrize(
|
|
18
|
+
"url",
|
|
19
|
+
[
|
|
20
|
+
"not-a-url",
|
|
21
|
+
"https://example.com/path with space",
|
|
22
|
+
"ftp://example.com",
|
|
23
|
+
"http://",
|
|
24
|
+
],
|
|
25
|
+
)
|
|
26
|
+
def test_validate_url_rejects_invalid_urls(url):
|
|
27
|
+
with pytest.raises(InvalidUrlError, match="invalid url"):
|
|
28
|
+
validate_url(url)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_parse_headers_rejects_missing():
|
|
32
|
+
with pytest.raises(InvalidHeadersError):
|
|
33
|
+
parse_headers(None)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def test_parse_headers_accepts_json_object():
|
|
37
|
+
headers = parse_headers('{"Accept": "text/html"}')
|
|
38
|
+
assert headers == {"Accept": "text/html"}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_basic_html_transform_removes_tags():
|
|
42
|
+
cleaned = basic_html_transform("<html><body>Hello</body></html>")
|
|
43
|
+
assert "<html" in cleaned
|
|
44
|
+
assert "<body>" in cleaned
|
|
45
|
+
assert "Hello" in cleaned
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_basic_html_transform_strips_scripts_and_styles():
|
|
49
|
+
html = """
|
|
50
|
+
<html>
|
|
51
|
+
<head>
|
|
52
|
+
<style>body { color: red; }</style>
|
|
53
|
+
<script>alert('x')</script>
|
|
54
|
+
</head>
|
|
55
|
+
<body><noscript>ignore</noscript><div>Hello</div></body>
|
|
56
|
+
</html>
|
|
57
|
+
"""
|
|
58
|
+
cleaned = basic_html_transform(html)
|
|
59
|
+
assert "<script>" not in cleaned
|
|
60
|
+
assert "<style>" not in cleaned
|
|
61
|
+
assert "<noscript>" not in cleaned
|
|
62
|
+
assert "body { color: red; }" not in cleaned
|
|
63
|
+
assert "alert('x')" not in cleaned
|
|
64
|
+
assert "Hello" in cleaned
|
|
65
|
+
assert "<html>" in cleaned
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_basic_html_transform_handles_broken_markup():
|
|
69
|
+
html = "<div>Hello<script>bad()</script><style>.x{}</style><span>World</span>"
|
|
70
|
+
cleaned = basic_html_transform(html)
|
|
71
|
+
assert "<script>" not in cleaned
|
|
72
|
+
assert "<style>" not in cleaned
|
|
73
|
+
assert "Hello" in cleaned
|
|
74
|
+
assert "World" in cleaned
|
|
75
|
+
assert "<div>" in cleaned or "<span>" in cleaned
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_basic_html_transform_removes_hidden_and_ad_like_nodes():
|
|
79
|
+
html = """
|
|
80
|
+
<html>
|
|
81
|
+
<body>
|
|
82
|
+
<div class="ads">Ad</div>
|
|
83
|
+
<div class="newsletter">Newsletter</div>
|
|
84
|
+
<div aria-hidden="true">Hidden</div>
|
|
85
|
+
<div hidden>Hidden attr</div>
|
|
86
|
+
<div class="content">Keep</div>
|
|
87
|
+
</body>
|
|
88
|
+
</html>
|
|
89
|
+
"""
|
|
90
|
+
cleaned = basic_html_transform(html)
|
|
91
|
+
assert "Ad" not in cleaned
|
|
92
|
+
assert "Newsletter" not in cleaned
|
|
93
|
+
assert "Hidden" not in cleaned
|
|
94
|
+
assert "Keep" in cleaned
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
from dataclasses import replace
|
|
2
|
+
import niquests
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from nscraper import InvalidHeadersError, NetworkError, RequestError, ScrapeOptions
|
|
6
|
+
from nscraper.scraper.http import HttpScraper
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DummyResponse:
|
|
10
|
+
def __init__(self, text: str, *, raise_exc: Exception | None = None) -> None:
|
|
11
|
+
self.text = text
|
|
12
|
+
self._raise_exc = raise_exc
|
|
13
|
+
|
|
14
|
+
def raise_for_status(self) -> None:
|
|
15
|
+
if self._raise_exc is not None:
|
|
16
|
+
raise self._raise_exc
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def make_options(**overrides):
|
|
20
|
+
base = ScrapeOptions(
|
|
21
|
+
url="https://example.com",
|
|
22
|
+
headers={"Accept": "text/html"},
|
|
23
|
+
cookies=None,
|
|
24
|
+
output_path=None,
|
|
25
|
+
transform="raw",
|
|
26
|
+
)
|
|
27
|
+
return replace(base, **overrides)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_http_scraper_returns_raw_response(monkeypatch):
|
|
31
|
+
response = DummyResponse("<html>Hello</html>")
|
|
32
|
+
calls = {}
|
|
33
|
+
|
|
34
|
+
def fake_get(url, **kwargs):
|
|
35
|
+
calls["url"] = url
|
|
36
|
+
calls["kwargs"] = kwargs
|
|
37
|
+
return response
|
|
38
|
+
|
|
39
|
+
monkeypatch.setattr("niquests.get", fake_get)
|
|
40
|
+
scraper = HttpScraper(make_options())
|
|
41
|
+
|
|
42
|
+
assert scraper.scrape() == "<html>Hello</html>"
|
|
43
|
+
assert calls["url"] == "https://example.com"
|
|
44
|
+
assert calls["kwargs"]["headers"] == {"Accept": "text/html"}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_http_scraper_forwards_cookies(monkeypatch):
|
|
48
|
+
response = DummyResponse("<html>Hello</html>")
|
|
49
|
+
calls = {}
|
|
50
|
+
|
|
51
|
+
def fake_get(url, **kwargs):
|
|
52
|
+
calls["kwargs"] = kwargs
|
|
53
|
+
return response
|
|
54
|
+
|
|
55
|
+
monkeypatch.setattr("niquests.get", fake_get)
|
|
56
|
+
scraper = HttpScraper(make_options(cookies={"sessionid": "abc123"}))
|
|
57
|
+
|
|
58
|
+
assert scraper.scrape() == "<html>Hello</html>"
|
|
59
|
+
assert calls["kwargs"]["cookies"] == {"sessionid": "abc123"}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_http_scraper_applies_basic_html_transform(monkeypatch):
|
|
63
|
+
monkeypatch.setattr("niquests.get", lambda *args, **kwargs: DummyResponse("<script>x</script><div>Hello</div>"))
|
|
64
|
+
scraper = HttpScraper(make_options(transform="basic_html"))
|
|
65
|
+
|
|
66
|
+
cleaned = scraper.scrape()
|
|
67
|
+
assert "<div>Hello</div>" in cleaned
|
|
68
|
+
assert "<script>" not in cleaned
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_http_scraper_writes_output(monkeypatch, tmp_path):
|
|
72
|
+
monkeypatch.setattr("niquests.get", lambda *args, **kwargs: DummyResponse("Hello"))
|
|
73
|
+
output = tmp_path / "page.html"
|
|
74
|
+
scraper = HttpScraper(make_options(output_path=output))
|
|
75
|
+
|
|
76
|
+
assert scraper.scrape() == "Hello"
|
|
77
|
+
assert output.read_text(encoding="utf-8") == "Hello"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_http_scraper_rejects_missing_headers(monkeypatch):
|
|
81
|
+
monkeypatch.setattr("niquests.get", lambda *args, **kwargs: DummyResponse("Hello"))
|
|
82
|
+
scraper = HttpScraper(make_options(headers=None))
|
|
83
|
+
|
|
84
|
+
with pytest.raises(InvalidHeadersError, match="headers are required"):
|
|
85
|
+
scraper.scrape()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_http_scraper_maps_request_error(monkeypatch):
|
|
89
|
+
class Boom(niquests.exceptions.HTTPError):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
def fake_get(*args, **kwargs):
|
|
93
|
+
return DummyResponse("ignored", raise_exc=Boom("bad status"))
|
|
94
|
+
|
|
95
|
+
monkeypatch.setattr("niquests.get", fake_get)
|
|
96
|
+
scraper = HttpScraper(make_options())
|
|
97
|
+
|
|
98
|
+
with pytest.raises(RequestError, match="request failed for"):
|
|
99
|
+
scraper.scrape()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_http_scraper_maps_network_error(monkeypatch):
|
|
103
|
+
class Boom(niquests.exceptions.RequestException):
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
def fake_get(*args, **kwargs):
|
|
107
|
+
raise Boom("network down")
|
|
108
|
+
|
|
109
|
+
monkeypatch.setattr("niquests.get", fake_get)
|
|
110
|
+
scraper = HttpScraper(make_options())
|
|
111
|
+
|
|
112
|
+
with pytest.raises(NetworkError, match="network failure while fetching"):
|
|
113
|
+
scraper.scrape()
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from argparse import Namespace
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from nscraper import InvalidCookiesError, ScrapeOptions
|
|
6
|
+
from nscraper.__main__ import _build_options
|
|
7
|
+
from nscraper.utils import DEFAULT_HEADERS
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_default_headers_are_used_when_explicitly_requested():
|
|
11
|
+
options = _build_options(
|
|
12
|
+
Namespace(
|
|
13
|
+
url="https://example.com",
|
|
14
|
+
engine="http",
|
|
15
|
+
proxy=None,
|
|
16
|
+
headers="default",
|
|
17
|
+
cookies_file=None,
|
|
18
|
+
timeout=3.0,
|
|
19
|
+
output=None,
|
|
20
|
+
transform="raw",
|
|
21
|
+
)
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
assert options.headers == DEFAULT_HEADERS
|
|
25
|
+
assert options.headers["User-Agent"] == (
|
|
26
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
|
27
|
+
"(KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_explicit_headers_override_defaults():
|
|
32
|
+
options = _build_options(
|
|
33
|
+
Namespace(
|
|
34
|
+
url="https://example.com",
|
|
35
|
+
engine="http",
|
|
36
|
+
proxy=None,
|
|
37
|
+
headers='{"Accept": "application/json"}',
|
|
38
|
+
cookies_file=None,
|
|
39
|
+
timeout=3.0,
|
|
40
|
+
output=None,
|
|
41
|
+
transform="raw",
|
|
42
|
+
)
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
assert options.headers == {"Accept": "application/json"}
|
|
46
|
+
assert isinstance(options, ScrapeOptions)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_cookie_file_is_optional(tmp_path):
|
|
50
|
+
options = _build_options(
|
|
51
|
+
Namespace(
|
|
52
|
+
url="https://example.com",
|
|
53
|
+
engine="http",
|
|
54
|
+
proxy=None,
|
|
55
|
+
headers="default",
|
|
56
|
+
cookies_file=None,
|
|
57
|
+
timeout=3.0,
|
|
58
|
+
output=None,
|
|
59
|
+
transform="raw",
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
assert options.cookies is None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_cookie_file_is_loaded(tmp_path):
|
|
67
|
+
cookie_file = tmp_path / "cookies.json"
|
|
68
|
+
cookie_file.write_text('{"sessionid": "abc123"}', encoding="utf-8")
|
|
69
|
+
options = _build_options(
|
|
70
|
+
Namespace(
|
|
71
|
+
url="https://example.com",
|
|
72
|
+
engine="http",
|
|
73
|
+
proxy=None,
|
|
74
|
+
headers="default",
|
|
75
|
+
cookies_file=cookie_file,
|
|
76
|
+
timeout=3.0,
|
|
77
|
+
output=None,
|
|
78
|
+
transform="raw",
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
assert options.cookies == {"sessionid": "abc123"}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_cookie_file_invalid_json_fails_fast(tmp_path):
|
|
86
|
+
cookie_file = tmp_path / "cookies.json"
|
|
87
|
+
cookie_file.write_text("{not json}", encoding="utf-8")
|
|
88
|
+
|
|
89
|
+
with pytest.raises(InvalidCookiesError, match="valid JSON"):
|
|
90
|
+
_build_options(
|
|
91
|
+
Namespace(
|
|
92
|
+
url="https://example.com",
|
|
93
|
+
engine="http",
|
|
94
|
+
proxy=None,
|
|
95
|
+
headers="default",
|
|
96
|
+
cookies_file=cookie_file,
|
|
97
|
+
timeout=3.0,
|
|
98
|
+
output=None,
|
|
99
|
+
transform="raw",
|
|
100
|
+
)
|
|
101
|
+
)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from nscraper import InvalidCookiesError
|
|
6
|
+
from nscraper.utils import load_cookies_file
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def test_load_cookies_file_returns_none_when_missing():
|
|
10
|
+
assert load_cookies_file(None) is None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_load_cookies_file_reads_json(tmp_path):
|
|
14
|
+
cookie_file = tmp_path / "cookies.json"
|
|
15
|
+
cookie_file.write_text('{"sessionid": "abc123"}', encoding="utf-8")
|
|
16
|
+
|
|
17
|
+
assert load_cookies_file(cookie_file) == {"sessionid": "abc123"}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_load_cookies_file_rejects_missing_file(tmp_path):
|
|
21
|
+
with pytest.raises(InvalidCookiesError, match="cookies file not found"):
|
|
22
|
+
load_cookies_file(tmp_path / "missing.json")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_load_cookies_file_rejects_invalid_json(tmp_path):
|
|
26
|
+
cookie_file = tmp_path / "cookies.json"
|
|
27
|
+
cookie_file.write_text("{not json}", encoding="utf-8")
|
|
28
|
+
|
|
29
|
+
with pytest.raises(InvalidCookiesError, match="valid JSON"):
|
|
30
|
+
load_cookies_file(cookie_file)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_load_cookies_file_rejects_non_object_json(tmp_path):
|
|
34
|
+
cookie_file = tmp_path / "cookies.json"
|
|
35
|
+
cookie_file.write_text('["sessionid", "abc123"]', encoding="utf-8")
|
|
36
|
+
|
|
37
|
+
with pytest.raises(InvalidCookiesError, match="non-empty JSON object"):
|
|
38
|
+
load_cookies_file(cookie_file)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_load_cookies_file_rejects_empty_object(tmp_path):
|
|
42
|
+
cookie_file = tmp_path / "cookies.json"
|
|
43
|
+
cookie_file.write_text("{}", encoding="utf-8")
|
|
44
|
+
|
|
45
|
+
with pytest.raises(InvalidCookiesError, match="non-empty JSON object"):
|
|
46
|
+
load_cookies_file(cookie_file)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_load_cookies_file_rejects_non_string_values(tmp_path):
|
|
50
|
+
cookie_file = tmp_path / "cookies.json"
|
|
51
|
+
cookie_file.write_text('{"sessionid": 123}', encoding="utf-8")
|
|
52
|
+
|
|
53
|
+
with pytest.raises(InvalidCookiesError, match="cookie values must be strings"):
|
|
54
|
+
load_cookies_file(cookie_file)
|