nscraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nscraper-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 mikerr1@github.com
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,125 @@
1
+ Metadata-Version: 2.1
2
+ Name: nscraper
3
+ Version: 0.1.0
4
+ Summary: A small importable Python module.
5
+ License: MIT
6
+ Requires-Dist: niquests==3.18.4
7
+ Requires-Dist: justhtml==1.14.0
8
+ Description-Content-Type: text/markdown
9
+
10
+ # nscraper
11
+
12
+ `nscraper` is a small Python package scaffolded for two use cases:
13
+
14
+ - import it from other projects
15
+ - run it directly with `python -m nscraper`
16
+
17
+ ## License
18
+
19
+ MIT. You can fork, modify, and reuse it with minimal restrictions as long as
20
+ the license notice is kept with the software.
21
+
22
+ ## Install
23
+
24
+ ```bash
25
+ pip install nscraper
26
+ ```
27
+
28
+ For development:
29
+
30
+ ```bash
31
+ uv sync --dev
32
+ ```
33
+
34
+ ## Use as a module
35
+
36
+ ```python
37
+ from nscraper import HttpScraper, ScrapeOptions
38
+
39
+ options = ScrapeOptions(
40
+ url="https://example.com",
41
+ headers={"Accept": "text/html"},
42
+ )
43
+
44
+ content = HttpScraper(options).scrape()
45
+ print(content)
46
+ ```
47
+
48
+ ## Run the Module
49
+
50
+ ```bash
51
+ python -m nscraper -u https://example.com -H default
52
+ ```
53
+
54
+ Fetch a URL:
55
+
56
+ ```bash
57
+ python -m nscraper -u https://example.com -H default
58
+ python -m nscraper -u https://example.com -H '{"Accept": "text/html"}'
59
+ python -m nscraper -u https://example.com -H default -c cookies.json
60
+ ```
61
+
62
+ ## Current API
63
+
64
+ - `nscraper.ScrapeOptions`
65
+ - `nscraper.BaseScraper`
66
+ - `nscraper.HttpScraper`
67
+ - `nscraper.SeleniumBaseScraper`
68
+ - `nscraper.get_scraper(options: ScrapeOptions) -> BaseScraper`
69
+ - `nscraper.validate_url(url: str) -> str`
70
+ - `nscraper.parse_headers(raw_headers: str | None) -> dict[str, str]`
71
+ - `nscraper.load_cookies_file(path: Path | str | None) -> dict[str, str] | None`
72
+ - `nscraper.basic_html_transform(content: str) -> str`
73
+ - runtime dependency: `niquests==3.18.4`
74
+ - runtime dependency: `justhtml==1.14.0`
75
+ - development dependency: `pytest`
76
+
77
+ ## Module Flags
78
+
79
+ - `-u` / `--url` required
80
+ - `-H` / `--headers` required, or `default`
81
+ - `-e` / `--engine` with `http` or `seleniumbase`
82
+ - `-p` / `--proxy`
83
+ - `--timeout` default `3`
84
+ - `-o` / `--output`
85
+ - `-c` / `--cookies-file` optional JSON file
86
+ - `-t` / `--transform` default `raw`
87
+
88
+ Behavior:
89
+
90
+ - invalid or malformed URLs raise `InvalidUrlError`
91
+ - missing or malformed headers raise `InvalidHeadersError`
92
+ - missing or malformed cookie files raise `InvalidCookiesError`
93
+ - use `-H default` to apply the built-in `Accept` and `User-Agent` header dict
94
+ - use `-c` only when you want to send cookies; omit it to keep current behavior
95
+ - output files are always overwritten
96
+ - `basic_html` removes non-content elements and writes cleaned HTML output
97
+
98
+ Default `User-Agent`:
99
+
100
+ ```text
101
+ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36
102
+ ```
103
+
104
+ The package is intentionally minimal so you can extend it into a reusable library
105
+ and publish it to PyPI.
106
+
107
+ ## GitHub And PyPI Release Flow
108
+
109
+ - pull requests to `master` run tests in GitHub Actions
110
+ - published GitHub releases run tests, build `sdist` and `wheel`, then publish to PyPI
111
+ - the publish workflow is in [.github/workflows/release.yml](/home/ubuntu/projects/nscraper/.github/workflows/release.yml)
112
+
113
+ Before the release workflow can publish, configure Trusted Publishing in PyPI:
114
+
115
+ 1. create the project on PyPI if it does not exist yet
116
+ 2. in PyPI, open the project publishing settings
117
+ 3. add a trusted publisher for this GitHub repository
118
+ 4. use the `release` workflow on the `master` branch
119
+
120
+ After that, the normal flow is:
121
+
122
+ 1. push code to GitHub
123
+ 2. merge to `master`
124
+ 3. create a GitHub release for the version tag
125
+ 4. let GitHub Actions test, build, and publish the package
@@ -0,0 +1,116 @@
1
+ # nscraper
2
+
3
+ `nscraper` is a small Python package scaffolded for two use cases:
4
+
5
+ - import it from other projects
6
+ - run it directly with `python -m nscraper`
7
+
8
+ ## License
9
+
10
+ MIT. You can fork, modify, and reuse it with minimal restrictions as long as
11
+ the license notice is kept with the software.
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install nscraper
17
+ ```
18
+
19
+ For development:
20
+
21
+ ```bash
22
+ uv sync --dev
23
+ ```
24
+
25
+ ## Use as a module
26
+
27
+ ```python
28
+ from nscraper import HttpScraper, ScrapeOptions
29
+
30
+ options = ScrapeOptions(
31
+ url="https://example.com",
32
+ headers={"Accept": "text/html"},
33
+ )
34
+
35
+ content = HttpScraper(options).scrape()
36
+ print(content)
37
+ ```
38
+
39
+ ## Run the Module
40
+
41
+ ```bash
42
+ python -m nscraper -u https://example.com -H default
43
+ ```
44
+
45
+ Fetch a URL:
46
+
47
+ ```bash
48
+ python -m nscraper -u https://example.com -H default
49
+ python -m nscraper -u https://example.com -H '{"Accept": "text/html"}'
50
+ python -m nscraper -u https://example.com -H default -c cookies.json
51
+ ```
52
+
53
+ ## Current API
54
+
55
+ - `nscraper.ScrapeOptions`
56
+ - `nscraper.BaseScraper`
57
+ - `nscraper.HttpScraper`
58
+ - `nscraper.SeleniumBaseScraper`
59
+ - `nscraper.get_scraper(options: ScrapeOptions) -> BaseScraper`
60
+ - `nscraper.validate_url(url: str) -> str`
61
+ - `nscraper.parse_headers(raw_headers: str | None) -> dict[str, str]`
62
+ - `nscraper.load_cookies_file(path: Path | str | None) -> dict[str, str] | None`
63
+ - `nscraper.basic_html_transform(content: str) -> str`
64
+ - runtime dependency: `niquests==3.18.4`
65
+ - runtime dependency: `justhtml==1.14.0`
66
+ - development dependency: `pytest`
67
+
68
+ ## Module Flags
69
+
70
+ - `-u` / `--url` required
71
+ - `-H` / `--headers` required, or `default`
72
+ - `-e` / `--engine` with `http` or `seleniumbase`
73
+ - `-p` / `--proxy`
74
+ - `--timeout` default `3`
75
+ - `-o` / `--output`
76
+ - `-c` / `--cookies-file` optional JSON file
77
+ - `-t` / `--transform` default `raw`
78
+
79
+ Behavior:
80
+
81
+ - invalid or malformed URLs raise `InvalidUrlError`
82
+ - missing or malformed headers raise `InvalidHeadersError`
83
+ - missing or malformed cookie files raise `InvalidCookiesError`
84
+ - use `-H default` to apply the built-in `Accept` and `User-Agent` header dict
85
+ - use `-c` only when you want to send cookies; omit it to keep current behavior
86
+ - output files are always overwritten
87
+ - `basic_html` removes non-content elements and writes cleaned HTML output
88
+
89
+ Default `User-Agent`:
90
+
91
+ ```text
92
+ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36
93
+ ```
94
+
95
+ The package is intentionally minimal so you can extend it into a reusable library
96
+ and publish it to PyPI.
97
+
98
+ ## GitHub And PyPI Release Flow
99
+
100
+ - pull requests to `master` run tests in GitHub Actions
101
+ - published GitHub releases run tests, build `sdist` and `wheel`, then publish to PyPI
102
+ - the publish workflow is in [.github/workflows/release.yml](/home/ubuntu/projects/nscraper/.github/workflows/release.yml)
103
+
104
+ Before the release workflow can publish, configure Trusted Publishing in PyPI:
105
+
106
+ 1. create the project on PyPI if it does not exist yet
107
+ 2. in PyPI, open the project publishing settings
108
+ 3. add a trusted publisher for this GitHub repository
109
+ 4. use the `release` workflow on the `master` branch
110
+
111
+ After that, the normal flow is:
112
+
113
+ 1. push code to GitHub
114
+ 2. merge to `master`
115
+ 3. create a GitHub release for the version tag
116
+ 4. let GitHub Actions test, build, and publish the package
@@ -0,0 +1,160 @@
1
+ """Minimal PEP 517 backend for offline editable and wheel installs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ import base64
8
+ import csv
9
+ import hashlib
10
+ import io
11
+ import os
12
+ import tarfile
13
+ import tempfile
14
+ import zipfile
15
+
16
+ NAME = "nscraper"
17
+ VERSION = "0.1.0"
18
+ DIST_INFO = f"{NAME}-{VERSION}.dist-info"
19
+ SUMMARY = "A small importable Python module."
20
+ README_PATH = Path(__file__).resolve().parent / "README.md"
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class FileRecord:
25
+ path: str
26
+ data: bytes
27
+
28
+
29
+ def _normalize(path: str) -> str:
30
+ return path.replace(os.sep, "/")
31
+
32
+
33
+ def _metadata_text() -> str:
34
+ description = README_PATH.read_text(encoding="utf-8")
35
+ return (
36
+ "Metadata-Version: 2.1\n"
37
+ f"Name: {NAME}\n"
38
+ f"Version: {VERSION}\n"
39
+ f"Summary: {SUMMARY}\n"
40
+ "License: MIT\n"
41
+ "Requires-Dist: niquests==3.18.4\n"
42
+ "Requires-Dist: justhtml==1.14.0\n"
43
+ "Description-Content-Type: text/markdown\n"
44
+ "\n"
45
+ f"{description}"
46
+ )
47
+
48
+
49
+ def _wheel_metadata() -> list[FileRecord]:
50
+ wheel = (
51
+ "Wheel-Version: 1.0\n"
52
+ "Generator: nscraper.build_backend\n"
53
+ "Root-Is-Purelib: true\n"
54
+ "Tag: py3-none-any\n"
55
+ )
56
+ meta = _metadata_text()
57
+ entry_points = "[console_scripts]\nnscraper = nscraper.__main__:main\n"
58
+ return [
59
+ FileRecord(f"{DIST_INFO}/WHEEL", wheel.encode()),
60
+ FileRecord(f"{DIST_INFO}/METADATA", meta.encode()),
61
+ FileRecord(f"{DIST_INFO}/entry_points.txt", entry_points.encode()),
62
+ ]
63
+
64
+
65
+ def _package_files() -> list[FileRecord]:
66
+ root = Path(__file__).resolve().parent / "src" / NAME
67
+ records: list[FileRecord] = []
68
+ for path in root.rglob("*.py"):
69
+ rel = _normalize(str(path.relative_to(root.parent)))
70
+ records.append(FileRecord(rel, path.read_bytes()))
71
+ return records
72
+
73
+
74
+ def _sdist_files(root: Path) -> list[tuple[Path, str]]:
75
+ members = ["build_backend.py", "pyproject.toml", "README.md", "LICENSE"]
76
+ files = [(root / member, f"{NAME}-{VERSION}/{member}") for member in members]
77
+ for base in ("src", "tests"):
78
+ for path in (root / base).rglob("*"):
79
+ if path.is_dir() or "__pycache__" in path.parts or path.suffix == ".pyc":
80
+ continue
81
+ arcname = f"{NAME}-{VERSION}/{path.relative_to(root)}"
82
+ files.append((path, arcname))
83
+ return files
84
+
85
+
86
+ def _build_wheel_dir(wheel_directory: str, editable: bool = False) -> str:
87
+ out = Path(wheel_directory)
88
+ out.mkdir(parents=True, exist_ok=True)
89
+ wheel_name = f"{NAME}-{VERSION}-py3-none-any.whl"
90
+ wheel_path = out / wheel_name
91
+ records = _wheel_metadata()
92
+ if editable:
93
+ pth = f"{NAME}.pth"
94
+ src_dir = (Path(__file__).resolve().parent / "src").as_posix()
95
+ records.append(FileRecord(pth, f"{src_dir}\n".encode()))
96
+ else:
97
+ records.extend(_package_files())
98
+ with tempfile.TemporaryDirectory() as td:
99
+ tmp = Path(td)
100
+ with zipfile.ZipFile(wheel_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
101
+ for record in records:
102
+ zf.writestr(record.path, record.data)
103
+ dist_info = tmp / DIST_INFO
104
+ dist_info.mkdir()
105
+ record_file = dist_info / "RECORD"
106
+ rows = []
107
+ for record in records:
108
+ digest = base64.urlsafe_b64encode(hashlib.sha256(record.data).digest()).rstrip(b"=").decode()
109
+ rows.append((record.path, f"sha256={digest}", str(len(record.data))))
110
+ rows.append((f"{DIST_INFO}/RECORD", "", ""))
111
+ with record_file.open("w", newline="", encoding="utf-8") as f:
112
+ writer = csv.writer(f)
113
+ writer.writerows(rows)
114
+ zf.write(record_file, f"{DIST_INFO}/RECORD")
115
+ return wheel_name
116
+
117
+
118
+ def get_requires_for_build_wheel(config_settings=None):
119
+ return []
120
+
121
+
122
+ def get_requires_for_build_editable(config_settings=None):
123
+ return []
124
+
125
+
126
+ def prepare_metadata_for_build_wheel(metadata_directory, config_settings=None):
127
+ dist = Path(metadata_directory) / DIST_INFO
128
+ dist.mkdir(parents=True, exist_ok=True)
129
+ for record in _wheel_metadata():
130
+ if record.path.endswith(("WHEEL", "METADATA", "entry_points.txt")):
131
+ (dist / Path(record.path).name).write_bytes(record.data)
132
+ return DIST_INFO
133
+
134
+
135
+ def build_wheel(wheel_directory, config_settings=None, metadata_directory=None):
136
+ return _build_wheel_dir(wheel_directory, editable=False)
137
+
138
+
139
+ def build_editable(wheel_directory, config_settings=None, metadata_directory=None):
140
+ return _build_wheel_dir(wheel_directory, editable=True)
141
+
142
+
143
+ def prepare_metadata_for_build_editable(metadata_directory, config_settings=None):
144
+ return prepare_metadata_for_build_wheel(metadata_directory, config_settings)
145
+
146
+
147
+ def build_sdist(sdist_directory, config_settings=None):
148
+ out = Path(sdist_directory)
149
+ out.mkdir(parents=True, exist_ok=True)
150
+ sdist_name = f"{NAME}-{VERSION}.tar.gz"
151
+ sdist_path = out / sdist_name
152
+ root = Path(__file__).resolve().parent
153
+ pkg_info = _metadata_text().encode("utf-8")
154
+ with tarfile.open(sdist_path, "w:gz") as tf:
155
+ for path, arcname in _sdist_files(root):
156
+ tf.add(path, arcname=arcname)
157
+ info = tarfile.TarInfo(name=f"{NAME}-{VERSION}/PKG-INFO")
158
+ info.size = len(pkg_info)
159
+ tf.addfile(info, fileobj=io.BytesIO(pkg_info))
160
+ return sdist_name
@@ -0,0 +1,22 @@
1
+ [build-system]
2
+ requires = []
3
+ build-backend = "build_backend"
4
+ backend-path = ["."]
5
+
6
+ [project]
7
+ name = "nscraper"
8
+ version = "0.1.0"
9
+ description = "A small importable Python module."
10
+ readme = "README.md"
11
+ requires-python = ">=3.10"
12
+ license = { text = "MIT" }
13
+ authors = [
14
+ { name = "mikerr1@github.com" }
15
+ ]
16
+ dependencies = ["niquests==3.18.4", "justhtml==1.14.0"]
17
+
18
+ [project.scripts]
19
+ nscraper = "nscraper.__main__:main"
20
+
21
+ [dependency-groups]
22
+ dev = ["pytest"]
@@ -0,0 +1,42 @@
1
+ """Public package interface for nscraper."""
2
+
3
+ from .core import get_scraper, hello
4
+ from .errors import (
5
+ InvalidHeadersError,
6
+ InvalidCookiesError,
7
+ InvalidUrlError,
8
+ NetworkError,
9
+ NscraperError,
10
+ RequestError,
11
+ )
12
+ from .models import ScrapeOptions
13
+ from .scraper import BaseScraper, HttpScraper, SeleniumBaseScraper
14
+ from .utils import (
15
+ DEFAULT_HEADERS,
16
+ basic_html_transform,
17
+ load_cookies_file,
18
+ parse_headers,
19
+ validate_url,
20
+ write_output,
21
+ )
22
+
23
+ __all__ = [
24
+ "BaseScraper",
25
+ "DEFAULT_HEADERS",
26
+ "HttpScraper",
27
+ "InvalidHeadersError",
28
+ "InvalidCookiesError",
29
+ "InvalidUrlError",
30
+ "NetworkError",
31
+ "NscraperError",
32
+ "RequestError",
33
+ "ScrapeOptions",
34
+ "SeleniumBaseScraper",
35
+ "basic_html_transform",
36
+ "load_cookies_file",
37
+ "hello",
38
+ "get_scraper",
39
+ "parse_headers",
40
+ "validate_url",
41
+ "write_output",
42
+ ]
@@ -0,0 +1,66 @@
1
+ """Module entry point for nscraper."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from pathlib import Path
7
+
8
+ from .core import get_scraper
9
+ from .errors import NscraperError
10
+ from .models import ScrapeOptions
11
+ from .utils import DEFAULT_HEADERS, load_cookies_file, parse_headers
12
+
13
+
14
+ def build_parser() -> argparse.ArgumentParser:
15
+ parser = argparse.ArgumentParser(prog="nscraper")
16
+ parser.add_argument("-u", "--url", required=True, help="Target URL")
17
+ parser.add_argument(
18
+ "-e",
19
+ "--engine",
20
+ choices=("http", "seleniumbase"),
21
+ default="http",
22
+ help="Request engine",
23
+ )
24
+ parser.add_argument("-p", "--proxy", help="Proxy URL")
25
+ parser.add_argument("-H", "--headers", required=True, help='Headers as JSON string or "default"')
26
+ parser.add_argument("-c", "--cookies-file", help="Path to a JSON cookies file")
27
+ parser.add_argument("--timeout", type=float, default=3.0, help="Timeout in seconds")
28
+ parser.add_argument("-o", "--output", help="Output path for HTML")
29
+ parser.add_argument(
30
+ "-t",
31
+ "--transform",
32
+ choices=("raw", "basic_html"),
33
+ default="raw",
34
+ help="Transform mode",
35
+ )
36
+ return parser
37
+
38
+
39
+ def _build_options(args: argparse.Namespace) -> ScrapeOptions:
40
+ headers = DEFAULT_HEADERS if args.headers == "default" else parse_headers(args.headers)
41
+ return ScrapeOptions(
42
+ url=args.url,
43
+ engine=args.engine,
44
+ proxy=args.proxy,
45
+ headers=headers,
46
+ cookies=load_cookies_file(args.cookies_file),
47
+ timeout=args.timeout,
48
+ output_path=Path(args.output) if args.output else None,
49
+ transform=args.transform,
50
+ )
51
+
52
+
53
+ def main(argv: list[str] | None = None) -> int:
54
+ args = build_parser().parse_args(argv)
55
+ try:
56
+ options = _build_options(args)
57
+ content = get_scraper(options).scrape()
58
+ except NscraperError as exc:
59
+ raise SystemExit(str(exc)) from exc
60
+ if not options.output_path:
61
+ print(content)
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__":
66
+ raise SystemExit(main())
@@ -0,0 +1,18 @@
1
+ """Core library functions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .models import ScrapeOptions
6
+ from .scraper import BaseScraper, HttpScraper, SeleniumBaseScraper
7
+
8
+
9
+ def hello(name: str = "world") -> str:
10
+ """Return a friendly greeting."""
11
+ cleaned = name.strip() or "world"
12
+ return f"Hello, {cleaned}!"
13
+
14
+
15
+ def get_scraper(options: ScrapeOptions) -> BaseScraper:
16
+ if options.engine == "seleniumbase":
17
+ return SeleniumBaseScraper(options)
18
+ return HttpScraper(options)
@@ -0,0 +1,25 @@
1
+ """Project-specific exceptions for nscraper."""
2
+
3
+
4
+ class NscraperError(Exception):
5
+ """Base exception for nscraper failures."""
6
+
7
+
8
+ class InvalidUrlError(NscraperError):
9
+ """Raised when a URL argument is missing or invalid."""
10
+
11
+
12
+ class InvalidHeadersError(NscraperError):
13
+ """Raised when headers are missing or malformed."""
14
+
15
+
16
+ class InvalidCookiesError(NscraperError):
17
+ """Raised when cookies are missing or malformed."""
18
+
19
+
20
+ class RequestError(NscraperError):
21
+ """Raised when an HTTP request fails with a non-success response."""
22
+
23
+
24
+ class NetworkError(NscraperError):
25
+ """Raised when the request cannot be completed due to network issues."""
@@ -0,0 +1,24 @@
1
+ """Data models for nscraper operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Literal
8
+
9
+ Engine = Literal["http", "seleniumbase"]
10
+ Transform = Literal["raw", "basic_html"]
11
+
12
+
13
+ @dataclass(frozen=True, slots=True)
14
+ class ScrapeOptions:
15
+ """Normalized scrape configuration."""
16
+
17
+ url: str
18
+ engine: Engine = "http"
19
+ proxy: str | None = None
20
+ headers: dict[str, str] | None = None
21
+ cookies: dict[str, str] | None = None
22
+ timeout: float = 3.0
23
+ output_path: Path | None = None
24
+ transform: Transform = "raw"
@@ -0,0 +1,7 @@
1
+ """Scraper implementations."""
2
+
3
+ from .base import BaseScraper
4
+ from .http import HttpScraper
5
+ from .seleniumbase import SeleniumBaseScraper
6
+
7
+ __all__ = ["BaseScraper", "HttpScraper", "SeleniumBaseScraper"]
@@ -0,0 +1,34 @@
1
+ """Base scraper contract."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from ..models import ScrapeOptions
8
+ from ..utils import basic_html_transform, write_output
9
+
10
+
11
+ class BaseScraper(ABC):
12
+ """Base scraper contract."""
13
+
14
+ def __init__(self, options: ScrapeOptions) -> None:
15
+ self.options = options
16
+
17
+ @abstractmethod
18
+ def send_request(self) -> str:
19
+ """Send the underlying request and return raw content."""
20
+
21
+ def transform(self, content: str) -> str:
22
+ if self.options.transform == "basic_html":
23
+ return basic_html_transform(content)
24
+ return content
25
+
26
+ def store(self, content: str) -> None:
27
+ if self.options.output_path:
28
+ write_output(self.options.output_path, content)
29
+
30
+ def scrape(self) -> str:
31
+ content = self.send_request()
32
+ content = self.transform(content)
33
+ self.store(content)
34
+ return content
@@ -0,0 +1,32 @@
1
+ """HTTP scraper implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import niquests
6
+
7
+ from ..errors import InvalidHeadersError, NetworkError, RequestError
8
+ from ..utils import validate_url
9
+ from .base import BaseScraper
10
+
11
+
12
+ class HttpScraper(BaseScraper):
13
+ """Scraper implementation backed by niquests."""
14
+
15
+ def send_request(self) -> str:
16
+ url = validate_url(self.options.url)
17
+ headers = self.options.headers or {}
18
+ if not headers:
19
+ raise InvalidHeadersError("headers are required")
20
+ kwargs: dict[str, object] = {"headers": headers, "timeout": self.options.timeout}
21
+ if self.options.proxy:
22
+ kwargs["proxies"] = {"http": self.options.proxy, "https": self.options.proxy}
23
+ if self.options.cookies:
24
+ kwargs["cookies"] = self.options.cookies
25
+ try:
26
+ response = niquests.get(url, **kwargs)
27
+ response.raise_for_status()
28
+ except niquests.exceptions.HTTPError as exc:
29
+ raise RequestError(f"request failed for {url}") from exc
30
+ except niquests.exceptions.RequestException as exc:
31
+ raise NetworkError(f"network failure while fetching {url}") from exc
32
+ return response.text
@@ -0,0 +1,12 @@
1
+ """SeleniumBase scraper implementation placeholder."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import BaseScraper
6
+
7
+
8
+ class SeleniumBaseScraper(BaseScraper):
9
+ """Scraper placeholder for SeleniumBase-driven scraping."""
10
+
11
+ def send_request(self) -> str:
12
+ raise NotImplementedError("seleniumbase scraper is not implemented yet")
@@ -0,0 +1,116 @@
1
+ """Reusable utility functions for nscraper."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from pathlib import Path
8
+ from urllib.parse import urlparse
9
+
10
+ from justhtml import JustHTML
11
+
12
+ from .errors import InvalidCookiesError, InvalidHeadersError, InvalidUrlError
13
+
14
+ DEFAULT_HEADERS = {
15
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
16
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36",
17
+ }
18
+
19
+ BASIC_HTML_CLEANUP_SELECTORS = (
20
+ "script",
21
+ "style",
22
+ "noscript",
23
+ "iframe",
24
+ "source",
25
+ "svg",
26
+ "template",
27
+ "[aria-hidden='true']",
28
+ "[hidden]",
29
+ ".ads",
30
+ ".advertisement",
31
+ ".banner",
32
+ ".social-share",
33
+ ".newsletter",
34
+ )
35
+
36
+ _ARIA_HIDDEN_RE = re.compile(
37
+ r"<(?P<tag>[a-zA-Z][\w:-]*)(?=[^>]*\baria-hidden=(['\"])true\2)[^>]*>.*?</(?P=tag)>",
38
+ re.IGNORECASE | re.DOTALL,
39
+ )
40
+ _HIDDEN_RE = re.compile(
41
+ r"<(?P<tag>[a-zA-Z][\w:-]*)(?=[^>]*\bhidden(?:\s|>|=))[^>]*>.*?</(?P=tag)>",
42
+ re.IGNORECASE | re.DOTALL,
43
+ )
44
+
45
+
46
+ def validate_url(url: str) -> str:
47
+ cleaned = url.strip()
48
+ parsed = urlparse(cleaned)
49
+ if (
50
+ not cleaned
51
+ or parsed.scheme not in {"http", "https"}
52
+ or not parsed.netloc
53
+ or " " in cleaned
54
+ or not parsed.hostname
55
+ ):
56
+ raise InvalidUrlError("invalid url")
57
+ return cleaned
58
+
59
+
60
+ def parse_headers(raw_headers: str | None) -> dict[str, str]:
61
+ if raw_headers is None or not raw_headers.strip():
62
+ raise InvalidHeadersError("headers are required")
63
+ try:
64
+ value = json.loads(raw_headers)
65
+ except json.JSONDecodeError as exc:
66
+ raise InvalidHeadersError("headers must be valid JSON") from exc
67
+ if not isinstance(value, dict) or not value:
68
+ raise InvalidHeadersError("headers must be a non-empty object")
69
+ headers: dict[str, str] = {}
70
+ for key, item in value.items():
71
+ if not isinstance(key, str) or not key.strip():
72
+ raise InvalidHeadersError("header names must be non-empty strings")
73
+ if not isinstance(item, str):
74
+ raise InvalidHeadersError("header values must be strings")
75
+ headers[key.strip()] = item
76
+ return headers
77
+
78
+
79
+ def load_cookies_file(cookies_file: Path | str | None) -> dict[str, str] | None:
80
+ if cookies_file is None:
81
+ return None
82
+ path = Path(cookies_file)
83
+ if not path.exists():
84
+ raise InvalidCookiesError(f"cookies file not found: {path}")
85
+ try:
86
+ payload = json.loads(path.read_text(encoding="utf-8"))
87
+ except json.JSONDecodeError as exc:
88
+ raise InvalidCookiesError("cookies file must contain valid JSON") from exc
89
+ if not isinstance(payload, dict) or not payload:
90
+ raise InvalidCookiesError("cookies file must contain a non-empty JSON object")
91
+ cookies: dict[str, str] = {}
92
+ for key, value in payload.items():
93
+ if not isinstance(key, str) or not key.strip():
94
+ raise InvalidCookiesError("cookie names must be non-empty strings")
95
+ if not isinstance(value, str):
96
+ raise InvalidCookiesError("cookie values must be strings")
97
+ cookies[key.strip()] = value
98
+ return cookies
99
+
100
+
101
+ def basic_html_transform(content: str) -> str:
102
+ cleaned_input = _ARIA_HIDDEN_RE.sub("", content)
103
+ cleaned_input = _HIDDEN_RE.sub("", cleaned_input)
104
+ doc = JustHTML(cleaned_input, fragment=False)
105
+ for selector in BASIC_HTML_CLEANUP_SELECTORS:
106
+ for node in doc.query(selector):
107
+ if node.parent:
108
+ node.parent.remove_child(node)
109
+ for head in doc.query("head"):
110
+ while head.has_child_nodes():
111
+ head.remove_child(head.children[0])
112
+ return doc.to_html(pretty=True)
113
+
114
+
115
+ def write_output(output_path: Path, content: str) -> None:
116
+ output_path.write_text(content, encoding="utf-8")
@@ -0,0 +1,94 @@
1
+ import pytest
2
+
3
+ from nscraper import InvalidHeadersError, InvalidUrlError, basic_html_transform, parse_headers, validate_url
4
+
5
+
6
+ @pytest.mark.parametrize(
7
+ ("url", "expected"),
8
+ [
9
+ ("http://example.com", "http://example.com"),
10
+ ("https://example.com", "https://example.com"),
11
+ ],
12
+ )
13
+ def test_validate_url_accepts_standard_urls(url, expected):
14
+ assert validate_url(url) == expected
15
+
16
+
17
+ @pytest.mark.parametrize(
18
+ "url",
19
+ [
20
+ "not-a-url",
21
+ "https://example.com/path with space",
22
+ "ftp://example.com",
23
+ "http://",
24
+ ],
25
+ )
26
+ def test_validate_url_rejects_invalid_urls(url):
27
+ with pytest.raises(InvalidUrlError, match="invalid url"):
28
+ validate_url(url)
29
+
30
+
31
+ def test_parse_headers_rejects_missing():
32
+ with pytest.raises(InvalidHeadersError):
33
+ parse_headers(None)
34
+
35
+
36
+ def test_parse_headers_accepts_json_object():
37
+ headers = parse_headers('{"Accept": "text/html"}')
38
+ assert headers == {"Accept": "text/html"}
39
+
40
+
41
+ def test_basic_html_transform_removes_tags():
42
+ cleaned = basic_html_transform("<html><body>Hello</body></html>")
43
+ assert "<html" in cleaned
44
+ assert "<body>" in cleaned
45
+ assert "Hello" in cleaned
46
+
47
+
48
+ def test_basic_html_transform_strips_scripts_and_styles():
49
+ html = """
50
+ <html>
51
+ <head>
52
+ <style>body { color: red; }</style>
53
+ <script>alert('x')</script>
54
+ </head>
55
+ <body><noscript>ignore</noscript><div>Hello</div></body>
56
+ </html>
57
+ """
58
+ cleaned = basic_html_transform(html)
59
+ assert "<script>" not in cleaned
60
+ assert "<style>" not in cleaned
61
+ assert "<noscript>" not in cleaned
62
+ assert "body { color: red; }" not in cleaned
63
+ assert "alert('x')" not in cleaned
64
+ assert "Hello" in cleaned
65
+ assert "<html>" in cleaned
66
+
67
+
68
+ def test_basic_html_transform_handles_broken_markup():
69
+ html = "<div>Hello<script>bad()</script><style>.x{}</style><span>World</span>"
70
+ cleaned = basic_html_transform(html)
71
+ assert "<script>" not in cleaned
72
+ assert "<style>" not in cleaned
73
+ assert "Hello" in cleaned
74
+ assert "World" in cleaned
75
+ assert "<div>" in cleaned or "<span>" in cleaned
76
+
77
+
78
+ def test_basic_html_transform_removes_hidden_and_ad_like_nodes():
79
+ html = """
80
+ <html>
81
+ <body>
82
+ <div class="ads">Ad</div>
83
+ <div class="newsletter">Newsletter</div>
84
+ <div aria-hidden="true">Hidden</div>
85
+ <div hidden>Hidden attr</div>
86
+ <div class="content">Keep</div>
87
+ </body>
88
+ </html>
89
+ """
90
+ cleaned = basic_html_transform(html)
91
+ assert "Ad" not in cleaned
92
+ assert "Newsletter" not in cleaned
93
+ assert "Hidden" not in cleaned
94
+ assert "Keep" in cleaned
@@ -0,0 +1,113 @@
1
+ from dataclasses import replace
2
+ import niquests
3
+ import pytest
4
+
5
+ from nscraper import InvalidHeadersError, NetworkError, RequestError, ScrapeOptions
6
+ from nscraper.scraper.http import HttpScraper
7
+
8
+
9
+ class DummyResponse:
10
+ def __init__(self, text: str, *, raise_exc: Exception | None = None) -> None:
11
+ self.text = text
12
+ self._raise_exc = raise_exc
13
+
14
+ def raise_for_status(self) -> None:
15
+ if self._raise_exc is not None:
16
+ raise self._raise_exc
17
+
18
+
19
+ def make_options(**overrides):
20
+ base = ScrapeOptions(
21
+ url="https://example.com",
22
+ headers={"Accept": "text/html"},
23
+ cookies=None,
24
+ output_path=None,
25
+ transform="raw",
26
+ )
27
+ return replace(base, **overrides)
28
+
29
+
30
+ def test_http_scraper_returns_raw_response(monkeypatch):
31
+ response = DummyResponse("<html>Hello</html>")
32
+ calls = {}
33
+
34
+ def fake_get(url, **kwargs):
35
+ calls["url"] = url
36
+ calls["kwargs"] = kwargs
37
+ return response
38
+
39
+ monkeypatch.setattr("niquests.get", fake_get)
40
+ scraper = HttpScraper(make_options())
41
+
42
+ assert scraper.scrape() == "<html>Hello</html>"
43
+ assert calls["url"] == "https://example.com"
44
+ assert calls["kwargs"]["headers"] == {"Accept": "text/html"}
45
+
46
+
47
+ def test_http_scraper_forwards_cookies(monkeypatch):
48
+ response = DummyResponse("<html>Hello</html>")
49
+ calls = {}
50
+
51
+ def fake_get(url, **kwargs):
52
+ calls["kwargs"] = kwargs
53
+ return response
54
+
55
+ monkeypatch.setattr("niquests.get", fake_get)
56
+ scraper = HttpScraper(make_options(cookies={"sessionid": "abc123"}))
57
+
58
+ assert scraper.scrape() == "<html>Hello</html>"
59
+ assert calls["kwargs"]["cookies"] == {"sessionid": "abc123"}
60
+
61
+
62
+ def test_http_scraper_applies_basic_html_transform(monkeypatch):
63
+ monkeypatch.setattr("niquests.get", lambda *args, **kwargs: DummyResponse("<script>x</script><div>Hello</div>"))
64
+ scraper = HttpScraper(make_options(transform="basic_html"))
65
+
66
+ cleaned = scraper.scrape()
67
+ assert "<div>Hello</div>" in cleaned
68
+ assert "<script>" not in cleaned
69
+
70
+
71
+ def test_http_scraper_writes_output(monkeypatch, tmp_path):
72
+ monkeypatch.setattr("niquests.get", lambda *args, **kwargs: DummyResponse("Hello"))
73
+ output = tmp_path / "page.html"
74
+ scraper = HttpScraper(make_options(output_path=output))
75
+
76
+ assert scraper.scrape() == "Hello"
77
+ assert output.read_text(encoding="utf-8") == "Hello"
78
+
79
+
80
+ def test_http_scraper_rejects_missing_headers(monkeypatch):
81
+ monkeypatch.setattr("niquests.get", lambda *args, **kwargs: DummyResponse("Hello"))
82
+ scraper = HttpScraper(make_options(headers=None))
83
+
84
+ with pytest.raises(InvalidHeadersError, match="headers are required"):
85
+ scraper.scrape()
86
+
87
+
88
+ def test_http_scraper_maps_request_error(monkeypatch):
89
+ class Boom(niquests.exceptions.HTTPError):
90
+ pass
91
+
92
+ def fake_get(*args, **kwargs):
93
+ return DummyResponse("ignored", raise_exc=Boom("bad status"))
94
+
95
+ monkeypatch.setattr("niquests.get", fake_get)
96
+ scraper = HttpScraper(make_options())
97
+
98
+ with pytest.raises(RequestError, match="request failed for"):
99
+ scraper.scrape()
100
+
101
+
102
+ def test_http_scraper_maps_network_error(monkeypatch):
103
+ class Boom(niquests.exceptions.RequestException):
104
+ pass
105
+
106
+ def fake_get(*args, **kwargs):
107
+ raise Boom("network down")
108
+
109
+ monkeypatch.setattr("niquests.get", fake_get)
110
+ scraper = HttpScraper(make_options())
111
+
112
+ with pytest.raises(NetworkError, match="network failure while fetching"):
113
+ scraper.scrape()
@@ -0,0 +1,101 @@
1
+ from argparse import Namespace
2
+
3
+ import pytest
4
+
5
+ from nscraper import InvalidCookiesError, ScrapeOptions
6
+ from nscraper.__main__ import _build_options
7
+ from nscraper.utils import DEFAULT_HEADERS
8
+
9
+
10
+ def test_default_headers_are_used_when_explicitly_requested():
11
+ options = _build_options(
12
+ Namespace(
13
+ url="https://example.com",
14
+ engine="http",
15
+ proxy=None,
16
+ headers="default",
17
+ cookies_file=None,
18
+ timeout=3.0,
19
+ output=None,
20
+ transform="raw",
21
+ )
22
+ )
23
+
24
+ assert options.headers == DEFAULT_HEADERS
25
+ assert options.headers["User-Agent"] == (
26
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
27
+ "(KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36"
28
+ )
29
+
30
+
31
+ def test_explicit_headers_override_defaults():
32
+ options = _build_options(
33
+ Namespace(
34
+ url="https://example.com",
35
+ engine="http",
36
+ proxy=None,
37
+ headers='{"Accept": "application/json"}',
38
+ cookies_file=None,
39
+ timeout=3.0,
40
+ output=None,
41
+ transform="raw",
42
+ )
43
+ )
44
+
45
+ assert options.headers == {"Accept": "application/json"}
46
+ assert isinstance(options, ScrapeOptions)
47
+
48
+
49
+ def test_cookie_file_is_optional(tmp_path):
50
+ options = _build_options(
51
+ Namespace(
52
+ url="https://example.com",
53
+ engine="http",
54
+ proxy=None,
55
+ headers="default",
56
+ cookies_file=None,
57
+ timeout=3.0,
58
+ output=None,
59
+ transform="raw",
60
+ )
61
+ )
62
+
63
+ assert options.cookies is None
64
+
65
+
66
+ def test_cookie_file_is_loaded(tmp_path):
67
+ cookie_file = tmp_path / "cookies.json"
68
+ cookie_file.write_text('{"sessionid": "abc123"}', encoding="utf-8")
69
+ options = _build_options(
70
+ Namespace(
71
+ url="https://example.com",
72
+ engine="http",
73
+ proxy=None,
74
+ headers="default",
75
+ cookies_file=cookie_file,
76
+ timeout=3.0,
77
+ output=None,
78
+ transform="raw",
79
+ )
80
+ )
81
+
82
+ assert options.cookies == {"sessionid": "abc123"}
83
+
84
+
85
+ def test_cookie_file_invalid_json_fails_fast(tmp_path):
86
+ cookie_file = tmp_path / "cookies.json"
87
+ cookie_file.write_text("{not json}", encoding="utf-8")
88
+
89
+ with pytest.raises(InvalidCookiesError, match="valid JSON"):
90
+ _build_options(
91
+ Namespace(
92
+ url="https://example.com",
93
+ engine="http",
94
+ proxy=None,
95
+ headers="default",
96
+ cookies_file=cookie_file,
97
+ timeout=3.0,
98
+ output=None,
99
+ transform="raw",
100
+ )
101
+ )
@@ -0,0 +1,54 @@
1
+ from pathlib import Path
2
+
3
+ import pytest
4
+
5
+ from nscraper import InvalidCookiesError
6
+ from nscraper.utils import load_cookies_file
7
+
8
+
9
+ def test_load_cookies_file_returns_none_when_missing():
10
+ assert load_cookies_file(None) is None
11
+
12
+
13
+ def test_load_cookies_file_reads_json(tmp_path):
14
+ cookie_file = tmp_path / "cookies.json"
15
+ cookie_file.write_text('{"sessionid": "abc123"}', encoding="utf-8")
16
+
17
+ assert load_cookies_file(cookie_file) == {"sessionid": "abc123"}
18
+
19
+
20
+ def test_load_cookies_file_rejects_missing_file(tmp_path):
21
+ with pytest.raises(InvalidCookiesError, match="cookies file not found"):
22
+ load_cookies_file(tmp_path / "missing.json")
23
+
24
+
25
+ def test_load_cookies_file_rejects_invalid_json(tmp_path):
26
+ cookie_file = tmp_path / "cookies.json"
27
+ cookie_file.write_text("{not json}", encoding="utf-8")
28
+
29
+ with pytest.raises(InvalidCookiesError, match="valid JSON"):
30
+ load_cookies_file(cookie_file)
31
+
32
+
33
+ def test_load_cookies_file_rejects_non_object_json(tmp_path):
34
+ cookie_file = tmp_path / "cookies.json"
35
+ cookie_file.write_text('["sessionid", "abc123"]', encoding="utf-8")
36
+
37
+ with pytest.raises(InvalidCookiesError, match="non-empty JSON object"):
38
+ load_cookies_file(cookie_file)
39
+
40
+
41
+ def test_load_cookies_file_rejects_empty_object(tmp_path):
42
+ cookie_file = tmp_path / "cookies.json"
43
+ cookie_file.write_text("{}", encoding="utf-8")
44
+
45
+ with pytest.raises(InvalidCookiesError, match="non-empty JSON object"):
46
+ load_cookies_file(cookie_file)
47
+
48
+
49
+ def test_load_cookies_file_rejects_non_string_values(tmp_path):
50
+ cookie_file = tmp_path / "cookies.json"
51
+ cookie_file.write_text('{"sessionid": 123}', encoding="utf-8")
52
+
53
+ with pytest.raises(InvalidCookiesError, match="cookie values must be strings"):
54
+ load_cookies_file(cookie_file)