lncrawl-scraper 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. lncrawl_scraper-0.1.0/.github/workflows/bump.yml +39 -0
  2. lncrawl_scraper-0.1.0/.github/workflows/ci.yml +78 -0
  3. lncrawl_scraper-0.1.0/.github/workflows/publish.yml +28 -0
  4. lncrawl_scraper-0.1.0/.gitignore +218 -0
  5. lncrawl_scraper-0.1.0/.python-version +1 -0
  6. lncrawl_scraper-0.1.0/CHANGELOG.md +33 -0
  7. lncrawl_scraper-0.1.0/CLAUDE.md +172 -0
  8. lncrawl_scraper-0.1.0/LICENSE +201 -0
  9. lncrawl_scraper-0.1.0/PKG-INFO +478 -0
  10. lncrawl_scraper-0.1.0/README.md +242 -0
  11. lncrawl_scraper-0.1.0/examples/01_basic_html.py +32 -0
  12. lncrawl_scraper-0.1.0/examples/02_pagesoup_parsing.py +69 -0
  13. lncrawl_scraper-0.1.0/examples/03_json_api.py +30 -0
  14. lncrawl_scraper-0.1.0/examples/04_files_and_images.py +39 -0
  15. lncrawl_scraper-0.1.0/examples/05_forms_cookies_headers.py +48 -0
  16. lncrawl_scraper-0.1.0/examples/06_configuration.py +63 -0
  17. lncrawl_scraper-0.1.0/examples/07_impersonation.py +35 -0
  18. lncrawl_scraper-0.1.0/examples/08_browser_clearance.py +58 -0
  19. lncrawl_scraper-0.1.0/examples/09_proxies_and_tor.py +52 -0
  20. lncrawl_scraper-0.1.0/examples/10_concurrency_and_abort.py +63 -0
  21. lncrawl_scraper-0.1.0/examples/11_error_handling.py +39 -0
  22. lncrawl_scraper-0.1.0/examples/README.md +34 -0
  23. lncrawl_scraper-0.1.0/pyproject.toml +113 -0
  24. lncrawl_scraper-0.1.0/src/scraper/__init__.py +46 -0
  25. lncrawl_scraper-0.1.0/src/scraper/_engine/__init__.py +398 -0
  26. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/__init__.py +13 -0
  27. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/base.py +42 -0
  28. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/cloudflare_v1.py +243 -0
  29. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/cloudflare_v2.py +158 -0
  30. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/cloudflare_v3.py +199 -0
  31. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/interpreter.py +98 -0
  32. lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/turnstile.py +55 -0
  33. lncrawl_scraper-0.1.0/src/scraper/_engine/config.py +113 -0
  34. lncrawl_scraper-0.1.0/src/scraper/_engine/exceptions.py +41 -0
  35. lncrawl_scraper-0.1.0/src/scraper/_engine/impersonate.py +107 -0
  36. lncrawl_scraper-0.1.0/src/scraper/_engine/proxy_manager.py +83 -0
  37. lncrawl_scraper-0.1.0/src/scraper/_engine/session.py +69 -0
  38. lncrawl_scraper-0.1.0/src/scraper/_engine/stealth.py +164 -0
  39. lncrawl_scraper-0.1.0/src/scraper/_engine/tls.py +100 -0
  40. lncrawl_scraper-0.1.0/src/scraper/_engine/user_agent.py +406 -0
  41. lncrawl_scraper-0.1.0/src/scraper/_utils/__init__.py +0 -0
  42. lncrawl_scraper-0.1.0/src/scraper/_utils/event_lock.py +38 -0
  43. lncrawl_scraper-0.1.0/src/scraper/_utils/file_tools.py +31 -0
  44. lncrawl_scraper-0.1.0/src/scraper/_utils/url_tools.py +33 -0
  45. lncrawl_scraper-0.1.0/src/scraper/config.py +60 -0
  46. lncrawl_scraper-0.1.0/src/scraper/py.typed +0 -0
  47. lncrawl_scraper-0.1.0/src/scraper/session.py +237 -0
  48. lncrawl_scraper-0.1.0/src/scraper/soup.py +512 -0
  49. lncrawl_scraper-0.1.0/tests/__init__.py +0 -0
  50. lncrawl_scraper-0.1.0/tests/conftest.py +45 -0
  51. lncrawl_scraper-0.1.0/tests/test_clearance.py +61 -0
  52. lncrawl_scraper-0.1.0/tests/test_config.py +49 -0
  53. lncrawl_scraper-0.1.0/tests/test_event_lock.py +72 -0
  54. lncrawl_scraper-0.1.0/tests/test_impersonate.py +110 -0
  55. lncrawl_scraper-0.1.0/tests/test_scraper.py +244 -0
  56. lncrawl_scraper-0.1.0/tests/test_soup.py +230 -0
  57. lncrawl_scraper-0.1.0/tests/test_soup_edge.py +173 -0
  58. lncrawl_scraper-0.1.0/tests/test_stealth.py +65 -0
  59. lncrawl_scraper-0.1.0/tests/test_user_agent.py +73 -0
  60. lncrawl_scraper-0.1.0/tests/test_utils.py +95 -0
  61. lncrawl_scraper-0.1.0/uv.lock +1578 -0
@@ -0,0 +1,39 @@
1
+ name: Bump Version
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ bump:
7
+ description: Version part to bump
8
+ required: true
9
+ default: patch
10
+ type: choice
11
+ options: [patch, minor, major]
12
+
13
+ jobs:
14
+ ci:
15
+ uses: ./.github/workflows/ci.yml
16
+
17
+ bump:
18
+ needs: ci
19
+ name: Bump version and push tag
20
+ runs-on: ubuntu-latest
21
+ permissions:
22
+ contents: write
23
+ steps:
24
+ - uses: actions/checkout@v6
25
+ - uses: astral-sh/setup-uv@v7
26
+
27
+ - name: Bump version
28
+ run: uv version --bump '${{ inputs.bump }}'
29
+
30
+ - name: Read new version
31
+ id: version
32
+ run: echo "value=$(uv version | awk '{print $NF}')" >> $GITHUB_OUTPUT
33
+
34
+ - name: Commit and tag
35
+ uses: EndBug/add-and-commit@v9
36
+ with:
37
+ add: "pyproject.toml uv.lock"
38
+ tag: "v${{ steps.version.outputs.value }}"
39
+ message: "Bump version to v${{ steps.version.outputs.value }}"
@@ -0,0 +1,78 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ workflow_call:
8
+
9
+ jobs:
10
+ lint:
11
+ name: Lint
12
+ runs-on: ubuntu-latest
13
+ steps:
14
+ - uses: actions/checkout@v6
15
+ - uses: astral-sh/setup-uv@v7
16
+ with:
17
+ python-version: "3.9"
18
+ - run: uv sync --all-groups --all-extras
19
+ - run: uv run ruff check
20
+ - run: uv run ruff format --check
21
+ - run: uv run pyright
22
+
23
+ coverage:
24
+ name: Coverage
25
+ runs-on: ubuntu-latest
26
+ needs: lint
27
+ permissions:
28
+ contents: write
29
+ pull-requests: write
30
+ steps:
31
+ - uses: actions/checkout@v6
32
+ - uses: astral-sh/setup-uv@v7
33
+ with:
34
+ python-version: "3.12"
35
+ - run: uv sync --all-groups --all-extras
36
+
37
+ - name: Run tests with coverage
38
+ run: uv run pytest --cov --cov-report=xml --cov-report=term-missing
39
+
40
+ - name: Write coverage to job summary
41
+ if: always()
42
+ run: |
43
+ {
44
+ echo '## Coverage'
45
+ uv run coverage report --format=markdown
46
+ } >> "$GITHUB_STEP_SUMMARY"
47
+
48
+ - name: Coverage PR comment + badge
49
+ uses: py-cov-action/python-coverage-comment-action@v3
50
+ with:
51
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
52
+
53
+ - name: Upload coverage.xml
54
+ uses: actions/upload-artifact@v4
55
+ with:
56
+ name: coverage-xml
57
+ path: coverage.xml
58
+
59
+ build:
60
+ name: Build (Python ${{ matrix.python-version }})
61
+ runs-on: ubuntu-latest
62
+ needs: lint
63
+ strategy:
64
+ fail-fast: false
65
+ matrix:
66
+ python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
67
+ steps:
68
+ - uses: actions/checkout@v6
69
+ - uses: astral-sh/setup-uv@v7
70
+ with:
71
+ python-version: ${{ matrix.python-version }}
72
+ - run: uv sync --all-groups --all-extras
73
+ - run: uv run pytest
74
+ - run: uv build
75
+ - uses: actions/upload-artifact@v4
76
+ with:
77
+ name: dist-${{ matrix.python-version }}
78
+ path: dist/
@@ -0,0 +1,28 @@
1
+ name: Publish
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ publish:
9
+ name: Publish to PyPI
10
+ runs-on: ubuntu-latest
11
+ environment: pypi
12
+ permissions:
13
+ id-token: write
14
+ contents: read
15
+ steps:
16
+ - uses: actions/checkout@v6
17
+ - uses: astral-sh/setup-uv@v7
18
+ with:
19
+ python-version: "3.9"
20
+
21
+ - name: Build distributions
22
+ run: uv build
23
+
24
+ - name: Check metadata renders on PyPI
25
+ run: uvx twine check dist/*
26
+
27
+ - name: Publish to PyPI (trusted publishing)
28
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,218 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
@@ -0,0 +1 @@
1
+ 3.12
@@ -0,0 +1,33 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres
5
+ to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
6
+
7
+ ## [0.1.0] - 2026-06-04
8
+
9
+ Initial public release of `lncrawl-scraper`, extracted from
10
+ [lightnovel-crawler](https://github.com/lncrawl/lightnovel-crawler).
11
+
12
+ ### Added
13
+
14
+ - `Scraper` — a `requests.Session` subclass with transparent Cloudflare
15
+ challenge handling (v1, v2, v3, Turnstile) and helpers: `get_soup`,
16
+ `post_soup`, `get_json`, `post_json`, `get_file`, `get_image`, `submit_form`,
17
+ `ping`.
18
+ - `PageSoup` — a null-safe BeautifulSoup wrapper; selection methods never return
19
+ `None` and text/HTML accessors always return `str`.
20
+ - Typed configuration: `ScraperConfig`, `StealthConfig`, `ProxyConfig`,
21
+ `BrowserConfig`, plus the `default_config()` factory.
22
+ - **Browser fingerprint impersonation** (`impersonate` extra): route requests
23
+ through `curl_cffi` for a real Chrome/Firefox TLS (JA3/JA4) and HTTP/2
24
+ fingerprint, with the spoofed User-Agent family aligned to the target.
25
+ - **Browser-assisted clearance**: `apply_browser_clearance()` to reuse a
26
+ `cf_clearance` cookie + User-Agent solved by an external real browser.
27
+ - **Accurate Client Hints**: `sec-ch-ua` / platform / mobile derived from the
28
+ chosen User-Agent (Chromium only) instead of hardcoded values.
29
+ - Stealth mode, proxy rotation with Tor identity refresh, TLS cipher rotation,
30
+ rate limiting, and cooperative `abort()`.
31
+ - `py.typed` marker (PEP 561) and full type coverage.
32
+
33
+ [0.1.0]: https://github.com/lncrawl/scraper/releases/tag/v0.1.0
@@ -0,0 +1,172 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## What this is
6
+
7
+ `lncrawl-scraper` (import name `scraper`) is a standalone HTTP scraping library
8
+ extracted from [lightnovel-crawler](https://github.com/lncrawl/lightnovel-crawler).
9
+ It is a `requests.Session` subclass that transparently handles Cloudflare
10
+ challenges, plus a null-safe BeautifulSoup wrapper and a set of HTTP helpers.
11
+
12
+ Published to PyPI as `lncrawl-scraper`; imported as `scraper`. Targets Python
13
+ **3.9+**.
14
+
15
+ ## Commands
16
+
17
+ Tooling is driven by [uv](https://docs.astral.sh/uv/) + [poethepoet](https://poethepoet.natn.io/).
18
+
19
+ ```bash
20
+ uv sync # install deps + editable package into .venv
21
+ uv run poe lint # ruff check + ruff format --check + pyright
22
+ uv run poe lint-fix # ruff check --fix + ruff format
23
+ uv run poe test # pytest
24
+ uv run poe cov # pytest with coverage (term-missing + html + xml)
25
+ uv run poe build # lint + test + uv build (wheel/sdist)
26
+ uv run poe publish # build + uv publish
27
+ ```
28
+
29
+ Always run `uv run poe lint` before considering a change done. CI
30
+ (`.github/workflows/ci.yml`) runs three jobs: `lint` (ruff + pyright), a
31
+ `build` matrix testing on Python 3.9–3.14, and `coverage` (which posts a PR
32
+ comment + badge via `python-coverage-comment-action` and a job-summary table).
33
+
34
+ ## Architecture
35
+
36
+ The package is a thin, ergonomic layer over an in-house Cloudflare-bypass engine.
37
+
38
+ ```text
39
+ src/scraper/
40
+ ├── __init__.py # public API + __version__ (via importlib.metadata)
41
+ ├── session.py # Scraper — the main class (subclasses ScraperEngine)
42
+ ├── soup.py # PageSoup — null-safe BeautifulSoup wrapper
43
+ ├── config.py # public config surface + default_config() factory
44
+ ├── py.typed # PEP 561 marker
45
+ ├── _utils/ # internal helpers (event_lock, url_tools, file_tools)
46
+ └── _engine/ # internal Cloudflare-bypass engine (private)
47
+ ```
48
+
49
+ ### Layers
50
+
51
+ - **`Scraper`** ([session.py](src/scraper/session.py)) — the public entry point.
52
+ Adds Origin/Referer injection, default timeouts, and helpers: `get_soup`,
53
+ `post_soup`, `get_json`, `post_json`, `get_image` (returns a PIL Image),
54
+ `get_file` (streamed, abortable), `submit_form`, `ping`. Subclasses
55
+ `ScraperEngine`, so all of `requests.Session` is available too.
56
+ - **`PageSoup`** ([soup.py](src/scraper/soup.py)) — wraps a BeautifulSoup `Tag`.
57
+ Selection methods (`select`, `select_one`, `find`, `xpath`, `closest`, …)
58
+ always return `PageSoup`/`list`, never `None`; text/HTML accessors always
59
+ return `str`. An empty `PageSoup` is falsy. Reach the raw tag via `.tag`.
60
+ - **`_engine/`** — the private engine: `ScraperEngine` (the `requests.Session`
61
+ subclass with the full request pipeline) in `_engine/__init__.py`, plus CF
62
+ challenge handlers v1/v2/v3 + Turnstile, TLS cipher rotation, stealth mode,
63
+ proxy/Tor manager, and UA selection. It is implementation detail — nothing
64
+ here is part of the public API except what `config.py`/`__init__.py`
65
+ re-export.
66
+
67
+ ### Cloudflare-bypass surface
68
+
69
+ The realistic ceiling of a `requests`-based engine is its TLS (JA3/JA4) and
70
+ HTTP/1.1 fingerprint — `set_ciphers()` in [tls.py](src/scraper/_engine/tls.py)
71
+ only reorders ciphers, so the ClientHello still reads as Python. Three features
72
+ push past that:
73
+
74
+ - **Impersonation transport** ([_engine/impersonate.py](src/scraper/_engine/impersonate.py)):
75
+ when `ScraperConfig.impersonate` is set (e.g. `"chrome"`), `ScraperEngine.perform_request`
76
+ routes through `curl_cffi` (curl-impersonate) for a real browser TLS + HTTP/2
77
+ fingerprint, and adapts the result back into a `requests.Response`. The
78
+ curl_cffi session is the cookie authority and is mirrored into `self.cookies`
79
+ after each request (`_mirror_transport_cookies`). Cipher rotation is skipped
80
+ while impersonating. Requires the `impersonate` extra (`curl_cffi`).
81
+ - **Client Hints** are derived from the actual UA in
82
+ `UserAgent._client_hints` (Chromium only; Firefox sends none) so `sec-ch-ua`
83
+ version/platform always match the User-Agent. `stealth.py` no longer hardcodes
84
+ them — it only defaults the non-version-specific `Sec-Fetch-*` nav hints.
85
+ - **`apply_browser_clearance(domain, cf_clearance=, user_agent=, cookies=)`**
86
+ injects a clearance solved by an external real browser; the UA must match the
87
+ one that obtained it. `put_cookie` keeps the requests jar and the impersonation
88
+ jar in sync.
89
+
90
+ ### Configuration
91
+
92
+ All config flows through `ScraperConfig` (a dataclass with nested
93
+ `StealthConfig`, `ProxyConfig`, `BrowserConfig`). The public surface is
94
+ [config.py](src/scraper/config.py), which re-exports the dataclasses from
95
+ `_engine.config` and adds the `default_config()` factory:
96
+
97
+ ```python
98
+ from scraper import Scraper, default_config
99
+ from scraper.config import BrowserConfig, StealthConfig
100
+
101
+ cfg = default_config() # fresh, fully-populated defaults
102
+ cfg.browser = BrowserConfig(browser="chrome", platform="darwin")
103
+ s = Scraper(origin="https://site.com", config=cfg)
104
+ ```
105
+
106
+ - **`default_config()` returns a fresh instance every call.** Never reintroduce
107
+ a shared module-level config singleton — `ScraperEngine` hands the nested
108
+ `proxy`/`stealth` objects to managers that may mutate them, so sharing would
109
+ leak state across `Scraper` instances.
110
+ - `ScraperConfig.browser` accepts `BrowserConfig | dict | None`; the dict form
111
+ is accepted as a convenience and normalized via `asdict` in `UserAgent.load`.
112
+
113
+ ## Conventions
114
+
115
+ - **Python 3.9 compatibility is mandatory.** Bare `X | Y` unions must not be
116
+ *evaluated at runtime* — only use them in files that have
117
+ `from __future__ import annotations`, or in pure annotations. Prefer
118
+ `typing.Optional/Union` in new non-future-annotated modules. `importlib`,
119
+ dataclasses, etc. must all work on 3.9.
120
+ - **Keep the public surface in public modules.** `_engine/` and `_utils/` are
121
+ private; user-facing names live in `__init__.py`/`config.py` and are listed
122
+ in `__all__`. Update `__all__` and the README when changing that surface.
123
+ - **`ruff`**: line-length 100, double quotes, `force-sort-within-sections`,
124
+ combine-as-imports. **`pyright`** runs in `standard` mode over `src` + `tests`
125
+ — keep it clean (use real `isinstance` narrowing rather than `is_dataclass`,
126
+ which pyright doesn't narrow on).
127
+ - **Dependencies**: core runtime deps live in `[project.dependencies]`. Optional
128
+ extras: `image` (`Pillow`, for `get_image`) and `impersonate` (`curl_cffi`,
129
+ for `ScraperConfig.impersonate`) — both imported lazily so the package works
130
+ without them. Add deps via `uv add` / `uv add --dev`.
131
+ - **Public API** is whatever `src/scraper/__init__.py` exports in `__all__`.
132
+ Update it (and the README) when adding user-facing surface.
133
+
134
+ ## Commit messages
135
+
136
+ Match the existing history (`git log`):
137
+
138
+ - **No type prefix.** Do NOT use Conventional Commits (`feat:`, `fix:`,
139
+ `docs:`, …) — subjects are plain capitalized text.
140
+ - **Imperative mood**, capitalized first word, no trailing period, subject
141
+ ≤ ~60 chars (e.g. `Add coverage reporting to CI`, `Restructure into src layout`).
142
+ - **Body only for non-trivial changes**: a blank line, then a short rationale
143
+ paragraph and/or `-` bullets covering *what* changed and *why* (wrap at ~72
144
+ chars). Small changes are subject-only.
145
+ - **Do NOT append a `Co-Authored-By` trailer** — this overrides the default
146
+ Claude Code behaviour; the maintainer's commits never carry it.
147
+
148
+ ## Testing
149
+
150
+ `pytest` under [tests/](tests/). The src/ layout means tests import the
151
+ *installed* package, so run them via `uv run poe test` / `uv run poe cov` (which
152
+ use the editable install).
153
+
154
+ - **Tests must be offline and fast.** [conftest.py](tests/conftest.py) provides
155
+ an autouse fixture that stubs `scraper._engine.user_agent._load_ua_data` to
156
+ `None` (forces the deterministic embedded UA generator, no network), plus
157
+ `fast_config` / `make_fast_config()` which disable stealth delays, throttling,
158
+ and session refresh. Use these in any test that constructs a `Scraper`.
159
+ - **Mock HTTP with `responses`** (`responses.RequestsMock()`), never real
160
+ requests. It patches `HTTPAdapter.send`, so it intercepts the mounted TLS
161
+ adapter too. Note: a set abort signal trips the pre-send check, so the request
162
+ never fires — use `assert_all_requests_are_fired=False` in that case.
163
+ - **UA-family gotcha**: the offline generator can pick iOS, where Chrome's UA is
164
+ `CriOS/…` and Firefox's is `FxiOS/…` (neither contains `Chrome/` / `Firefox/`).
165
+ When asserting on UA family, pin a desktop platform
166
+ (`BrowserConfig(platform="windows", mobile=False)`).
167
+ - `curl_cffi`-dependent tests use `pytest.importorskip("curl_cffi")`.
168
+ - **Coverage** config is in `pyproject.toml` (`[tool.coverage]`, `source =
169
+ ["scraper"]`, `relative_files = true`). `uv run poe cov` writes `htmlcov/`,
170
+ `coverage.xml`, and a terminal report (all coverage artifacts are gitignored).
171
+ The deep CF challenge solvers (`cloudflare_v1/v2/v3`, `interpreter`) are
172
+ integration-only and stay low-coverage without live Cloudflare traffic.