lncrawl-scraper 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lncrawl_scraper-0.1.0/.github/workflows/bump.yml +39 -0
- lncrawl_scraper-0.1.0/.github/workflows/ci.yml +78 -0
- lncrawl_scraper-0.1.0/.github/workflows/publish.yml +28 -0
- lncrawl_scraper-0.1.0/.gitignore +218 -0
- lncrawl_scraper-0.1.0/.python-version +1 -0
- lncrawl_scraper-0.1.0/CHANGELOG.md +33 -0
- lncrawl_scraper-0.1.0/CLAUDE.md +172 -0
- lncrawl_scraper-0.1.0/LICENSE +201 -0
- lncrawl_scraper-0.1.0/PKG-INFO +478 -0
- lncrawl_scraper-0.1.0/README.md +242 -0
- lncrawl_scraper-0.1.0/examples/01_basic_html.py +32 -0
- lncrawl_scraper-0.1.0/examples/02_pagesoup_parsing.py +69 -0
- lncrawl_scraper-0.1.0/examples/03_json_api.py +30 -0
- lncrawl_scraper-0.1.0/examples/04_files_and_images.py +39 -0
- lncrawl_scraper-0.1.0/examples/05_forms_cookies_headers.py +48 -0
- lncrawl_scraper-0.1.0/examples/06_configuration.py +63 -0
- lncrawl_scraper-0.1.0/examples/07_impersonation.py +35 -0
- lncrawl_scraper-0.1.0/examples/08_browser_clearance.py +58 -0
- lncrawl_scraper-0.1.0/examples/09_proxies_and_tor.py +52 -0
- lncrawl_scraper-0.1.0/examples/10_concurrency_and_abort.py +63 -0
- lncrawl_scraper-0.1.0/examples/11_error_handling.py +39 -0
- lncrawl_scraper-0.1.0/examples/README.md +34 -0
- lncrawl_scraper-0.1.0/pyproject.toml +113 -0
- lncrawl_scraper-0.1.0/src/scraper/__init__.py +46 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/__init__.py +398 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/__init__.py +13 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/base.py +42 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/cloudflare_v1.py +243 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/cloudflare_v2.py +158 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/cloudflare_v3.py +199 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/interpreter.py +98 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/challenges/turnstile.py +55 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/config.py +113 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/exceptions.py +41 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/impersonate.py +107 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/proxy_manager.py +83 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/session.py +69 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/stealth.py +164 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/tls.py +100 -0
- lncrawl_scraper-0.1.0/src/scraper/_engine/user_agent.py +406 -0
- lncrawl_scraper-0.1.0/src/scraper/_utils/__init__.py +0 -0
- lncrawl_scraper-0.1.0/src/scraper/_utils/event_lock.py +38 -0
- lncrawl_scraper-0.1.0/src/scraper/_utils/file_tools.py +31 -0
- lncrawl_scraper-0.1.0/src/scraper/_utils/url_tools.py +33 -0
- lncrawl_scraper-0.1.0/src/scraper/config.py +60 -0
- lncrawl_scraper-0.1.0/src/scraper/py.typed +0 -0
- lncrawl_scraper-0.1.0/src/scraper/session.py +237 -0
- lncrawl_scraper-0.1.0/src/scraper/soup.py +512 -0
- lncrawl_scraper-0.1.0/tests/__init__.py +0 -0
- lncrawl_scraper-0.1.0/tests/conftest.py +45 -0
- lncrawl_scraper-0.1.0/tests/test_clearance.py +61 -0
- lncrawl_scraper-0.1.0/tests/test_config.py +49 -0
- lncrawl_scraper-0.1.0/tests/test_event_lock.py +72 -0
- lncrawl_scraper-0.1.0/tests/test_impersonate.py +110 -0
- lncrawl_scraper-0.1.0/tests/test_scraper.py +244 -0
- lncrawl_scraper-0.1.0/tests/test_soup.py +230 -0
- lncrawl_scraper-0.1.0/tests/test_soup_edge.py +173 -0
- lncrawl_scraper-0.1.0/tests/test_stealth.py +65 -0
- lncrawl_scraper-0.1.0/tests/test_user_agent.py +73 -0
- lncrawl_scraper-0.1.0/tests/test_utils.py +95 -0
- lncrawl_scraper-0.1.0/uv.lock +1578 -0
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
name: Bump Version
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
inputs:
|
|
6
|
+
bump:
|
|
7
|
+
description: Version part to bump
|
|
8
|
+
required: true
|
|
9
|
+
default: patch
|
|
10
|
+
type: choice
|
|
11
|
+
options: [patch, minor, major]
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
ci:
|
|
15
|
+
uses: ./.github/workflows/ci.yml
|
|
16
|
+
|
|
17
|
+
bump:
|
|
18
|
+
needs: ci
|
|
19
|
+
name: Bump version and push tag
|
|
20
|
+
runs-on: ubuntu-latest
|
|
21
|
+
permissions:
|
|
22
|
+
contents: write
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v6
|
|
25
|
+
- uses: astral-sh/setup-uv@v7
|
|
26
|
+
|
|
27
|
+
- name: Bump version
|
|
28
|
+
run: uv version --bump '${{ inputs.bump }}'
|
|
29
|
+
|
|
30
|
+
- name: Read new version
|
|
31
|
+
id: version
|
|
32
|
+
run: echo "value=$(uv version | awk '{print $NF}')" >> $GITHUB_OUTPUT
|
|
33
|
+
|
|
34
|
+
- name: Commit and tag
|
|
35
|
+
uses: EndBug/add-and-commit@v9
|
|
36
|
+
with:
|
|
37
|
+
add: "pyproject.toml uv.lock"
|
|
38
|
+
tag: "v${{ steps.version.outputs.value }}"
|
|
39
|
+
message: "Bump version to v${{ steps.version.outputs.value }}"
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
workflow_call:
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
name: Lint
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v6
|
|
15
|
+
- uses: astral-sh/setup-uv@v7
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.9"
|
|
18
|
+
- run: uv sync --all-groups --all-extras
|
|
19
|
+
- run: uv run ruff check
|
|
20
|
+
- run: uv run ruff format --check
|
|
21
|
+
- run: uv run pyright
|
|
22
|
+
|
|
23
|
+
coverage:
|
|
24
|
+
name: Coverage
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
needs: lint
|
|
27
|
+
permissions:
|
|
28
|
+
contents: write
|
|
29
|
+
pull-requests: write
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v6
|
|
32
|
+
- uses: astral-sh/setup-uv@v7
|
|
33
|
+
with:
|
|
34
|
+
python-version: "3.12"
|
|
35
|
+
- run: uv sync --all-groups --all-extras
|
|
36
|
+
|
|
37
|
+
- name: Run tests with coverage
|
|
38
|
+
run: uv run pytest --cov --cov-report=xml --cov-report=term-missing
|
|
39
|
+
|
|
40
|
+
- name: Write coverage to job summary
|
|
41
|
+
if: always()
|
|
42
|
+
run: |
|
|
43
|
+
{
|
|
44
|
+
echo '## Coverage'
|
|
45
|
+
uv run coverage report --format=markdown
|
|
46
|
+
} >> "$GITHUB_STEP_SUMMARY"
|
|
47
|
+
|
|
48
|
+
- name: Coverage PR comment + badge
|
|
49
|
+
uses: py-cov-action/python-coverage-comment-action@v3
|
|
50
|
+
with:
|
|
51
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
52
|
+
|
|
53
|
+
- name: Upload coverage.xml
|
|
54
|
+
uses: actions/upload-artifact@v4
|
|
55
|
+
with:
|
|
56
|
+
name: coverage-xml
|
|
57
|
+
path: coverage.xml
|
|
58
|
+
|
|
59
|
+
build:
|
|
60
|
+
name: Build (Python ${{ matrix.python-version }})
|
|
61
|
+
runs-on: ubuntu-latest
|
|
62
|
+
needs: lint
|
|
63
|
+
strategy:
|
|
64
|
+
fail-fast: false
|
|
65
|
+
matrix:
|
|
66
|
+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13", "3.14"]
|
|
67
|
+
steps:
|
|
68
|
+
- uses: actions/checkout@v6
|
|
69
|
+
- uses: astral-sh/setup-uv@v7
|
|
70
|
+
with:
|
|
71
|
+
python-version: ${{ matrix.python-version }}
|
|
72
|
+
- run: uv sync --all-groups --all-extras
|
|
73
|
+
- run: uv run pytest
|
|
74
|
+
- run: uv build
|
|
75
|
+
- uses: actions/upload-artifact@v4
|
|
76
|
+
with:
|
|
77
|
+
name: dist-${{ matrix.python-version }}
|
|
78
|
+
path: dist/
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
name: Publish to PyPI
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
environment: pypi
|
|
12
|
+
permissions:
|
|
13
|
+
id-token: write
|
|
14
|
+
contents: read
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v6
|
|
17
|
+
- uses: astral-sh/setup-uv@v7
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.9"
|
|
20
|
+
|
|
21
|
+
- name: Build distributions
|
|
22
|
+
run: uv build
|
|
23
|
+
|
|
24
|
+
- name: Check metadata renders on PyPI
|
|
25
|
+
run: uvx twine check dist/*
|
|
26
|
+
|
|
27
|
+
- name: Publish to PyPI (trusted publishing)
|
|
28
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
# Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
# uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
# poetry.lock
|
|
109
|
+
# poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
# pdm.lock
|
|
116
|
+
# pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
# pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# Redis
|
|
135
|
+
*.rdb
|
|
136
|
+
*.aof
|
|
137
|
+
*.pid
|
|
138
|
+
|
|
139
|
+
# RabbitMQ
|
|
140
|
+
mnesia/
|
|
141
|
+
rabbitmq/
|
|
142
|
+
rabbitmq-data/
|
|
143
|
+
|
|
144
|
+
# ActiveMQ
|
|
145
|
+
activemq-data/
|
|
146
|
+
|
|
147
|
+
# SageMath parsed files
|
|
148
|
+
*.sage.py
|
|
149
|
+
|
|
150
|
+
# Environments
|
|
151
|
+
.env
|
|
152
|
+
.envrc
|
|
153
|
+
.venv
|
|
154
|
+
env/
|
|
155
|
+
venv/
|
|
156
|
+
ENV/
|
|
157
|
+
env.bak/
|
|
158
|
+
venv.bak/
|
|
159
|
+
|
|
160
|
+
# Spyder project settings
|
|
161
|
+
.spyderproject
|
|
162
|
+
.spyproject
|
|
163
|
+
|
|
164
|
+
# Rope project settings
|
|
165
|
+
.ropeproject
|
|
166
|
+
|
|
167
|
+
# mkdocs documentation
|
|
168
|
+
/site
|
|
169
|
+
|
|
170
|
+
# mypy
|
|
171
|
+
.mypy_cache/
|
|
172
|
+
.dmypy.json
|
|
173
|
+
dmypy.json
|
|
174
|
+
|
|
175
|
+
# Pyre type checker
|
|
176
|
+
.pyre/
|
|
177
|
+
|
|
178
|
+
# pytype static type analyzer
|
|
179
|
+
.pytype/
|
|
180
|
+
|
|
181
|
+
# Cython debug symbols
|
|
182
|
+
cython_debug/
|
|
183
|
+
|
|
184
|
+
# PyCharm
|
|
185
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
186
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
188
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
189
|
+
# .idea/
|
|
190
|
+
|
|
191
|
+
# Abstra
|
|
192
|
+
# Abstra is an AI-powered process automation framework.
|
|
193
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
194
|
+
# Learn more at https://abstra.io/docs
|
|
195
|
+
.abstra/
|
|
196
|
+
|
|
197
|
+
# Visual Studio Code
|
|
198
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
199
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
200
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
201
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
202
|
+
# .vscode/
|
|
203
|
+
# Temporary file for partial code execution
|
|
204
|
+
tempCodeRunnerFile.py
|
|
205
|
+
|
|
206
|
+
# Ruff stuff:
|
|
207
|
+
.ruff_cache/
|
|
208
|
+
|
|
209
|
+
# PyPI configuration file
|
|
210
|
+
.pypirc
|
|
211
|
+
|
|
212
|
+
# Marimo
|
|
213
|
+
marimo/_static/
|
|
214
|
+
marimo/_lsp/
|
|
215
|
+
__marimo__/
|
|
216
|
+
|
|
217
|
+
# Streamlit
|
|
218
|
+
.streamlit/secrets.toml
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres
|
|
5
|
+
to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
6
|
+
|
|
7
|
+
## [0.1.0] - 2026-06-04
|
|
8
|
+
|
|
9
|
+
Initial public release of `lncrawl-scraper`, extracted from
|
|
10
|
+
[lightnovel-crawler](https://github.com/lncrawl/lightnovel-crawler).
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- `Scraper` — a `requests.Session` subclass with transparent Cloudflare
|
|
15
|
+
challenge handling (v1, v2, v3, Turnstile) and helpers: `get_soup`,
|
|
16
|
+
`post_soup`, `get_json`, `post_json`, `get_file`, `get_image`, `submit_form`,
|
|
17
|
+
`ping`.
|
|
18
|
+
- `PageSoup` — a null-safe BeautifulSoup wrapper; selection methods never return
|
|
19
|
+
`None` and text/HTML accessors always return `str`.
|
|
20
|
+
- Typed configuration: `ScraperConfig`, `StealthConfig`, `ProxyConfig`,
|
|
21
|
+
`BrowserConfig`, plus the `default_config()` factory.
|
|
22
|
+
- **Browser fingerprint impersonation** (`impersonate` extra): route requests
|
|
23
|
+
through `curl_cffi` for a real Chrome/Firefox TLS (JA3/JA4) and HTTP/2
|
|
24
|
+
fingerprint, with the spoofed User-Agent family aligned to the target.
|
|
25
|
+
- **Browser-assisted clearance**: `apply_browser_clearance()` to reuse a
|
|
26
|
+
`cf_clearance` cookie + User-Agent solved by an external real browser.
|
|
27
|
+
- **Accurate Client Hints**: `sec-ch-ua` / platform / mobile derived from the
|
|
28
|
+
chosen User-Agent (Chromium only) instead of hardcoded values.
|
|
29
|
+
- Stealth mode, proxy rotation with Tor identity refresh, TLS cipher rotation,
|
|
30
|
+
rate limiting, and cooperative `abort()`.
|
|
31
|
+
- `py.typed` marker (PEP 561) and full type coverage.
|
|
32
|
+
|
|
33
|
+
[0.1.0]: https://github.com/lncrawl/scraper/releases/tag/v0.1.0
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## What this is
|
|
6
|
+
|
|
7
|
+
`lncrawl-scraper` (import name `scraper`) is a standalone HTTP scraping library
|
|
8
|
+
extracted from [lightnovel-crawler](https://github.com/lncrawl/lightnovel-crawler).
|
|
9
|
+
It is a `requests.Session` subclass that transparently handles Cloudflare
|
|
10
|
+
challenges, plus a null-safe BeautifulSoup wrapper and a set of HTTP helpers.
|
|
11
|
+
|
|
12
|
+
Published to PyPI as `lncrawl-scraper`; imported as `scraper`. Targets Python
|
|
13
|
+
**3.9+**.
|
|
14
|
+
|
|
15
|
+
## Commands
|
|
16
|
+
|
|
17
|
+
Tooling is driven by [uv](https://docs.astral.sh/uv/) + [poethepoet](https://poethepoet.natn.io/).
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
uv sync # install deps + editable package into .venv
|
|
21
|
+
uv run poe lint # ruff check + ruff format --check + pyright
|
|
22
|
+
uv run poe lint-fix # ruff check --fix + ruff format
|
|
23
|
+
uv run poe test # pytest
|
|
24
|
+
uv run poe cov # pytest with coverage (term-missing + html + xml)
|
|
25
|
+
uv run poe build # lint + test + uv build (wheel/sdist)
|
|
26
|
+
uv run poe publish # build + uv publish
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
Always run `uv run poe lint` before considering a change done. CI
|
|
30
|
+
(`.github/workflows/ci.yml`) runs three jobs: `lint` (ruff + pyright), a
|
|
31
|
+
`build` matrix testing on Python 3.9–3.14, and `coverage` (which posts a PR
|
|
32
|
+
comment + badge via `python-coverage-comment-action` and a job-summary table).
|
|
33
|
+
|
|
34
|
+
## Architecture
|
|
35
|
+
|
|
36
|
+
The package is a thin, ergonomic layer over an in-house Cloudflare-bypass engine.
|
|
37
|
+
|
|
38
|
+
```text
|
|
39
|
+
src/scraper/
|
|
40
|
+
├── __init__.py # public API + __version__ (via importlib.metadata)
|
|
41
|
+
├── session.py # Scraper — the main class (subclasses ScraperEngine)
|
|
42
|
+
├── soup.py # PageSoup — null-safe BeautifulSoup wrapper
|
|
43
|
+
├── config.py # public config surface + default_config() factory
|
|
44
|
+
├── py.typed # PEP 561 marker
|
|
45
|
+
├── _utils/ # internal helpers (event_lock, url_tools, file_tools)
|
|
46
|
+
└── _engine/ # internal Cloudflare-bypass engine (private)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Layers
|
|
50
|
+
|
|
51
|
+
- **`Scraper`** ([session.py](src/scraper/session.py)) — the public entry point.
|
|
52
|
+
Adds Origin/Referer injection, default timeouts, and helpers: `get_soup`,
|
|
53
|
+
`post_soup`, `get_json`, `post_json`, `get_image` (returns a PIL Image),
|
|
54
|
+
`get_file` (streamed, abortable), `submit_form`, `ping`. Subclasses
|
|
55
|
+
`ScraperEngine`, so all of `requests.Session` is available too.
|
|
56
|
+
- **`PageSoup`** ([soup.py](src/scraper/soup.py)) — wraps a BeautifulSoup `Tag`.
|
|
57
|
+
Selection methods (`select`, `select_one`, `find`, `xpath`, `closest`, …)
|
|
58
|
+
always return `PageSoup`/`list`, never `None`; text/HTML accessors always
|
|
59
|
+
return `str`. An empty `PageSoup` is falsy. Reach the raw tag via `.tag`.
|
|
60
|
+
- **`_engine/`** — the private engine: `ScraperEngine` (the `requests.Session`
|
|
61
|
+
subclass with the full request pipeline) in `_engine/__init__.py`, plus CF
|
|
62
|
+
challenge handlers v1/v2/v3 + Turnstile, TLS cipher rotation, stealth mode,
|
|
63
|
+
proxy/Tor manager, and UA selection. It is implementation detail — nothing
|
|
64
|
+
here is part of the public API except what `config.py`/`__init__.py`
|
|
65
|
+
re-export.
|
|
66
|
+
|
|
67
|
+
### Cloudflare-bypass surface
|
|
68
|
+
|
|
69
|
+
The realistic ceiling of a `requests`-based engine is its TLS (JA3/JA4) and
|
|
70
|
+
HTTP/1.1 fingerprint — `set_ciphers()` in [tls.py](src/scraper/_engine/tls.py)
|
|
71
|
+
only reorders ciphers, so the ClientHello still reads as Python. Three features
|
|
72
|
+
push past that:
|
|
73
|
+
|
|
74
|
+
- **Impersonation transport** ([_engine/impersonate.py](src/scraper/_engine/impersonate.py)):
|
|
75
|
+
when `ScraperConfig.impersonate` is set (e.g. `"chrome"`), `ScraperEngine.perform_request`
|
|
76
|
+
routes through `curl_cffi` (curl-impersonate) for a real browser TLS + HTTP/2
|
|
77
|
+
fingerprint, and adapts the result back into a `requests.Response`. The
|
|
78
|
+
curl_cffi session is the cookie authority and is mirrored into `self.cookies`
|
|
79
|
+
after each request (`_mirror_transport_cookies`). Cipher rotation is skipped
|
|
80
|
+
while impersonating. Requires the `impersonate` extra (`curl_cffi`).
|
|
81
|
+
- **Client Hints** are derived from the actual UA in
|
|
82
|
+
`UserAgent._client_hints` (Chromium only; Firefox sends none) so `sec-ch-ua`
|
|
83
|
+
version/platform always match the User-Agent. `stealth.py` no longer hardcodes
|
|
84
|
+
them — it only defaults the non-version-specific `Sec-Fetch-*` nav hints.
|
|
85
|
+
- **`apply_browser_clearance(domain, cf_clearance=, user_agent=, cookies=)`**
|
|
86
|
+
injects a clearance solved by an external real browser; the UA must match the
|
|
87
|
+
one that obtained it. `put_cookie` keeps the requests jar and the impersonation
|
|
88
|
+
jar in sync.
|
|
89
|
+
|
|
90
|
+
### Configuration
|
|
91
|
+
|
|
92
|
+
All config flows through `ScraperConfig` (a dataclass with nested
|
|
93
|
+
`StealthConfig`, `ProxyConfig`, `BrowserConfig`). The public surface is
|
|
94
|
+
[config.py](src/scraper/config.py), which re-exports the dataclasses from
|
|
95
|
+
`_engine.config` and adds the `default_config()` factory:
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from scraper import Scraper, default_config
|
|
99
|
+
from scraper.config import BrowserConfig, StealthConfig
|
|
100
|
+
|
|
101
|
+
cfg = default_config() # fresh, fully-populated defaults
|
|
102
|
+
cfg.browser = BrowserConfig(browser="chrome", platform="darwin")
|
|
103
|
+
s = Scraper(origin="https://site.com", config=cfg)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
- **`default_config()` returns a fresh instance every call.** Never reintroduce
|
|
107
|
+
a shared module-level config singleton — `ScraperEngine` hands the nested
|
|
108
|
+
`proxy`/`stealth` objects to managers that may mutate them, so sharing would
|
|
109
|
+
leak state across `Scraper` instances.
|
|
110
|
+
- `ScraperConfig.browser` accepts `BrowserConfig | dict | None`; the dict form
|
|
111
|
+
is accepted as a convenience and normalized via `asdict` in `UserAgent.load`.
|
|
112
|
+
|
|
113
|
+
## Conventions
|
|
114
|
+
|
|
115
|
+
- **Python 3.9 compatibility is mandatory.** Bare `X | Y` unions must not be
|
|
116
|
+
*evaluated at runtime* — only use them in files that have
|
|
117
|
+
`from __future__ import annotations`, or in pure annotations. Prefer
|
|
118
|
+
`typing.Optional/Union` in new non-future-annotated modules. `importlib`,
|
|
119
|
+
dataclasses, etc. must all work on 3.9.
|
|
120
|
+
- **Keep the public surface in public modules.** `_engine/` and `_utils/` are
|
|
121
|
+
private; user-facing names live in `__init__.py`/`config.py` and are listed
|
|
122
|
+
in `__all__`. Update `__all__` and the README when changing that surface.
|
|
123
|
+
- **`ruff`**: line-length 100, double quotes, `force-sort-within-sections`,
|
|
124
|
+
combine-as-imports. **`pyright`** runs in `standard` mode over `src` + `tests`
|
|
125
|
+
— keep it clean (use real `isinstance` narrowing rather than `is_dataclass`,
|
|
126
|
+
which pyright doesn't narrow on).
|
|
127
|
+
- **Dependencies**: core runtime deps live in `[project.dependencies]`. Optional
|
|
128
|
+
extras: `image` (`Pillow`, for `get_image`) and `impersonate` (`curl_cffi`,
|
|
129
|
+
for `ScraperConfig.impersonate`) — both imported lazily so the package works
|
|
130
|
+
without them. Add deps via `uv add` / `uv add --dev`.
|
|
131
|
+
- **Public API** is whatever `src/scraper/__init__.py` exports in `__all__`.
|
|
132
|
+
Update it (and the README) when adding user-facing surface.
|
|
133
|
+
|
|
134
|
+
## Commit messages
|
|
135
|
+
|
|
136
|
+
Match the existing history (`git log`):
|
|
137
|
+
|
|
138
|
+
- **No type prefix.** Do NOT use Conventional Commits (`feat:`, `fix:`,
|
|
139
|
+
`docs:`, …) — subjects are plain capitalized text.
|
|
140
|
+
- **Imperative mood**, capitalized first word, no trailing period, subject
|
|
141
|
+
≤ ~60 chars (e.g. `Add coverage reporting to CI`, `Restructure into src layout`).
|
|
142
|
+
- **Body only for non-trivial changes**: a blank line, then a short rationale
|
|
143
|
+
paragraph and/or `-` bullets covering *what* changed and *why* (wrap at ~72
|
|
144
|
+
chars). Small changes are subject-only.
|
|
145
|
+
- **Do NOT append a `Co-Authored-By` trailer** — this overrides the default
|
|
146
|
+
Claude Code behaviour; the maintainer's commits never carry it.
|
|
147
|
+
|
|
148
|
+
## Testing
|
|
149
|
+
|
|
150
|
+
`pytest` under [tests/](tests/). The src/ layout means tests import the
|
|
151
|
+
*installed* package, so run them via `uv run poe test` / `uv run poe cov` (which
|
|
152
|
+
use the editable install).
|
|
153
|
+
|
|
154
|
+
- **Tests must be offline and fast.** [conftest.py](tests/conftest.py) provides
|
|
155
|
+
an autouse fixture that stubs `scraper._engine.user_agent._load_ua_data` to
|
|
156
|
+
`None` (forces the deterministic embedded UA generator, no network), plus
|
|
157
|
+
`fast_config` / `make_fast_config()` which disable stealth delays, throttling,
|
|
158
|
+
and session refresh. Use these in any test that constructs a `Scraper`.
|
|
159
|
+
- **Mock HTTP with `responses`** (`responses.RequestsMock()`), never real
|
|
160
|
+
requests. It patches `HTTPAdapter.send`, so it intercepts the mounted TLS
|
|
161
|
+
adapter too. Note: a set abort signal trips the pre-send check, so the request
|
|
162
|
+
never fires — use `assert_all_requests_are_fired=False` in that case.
|
|
163
|
+
- **UA-family gotcha**: the offline generator can pick iOS, where Chrome's UA is
|
|
164
|
+
`CriOS/…` and Firefox's is `FxiOS/…` (neither contains `Chrome/` / `Firefox/`).
|
|
165
|
+
When asserting on UA family, pin a desktop platform
|
|
166
|
+
(`BrowserConfig(platform="windows", mobile=False)`).
|
|
167
|
+
- `curl_cffi`-dependent tests use `pytest.importorskip("curl_cffi")`.
|
|
168
|
+
- **Coverage** config is in `pyproject.toml` (`[tool.coverage]`, `source =
|
|
169
|
+
["scraper"]`, `relative_files = true`). `uv run poe cov` writes `htmlcov/`,
|
|
170
|
+
`coverage.xml`, and a terminal report (all coverage artifacts are gitignored).
|
|
171
|
+
The deep CF challenge solvers (`cloudflare_v1/v2/v3`, `interpreter`) are
|
|
172
|
+
integration-only and stay low-coverage without live Cloudflare traffic.
|