scwrap 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scwrap-0.1.0/.gitignore +207 -0
- scwrap-0.1.0/.python-version +1 -0
- scwrap-0.1.0/LICENSE +21 -0
- scwrap-0.1.0/PKG-INFO +202 -0
- scwrap-0.1.0/README.md +186 -0
- scwrap-0.1.0/main.py +6 -0
- scwrap-0.1.0/pyproject.toml +15 -0
- scwrap-0.1.0/scwrap/__init__.py +15 -0
- scwrap-0.1.0/scwrap/browser.py +33 -0
- scwrap-0.1.0/scwrap/scwrap.py +232 -0
- scwrap-0.1.0/scwrap/utils.py +70 -0
scwrap-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[codz]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# PyInstaller
|
|
30
|
+
# Usually these files are written by a python script from a template
|
|
31
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
32
|
+
*.manifest
|
|
33
|
+
*.spec
|
|
34
|
+
|
|
35
|
+
# Installer logs
|
|
36
|
+
pip-log.txt
|
|
37
|
+
pip-delete-this-directory.txt
|
|
38
|
+
|
|
39
|
+
# Unit test / coverage reports
|
|
40
|
+
htmlcov/
|
|
41
|
+
.tox/
|
|
42
|
+
.nox/
|
|
43
|
+
.coverage
|
|
44
|
+
.coverage.*
|
|
45
|
+
.cache
|
|
46
|
+
nosetests.xml
|
|
47
|
+
coverage.xml
|
|
48
|
+
*.cover
|
|
49
|
+
*.py.cover
|
|
50
|
+
.hypothesis/
|
|
51
|
+
.pytest_cache/
|
|
52
|
+
cover/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
.pybuilder/
|
|
76
|
+
target/
|
|
77
|
+
|
|
78
|
+
# Jupyter Notebook
|
|
79
|
+
.ipynb_checkpoints
|
|
80
|
+
|
|
81
|
+
# IPython
|
|
82
|
+
profile_default/
|
|
83
|
+
ipython_config.py
|
|
84
|
+
|
|
85
|
+
# pyenv
|
|
86
|
+
# For a library or package, you might want to ignore these files since the code is
|
|
87
|
+
# intended to run in multiple environments; otherwise, check them in:
|
|
88
|
+
# .python-version
|
|
89
|
+
|
|
90
|
+
# pipenv
|
|
91
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
92
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
93
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
94
|
+
# install all needed dependencies.
|
|
95
|
+
#Pipfile.lock
|
|
96
|
+
|
|
97
|
+
# UV
|
|
98
|
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
|
99
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
100
|
+
# commonly ignored for libraries.
|
|
101
|
+
#uv.lock
|
|
102
|
+
|
|
103
|
+
# poetry
|
|
104
|
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
|
105
|
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
|
106
|
+
# commonly ignored for libraries.
|
|
107
|
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
|
108
|
+
#poetry.lock
|
|
109
|
+
#poetry.toml
|
|
110
|
+
|
|
111
|
+
# pdm
|
|
112
|
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
|
113
|
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
|
114
|
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
|
115
|
+
#pdm.lock
|
|
116
|
+
#pdm.toml
|
|
117
|
+
.pdm-python
|
|
118
|
+
.pdm-build/
|
|
119
|
+
|
|
120
|
+
# pixi
|
|
121
|
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
|
122
|
+
#pixi.lock
|
|
123
|
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
|
124
|
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
|
125
|
+
.pixi
|
|
126
|
+
|
|
127
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
|
128
|
+
__pypackages__/
|
|
129
|
+
|
|
130
|
+
# Celery stuff
|
|
131
|
+
celerybeat-schedule
|
|
132
|
+
celerybeat.pid
|
|
133
|
+
|
|
134
|
+
# SageMath parsed files
|
|
135
|
+
*.sage.py
|
|
136
|
+
|
|
137
|
+
# Environments
|
|
138
|
+
.env
|
|
139
|
+
.envrc
|
|
140
|
+
.venv
|
|
141
|
+
env/
|
|
142
|
+
venv/
|
|
143
|
+
ENV/
|
|
144
|
+
env.bak/
|
|
145
|
+
venv.bak/
|
|
146
|
+
|
|
147
|
+
# Spyder project settings
|
|
148
|
+
.spyderproject
|
|
149
|
+
.spyproject
|
|
150
|
+
|
|
151
|
+
# Rope project settings
|
|
152
|
+
.ropeproject
|
|
153
|
+
|
|
154
|
+
# mkdocs documentation
|
|
155
|
+
/site
|
|
156
|
+
|
|
157
|
+
# mypy
|
|
158
|
+
.mypy_cache/
|
|
159
|
+
.dmypy.json
|
|
160
|
+
dmypy.json
|
|
161
|
+
|
|
162
|
+
# Pyre type checker
|
|
163
|
+
.pyre/
|
|
164
|
+
|
|
165
|
+
# pytype static type analyzer
|
|
166
|
+
.pytype/
|
|
167
|
+
|
|
168
|
+
# Cython debug symbols
|
|
169
|
+
cython_debug/
|
|
170
|
+
|
|
171
|
+
# PyCharm
|
|
172
|
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
|
173
|
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
|
174
|
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
175
|
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
176
|
+
#.idea/
|
|
177
|
+
|
|
178
|
+
# Abstra
|
|
179
|
+
# Abstra is an AI-powered process automation framework.
|
|
180
|
+
# Ignore directories containing user credentials, local state, and settings.
|
|
181
|
+
# Learn more at https://abstra.io/docs
|
|
182
|
+
.abstra/
|
|
183
|
+
|
|
184
|
+
# Visual Studio Code
|
|
185
|
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
|
186
|
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
|
187
|
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
|
188
|
+
# you could uncomment the following to ignore the entire vscode folder
|
|
189
|
+
# .vscode/
|
|
190
|
+
|
|
191
|
+
# Ruff stuff:
|
|
192
|
+
.ruff_cache/
|
|
193
|
+
|
|
194
|
+
# PyPI configuration file
|
|
195
|
+
.pypirc
|
|
196
|
+
|
|
197
|
+
# Cursor
|
|
198
|
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
|
199
|
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
|
200
|
+
# refer to https://docs.cursor.com/context/ignore-files
|
|
201
|
+
.cursorignore
|
|
202
|
+
.cursorindexingignore
|
|
203
|
+
|
|
204
|
+
# Marimo
|
|
205
|
+
marimo/_static/
|
|
206
|
+
marimo/_lsp/
|
|
207
|
+
__marimo__/
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
scwrap-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nishizawa Takamasa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
scwrap-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scwrap
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight scraping helpers: wrapped Page/parser APIs (Patchright, Playwright, selectolax), browser presets, CSV/Parquet and logging utilities.
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: patchright>=1.40
|
|
9
|
+
Requires-Dist: playwright>=1.40
|
|
10
|
+
Requires-Dist: selectolax>=0.3
|
|
11
|
+
Requires-Dist: pandas>=2.0
|
|
12
|
+
Requires-Dist: pyarrow>=14.0
|
|
13
|
+
Requires-Dist: camoufox>=0.4
|
|
14
|
+
Requires-Dist: loguru>=0.7
|
|
15
|
+
|
|
16
|
+
# scwrap
|
|
17
|
+
|
|
18
|
+
## Overview - 概要
|
|
19
|
+
|
|
20
|
+
scwrap is a scraping utility library built on Patchright, Playwright, and selectolax.
|
|
21
|
+
scwrap は Patchright / Playwright(`Page` API)と selectolax をベースにしたスクレイピングユーティリティライブラリです。**細かい挙動はプリミティブの組み合わせで組み立てる**前提の薄いラッパーです(「よしなに」な自動修復は置かない方針)。
|
|
22
|
+
|
|
23
|
+
DOM・パーサのラッパーは **`scwrap`**(`wrap_page` / `wrap_parser` などのファクトリー)から、ブラウザ起動は **`scwrap.browser`**、CSV やログなどの周辺は **`scwrap.utils`** から import します。
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
## Requirements - 必要条件
|
|
27
|
+
|
|
28
|
+
- Python 3.12 or higher(`requires-python` は `pyproject.toml` 参照)
|
|
29
|
+
- 主要依存: patchright, playwright, selectolax, pandas, pyarrow, camoufox, loguru(一覧・下限は `pyproject.toml` の `[project.dependencies]`)
|
|
30
|
+
- `write_parquet` は **pandas + pyarrow**(`pyarrow` は依存に含まれる)。別エンジンに切り替える場合のみ `fastparquet` などが必要になることがあります。
|
|
31
|
+
- ブラウザ: **Patchright / Playwright 用の取得**と、下記のとおり **`patchright_page` は Google Chrome 前提**です。
|
|
32
|
+
|
|
33
|
+
## Installation - インストール
|
|
34
|
+
|
|
35
|
+
### pip
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
pip install scwrap
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### uv (推奨)
|
|
42
|
+
|
|
43
|
+
```
|
|
44
|
+
uv add scwrap
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Playwright / Patchright が使うブラウザバイナリは別途取得してください。
|
|
48
|
+
加えて **`patchright_page()` は `channel='chrome'` で起動するため、マシンに [Google Chrome](https://www.google.com/chrome/) がインストールされている必要があります**(Chromium のみの環境では起動に失敗することがあります)。
|
|
49
|
+
|
|
50
|
+
### Patchright(Chromium 等)
|
|
51
|
+
|
|
52
|
+
#### pip
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
python -m patchright install chromium
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
#### uv (推奨)
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
uv run patchright install chromium
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Camoufox(Firefox)
|
|
65
|
+
|
|
66
|
+
#### pip
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
camoufox fetch
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
#### uv (推奨)
|
|
73
|
+
|
|
74
|
+
```
|
|
75
|
+
uv run camoufox fetch
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## メソッド
|
|
79
|
+
|
|
80
|
+
### `scwrap`(ラッパー)
|
|
81
|
+
|
|
82
|
+
ブラウザ側は `wrap_page(page)` が起点です。`goto`・`wait`・`css` などはこの戻り値に対して呼びます。要素が複数なら `css(...)` はグループを返し、先頭だけなら `.first`、正規表現で絞り込みは `.grep(pattern)`、相対 URL の解決には `.urls`(単一は `.url`)を使います。テキストや生の要素は `.text` / `.raw` プロパティです。
|
|
83
|
+
|
|
84
|
+
静的 HTML(selectolax)側は `wrap_parser(parser)` から `css` / `grep` / `text` など(ノードは `wrap_node` 系)。クラス実装は非公開で、**コンストラクトは常にこれらのファクトリー経由**にしてください。
|
|
85
|
+
|
|
86
|
+
### `scwrap.browser`
|
|
87
|
+
|
|
88
|
+
- **`patchright_page()`** … コンテキストマネージャ。Patchright で **Google Chrome**(`channel='chrome'`)を起動し、**毎回クリーンな `BrowserContext`** の `Page` を `with` に渡す(永続プロファイルは使わない)。`headless=False`・`no_viewport=True` などは固定。
|
|
89
|
+
|
|
90
|
+
- **`camoufox_page(locale=...)`** … Camoufox(Firefox)で `Page` を開く。
|
|
91
|
+
_例:_ `with camoufox_page(locale='en-US,en') as page:`
|
|
92
|
+
デフォルトの `locale` は `'ja-JP,ja'`。`headless=False`・`humanize=True` は固定。
|
|
93
|
+
|
|
94
|
+
ウィンドウ最大化が必要なら、コードではなく **ブラウザ上で手動**してください(起動引数に依存させない)。
|
|
95
|
+
|
|
96
|
+
### `scwrap.utils`
|
|
97
|
+
|
|
98
|
+
`log_to_file`・`from_here`・`parse_html`・`append_csv`・`write_parquet`・`save_html`・`hash_name`・`random_sleep` など(各関数は `scwrap/utils.py` を参照)。`log_to_file` はログファイルの **親ディレクトリが無いと失敗**するので、必要なら先に `Path.mkdir` するか、`save_html` のように親を作る処理を挟んでください。
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
## Basic Usage - 基本的な使い方
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from scwrap import wrap_page
|
|
105
|
+
from scwrap.browser import patchright_page
|
|
106
|
+
from scwrap.utils import log_to_file, append_csv, from_here, random_sleep
|
|
107
|
+
|
|
108
|
+
fh = from_here(__file__)
|
|
109
|
+
log_to_file(fh('log/scraping.log'))
|
|
110
|
+
|
|
111
|
+
with patchright_page() as page:
|
|
112
|
+
p = wrap_page(page)
|
|
113
|
+
p.goto('https://www.foobarbaz1.jp')
|
|
114
|
+
|
|
115
|
+
pref_urls = p.css('li.item > ul > li > a').urls
|
|
116
|
+
|
|
117
|
+
classroom_urls = []
|
|
118
|
+
for i, url in enumerate(pref_urls, 1):
|
|
119
|
+
print(f'pref_urls {i}/{len(pref_urls)}')
|
|
120
|
+
if not url or not p.goto(url):
|
|
121
|
+
continue
|
|
122
|
+
random_sleep(1, 2)
|
|
123
|
+
classroom_urls.extend(p.css('.school-area h4 a').urls)
|
|
124
|
+
|
|
125
|
+
for i, url in enumerate(classroom_urls, 1):
|
|
126
|
+
print(f'classroom_urls {i}/{len(classroom_urls)}')
|
|
127
|
+
if not p.goto(url):
|
|
128
|
+
continue
|
|
129
|
+
random_sleep(1, 2)
|
|
130
|
+
append_csv(fh('csv/out.csv'), {
|
|
131
|
+
'URL': page.url,
|
|
132
|
+
'教室名': p.css('h1 .text01').first.text,
|
|
133
|
+
'住所': p.css('.item .mapText').first.text,
|
|
134
|
+
'電話番号': p.css('.item .phoneNumber').first.text,
|
|
135
|
+
'HP': p.css('th').grep('ホームページ').first.next('td').css('a').first.url,
|
|
136
|
+
})
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
## Save HTML while scraping - スクレイピングしながらHTMLを保存する
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from scwrap import wrap_page
|
|
143
|
+
from scwrap.browser import camoufox_page
|
|
144
|
+
from scwrap.utils import log_to_file, append_csv, from_here, hash_name, random_sleep, save_html
|
|
145
|
+
|
|
146
|
+
fh = from_here(__file__)
|
|
147
|
+
log_to_file(fh('log/scraping.log'))
|
|
148
|
+
|
|
149
|
+
with camoufox_page() as page:
|
|
150
|
+
ctx = {}
|
|
151
|
+
p = wrap_page(page)
|
|
152
|
+
p.goto('https://www.foobarbaz1.jp')
|
|
153
|
+
|
|
154
|
+
ctx['アイテムURLs'] = p.css('ul.items > li > a').urls
|
|
155
|
+
|
|
156
|
+
for i, url in enumerate(ctx['アイテムURLs'], 1):
|
|
157
|
+
print(f"アイテムURLs {i}/{len(ctx['アイテムURLs'])}")
|
|
158
|
+
if not p.goto(url):
|
|
159
|
+
continue
|
|
160
|
+
random_sleep(1, 2)
|
|
161
|
+
if p.wait('#logo', timeout=10000).raw is None:
|
|
162
|
+
continue
|
|
163
|
+
file_name = f'{hash_name(url)}.html'
|
|
164
|
+
if not save_html(fh('html') / file_name, page.content()):
|
|
165
|
+
continue
|
|
166
|
+
append_csv(fh('outurlhtml.csv'), {
|
|
167
|
+
'URL': url,
|
|
168
|
+
'HTML': file_name,
|
|
169
|
+
})
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Scrape from local HTML files - 保存済みHTMLからスクレイピングしてParquetに出力する
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
import pandas as pd
|
|
176
|
+
|
|
177
|
+
from scwrap import wrap_parser
|
|
178
|
+
from scwrap.utils import log_to_file, from_here, parse_html, write_parquet
|
|
179
|
+
|
|
180
|
+
fh = from_here(__file__)
|
|
181
|
+
log_to_file(fh('log/scraping.log'))
|
|
182
|
+
|
|
183
|
+
df = pd.read_csv(fh('outurlhtml.csv'))
|
|
184
|
+
results = []
|
|
185
|
+
for i, (url, path) in enumerate(zip(df['URL'], df['HTML']), 1):
|
|
186
|
+
print(f'outhtml {i}/{len(df)}')
|
|
187
|
+
if not (parser := parse_html(fh('html') / path)):
|
|
188
|
+
continue
|
|
189
|
+
p = wrap_parser(parser)
|
|
190
|
+
results.append({
|
|
191
|
+
'URL': url,
|
|
192
|
+
'教室名': p.css('h1 .text02').first.text,
|
|
193
|
+
'住所': p.css('.item .mapText').first.text,
|
|
194
|
+
'所在地': p.css('dt').grep(r'所在地').first.next('dd').text,
|
|
195
|
+
})
|
|
196
|
+
write_parquet(fh('outhtml.parquet'), results)
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## License - ライセンス
|
|
200
|
+
|
|
201
|
+
[MIT](./LICENSE)
|
|
202
|
+
|
scwrap-0.1.0/README.md
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
# scwrap
|
|
2
|
+
|
|
3
|
+
## Overview - 概要
|
|
4
|
+
|
|
5
|
+
scwrap is a scraping utility library built on Patchright, Playwright, and selectolax.
|
|
6
|
+
scwrap は Patchright / Playwright(`Page` API)と selectolax をベースにしたスクレイピングユーティリティライブラリです。**細かい挙動はプリミティブの組み合わせで組み立てる**前提の薄いラッパーです(「よしなに」な自動修復は置かない方針)。
|
|
7
|
+
|
|
8
|
+
DOM・パーサのラッパーは **`scwrap`**(`wrap_page` / `wrap_parser` などのファクトリー)から、ブラウザ起動は **`scwrap.browser`**、CSV やログなどの周辺は **`scwrap.utils`** から import します。
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
## Requirements - 必要条件
|
|
12
|
+
|
|
13
|
+
- Python 3.12 or higher(`requires-python` は `pyproject.toml` 参照)
|
|
14
|
+
- 主要依存: patchright, playwright, selectolax, pandas, pyarrow, camoufox, loguru(一覧・下限は `pyproject.toml` の `[project.dependencies]`)
|
|
15
|
+
- `write_parquet` は **pandas + pyarrow**(`pyarrow` は依存に含まれる)。別エンジンに切り替える場合のみ `fastparquet` などが必要になることがあります。
|
|
16
|
+
- ブラウザ: **Patchright / Playwright 用の取得**と、下記のとおり **`patchright_page` は Google Chrome 前提**です。
|
|
17
|
+
|
|
18
|
+
## Installation - インストール
|
|
19
|
+
|
|
20
|
+
### pip
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
pip install scwrap
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### uv (推奨)
|
|
27
|
+
|
|
28
|
+
```
|
|
29
|
+
uv add scwrap
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Playwright / Patchright が使うブラウザバイナリは別途取得してください。
|
|
33
|
+
加えて **`patchright_page()` は `channel='chrome'` で起動するため、マシンに [Google Chrome](https://www.google.com/chrome/) がインストールされている必要があります**(Chromium のみの環境では起動に失敗することがあります)。
|
|
34
|
+
|
|
35
|
+
### Patchright(Chromium 等)
|
|
36
|
+
|
|
37
|
+
#### pip
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
python -m patchright install chromium
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
#### uv (推奨)
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
uv run patchright install chromium
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Camoufox(Firefox)
|
|
50
|
+
|
|
51
|
+
#### pip
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
camoufox fetch
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
#### uv (推奨)
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
uv run camoufox fetch
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## メソッド
|
|
64
|
+
|
|
65
|
+
### `scwrap`(ラッパー)
|
|
66
|
+
|
|
67
|
+
ブラウザ側は `wrap_page(page)` が起点です。`goto`・`wait`・`css` などはこの戻り値に対して呼びます。要素が複数なら `css(...)` はグループを返し、先頭だけなら `.first`、正規表現で絞り込みは `.grep(pattern)`、相対 URL の解決には `.urls`(単一は `.url`)を使います。テキストや生の要素は `.text` / `.raw` プロパティです。
|
|
68
|
+
|
|
69
|
+
静的 HTML(selectolax)側は `wrap_parser(parser)` から `css` / `grep` / `text` など(ノードは `wrap_node` 系)。クラス実装は非公開で、**コンストラクトは常にこれらのファクトリー経由**にしてください。
|
|
70
|
+
|
|
71
|
+
### `scwrap.browser`
|
|
72
|
+
|
|
73
|
+
- **`patchright_page()`** … コンテキストマネージャ。Patchright で **Google Chrome**(`channel='chrome'`)を起動し、**毎回クリーンな `BrowserContext`** の `Page` を `with` に渡す(永続プロファイルは使わない)。`headless=False`・`no_viewport=True` などは固定。
|
|
74
|
+
|
|
75
|
+
- **`camoufox_page(locale=...)`** … Camoufox(Firefox)で `Page` を開く。
|
|
76
|
+
_例:_ `with camoufox_page(locale='en-US,en') as page:`
|
|
77
|
+
デフォルトの `locale` は `'ja-JP,ja'`。`headless=False`・`humanize=True` は固定。
|
|
78
|
+
|
|
79
|
+
ウィンドウ最大化が必要なら、コードではなく **ブラウザ上で手動**してください(起動引数に依存させない)。
|
|
80
|
+
|
|
81
|
+
### `scwrap.utils`
|
|
82
|
+
|
|
83
|
+
`log_to_file`・`from_here`・`parse_html`・`append_csv`・`write_parquet`・`save_html`・`hash_name`・`random_sleep` など(各関数は `scwrap/utils.py` を参照)。`log_to_file` はログファイルの **親ディレクトリが無いと失敗**するので、必要なら先に `Path.mkdir` するか、`save_html` のように親を作る処理を挟んでください。
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
## Basic Usage - 基本的な使い方
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from scwrap import wrap_page
|
|
90
|
+
from scwrap.browser import patchright_page
|
|
91
|
+
from scwrap.utils import log_to_file, append_csv, from_here, random_sleep
|
|
92
|
+
|
|
93
|
+
fh = from_here(__file__)
|
|
94
|
+
log_to_file(fh('log/scraping.log'))
|
|
95
|
+
|
|
96
|
+
with patchright_page() as page:
|
|
97
|
+
p = wrap_page(page)
|
|
98
|
+
p.goto('https://www.foobarbaz1.jp')
|
|
99
|
+
|
|
100
|
+
pref_urls = p.css('li.item > ul > li > a').urls
|
|
101
|
+
|
|
102
|
+
classroom_urls = []
|
|
103
|
+
for i, url in enumerate(pref_urls, 1):
|
|
104
|
+
print(f'pref_urls {i}/{len(pref_urls)}')
|
|
105
|
+
if not url or not p.goto(url):
|
|
106
|
+
continue
|
|
107
|
+
random_sleep(1, 2)
|
|
108
|
+
classroom_urls.extend(p.css('.school-area h4 a').urls)
|
|
109
|
+
|
|
110
|
+
for i, url in enumerate(classroom_urls, 1):
|
|
111
|
+
print(f'classroom_urls {i}/{len(classroom_urls)}')
|
|
112
|
+
if not p.goto(url):
|
|
113
|
+
continue
|
|
114
|
+
random_sleep(1, 2)
|
|
115
|
+
append_csv(fh('csv/out.csv'), {
|
|
116
|
+
'URL': page.url,
|
|
117
|
+
'教室名': p.css('h1 .text01').first.text,
|
|
118
|
+
'住所': p.css('.item .mapText').first.text,
|
|
119
|
+
'電話番号': p.css('.item .phoneNumber').first.text,
|
|
120
|
+
'HP': p.css('th').grep('ホームページ').first.next('td').css('a').first.url,
|
|
121
|
+
})
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Save HTML while scraping - スクレイピングしながらHTMLを保存する
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from scwrap import wrap_page
|
|
128
|
+
from scwrap.browser import camoufox_page
|
|
129
|
+
from scwrap.utils import log_to_file, append_csv, from_here, hash_name, random_sleep, save_html
|
|
130
|
+
|
|
131
|
+
fh = from_here(__file__)
|
|
132
|
+
log_to_file(fh('log/scraping.log'))
|
|
133
|
+
|
|
134
|
+
with camoufox_page() as page:
|
|
135
|
+
ctx = {}
|
|
136
|
+
p = wrap_page(page)
|
|
137
|
+
p.goto('https://www.foobarbaz1.jp')
|
|
138
|
+
|
|
139
|
+
ctx['アイテムURLs'] = p.css('ul.items > li > a').urls
|
|
140
|
+
|
|
141
|
+
for i, url in enumerate(ctx['アイテムURLs'], 1):
|
|
142
|
+
print(f"アイテムURLs {i}/{len(ctx['アイテムURLs'])}")
|
|
143
|
+
if not p.goto(url):
|
|
144
|
+
continue
|
|
145
|
+
random_sleep(1, 2)
|
|
146
|
+
if p.wait('#logo', timeout=10000).raw is None:
|
|
147
|
+
continue
|
|
148
|
+
file_name = f'{hash_name(url)}.html'
|
|
149
|
+
if not save_html(fh('html') / file_name, page.content()):
|
|
150
|
+
continue
|
|
151
|
+
append_csv(fh('outurlhtml.csv'), {
|
|
152
|
+
'URL': url,
|
|
153
|
+
'HTML': file_name,
|
|
154
|
+
})
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Scrape from local HTML files - 保存済みHTMLからスクレイピングしてParquetに出力する
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
import pandas as pd
|
|
161
|
+
|
|
162
|
+
from scwrap import wrap_parser
|
|
163
|
+
from scwrap.utils import log_to_file, from_here, parse_html, write_parquet
|
|
164
|
+
|
|
165
|
+
fh = from_here(__file__)
|
|
166
|
+
log_to_file(fh('log/scraping.log'))
|
|
167
|
+
|
|
168
|
+
df = pd.read_csv(fh('outurlhtml.csv'))
|
|
169
|
+
results = []
|
|
170
|
+
for i, (url, path) in enumerate(zip(df['URL'], df['HTML']), 1):
|
|
171
|
+
print(f'outhtml {i}/{len(df)}')
|
|
172
|
+
if not (parser := parse_html(fh('html') / path)):
|
|
173
|
+
continue
|
|
174
|
+
p = wrap_parser(parser)
|
|
175
|
+
results.append({
|
|
176
|
+
'URL': url,
|
|
177
|
+
'教室名': p.css('h1 .text02').first.text,
|
|
178
|
+
'住所': p.css('.item .mapText').first.text,
|
|
179
|
+
'所在地': p.css('dt').grep(r'所在地').first.next('dd').text,
|
|
180
|
+
})
|
|
181
|
+
write_parquet(fh('outhtml.parquet'), results)
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## License - ライセンス
|
|
185
|
+
|
|
186
|
+
[MIT](./LICENSE)
|
scwrap-0.1.0/main.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "scwrap"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Lightweight scraping helpers: wrapped Page/parser APIs (Patchright, Playwright, selectolax), browser presets, CSV/Parquet and logging utilities."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"patchright>=1.40",
|
|
9
|
+
"playwright>=1.40",
|
|
10
|
+
"selectolax>=0.3",
|
|
11
|
+
"pandas>=2.0",
|
|
12
|
+
"pyarrow>=14.0",
|
|
13
|
+
"camoufox>=0.4",
|
|
14
|
+
"loguru>=0.7",
|
|
15
|
+
]
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
|
|
6
|
+
from camoufox.sync_api import Camoufox
|
|
7
|
+
from patchright.sync_api import Page as PatchrightPage, sync_playwright
|
|
8
|
+
from playwright.sync_api import Page as PlaywrightPage
|
|
9
|
+
|
|
10
|
+
Page = PatchrightPage | PlaywrightPage
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@contextmanager
|
|
14
|
+
def patchright_page() -> Iterator[Page]:
|
|
15
|
+
with sync_playwright() as pw:
|
|
16
|
+
with pw.chromium.launch(
|
|
17
|
+
channel='chrome',
|
|
18
|
+
headless=False,
|
|
19
|
+
) as browser:
|
|
20
|
+
with browser.new_context(no_viewport=True) as context:
|
|
21
|
+
page = context.new_page()
|
|
22
|
+
yield page
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@contextmanager
|
|
26
|
+
def camoufox_page(locale: str | list[str] = 'ja-JP,ja') -> Iterator[Page]:
|
|
27
|
+
with Camoufox(
|
|
28
|
+
headless=False,
|
|
29
|
+
humanize=True,
|
|
30
|
+
locale=locale,
|
|
31
|
+
) as browser:
|
|
32
|
+
page = browser.new_page()
|
|
33
|
+
yield page
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
import unicodedata as ud
|
|
7
|
+
from urllib.parse import urljoin
|
|
8
|
+
|
|
9
|
+
from loguru import logger
|
|
10
|
+
from patchright.sync_api import Page as PatchrightPage, ElementHandle as PatchrightElementHandle
|
|
11
|
+
from playwright.sync_api import Page as PlaywrightPage, ElementHandle as PlaywrightElementHandle
|
|
12
|
+
from selectolax.lexbor import LexborHTMLParser, LexborNode
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
Page = PatchrightPage | PlaywrightPage
|
|
16
|
+
ElementHandle = PatchrightElementHandle | PlaywrightElementHandle
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def wrap_page(page: Page) -> _WrappedPage:
|
|
20
|
+
return _WrappedPage(page)
|
|
21
|
+
|
|
22
|
+
class _PageScoped:
|
|
23
|
+
_page: Page
|
|
24
|
+
|
|
25
|
+
def wrap_element(self, elem: ElementHandle | None) -> _WrappedElement:
|
|
26
|
+
return _WrappedElement(self._page, elem)
|
|
27
|
+
|
|
28
|
+
def wrap_element_group(self, elems: list[_WrappedElement]) -> _WrappedElementGroup:
|
|
29
|
+
return _WrappedElementGroup(self._page, elems)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def wrap_parser(parser: LexborHTMLParser) -> _WrappedParser:
|
|
33
|
+
return _WrappedParser(parser)
|
|
34
|
+
|
|
35
|
+
def wrap_node(node: LexborNode | None) -> _WrappedNode:
|
|
36
|
+
return _WrappedNode(node)
|
|
37
|
+
|
|
38
|
+
def wrap_node_group(nodes: list[_WrappedNode]) -> _WrappedNodeGroup:
|
|
39
|
+
return _WrappedNodeGroup(nodes)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class _WrappedPage(_PageScoped):
|
|
43
|
+
def __init__(self, page: Page) -> None:
|
|
44
|
+
self._page = page
|
|
45
|
+
|
|
46
|
+
def css(self, selector: str) -> _WrappedElementGroup:
|
|
47
|
+
elems = self._page.query_selector_all(selector)
|
|
48
|
+
return self.wrap_element_group([self.wrap_element(e) for e in elems])
|
|
49
|
+
|
|
50
|
+
def goto(self, url: str | None, try_cnt: int = 3, wait_range: tuple[float, float] = (3, 5)) -> bool:
|
|
51
|
+
if not url:
|
|
52
|
+
return False
|
|
53
|
+
for i in range(try_cnt):
|
|
54
|
+
try:
|
|
55
|
+
if self._page.goto(url) is not None:
|
|
56
|
+
return True
|
|
57
|
+
else:
|
|
58
|
+
reason = "response is None"
|
|
59
|
+
except Exception as e:
|
|
60
|
+
reason = f"{type(e).__name__}: {e}"
|
|
61
|
+
logger.warning(f"[goto] {url} ({i+1}/{try_cnt}) {reason}")
|
|
62
|
+
if i + 1 < try_cnt:
|
|
63
|
+
time.sleep(random.uniform(*wait_range))
|
|
64
|
+
logger.error(f"[goto] giving up: {url}")
|
|
65
|
+
return False
|
|
66
|
+
|
|
67
|
+
def wait(self, selector: str, timeout: int = 15000) -> _WrappedElement:
|
|
68
|
+
try:
|
|
69
|
+
elem = self._page.wait_for_selector(selector, timeout=timeout)
|
|
70
|
+
return self.wrap_element(elem)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.warning(f"[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url}")
|
|
73
|
+
return self.wrap_element(None)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class _WrappedElement(_PageScoped):
|
|
77
|
+
def __init__(self, page: Page, elem: ElementHandle | None) -> None:
|
|
78
|
+
self._page = page
|
|
79
|
+
self._elem = elem
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def raw(self) -> ElementHandle | None:
|
|
83
|
+
return self._elem
|
|
84
|
+
|
|
85
|
+
def css(self, selector: str) -> _WrappedElementGroup:
|
|
86
|
+
elems = self._elem.query_selector_all(selector) if self._elem else []
|
|
87
|
+
return self.wrap_element_group([self.wrap_element(e) for e in elems])
|
|
88
|
+
|
|
89
|
+
def next(self, selector: str) -> _WrappedElement:
|
|
90
|
+
if self._elem is None:
|
|
91
|
+
return self.wrap_element(None)
|
|
92
|
+
try:
|
|
93
|
+
elem = self._elem.evaluate_handle(
|
|
94
|
+
"""(el, sel) => {
|
|
95
|
+
let cur = el.nextElementSibling;
|
|
96
|
+
while (cur) {
|
|
97
|
+
if (cur.matches(sel)) return cur;
|
|
98
|
+
cur = cur.nextElementSibling;
|
|
99
|
+
}
|
|
100
|
+
return null;
|
|
101
|
+
}""",
|
|
102
|
+
selector,
|
|
103
|
+
).as_element()
|
|
104
|
+
return self.wrap_element(elem)
|
|
105
|
+
except Exception as e:
|
|
106
|
+
logger.error(f"[next] {self._elem} {type(e).__name__}: {e}")
|
|
107
|
+
return self.wrap_element(None)
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def text(self) -> str | None:
|
|
111
|
+
if self._elem is None:
|
|
112
|
+
return None
|
|
113
|
+
if not (text := self._elem.text_content()):
|
|
114
|
+
return None
|
|
115
|
+
if not (t := text.strip()):
|
|
116
|
+
return None
|
|
117
|
+
return t
|
|
118
|
+
|
|
119
|
+
def attr(self, attr_name: str) -> str | None:
|
|
120
|
+
if self._elem is None:
|
|
121
|
+
return None
|
|
122
|
+
return a.strip() if (a := self._elem.get_attribute(attr_name)) else None
|
|
123
|
+
|
|
124
|
+
@property
|
|
125
|
+
def url(self) -> str | None:
|
|
126
|
+
if not (href := self.attr('href')):
|
|
127
|
+
return None
|
|
128
|
+
if re.search(r'(?i)^(?:#|javascript:|mailto:|tel:|data:)', href):
|
|
129
|
+
return None
|
|
130
|
+
return urljoin(self._page.url, href)
|
|
131
|
+
|
|
132
|
+
class _WrappedElementGroup(_PageScoped):
|
|
133
|
+
def __init__(self, page: Page, elems: list[_WrappedElement]) -> None:
|
|
134
|
+
self._page = page
|
|
135
|
+
self._elems = elems
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def raw(self) -> list[_WrappedElement]:
|
|
139
|
+
return self._elems
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def first(self) -> _WrappedElement:
|
|
143
|
+
return self._elems[0] if self._elems else self.wrap_element(None)
|
|
144
|
+
|
|
145
|
+
def grep(self, pattern: str) -> _WrappedElementGroup:
|
|
146
|
+
prog = re.compile(pattern)
|
|
147
|
+
filtered = [
|
|
148
|
+
e for e in self._elems
|
|
149
|
+
if (t := e.text) and prog.search(ud.normalize('NFKC', t))
|
|
150
|
+
]
|
|
151
|
+
return self.wrap_element_group(filtered)
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def texts(self) -> list[str | None]:
|
|
155
|
+
return [e.text for e in self._elems]
|
|
156
|
+
|
|
157
|
+
def attrs(self, attr_name: str) -> list[str | None]:
|
|
158
|
+
return [e.attr(attr_name) for e in self._elems]
|
|
159
|
+
|
|
160
|
+
@property
|
|
161
|
+
def urls(self) -> list[str | None]:
|
|
162
|
+
return [e.url for e in self._elems]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class _WrappedParser:
|
|
166
|
+
def __init__(self, parser: LexborHTMLParser) -> None:
|
|
167
|
+
self._parser = parser
|
|
168
|
+
|
|
169
|
+
def css(self, selector: str) -> _WrappedNodeGroup:
|
|
170
|
+
nodes = self._parser.css(selector)
|
|
171
|
+
return wrap_node_group([wrap_node(n) for n in nodes])
|
|
172
|
+
|
|
173
|
+
class _WrappedNode:
|
|
174
|
+
def __init__(self, node: LexborNode | None) -> None:
|
|
175
|
+
self._node = node
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def raw(self) -> LexborNode | None:
|
|
179
|
+
return self._node
|
|
180
|
+
|
|
181
|
+
def css(self, selector: str) -> _WrappedNodeGroup:
|
|
182
|
+
nodes = self._node.css(selector) if self._node else []
|
|
183
|
+
return wrap_node_group([wrap_node(n) for n in nodes])
|
|
184
|
+
|
|
185
|
+
def next(self, selector: str) -> _WrappedNode:
|
|
186
|
+
if self._node is None:
|
|
187
|
+
return wrap_node(None)
|
|
188
|
+
cur = self._node.next
|
|
189
|
+
while cur is not None:
|
|
190
|
+
if cur.is_element_node and cur.css_matches(selector):
|
|
191
|
+
return wrap_node(cur)
|
|
192
|
+
cur = cur.next
|
|
193
|
+
return wrap_node(None)
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def text(self) -> str | None:
|
|
197
|
+
if self._node is None:
|
|
198
|
+
return None
|
|
199
|
+
return t if (t := self._node.text(strip=True)) else None
|
|
200
|
+
|
|
201
|
+
def attr(self, attr_name: str) -> str | None:
|
|
202
|
+
if self._node is None:
|
|
203
|
+
return None
|
|
204
|
+
return a.strip() if (a := self._node.attributes.get(attr_name)) else None
|
|
205
|
+
|
|
206
|
+
class _WrappedNodeGroup:
|
|
207
|
+
def __init__(self, nodes: list[_WrappedNode]) -> None:
|
|
208
|
+
self._nodes = nodes
|
|
209
|
+
|
|
210
|
+
@property
|
|
211
|
+
def raw(self) -> list[_WrappedNode]:
|
|
212
|
+
return self._nodes
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def first(self) -> _WrappedNode:
|
|
216
|
+
return self._nodes[0] if self._nodes else wrap_node(None)
|
|
217
|
+
|
|
218
|
+
def grep(self, pattern: str) -> _WrappedNodeGroup:
|
|
219
|
+
prog = re.compile(pattern)
|
|
220
|
+
filtered = [
|
|
221
|
+
n for n in self._nodes
|
|
222
|
+
if (t := n.text) and prog.search(ud.normalize('NFKC', t))
|
|
223
|
+
]
|
|
224
|
+
return wrap_node_group(filtered)
|
|
225
|
+
|
|
226
|
+
@property
|
|
227
|
+
def texts(self) -> list[str | None]:
|
|
228
|
+
return [n.text for n in self._nodes]
|
|
229
|
+
|
|
230
|
+
def attrs(self, attr_name: str) -> list[str | None]:
|
|
231
|
+
return [n.attr(attr_name) for n in self._nodes]
|
|
232
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import random
|
|
5
|
+
import time
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
from loguru import logger
|
|
11
|
+
from selectolax.lexbor import LexborHTMLParser
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_html(path: Path | str) -> LexborHTMLParser | None:
|
|
15
|
+
try:
|
|
16
|
+
return LexborHTMLParser(Path(path).read_text(encoding='utf-8'))
|
|
17
|
+
except Exception as e:
|
|
18
|
+
logger.error(f"[parse_html] {path} {type(e).__name__}: {e}")
|
|
19
|
+
return None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def from_here(file: str) -> Callable[[str], Path]:
|
|
23
|
+
base = Path(file).resolve().parent
|
|
24
|
+
return lambda path: base / path
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def random_sleep(a: float, b: float) -> None:
|
|
28
|
+
time.sleep(random.uniform(a, b))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def append_csv(path: Path | str, row: dict) -> None:
|
|
32
|
+
p = Path(path)
|
|
33
|
+
try:
|
|
34
|
+
pd.DataFrame([row]).to_csv(
|
|
35
|
+
p,
|
|
36
|
+
mode='a',
|
|
37
|
+
index=False,
|
|
38
|
+
header=True if not p.exists() else p.stat().st_size == 0,
|
|
39
|
+
encoding='utf-8-sig',
|
|
40
|
+
)
|
|
41
|
+
except Exception as e:
|
|
42
|
+
logger.error(f"[append_csv] {path} {row} {type(e).__name__}: {e}")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_parquet(path: Path | str, rows: list[dict]) -> None:
|
|
46
|
+
try:
|
|
47
|
+
pd.DataFrame(rows).to_parquet(
|
|
48
|
+
Path(path),
|
|
49
|
+
index=False,
|
|
50
|
+
)
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.error(f"[write_parquet] {path} {type(e).__name__}: {e}")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def hash_name(key: str) -> str:
|
|
56
|
+
return hashlib.md5(key.encode()).hexdigest()
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def save_html(filepath: Path, html: str) -> bool:
|
|
60
|
+
try:
|
|
61
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
filepath.write_text(html, encoding="utf-8", errors="replace")
|
|
63
|
+
return True
|
|
64
|
+
except Exception as e:
|
|
65
|
+
logger.error(f"[save_html] {filepath} {type(e).__name__}: {e}")
|
|
66
|
+
return False
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def log_to_file(path: Path | str) -> None:
|
|
70
|
+
logger.add(Path(path), level="WARNING", encoding="utf-8")
|