scwrap 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,207 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
@@ -0,0 +1 @@
1
+ 3.12
scwrap-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nishizawa Takamasa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
scwrap-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.4
2
+ Name: scwrap
3
+ Version: 0.1.0
4
+ Summary: Lightweight scraping helpers: wrapped Page/parser APIs (Patchright, Playwright, selectolax), browser presets, CSV/Parquet and logging utilities.
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: patchright>=1.40
9
+ Requires-Dist: playwright>=1.40
10
+ Requires-Dist: selectolax>=0.3
11
+ Requires-Dist: pandas>=2.0
12
+ Requires-Dist: pyarrow>=14.0
13
+ Requires-Dist: camoufox>=0.4
14
+ Requires-Dist: loguru>=0.7
15
+
16
+ # scwrap
17
+
18
+ ## Overview - 概要
19
+
20
+ scwrap is a scraping utility library built on Patchright, Playwright, and selectolax.
21
+ scwrap は Patchright / Playwright(`Page` API)と selectolax をベースにしたスクレイピングユーティリティライブラリです。**細かい挙動はプリミティブの組み合わせで組み立てる**前提の薄いラッパーです(「よしなに」な自動修復は置かない方針)。
22
+
23
+ DOM・パーサのラッパーは **`scwrap`**(`wrap_page` / `wrap_parser` などのファクトリー)から、ブラウザ起動は **`scwrap.browser`**、CSV やログなどの周辺は **`scwrap.utils`** から import します。
24
+
25
+
26
+ ## Requirements - 必要条件
27
+
28
+ - Python 3.12 or higher(`requires-python` は `pyproject.toml` 参照)
29
+ - 主要依存: patchright, playwright, selectolax, pandas, pyarrow, camoufox, loguru(一覧・下限は `pyproject.toml` の `[project.dependencies]`)
30
+ - `write_parquet` は **pandas + pyarrow**(`pyarrow` は依存に含まれる)。別エンジンに切り替える場合のみ `fastparquet` などが必要になることがあります。
31
+ - ブラウザ: **Patchright / Playwright 用の取得**と、下記のとおり **`patchright_page` は Google Chrome 前提**です。
32
+
33
+ ## Installation - インストール
34
+
35
+ ### pip
36
+
37
+ ```
38
+ pip install scwrap
39
+ ```
40
+
41
+ ### uv (推奨)
42
+
43
+ ```
44
+ uv add scwrap
45
+ ```
46
+
47
+ Playwright / Patchright が使うブラウザバイナリは別途取得してください。
48
+ 加えて **`patchright_page()` は `channel='chrome'` で起動するため、マシンに [Google Chrome](https://www.google.com/chrome/) がインストールされている必要があります**(Chromium のみの環境では起動に失敗することがあります)。
49
+
50
+ ### Patchright(Chromium 等)
51
+
52
+ #### pip
53
+
54
+ ```
55
+ python -m patchright install chromium
56
+ ```
57
+
58
+ #### uv (推奨)
59
+
60
+ ```
61
+ uv run patchright install chromium
62
+ ```
63
+
64
+ ### Camoufox(Firefox)
65
+
66
+ #### pip
67
+
68
+ ```
69
+ camoufox fetch
70
+ ```
71
+
72
+ #### uv (推奨)
73
+
74
+ ```
75
+ uv run camoufox fetch
76
+ ```
77
+
78
+ ## メソッド
79
+
80
+ ### `scwrap`(ラッパー)
81
+
82
+ ブラウザ側は `wrap_page(page)` が起点です。`goto`・`wait`・`css` などはこの戻り値に対して呼びます。要素が複数なら `css(...)` はグループを返し、先頭だけなら `.first`、正規表現で絞り込みは `.grep(pattern)`、相対 URL の解決には `.urls`(単一は `.url`)を使います。テキストや生の要素は `.text` / `.raw` プロパティです。
83
+
84
+ 静的 HTML(selectolax)側は `wrap_parser(parser)` から `css` / `grep` / `text` など(ノードは `wrap_node` 系)。クラス実装は非公開で、**コンストラクトは常にこれらのファクトリー経由**にしてください。
85
+
86
+ ### `scwrap.browser`
87
+
88
+ - **`patchright_page()`** … コンテキストマネージャ。Patchright で **Google Chrome**(`channel='chrome'`)を起動し、**毎回クリーンな `BrowserContext`** の `Page` を `with` に渡す(永続プロファイルは使わない)。`headless=False`・`no_viewport=True` などは固定。
89
+
90
+ - **`camoufox_page(locale=...)`** … Camoufox(Firefox)で `Page` を開く。
91
+ _例:_ `with camoufox_page(locale='en-US,en') as page:`
92
+ デフォルトの `locale` は `'ja-JP,ja'`。`headless=False`・`humanize=True` は固定。
93
+
94
+ ウィンドウ最大化が必要なら、コードではなく **ブラウザ上で手動**してください(起動引数に依存させない)。
95
+
96
+ ### `scwrap.utils`
97
+
98
+ `log_to_file`・`from_here`・`parse_html`・`append_csv`・`write_parquet`・`save_html`・`hash_name`・`random_sleep` など(各関数は `scwrap/utils.py` を参照)。`log_to_file` はログファイルの **親ディレクトリが無いと失敗**するので、必要なら先に `Path.mkdir` するか、`save_html` のように親を作る処理を挟んでください。
99
+
100
+
101
+ ## Basic Usage - 基本的な使い方
102
+
103
+ ```python
104
+ from scwrap import wrap_page
105
+ from scwrap.browser import patchright_page
106
+ from scwrap.utils import log_to_file, append_csv, from_here, random_sleep
107
+
108
+ fh = from_here(__file__)
109
+ log_to_file(fh('log/scraping.log'))
110
+
111
+ with patchright_page() as page:
112
+ p = wrap_page(page)
113
+ p.goto('https://www.foobarbaz1.jp')
114
+
115
+ pref_urls = p.css('li.item > ul > li > a').urls
116
+
117
+ classroom_urls = []
118
+ for i, url in enumerate(pref_urls, 1):
119
+ print(f'pref_urls {i}/{len(pref_urls)}')
120
+ if not url or not p.goto(url):
121
+ continue
122
+ random_sleep(1, 2)
123
+ classroom_urls.extend(p.css('.school-area h4 a').urls)
124
+
125
+ for i, url in enumerate(classroom_urls, 1):
126
+ print(f'classroom_urls {i}/{len(classroom_urls)}')
127
+ if not p.goto(url):
128
+ continue
129
+ random_sleep(1, 2)
130
+ append_csv(fh('csv/out.csv'), {
131
+ 'URL': page.url,
132
+ '教室名': p.css('h1 .text01').first.text,
133
+ '住所': p.css('.item .mapText').first.text,
134
+ '電話番号': p.css('.item .phoneNumber').first.text,
135
+ 'HP': p.css('th').grep('ホームページ').first.next('td').css('a').first.url,
136
+ })
137
+ ```
138
+
139
+ ## Save HTML while scraping - スクレイピングしながらHTMLを保存する
140
+
141
+ ```python
142
+ from scwrap import wrap_page
143
+ from scwrap.browser import camoufox_page
144
+ from scwrap.utils import log_to_file, append_csv, from_here, hash_name, random_sleep, save_html
145
+
146
+ fh = from_here(__file__)
147
+ log_to_file(fh('log/scraping.log'))
148
+
149
+ with camoufox_page() as page:
150
+ ctx = {}
151
+ p = wrap_page(page)
152
+ p.goto('https://www.foobarbaz1.jp')
153
+
154
+ ctx['アイテムURLs'] = p.css('ul.items > li > a').urls
155
+
156
+ for i, url in enumerate(ctx['アイテムURLs'], 1):
157
+ print(f"アイテムURLs {i}/{len(ctx['アイテムURLs'])}")
158
+ if not p.goto(url):
159
+ continue
160
+ random_sleep(1, 2)
161
+ if p.wait('#logo', timeout=10000).raw is None:
162
+ continue
163
+ file_name = f'{hash_name(url)}.html'
164
+ if not save_html(fh('html') / file_name, page.content()):
165
+ continue
166
+ append_csv(fh('outurlhtml.csv'), {
167
+ 'URL': url,
168
+ 'HTML': file_name,
169
+ })
170
+ ```
171
+
172
+ ## Scrape from local HTML files - 保存済みHTMLからスクレイピングしてParquetに出力する
173
+
174
+ ```python
175
+ import pandas as pd
176
+
177
+ from scwrap import wrap_parser
178
+ from scwrap.utils import log_to_file, from_here, parse_html, write_parquet
179
+
180
+ fh = from_here(__file__)
181
+ log_to_file(fh('log/scraping.log'))
182
+
183
+ df = pd.read_csv(fh('outurlhtml.csv'))
184
+ results = []
185
+ for i, (url, path) in enumerate(zip(df['URL'], df['HTML']), 1):
186
+ print(f'outhtml {i}/{len(df)}')
187
+ if not (parser := parse_html(fh('html') / path)):
188
+ continue
189
+ p = wrap_parser(parser)
190
+ results.append({
191
+ 'URL': url,
192
+ '教室名': p.css('h1 .text02').first.text,
193
+ '住所': p.css('.item .mapText').first.text,
194
+ '所在地': p.css('dt').grep(r'所在地').first.next('dd').text,
195
+ })
196
+ write_parquet(fh('outhtml.parquet'), results)
197
+ ```
198
+
199
+ ## License - ライセンス
200
+
201
+ [MIT](./LICENSE)
202
+
scwrap-0.1.0/README.md ADDED
@@ -0,0 +1,186 @@
1
+ # scwrap
2
+
3
+ ## Overview - 概要
4
+
5
+ scwrap is a scraping utility library built on Patchright, Playwright, and selectolax.
6
+ scwrap は Patchright / Playwright(`Page` API)と selectolax をベースにしたスクレイピングユーティリティライブラリです。**細かい挙動はプリミティブの組み合わせで組み立てる**前提の薄いラッパーです(「よしなに」な自動修復は置かない方針)。
7
+
8
+ DOM・パーサのラッパーは **`scwrap`**(`wrap_page` / `wrap_parser` などのファクトリー)から、ブラウザ起動は **`scwrap.browser`**、CSV やログなどの周辺は **`scwrap.utils`** から import します。
9
+
10
+
11
+ ## Requirements - 必要条件
12
+
13
+ - Python 3.12 or higher(`requires-python` は `pyproject.toml` 参照)
14
+ - 主要依存: patchright, playwright, selectolax, pandas, pyarrow, camoufox, loguru(一覧・下限は `pyproject.toml` の `[project.dependencies]`)
15
+ - `write_parquet` は **pandas + pyarrow**(`pyarrow` は依存に含まれる)。別エンジンに切り替える場合のみ `fastparquet` などが必要になることがあります。
16
+ - ブラウザ: **Patchright / Playwright 用の取得**と、下記のとおり **`patchright_page` は Google Chrome 前提**です。
17
+
18
+ ## Installation - インストール
19
+
20
+ ### pip
21
+
22
+ ```
23
+ pip install scwrap
24
+ ```
25
+
26
+ ### uv (推奨)
27
+
28
+ ```
29
+ uv add scwrap
30
+ ```
31
+
32
+ Playwright / Patchright が使うブラウザバイナリは別途取得してください。
33
+ 加えて **`patchright_page()` は `channel='chrome'` で起動するため、マシンに [Google Chrome](https://www.google.com/chrome/) がインストールされている必要があります**(Chromium のみの環境では起動に失敗することがあります)。
34
+
35
+ ### Patchright(Chromium 等)
36
+
37
+ #### pip
38
+
39
+ ```
40
+ python -m patchright install chromium
41
+ ```
42
+
43
+ #### uv (推奨)
44
+
45
+ ```
46
+ uv run patchright install chromium
47
+ ```
48
+
49
+ ### Camoufox(Firefox)
50
+
51
+ #### pip
52
+
53
+ ```
54
+ camoufox fetch
55
+ ```
56
+
57
+ #### uv (推奨)
58
+
59
+ ```
60
+ uv run camoufox fetch
61
+ ```
62
+
63
+ ## メソッド
64
+
65
+ ### `scwrap`(ラッパー)
66
+
67
+ ブラウザ側は `wrap_page(page)` が起点です。`goto`・`wait`・`css` などはこの戻り値に対して呼びます。要素が複数なら `css(...)` はグループを返し、先頭だけなら `.first`、正規表現で絞り込みは `.grep(pattern)`、相対 URL の解決には `.urls`(単一は `.url`)を使います。テキストや生の要素は `.text` / `.raw` プロパティです。
68
+
69
+ 静的 HTML(selectolax)側は `wrap_parser(parser)` から `css` / `grep` / `text` など(ノードは `wrap_node` 系)。クラス実装は非公開で、**コンストラクトは常にこれらのファクトリー経由**にしてください。
70
+
71
+ ### `scwrap.browser`
72
+
73
+ - **`patchright_page()`** … コンテキストマネージャ。Patchright で **Google Chrome**(`channel='chrome'`)を起動し、**毎回クリーンな `BrowserContext`** の `Page` を `with` に渡す(永続プロファイルは使わない)。`headless=False`・`no_viewport=True` などは固定。
74
+
75
+ - **`camoufox_page(locale=...)`** … Camoufox(Firefox)で `Page` を開く。
76
+ _例:_ `with camoufox_page(locale='en-US,en') as page:`
77
+ デフォルトの `locale` は `'ja-JP,ja'`。`headless=False`・`humanize=True` は固定。
78
+
79
+ ウィンドウ最大化が必要なら、コードではなく **ブラウザ上で手動**してください(起動引数に依存させない)。
80
+
81
+ ### `scwrap.utils`
82
+
83
+ `log_to_file`・`from_here`・`parse_html`・`append_csv`・`write_parquet`・`save_html`・`hash_name`・`random_sleep` など(各関数は `scwrap/utils.py` を参照)。`log_to_file` はログファイルの **親ディレクトリが無いと失敗**するので、必要なら先に `Path.mkdir` するか、`save_html` のように親を作る処理を挟んでください。
84
+
85
+
86
+ ## Basic Usage - 基本的な使い方
87
+
88
+ ```python
89
+ from scwrap import wrap_page
90
+ from scwrap.browser import patchright_page
91
+ from scwrap.utils import log_to_file, append_csv, from_here, random_sleep
92
+
93
+ fh = from_here(__file__)
94
+ log_to_file(fh('log/scraping.log'))
95
+
96
+ with patchright_page() as page:
97
+ p = wrap_page(page)
98
+ p.goto('https://www.foobarbaz1.jp')
99
+
100
+ pref_urls = p.css('li.item > ul > li > a').urls
101
+
102
+ classroom_urls = []
103
+ for i, url in enumerate(pref_urls, 1):
104
+ print(f'pref_urls {i}/{len(pref_urls)}')
105
+ if not url or not p.goto(url):
106
+ continue
107
+ random_sleep(1, 2)
108
+ classroom_urls.extend(p.css('.school-area h4 a').urls)
109
+
110
+ for i, url in enumerate(classroom_urls, 1):
111
+ print(f'classroom_urls {i}/{len(classroom_urls)}')
112
+ if not p.goto(url):
113
+ continue
114
+ random_sleep(1, 2)
115
+ append_csv(fh('csv/out.csv'), {
116
+ 'URL': page.url,
117
+ '教室名': p.css('h1 .text01').first.text,
118
+ '住所': p.css('.item .mapText').first.text,
119
+ '電話番号': p.css('.item .phoneNumber').first.text,
120
+ 'HP': p.css('th').grep('ホームページ').first.next('td').css('a').first.url,
121
+ })
122
+ ```
123
+
124
+ ## Save HTML while scraping - スクレイピングしながらHTMLを保存する
125
+
126
+ ```python
127
+ from scwrap import wrap_page
128
+ from scwrap.browser import camoufox_page
129
+ from scwrap.utils import log_to_file, append_csv, from_here, hash_name, random_sleep, save_html
130
+
131
+ fh = from_here(__file__)
132
+ log_to_file(fh('log/scraping.log'))
133
+
134
+ with camoufox_page() as page:
135
+ ctx = {}
136
+ p = wrap_page(page)
137
+ p.goto('https://www.foobarbaz1.jp')
138
+
139
+ ctx['アイテムURLs'] = p.css('ul.items > li > a').urls
140
+
141
+ for i, url in enumerate(ctx['アイテムURLs'], 1):
142
+ print(f"アイテムURLs {i}/{len(ctx['アイテムURLs'])}")
143
+ if not p.goto(url):
144
+ continue
145
+ random_sleep(1, 2)
146
+ if p.wait('#logo', timeout=10000).raw is None:
147
+ continue
148
+ file_name = f'{hash_name(url)}.html'
149
+ if not save_html(fh('html') / file_name, page.content()):
150
+ continue
151
+ append_csv(fh('outurlhtml.csv'), {
152
+ 'URL': url,
153
+ 'HTML': file_name,
154
+ })
155
+ ```
156
+
157
+ ## Scrape from local HTML files - 保存済みHTMLからスクレイピングしてParquetに出力する
158
+
159
+ ```python
160
+ import pandas as pd
161
+
162
+ from scwrap import wrap_parser
163
+ from scwrap.utils import log_to_file, from_here, parse_html, write_parquet
164
+
165
+ fh = from_here(__file__)
166
+ log_to_file(fh('log/scraping.log'))
167
+
168
+ df = pd.read_csv(fh('outurlhtml.csv'))
169
+ results = []
170
+ for i, (url, path) in enumerate(zip(df['URL'], df['HTML']), 1):
171
+ print(f'outhtml {i}/{len(df)}')
172
+ if not (parser := parse_html(fh('html') / path)):
173
+ continue
174
+ p = wrap_parser(parser)
175
+ results.append({
176
+ 'URL': url,
177
+ '教室名': p.css('h1 .text02').first.text,
178
+ '住所': p.css('.item .mapText').first.text,
179
+ '所在地': p.css('dt').grep(r'所在地').first.next('dd').text,
180
+ })
181
+ write_parquet(fh('outhtml.parquet'), results)
182
+ ```
183
+
184
+ ## License - ライセンス
185
+
186
+ [MIT](./LICENSE)
scwrap-0.1.0/main.py ADDED
@@ -0,0 +1,6 @@
1
+ def main():
2
+ print("Hello from scwrap!")
3
+
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,15 @@
1
+ [project]
2
+ name = "scwrap"
3
+ version = "0.1.0"
4
+ description = "Lightweight scraping helpers: wrapped Page/parser APIs (Patchright, Playwright, selectolax), browser presets, CSV/Parquet and logging utilities."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "patchright>=1.40",
9
+ "playwright>=1.40",
10
+ "selectolax>=0.3",
11
+ "pandas>=2.0",
12
+ "pyarrow>=14.0",
13
+ "camoufox>=0.4",
14
+ "loguru>=0.7",
15
+ ]
@@ -0,0 +1,15 @@
1
+ from .scwrap import (
2
+ Page,
3
+ wrap_node,
4
+ wrap_node_group,
5
+ wrap_page,
6
+ wrap_parser,
7
+ )
8
+
9
+ __all__ = [
10
+ "Page",
11
+ "wrap_node",
12
+ "wrap_node_group",
13
+ "wrap_page",
14
+ "wrap_parser",
15
+ ]
@@ -0,0 +1,33 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Iterator
4
+ from contextlib import contextmanager
5
+
6
+ from camoufox.sync_api import Camoufox
7
+ from patchright.sync_api import Page as PatchrightPage, sync_playwright
8
+ from playwright.sync_api import Page as PlaywrightPage
9
+
10
+ Page = PatchrightPage | PlaywrightPage
11
+
12
+
13
+ @contextmanager
14
+ def patchright_page() -> Iterator[Page]:
15
+ with sync_playwright() as pw:
16
+ with pw.chromium.launch(
17
+ channel='chrome',
18
+ headless=False,
19
+ ) as browser:
20
+ with browser.new_context(no_viewport=True) as context:
21
+ page = context.new_page()
22
+ yield page
23
+
24
+
25
+ @contextmanager
26
+ def camoufox_page(locale: str | list[str] = 'ja-JP,ja') -> Iterator[Page]:
27
+ with Camoufox(
28
+ headless=False,
29
+ humanize=True,
30
+ locale=locale,
31
+ ) as browser:
32
+ page = browser.new_page()
33
+ yield page
@@ -0,0 +1,232 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import re
5
+ import time
6
+ import unicodedata as ud
7
+ from urllib.parse import urljoin
8
+
9
+ from loguru import logger
10
+ from patchright.sync_api import Page as PatchrightPage, ElementHandle as PatchrightElementHandle
11
+ from playwright.sync_api import Page as PlaywrightPage, ElementHandle as PlaywrightElementHandle
12
+ from selectolax.lexbor import LexborHTMLParser, LexborNode
13
+
14
+
15
+ Page = PatchrightPage | PlaywrightPage
16
+ ElementHandle = PatchrightElementHandle | PlaywrightElementHandle
17
+
18
+
19
+ def wrap_page(page: Page) -> _WrappedPage:
20
+ return _WrappedPage(page)
21
+
22
+ class _PageScoped:
23
+ _page: Page
24
+
25
+ def wrap_element(self, elem: ElementHandle | None) -> _WrappedElement:
26
+ return _WrappedElement(self._page, elem)
27
+
28
+ def wrap_element_group(self, elems: list[_WrappedElement]) -> _WrappedElementGroup:
29
+ return _WrappedElementGroup(self._page, elems)
30
+
31
+
32
+ def wrap_parser(parser: LexborHTMLParser) -> _WrappedParser:
33
+ return _WrappedParser(parser)
34
+
35
+ def wrap_node(node: LexborNode | None) -> _WrappedNode:
36
+ return _WrappedNode(node)
37
+
38
+ def wrap_node_group(nodes: list[_WrappedNode]) -> _WrappedNodeGroup:
39
+ return _WrappedNodeGroup(nodes)
40
+
41
+
42
+ class _WrappedPage(_PageScoped):
43
+ def __init__(self, page: Page) -> None:
44
+ self._page = page
45
+
46
+ def css(self, selector: str) -> _WrappedElementGroup:
47
+ elems = self._page.query_selector_all(selector)
48
+ return self.wrap_element_group([self.wrap_element(e) for e in elems])
49
+
50
+ def goto(self, url: str | None, try_cnt: int = 3, wait_range: tuple[float, float] = (3, 5)) -> bool:
51
+ if not url:
52
+ return False
53
+ for i in range(try_cnt):
54
+ try:
55
+ if self._page.goto(url) is not None:
56
+ return True
57
+ else:
58
+ reason = "response is None"
59
+ except Exception as e:
60
+ reason = f"{type(e).__name__}: {e}"
61
+ logger.warning(f"[goto] {url} ({i+1}/{try_cnt}) {reason}")
62
+ if i + 1 < try_cnt:
63
+ time.sleep(random.uniform(*wait_range))
64
+ logger.error(f"[goto] giving up: {url}")
65
+ return False
66
+
67
+ def wait(self, selector: str, timeout: int = 15000) -> _WrappedElement:
68
+ try:
69
+ elem = self._page.wait_for_selector(selector, timeout=timeout)
70
+ return self.wrap_element(elem)
71
+ except Exception as e:
72
+ logger.warning(f"[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url}")
73
+ return self.wrap_element(None)
74
+
75
+
76
+ class _WrappedElement(_PageScoped):
77
+ def __init__(self, page: Page, elem: ElementHandle | None) -> None:
78
+ self._page = page
79
+ self._elem = elem
80
+
81
+ @property
82
+ def raw(self) -> ElementHandle | None:
83
+ return self._elem
84
+
85
+ def css(self, selector: str) -> _WrappedElementGroup:
86
+ elems = self._elem.query_selector_all(selector) if self._elem else []
87
+ return self.wrap_element_group([self.wrap_element(e) for e in elems])
88
+
89
+ def next(self, selector: str) -> _WrappedElement:
90
+ if self._elem is None:
91
+ return self.wrap_element(None)
92
+ try:
93
+ elem = self._elem.evaluate_handle(
94
+ """(el, sel) => {
95
+ let cur = el.nextElementSibling;
96
+ while (cur) {
97
+ if (cur.matches(sel)) return cur;
98
+ cur = cur.nextElementSibling;
99
+ }
100
+ return null;
101
+ }""",
102
+ selector,
103
+ ).as_element()
104
+ return self.wrap_element(elem)
105
+ except Exception as e:
106
+ logger.error(f"[next] {self._elem} {type(e).__name__}: {e}")
107
+ return self.wrap_element(None)
108
+
109
+ @property
110
+ def text(self) -> str | None:
111
+ if self._elem is None:
112
+ return None
113
+ if not (text := self._elem.text_content()):
114
+ return None
115
+ if not (t := text.strip()):
116
+ return None
117
+ return t
118
+
119
+ def attr(self, attr_name: str) -> str | None:
120
+ if self._elem is None:
121
+ return None
122
+ return a.strip() if (a := self._elem.get_attribute(attr_name)) else None
123
+
124
+ @property
125
+ def url(self) -> str | None:
126
+ if not (href := self.attr('href')):
127
+ return None
128
+ if re.search(r'(?i)^(?:#|javascript:|mailto:|tel:|data:)', href):
129
+ return None
130
+ return urljoin(self._page.url, href)
131
+
132
+ class _WrappedElementGroup(_PageScoped):
133
+ def __init__(self, page: Page, elems: list[_WrappedElement]) -> None:
134
+ self._page = page
135
+ self._elems = elems
136
+
137
+ @property
138
+ def raw(self) -> list[_WrappedElement]:
139
+ return self._elems
140
+
141
+ @property
142
+ def first(self) -> _WrappedElement:
143
+ return self._elems[0] if self._elems else self.wrap_element(None)
144
+
145
+ def grep(self, pattern: str) -> _WrappedElementGroup:
146
+ prog = re.compile(pattern)
147
+ filtered = [
148
+ e for e in self._elems
149
+ if (t := e.text) and prog.search(ud.normalize('NFKC', t))
150
+ ]
151
+ return self.wrap_element_group(filtered)
152
+
153
+ @property
154
+ def texts(self) -> list[str | None]:
155
+ return [e.text for e in self._elems]
156
+
157
+ def attrs(self, attr_name: str) -> list[str | None]:
158
+ return [e.attr(attr_name) for e in self._elems]
159
+
160
+ @property
161
+ def urls(self) -> list[str | None]:
162
+ return [e.url for e in self._elems]
163
+
164
+
165
+ class _WrappedParser:
166
+ def __init__(self, parser: LexborHTMLParser) -> None:
167
+ self._parser = parser
168
+
169
+ def css(self, selector: str) -> _WrappedNodeGroup:
170
+ nodes = self._parser.css(selector)
171
+ return wrap_node_group([wrap_node(n) for n in nodes])
172
+
173
+ class _WrappedNode:
174
+ def __init__(self, node: LexborNode | None) -> None:
175
+ self._node = node
176
+
177
+ @property
178
+ def raw(self) -> LexborNode | None:
179
+ return self._node
180
+
181
+ def css(self, selector: str) -> _WrappedNodeGroup:
182
+ nodes = self._node.css(selector) if self._node else []
183
+ return wrap_node_group([wrap_node(n) for n in nodes])
184
+
185
+ def next(self, selector: str) -> _WrappedNode:
186
+ if self._node is None:
187
+ return wrap_node(None)
188
+ cur = self._node.next
189
+ while cur is not None:
190
+ if cur.is_element_node and cur.css_matches(selector):
191
+ return wrap_node(cur)
192
+ cur = cur.next
193
+ return wrap_node(None)
194
+
195
+ @property
196
+ def text(self) -> str | None:
197
+ if self._node is None:
198
+ return None
199
+ return t if (t := self._node.text(strip=True)) else None
200
+
201
+ def attr(self, attr_name: str) -> str | None:
202
+ if self._node is None:
203
+ return None
204
+ return a.strip() if (a := self._node.attributes.get(attr_name)) else None
205
+
206
+ class _WrappedNodeGroup:
207
+ def __init__(self, nodes: list[_WrappedNode]) -> None:
208
+ self._nodes = nodes
209
+
210
+ @property
211
+ def raw(self) -> list[_WrappedNode]:
212
+ return self._nodes
213
+
214
+ @property
215
+ def first(self) -> _WrappedNode:
216
+ return self._nodes[0] if self._nodes else wrap_node(None)
217
+
218
+ def grep(self, pattern: str) -> _WrappedNodeGroup:
219
+ prog = re.compile(pattern)
220
+ filtered = [
221
+ n for n in self._nodes
222
+ if (t := n.text) and prog.search(ud.normalize('NFKC', t))
223
+ ]
224
+ return wrap_node_group(filtered)
225
+
226
+ @property
227
+ def texts(self) -> list[str | None]:
228
+ return [n.text for n in self._nodes]
229
+
230
+ def attrs(self, attr_name: str) -> list[str | None]:
231
+ return [n.attr(attr_name) for n in self._nodes]
232
+
@@ -0,0 +1,70 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import random
5
+ import time
6
+ from pathlib import Path
7
+ from typing import Callable
8
+
9
+ import pandas as pd
10
+ from loguru import logger
11
+ from selectolax.lexbor import LexborHTMLParser
12
+
13
+
14
+ def parse_html(path: Path | str) -> LexborHTMLParser | None:
15
+ try:
16
+ return LexborHTMLParser(Path(path).read_text(encoding='utf-8'))
17
+ except Exception as e:
18
+ logger.error(f"[parse_html] {path} {type(e).__name__}: {e}")
19
+ return None
20
+
21
+
22
+ def from_here(file: str) -> Callable[[str], Path]:
23
+ base = Path(file).resolve().parent
24
+ return lambda path: base / path
25
+
26
+
27
+ def random_sleep(a: float, b: float) -> None:
28
+ time.sleep(random.uniform(a, b))
29
+
30
+
31
+ def append_csv(path: Path | str, row: dict) -> None:
32
+ p = Path(path)
33
+ try:
34
+ pd.DataFrame([row]).to_csv(
35
+ p,
36
+ mode='a',
37
+ index=False,
38
+ header=True if not p.exists() else p.stat().st_size == 0,
39
+ encoding='utf-8-sig',
40
+ )
41
+ except Exception as e:
42
+ logger.error(f"[append_csv] {path} {row} {type(e).__name__}: {e}")
43
+
44
+
45
+ def write_parquet(path: Path | str, rows: list[dict]) -> None:
46
+ try:
47
+ pd.DataFrame(rows).to_parquet(
48
+ Path(path),
49
+ index=False,
50
+ )
51
+ except Exception as e:
52
+ logger.error(f"[write_parquet] {path} {type(e).__name__}: {e}")
53
+
54
+
55
+ def hash_name(key: str) -> str:
56
+ return hashlib.md5(key.encode()).hexdigest()
57
+
58
+
59
+ def save_html(filepath: Path, html: str) -> bool:
60
+ try:
61
+ filepath.parent.mkdir(parents=True, exist_ok=True)
62
+ filepath.write_text(html, encoding="utf-8", errors="replace")
63
+ return True
64
+ except Exception as e:
65
+ logger.error(f"[save_html] {filepath} {type(e).__name__}: {e}")
66
+ return False
67
+
68
+
69
+ def log_to_file(path: Path | str) -> None:
70
+ logger.add(Path(path), level="WARNING", encoding="utf-8")