domx 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
domx-0.1.1/.gitignore ADDED
@@ -0,0 +1,218 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+ # Temporary file for partial code execution
204
+ tempCodeRunnerFile.py
205
+
206
+ # Ruff stuff:
207
+ .ruff_cache/
208
+
209
+ # PyPI configuration file
210
+ .pypirc
211
+
212
+ # Marimo
213
+ marimo/_static/
214
+ marimo/_lsp/
215
+ __marimo__/
216
+
217
+ # Streamlit
218
+ .streamlit/secrets.toml
@@ -0,0 +1 @@
1
+ 3.12
domx-0.1.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nishizawa Takamasa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
domx-0.1.1/PKG-INFO ADDED
@@ -0,0 +1,239 @@
1
+ Metadata-Version: 2.4
2
+ Name: domx
3
+ Version: 0.1.1
4
+ Summary: 自分用・非汎用
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: patchright>=1.40
9
+ Requires-Dist: playwright>=1.40
10
+ Requires-Dist: selectolax>=0.3
11
+ Requires-Dist: pyarrow>=14.0
12
+ Requires-Dist: camoufox>=0.4
13
+ Requires-Dist: loguru>=0.7
14
+ Requires-Dist: tqdm>=4.66
15
+
16
+ # domx
17
+
18
+ 自分用・非汎用
19
+
20
+ ## インストール
21
+
22
+ `uv add domx`
23
+ `uv run patchright install chromium`
24
+ `uv run camoufox fetch`
25
+
26
+ ## 使用例
27
+
28
+ ### スクレイピング
29
+
30
+ ```python
31
+ from domx import wrap_page
32
+ from domx.browser import patchright_page
33
+ from domx.utils import append_csv, from_here, save_log, write_bytes
34
+
35
+ here = from_here(__file__)
36
+ save_log(here('log/scraping.log'))
37
+
38
+ with patchright_page() as page:
39
+ p = wrap_page(page)
40
+
41
+ p.goto('https://www.foobarbaz1.jp')
42
+ pref_urls = p.ss('li.item > ul > li > a').urls
43
+
44
+ classroom_urls = []
45
+ for i, url in enumerate(pref_urls, 1):
46
+ print(f'pref_urls {i}/{len(pref_urls)}')
47
+ if not p.goto(url):
48
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
49
+ continue
50
+ classroom_urls.extend(p.ss('.school-area h4 a').urls)
51
+
52
+ for i, url in enumerate(classroom_urls, 1):
53
+ print(f'classroom_urls {i}/{len(classroom_urls)}')
54
+ if not p.goto(url):
55
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
56
+ continue
57
+ th_grep = p.ss('th').re
58
+ append_csv(here('csv/scrape.csv'), {
59
+ 'id': i,
60
+ 'URL': page.url,
61
+ '教室名': p.s('h1 .text01').text,
62
+ '住所': p.s('.item .mapText').text,
63
+ '電話番号': p.s('.item .phoneNumber').text,
64
+ 'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
65
+ '営業時間': th_grep.s(r'営業時間').next('td').text,
66
+ '定休日': th_grep.s(r'定休日').next('td').text,
67
+ })
68
+ p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
69
+ if (img_url := p.s('.school-area img').src):
70
+ if (res := p.goto(img_url)) and res.ok:
71
+ write_bytes(here(f'media/{i}-img.jpg'), res.body())
72
+ ```
73
+
74
+ ### スクレイピング(スクショと画像も保存)
75
+
76
+ ```python
77
+ import time
78
+ from urllib.parse import urlencode
79
+
80
+ from domx import wrap_page
81
+ from domx.browser import patchright_page
82
+ from domx.utils import save_log, append_csv, from_here, write_bytes
83
+
84
+ here = from_here(__file__)
85
+ save_log(here('log/scraping.log'))
86
+
87
+ with patchright_page() as page:
88
+ p = wrap_page(page)
89
+
90
+ p.goto('https://example.com/demo/search')
91
+ prefecture_urls = p.ss('li > a[href^="https://example.com/demo/search/area/"]').urls
92
+
93
+ bukken_urls = []
94
+ for i, prefecture_url in enumerate(prefecture_urls, 1):
95
+ print(f'{i}/{len(prefecture_urls)} エリア一覧ページ')
96
+ page_num = 1
97
+ while True:
98
+ if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}'):
99
+ break
100
+ if not (bukken_elems := p.ss('ul li div a[href^="https://example.com"]:has(p)')):
101
+ break
102
+ bukken_urls.extend(bukken_elems.urls)
103
+ page_num += 1
104
+
105
+ for i, url in enumerate(bukken_urls, 1):
106
+ print(f'{i}/{len(bukken_urls)} 詳細ページ {url}')
107
+ if not p.goto(url):
108
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
109
+ continue
110
+
111
+ dt_grep = p.ss('h4').re.s(r'概要').next('div:has(dl)').ss('dt').re
112
+ dd_text = lambda pattern: dt_grep.s(pattern).next('dd').text
113
+
114
+ append_csv(here('csv/scrape.csv'), {
115
+ 'id': i,
116
+ 'URL': page.url,
117
+ '価格': dd_text(r'価格'),
118
+ '所在地': dd_text(r'所在地'),
119
+ '交通': dd_text(r'交通'),
120
+ '駐車場': dd_text(r'駐車場'),
121
+ '備考': dd_text(r'備考'),
122
+ '情報更新日': dd_text(r'情報更新日'),
123
+ })
124
+
125
+ page.add_style_tag(content='header, footer.site-footer { visibility: hidden !important; }')
126
+
127
+ p.ss('h4').re.s(r'概要').next('div:has(dl)').screenshot(path=here(f'media/{i}-summary.png'))
128
+
129
+ elem_iframe = p.s('iframe[src^="https://example.com"]')
130
+ elem_iframe.scroll_into_view()
131
+ time.sleep(3)
132
+ elem_iframe.screenshot(path=here(f'media/{i}-iframe.png'))
133
+
134
+ main_img_url = p.s('img.w-full.object-contain').src
135
+
136
+ img_desc_grep = p.ss('p.text-left').re.s(r'画像をクリック').next('ul').ss('li p').re
137
+ img_desc = img_desc_grep.s(r'表紙') or img_desc_grep.s(r'^(?!.*裏面).*')
138
+ img_url = img_desc.parent('li').s('a').url
139
+
140
+ if main_img_url and (res := p.goto(main_img_url)) and res.ok:
141
+ write_bytes(here(f'media/{i}-main-img.jpg'), res.body())
142
+ if img_url and (res := p.goto(img_url)) and res.ok:
143
+ write_bytes(here(f'media/{i}-img-desc.jpg'), res.body())
144
+ ```
145
+
146
+ ### スクレイピング(HTML丸ごと保存)
147
+
148
+ ```python
149
+ from domx import wrap_page
150
+ from domx.browser import camoufox_page
151
+ from domx.utils import append_csv, from_here, hash_name, save_log, write_text
152
+
153
+ here = from_here(__file__)
154
+ save_log(here('log/scraping.log'))
155
+
156
+ with camoufox_page() as page:
157
+ p = wrap_page(page)
158
+
159
+ p.goto('https://www.foobarbaz1.jp')
160
+ item_urls = p.ss('ul.items > li > a').urls
161
+
162
+ for i, url in enumerate(item_urls, 1):
163
+ print(f'item_urls {i}/{len(item_urls)}')
164
+ if not p.goto(url):
165
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
166
+ continue
167
+ file_name = f'{hash_name(url)}.html'
168
+ if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
169
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
170
+ continue
171
+ ```
172
+
173
+ ### ローカルHTMLからデータ抽出&Parquet出力
174
+
175
+ ```python
176
+ from domx import wrap_parser
177
+ from domx.utils import from_here, parse_html, save_log, write_parquet
178
+
179
+ here = from_here(__file__)
180
+ save_log(here('log/scraping.log'))
181
+
182
+ results = []
183
+ for i, file_path in enumerate(here('html').glob('*.html'),1):
184
+ print(f'html {i}')
185
+ if not (parser := parse_html(file_path)):
186
+ continue
187
+ p = wrap_parser(parser)
188
+ dts = p.ss('dt').re
189
+ results.append({
190
+ 'URL': p.url,
191
+ 'file_name': file_path.name,
192
+ '教室名': p.s('h1 .text02').text,
193
+ '住所': p.s('.item .mapText').text,
194
+ '所在地': dts.s(r'所在地').next('dd').text,
195
+ '交通': dts.s(r'交通').next('dd').text,
196
+ '物件番号': dts.s(r'物件番号').next('dd').text,
197
+ })
198
+ write_parquet(here('parquet/extract.parquet'), results)
199
+ ```
200
+
201
+ ### ローカルHTMLからデータ抽出&Parquet出力(並列処理)
202
+
203
+ ```python
204
+ from pathlib import Path
205
+
206
+ from domx import wrap_parser
207
+ from domx.utils import from_here, glob_paths, parse_html, pool_map, write_parquet
208
+
209
+ def main():
210
+ here = from_here(__file__)
211
+ html_paths = glob_paths(here('html'), '*.html')
212
+ results = [r for r in pool_map(extract, html_paths) if r]
213
+ write_parquet(here('parquet/extract.parquet'), results)
214
+
215
+ def extract(file_path: str) -> dict | None:
216
+ if not (parser := parse_html(Path(file_path))):
217
+ return None
218
+ p = wrap_parser(parser)
219
+ dts = p.ss('dt').re
220
+ return {
221
+ 'URL': p.url,
222
+ 'file_path': file_path,
223
+ '教室名': p.s('h1 .text02').text,
224
+ '住所': p.s('.item .mapText').text,
225
+ '所在地': dts.s(r'所在地').next('dd').text,
226
+ '交通': dts.s(r'交通').next('dd').text,
227
+ '価格': dts.s(r'価格').next('dd').text,
228
+ '設備・条件': dts.s(r'設備').next('dd').text,
229
+ '備考': dts.s(r'備考').next('dd').text,
230
+ }
231
+
232
+ if __name__ == '__main__':
233
+ main()
234
+ ```
235
+
236
+ ## License - ライセンス
237
+
238
+ [MIT](./LICENSE)
239
+
domx-0.1.1/README.md ADDED
@@ -0,0 +1,223 @@
1
+ # domx
2
+
3
+ 自分用・非汎用
4
+
5
+ ## インストール
6
+
7
+ `uv add domx`
8
+ `uv run patchright install chromium`
9
+ `uv run camoufox fetch`
10
+
11
+ ## 使用例
12
+
13
+ ### スクレイピング
14
+
15
+ ```python
16
+ from domx import wrap_page
17
+ from domx.browser import patchright_page
18
+ from domx.utils import append_csv, from_here, save_log, write_bytes
19
+
20
+ here = from_here(__file__)
21
+ save_log(here('log/scraping.log'))
22
+
23
+ with patchright_page() as page:
24
+ p = wrap_page(page)
25
+
26
+ p.goto('https://www.foobarbaz1.jp')
27
+ pref_urls = p.ss('li.item > ul > li > a').urls
28
+
29
+ classroom_urls = []
30
+ for i, url in enumerate(pref_urls, 1):
31
+ print(f'pref_urls {i}/{len(pref_urls)}')
32
+ if not p.goto(url):
33
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
34
+ continue
35
+ classroom_urls.extend(p.ss('.school-area h4 a').urls)
36
+
37
+ for i, url in enumerate(classroom_urls, 1):
38
+ print(f'classroom_urls {i}/{len(classroom_urls)}')
39
+ if not p.goto(url):
40
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
41
+ continue
42
+ th_grep = p.ss('th').re
43
+ append_csv(here('csv/scrape.csv'), {
44
+ 'id': i,
45
+ 'URL': page.url,
46
+ '教室名': p.s('h1 .text01').text,
47
+ '住所': p.s('.item .mapText').text,
48
+ '電話番号': p.s('.item .phoneNumber').text,
49
+ 'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
50
+ '営業時間': th_grep.s(r'営業時間').next('td').text,
51
+ '定休日': th_grep.s(r'定休日').next('td').text,
52
+ })
53
+ p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
54
+ if (img_url := p.s('.school-area img').src):
55
+ if (res := p.goto(img_url)) and res.ok:
56
+ write_bytes(here(f'media/{i}-img.jpg'), res.body())
57
+ ```
58
+
59
+ ### スクレイピング(スクショと画像も保存)
60
+
61
+ ```python
62
+ import time
63
+ from urllib.parse import urlencode
64
+
65
+ from domx import wrap_page
66
+ from domx.browser import patchright_page
67
+ from domx.utils import save_log, append_csv, from_here, write_bytes
68
+
69
+ here = from_here(__file__)
70
+ save_log(here('log/scraping.log'))
71
+
72
+ with patchright_page() as page:
73
+ p = wrap_page(page)
74
+
75
+ p.goto('https://example.com/demo/search')
76
+ prefecture_urls = p.ss('li > a[href^="https://example.com/demo/search/area/"]').urls
77
+
78
+ bukken_urls = []
79
+ for i, prefecture_url in enumerate(prefecture_urls, 1):
80
+ print(f'{i}/{len(prefecture_urls)} エリア一覧ページ')
81
+ page_num = 1
82
+ while True:
83
+ if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}'):
84
+ break
85
+ if not (bukken_elems := p.ss('ul li div a[href^="https://example.com"]:has(p)')):
86
+ break
87
+ bukken_urls.extend(bukken_elems.urls)
88
+ page_num += 1
89
+
90
+ for i, url in enumerate(bukken_urls, 1):
91
+ print(f'{i}/{len(bukken_urls)} 詳細ページ {url}')
92
+ if not p.goto(url):
93
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
94
+ continue
95
+
96
+ dt_grep = p.ss('h4').re.s(r'概要').next('div:has(dl)').ss('dt').re
97
+ dd_text = lambda pattern: dt_grep.s(pattern).next('dd').text
98
+
99
+ append_csv(here('csv/scrape.csv'), {
100
+ 'id': i,
101
+ 'URL': page.url,
102
+ '価格': dd_text(r'価格'),
103
+ '所在地': dd_text(r'所在地'),
104
+ '交通': dd_text(r'交通'),
105
+ '駐車場': dd_text(r'駐車場'),
106
+ '備考': dd_text(r'備考'),
107
+ '情報更新日': dd_text(r'情報更新日'),
108
+ })
109
+
110
+ page.add_style_tag(content='header, footer.site-footer { visibility: hidden !important; }')
111
+
112
+ p.ss('h4').re.s(r'概要').next('div:has(dl)').screenshot(path=here(f'media/{i}-summary.png'))
113
+
114
+ elem_iframe = p.s('iframe[src^="https://example.com"]')
115
+ elem_iframe.scroll_into_view()
116
+ time.sleep(3)
117
+ elem_iframe.screenshot(path=here(f'media/{i}-iframe.png'))
118
+
119
+ main_img_url = p.s('img.w-full.object-contain').src
120
+
121
+ img_desc_grep = p.ss('p.text-left').re.s(r'画像をクリック').next('ul').ss('li p').re
122
+ img_desc = img_desc_grep.s(r'表紙') or img_desc_grep.s(r'^(?!.*裏面).*')
123
+ img_url = img_desc.parent('li').s('a').url
124
+
125
+ if main_img_url and (res := p.goto(main_img_url)) and res.ok:
126
+ write_bytes(here(f'media/{i}-main-img.jpg'), res.body())
127
+ if img_url and (res := p.goto(img_url)) and res.ok:
128
+ write_bytes(here(f'media/{i}-img-desc.jpg'), res.body())
129
+ ```
130
+
131
+ ### スクレイピング(HTML丸ごと保存)
132
+
133
+ ```python
134
+ from domx import wrap_page
135
+ from domx.browser import camoufox_page
136
+ from domx.utils import append_csv, from_here, hash_name, save_log, write_text
137
+
138
+ here = from_here(__file__)
139
+ save_log(here('log/scraping.log'))
140
+
141
+ with camoufox_page() as page:
142
+ p = wrap_page(page)
143
+
144
+ p.goto('https://www.foobarbaz1.jp')
145
+ item_urls = p.ss('ul.items > li > a').urls
146
+
147
+ for i, url in enumerate(item_urls, 1):
148
+ print(f'item_urls {i}/{len(item_urls)}')
149
+ if not p.goto(url):
150
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
151
+ continue
152
+ file_name = f'{hash_name(url)}.html'
153
+ if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
154
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
155
+ continue
156
+ ```
157
+
158
+ ### ローカルHTMLからデータ抽出&Parquet出力
159
+
160
+ ```python
161
+ from domx import wrap_parser
162
+ from domx.utils import from_here, parse_html, save_log, write_parquet
163
+
164
+ here = from_here(__file__)
165
+ save_log(here('log/scraping.log'))
166
+
167
+ results = []
168
+ for i, file_path in enumerate(here('html').glob('*.html'),1):
169
+ print(f'html {i}')
170
+ if not (parser := parse_html(file_path)):
171
+ continue
172
+ p = wrap_parser(parser)
173
+ dts = p.ss('dt').re
174
+ results.append({
175
+ 'URL': p.url,
176
+ 'file_name': file_path.name,
177
+ '教室名': p.s('h1 .text02').text,
178
+ '住所': p.s('.item .mapText').text,
179
+ '所在地': dts.s(r'所在地').next('dd').text,
180
+ '交通': dts.s(r'交通').next('dd').text,
181
+ '物件番号': dts.s(r'物件番号').next('dd').text,
182
+ })
183
+ write_parquet(here('parquet/extract.parquet'), results)
184
+ ```
185
+
186
+ ### ローカルHTMLからデータ抽出&Parquet出力(並列処理)
187
+
188
+ ```python
189
+ from pathlib import Path
190
+
191
+ from domx import wrap_parser
192
+ from domx.utils import from_here, glob_paths, parse_html, pool_map, write_parquet
193
+
194
+ def main():
195
+ here = from_here(__file__)
196
+ html_paths = glob_paths(here('html'), '*.html')
197
+ results = [r for r in pool_map(extract, html_paths) if r]
198
+ write_parquet(here('parquet/extract.parquet'), results)
199
+
200
+ def extract(file_path: str) -> dict | None:
201
+ if not (parser := parse_html(Path(file_path))):
202
+ return None
203
+ p = wrap_parser(parser)
204
+ dts = p.ss('dt').re
205
+ return {
206
+ 'URL': p.url,
207
+ 'file_path': file_path,
208
+ '教室名': p.s('h1 .text02').text,
209
+ '住所': p.s('.item .mapText').text,
210
+ '所在地': dts.s(r'所在地').next('dd').text,
211
+ '交通': dts.s(r'交通').next('dd').text,
212
+ '価格': dts.s(r'価格').next('dd').text,
213
+ '設備・条件': dts.s(r'設備').next('dd').text,
214
+ '備考': dts.s(r'備考').next('dd').text,
215
+ }
216
+
217
+ if __name__ == '__main__':
218
+ main()
219
+ ```
220
+
221
+ ## License - ライセンス
222
+
223
+ [MIT](./LICENSE)