domx 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
domx/__init__.py ADDED
@@ -0,0 +1,39 @@
1
+ from .domx import (
2
+ ElementGrep,
3
+ ElementHandle,
4
+ Frame,
5
+ NodeGrep,
6
+ Page,
7
+ Response,
8
+ WrappedElement,
9
+ WrappedElementGroup,
10
+ WrappedFrame,
11
+ WrappedNode,
12
+ WrappedNodeGroup,
13
+ WrappedPage,
14
+ WrappedParser,
15
+ wrap_node,
16
+ wrap_node_group,
17
+ wrap_page,
18
+ wrap_parser,
19
+ )
20
+
21
+ __all__ = [
22
+ "Page",
23
+ "ElementHandle",
24
+ "Frame",
25
+ "Response",
26
+ "wrap_page",
27
+ "wrap_parser",
28
+ "wrap_node",
29
+ "wrap_node_group",
30
+ "WrappedPage",
31
+ "WrappedFrame",
32
+ "WrappedElement",
33
+ "WrappedElementGroup",
34
+ "ElementGrep",
35
+ "WrappedParser",
36
+ "WrappedNode",
37
+ "WrappedNodeGroup",
38
+ "NodeGrep",
39
+ ]
domx/browser.py ADDED
@@ -0,0 +1,39 @@
1
+ from collections.abc import Iterator
2
+ from contextlib import contextmanager
3
+
4
+ from camoufox.sync_api import Camoufox
5
+ from patchright.sync_api import Page as PatchrightPage, sync_playwright
6
+ from playwright.sync_api import Page as PlaywrightPage
7
+
8
+ Page = PatchrightPage | PlaywrightPage
9
+
10
+
11
+ _VIEWPORT_FULL_HD: dict[str, int] = {'width': 1920, 'height': 1080}
12
+
13
+
14
+ @contextmanager
15
+ def patchright_page(*, large_viewport: bool = False) -> Iterator[Page]:
16
+ with sync_playwright() as pw:
17
+ with pw.chromium.launch(
18
+ channel='chrome',
19
+ headless=False,
20
+ ) as browser:
21
+ ctx_kw: dict = {}
22
+ if large_viewport:
23
+ ctx_kw['viewport'] = _VIEWPORT_FULL_HD
24
+ with browser.new_context(**ctx_kw) as context:
25
+ page = context.new_page()
26
+ yield page
27
+
28
+
29
+ @contextmanager
30
+ def camoufox_page(*, large_viewport: bool = False) -> Iterator[Page]:
31
+ with Camoufox(
32
+ headless=False,
33
+ humanize=True,
34
+ ) as browser:
35
+ if large_viewport:
36
+ page = browser.new_page(viewport=_VIEWPORT_FULL_HD)
37
+ else:
38
+ page = browser.new_page()
39
+ yield page
domx/domx.py ADDED
@@ -0,0 +1,515 @@
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ from collections.abc import Iterator
5
+ import random
6
+ import re
7
+ import time
8
+ import unicodedata as ud
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+ from typing import Literal
12
+ from urllib.parse import urljoin
13
+
14
+ from loguru import logger
15
+ from patchright.sync_api import Frame as PatchFrame, Page as PatchPage, ElementHandle as PatchElementHandle, Response as PatchResponse
16
+ from playwright.sync_api import Frame as PlayFrame, Page as PlayPage, ElementHandle as PlayElementHandle, Response as PlayResponse
17
+ from selectolax.lexbor import LexborHTMLParser, LexborNode
18
+
19
+
20
+ Page = PatchPage | PlayPage
21
+ ElementHandle = PatchElementHandle | PlayElementHandle
22
+ Response = PatchResponse | PlayResponse
23
+ Frame = PatchFrame | PlayFrame
24
+
25
+ _DOMX_META_URL = 'domx:url'
26
+ _DOMX_META_SAVED_AT = 'domx:saved_at'
27
+
28
+ _UNUSABLE_INLINE_URL = re.compile(r'(?i)^(?:#|javascript:|mailto:|tel:|data:)')
29
+
30
+ _ELEMENT_NEXT = 'nextElementSibling'
31
+ _ELEMENT_PREV = 'previousElementSibling'
32
+ _ELEMENT_PARENT = 'parentElement'
33
+
34
+ _NODE_NEXT = 'next'
35
+ _NODE_PREV = 'prev'
36
+ _NODE_PARENT = 'parent'
37
+
38
+
39
+ def wrap_page(page: Page) -> WrappedPage:
40
+ return WrappedPage(page)
41
+
42
+
43
+ class _PageScoped:
44
+ _page: Page
45
+
46
+ def wrap_element(self, elem: ElementHandle | None) -> WrappedElement:
47
+ return WrappedElement(self._page, elem)
48
+
49
+ def wrap_element_group(self, elems: list[WrappedElement]) -> WrappedElementGroup:
50
+ return WrappedElementGroup(self._page, elems)
51
+
52
+ def wrap_frame(self, frame: Frame | None) -> WrappedFrame:
53
+ return WrappedFrame(self._page, frame)
54
+
55
+
56
+ def wrap_parser(parser: LexborHTMLParser) -> WrappedParser:
57
+ return WrappedParser(parser)
58
+
59
+
60
+ def wrap_node(node: LexborNode | None) -> WrappedNode:
61
+ return WrappedNode(node)
62
+
63
+
64
+ def wrap_node_group(nodes: list[WrappedNode]) -> WrappedNodeGroup:
65
+ return WrappedNodeGroup(nodes)
66
+
67
+
68
+ class WrappedFrame(_PageScoped):
69
+ def __init__(self, page: Page, frame: Frame | None) -> None:
70
+ self._page = page
71
+ self._frame = frame
72
+
73
+ def __bool__(self) -> bool:
74
+ return self._frame is not None
75
+
76
+ @property
77
+ def raw(self) -> Frame | None:
78
+ return self._frame
79
+
80
+ def s(self, selector: str) -> WrappedElement:
81
+ if self._frame is None:
82
+ return self.wrap_element(None)
83
+ elem = self._frame.query_selector(selector)
84
+ return self.wrap_element(elem)
85
+
86
+ def ss(self, selector: str) -> WrappedElementGroup:
87
+ if self._frame is None:
88
+ return self.wrap_element_group([])
89
+ elems = self._frame.query_selector_all(selector)
90
+ return self.wrap_element_group([self.wrap_element(e) for e in elems])
91
+
92
+ def wait(self, selector: str, state: str = 'attached', timeout: int = 15000) -> WrappedElement:
93
+ if self._frame is None:
94
+ return self.wrap_element(None)
95
+ try:
96
+ elem = self._frame.wait_for_selector(selector, state=state, timeout=timeout)
97
+ return self.wrap_element(elem)
98
+ except Exception as e:
99
+ logger.warning(
100
+ f'[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url}'
101
+ )
102
+ return self.wrap_element(None)
103
+
104
+
105
+ class WrappedPage(_PageScoped):
106
+ def __init__(self, page: Page) -> None:
107
+ self._page = page
108
+
109
+ @property
110
+ def raw(self) -> Page:
111
+ return self._page
112
+
113
+ def s(self, selector: str) -> WrappedElement:
114
+ elem = self._page.query_selector(selector)
115
+ return self.wrap_element(elem)
116
+
117
+ def ss(self, selector: str) -> WrappedElementGroup:
118
+ elems = self._page.query_selector_all(selector)
119
+ return self.wrap_element_group([self.wrap_element(e) for e in elems])
120
+
121
+ def frame(self, iframe_selector: str) -> WrappedFrame:
122
+ iframe_elem = self._page.query_selector(iframe_selector)
123
+ if iframe_elem is None:
124
+ return self.wrap_frame(None)
125
+ try:
126
+ fr = iframe_elem.content_frame()
127
+ return self.wrap_frame(fr)
128
+ except Exception as e:
129
+ logger.error(f'[frame] {type(e).__name__}: {e} | iframe_selector={iframe_selector!r}')
130
+ return self.wrap_frame(None)
131
+
132
+ def goto(
133
+ self,
134
+ url: str | None,
135
+ try_cnt: int = 3,
136
+ wait_range: tuple[float, float] = (3, 5),
137
+ sleep_after: tuple[float, float] | None = (1, 2),
138
+ ) -> Response | None:
139
+ if not url:
140
+ return None
141
+ for i in range(try_cnt):
142
+ try:
143
+ response = self._page.goto(url)
144
+ if response is not None:
145
+ if sleep_after is not None:
146
+ time.sleep(random.uniform(*sleep_after))
147
+ return response
148
+ reason = 'response is None'
149
+ except Exception as e:
150
+ reason = f'{type(e).__name__}: {e}'
151
+ logger.warning(f'[goto] {url} ({i+1}/{try_cnt}) {reason}')
152
+ if i + 1 < try_cnt:
153
+ time.sleep(random.uniform(*wait_range))
154
+ logger.error(f'[goto] giving up: {url}')
155
+ return None
156
+
157
+ def wait(self, selector: str, state: str = 'attached', timeout: int = 15000) -> WrappedElement:
158
+ try:
159
+ elem = self._page.wait_for_selector(selector, state=state, timeout=timeout)
160
+ return self.wrap_element(elem)
161
+ except Exception as e:
162
+ logger.warning(f'[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url}')
163
+ return self.wrap_element(None)
164
+
165
+ def html(self, with_url: bool = False, with_saved_at: bool = False) -> str:
166
+ content = self._page.content()
167
+ metas: list[str] = []
168
+ if with_url:
169
+ metas.append(
170
+ f'<meta name="{_DOMX_META_URL}" content="{html.escape(self._page.url)}">'
171
+ )
172
+ if with_saved_at:
173
+ ts = datetime.now(timezone.utc).isoformat()
174
+ metas.append(f'<meta name="{_DOMX_META_SAVED_AT}" content="{ts}">')
175
+ return ''.join(metas) + content
176
+
177
+
178
+ class WrappedElement(_PageScoped):
179
+ def __init__(self, page: Page, elem: ElementHandle | None) -> None:
180
+ self._page = page
181
+ self._elem = elem
182
+
183
+ def __bool__(self) -> bool:
184
+ return self._elem is not None
185
+
186
+ @property
187
+ def raw(self) -> ElementHandle | None:
188
+ return self._elem
189
+
190
+ def s(self, selector: str) -> WrappedElement:
191
+ elem = self._elem.query_selector(selector) if self._elem else None
192
+ return self.wrap_element(elem)
193
+
194
+ def ss(self, selector: str) -> WrappedElementGroup:
195
+ elems = self._elem.query_selector_all(selector) if self._elem else []
196
+ return self.wrap_element_group([self.wrap_element(e) for e in elems])
197
+
198
+ def frame(self, iframe_selector: str | None = None) -> WrappedFrame:
199
+ if self._elem is None:
200
+ return self.wrap_frame(None)
201
+ try:
202
+ if iframe_selector is None:
203
+ fr = self._elem.content_frame()
204
+ else:
205
+ iframe_elem = self._elem.query_selector(iframe_selector)
206
+ if iframe_elem is None:
207
+ return self.wrap_frame(None)
208
+ fr = iframe_elem.content_frame()
209
+ return self.wrap_frame(fr)
210
+ except Exception as e:
211
+ logger.error(
212
+ f'[frame] {type(e).__name__}: {e} | iframe_selector={iframe_selector!r}'
213
+ )
214
+ return self.wrap_frame(None)
215
+
216
+ def _walk_relative(self, selector: str, axis: str, label: str) -> WrappedElement:
217
+ if self._elem is None:
218
+ return self.wrap_element(None)
219
+ try:
220
+ elem = self._elem.evaluate_handle(
221
+ '''(el, args) => {
222
+ const [sel, axis] = args;
223
+ let cur = el[axis];
224
+ while (cur) {
225
+ if (cur.matches(sel)) return cur;
226
+ cur = cur[axis];
227
+ }
228
+ return null;
229
+ }''',
230
+ [selector, axis],
231
+ ).as_element()
232
+ return self.wrap_element(elem)
233
+ except Exception as e:
234
+ logger.error(f'[{label}] {self._elem} {type(e).__name__}: {e}')
235
+ return self.wrap_element(None)
236
+
237
+ def next(self, selector: str) -> WrappedElement:
238
+ return self._walk_relative(selector, _ELEMENT_NEXT, 'next')
239
+
240
+ def prev(self, selector: str) -> WrappedElement:
241
+ return self._walk_relative(selector, _ELEMENT_PREV, 'prev')
242
+
243
+ def parent(self, selector: str) -> WrappedElement:
244
+ return self._walk_relative(selector, _ELEMENT_PARENT, 'parent')
245
+
246
+ @property
247
+ def text(self) -> str | None:
248
+ if self._elem is None:
249
+ return None
250
+ return text if (text := self._elem.text_content()) else None
251
+
252
+ def attr(self, attr_name: str) -> str | None:
253
+ if self._elem is None:
254
+ return None
255
+ return attr if (attr := self._elem.get_attribute(attr_name)) else None
256
+
257
+ def _resolved_url_from_attr(self, attr_name: str) -> str | None:
258
+ if self._elem is None:
259
+ return None
260
+ if not (attr := self._elem.get_attribute(attr_name)):
261
+ return None
262
+ if not (a := attr.strip()):
263
+ return None
264
+ if _UNUSABLE_INLINE_URL.search(a):
265
+ return None
266
+ return urljoin(self._page.url, a)
267
+
268
+ @property
269
+ def url(self) -> str | None:
270
+ return self._resolved_url_from_attr('href')
271
+
272
+ @property
273
+ def src(self) -> str | None:
274
+ return self._resolved_url_from_attr('src')
275
+
276
+ def scroll_into_view(self) -> None:
277
+ if self._elem is None:
278
+ logger.warning('[scroll_into_view] element is None')
279
+ return
280
+ try:
281
+ self._elem.evaluate(
282
+ '''(el) => el.scrollIntoView({ behavior: "smooth", block: "center", inline: "nearest" });'''
283
+ )
284
+ self._elem.wait_for_element_state('stable')
285
+ except Exception as e:
286
+ logger.warning(f'[scroll_into_view] {type(e).__name__}: {e} | url={self._page.url!r}')
287
+
288
+ def screenshot(
289
+ self,
290
+ path: Path,
291
+ image_type: Literal['png', 'jpeg'] = 'png',
292
+ ) -> bool:
293
+ if self._elem is None:
294
+ logger.warning('[screenshot] element is None')
295
+ return False
296
+ try:
297
+ path.parent.mkdir(parents=True, exist_ok=True)
298
+ self._elem.screenshot(
299
+ path=path,
300
+ type=image_type,
301
+ animations='disabled',
302
+ )
303
+ return True
304
+ except Exception as e:
305
+ logger.warning(f'[screenshot] {type(e).__name__}: {e} | url={self._page.url!r}')
306
+ return False
307
+
308
+
309
+ class WrappedElementGroup(_PageScoped):
310
+ def __init__(self, page: Page, elems: list[WrappedElement]) -> None:
311
+ self._page = page
312
+ self._elems = elems
313
+
314
+ def __iter__(self) -> Iterator[WrappedElement]:
315
+ return iter(self._elems)
316
+
317
+ def __len__(self) -> int:
318
+ return len(self._elems)
319
+
320
+ def __getitem__(self, key: int | slice) -> WrappedElement | WrappedElementGroup:
321
+ if isinstance(key, slice):
322
+ return WrappedElementGroup(self._page, self._elems[key])
323
+ return self._elems[key]
324
+
325
+ def __add__(self, other: WrappedElementGroup) -> WrappedElementGroup:
326
+ if not isinstance(other, WrappedElementGroup):
327
+ raise TypeError(
328
+ 'WrappedElementGroup 同士のみ + で結合できます '
329
+ f'(右辺は {type(other).__name__})'
330
+ )
331
+ if self._page is not other._page:
332
+ raise ValueError('異なる Page に紐づいた WrappedElementGroup は結合できません')
333
+ return WrappedElementGroup(self._page, self._elems + other._elems)
334
+
335
+ @property
336
+ def raw(self) -> list[WrappedElement]:
337
+ return self._elems
338
+
339
+ @property
340
+ def re(self) -> ElementGrep:
341
+ pairs: list[tuple[str, WrappedElement]] = []
342
+ for e in self._elems:
343
+ if (t := e.text):
344
+ pairs.append((ud.normalize('NFKC', t), e))
345
+ return ElementGrep(self._page, pairs)
346
+
347
+ @property
348
+ def urls(self) -> list[str]:
349
+ return [u for e in self._elems if (u := e.url)]
350
+
351
+
352
+ class ElementGrep(_PageScoped):
353
+ def __init__(self, page: Page, pairs: list[tuple[str, WrappedElement]]) -> None:
354
+ self._page = page
355
+ self._pairs = pairs
356
+
357
+ def s(self, pattern: str) -> WrappedElement:
358
+ try:
359
+ prog = re.compile(pattern)
360
+ for text, e in self._pairs:
361
+ if prog.search(text):
362
+ return e
363
+ except Exception as e:
364
+ logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
365
+ return self.wrap_element(None)
366
+
367
+ def ss(self, pattern: str) -> WrappedElementGroup:
368
+ try:
369
+ prog = re.compile(pattern)
370
+ filtered = [e for text, e in self._pairs if prog.search(text)]
371
+ return self.wrap_element_group(filtered)
372
+ except Exception as e:
373
+ logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
374
+ return self.wrap_element_group([])
375
+
376
+
377
+ class WrappedParser:
378
+ def __init__(self, parser: LexborHTMLParser) -> None:
379
+ self._parser = parser
380
+
381
+ @property
382
+ def raw(self) -> LexborHTMLParser:
383
+ return self._parser
384
+
385
+ def s(self, selector: str) -> WrappedNode:
386
+ node = self._parser.css_first(selector)
387
+ return wrap_node(node)
388
+
389
+ def ss(self, selector: str) -> WrappedNodeGroup:
390
+ nodes = self._parser.css(selector)
391
+ return wrap_node_group([wrap_node(n) for n in nodes])
392
+
393
+ @property
394
+ def url(self) -> str | None:
395
+ node = self._parser.css_first(f'meta[name="{_DOMX_META_URL}"]')
396
+ if node is None:
397
+ return None
398
+ return node.attributes.get('content') or None
399
+
400
+ @property
401
+ def saved_at(self) -> str | None:
402
+ node = self._parser.css_first(f'meta[name="{_DOMX_META_SAVED_AT}"]')
403
+ if node is None:
404
+ return None
405
+ return node.attributes.get('content') or None
406
+
407
+
408
+ class WrappedNode:
409
+ def __init__(self, node: LexborNode | None) -> None:
410
+ self._node = node
411
+
412
+ def __bool__(self) -> bool:
413
+ return self._node is not None
414
+
415
+ @property
416
+ def raw(self) -> LexborNode | None:
417
+ return self._node
418
+
419
+ def s(self, selector: str) -> WrappedNode:
420
+ node = self._node.css_first(selector) if self._node else None
421
+ return wrap_node(node)
422
+
423
+ def ss(self, selector: str) -> WrappedNodeGroup:
424
+ nodes = self._node.css(selector) if self._node else []
425
+ return wrap_node_group([wrap_node(n) for n in nodes])
426
+
427
+ def _walk_relative(self, selector: str, axis: str) -> WrappedNode:
428
+ if self._node is None:
429
+ return wrap_node(None)
430
+ cur = getattr(self._node, axis)
431
+ while cur is not None:
432
+ if cur.is_element_node and cur.css_matches(selector):
433
+ return wrap_node(cur)
434
+ cur = getattr(cur, axis)
435
+ return wrap_node(None)
436
+
437
+ def next(self, selector: str) -> WrappedNode:
438
+ return self._walk_relative(selector, _NODE_NEXT)
439
+
440
+ def prev(self, selector: str) -> WrappedNode:
441
+ return self._walk_relative(selector, _NODE_PREV)
442
+
443
+ def parent(self, selector: str) -> WrappedNode:
444
+ return self._walk_relative(selector, _NODE_PARENT)
445
+
446
+ @property
447
+ def text(self) -> str | None:
448
+ if self._node is None:
449
+ return None
450
+ return text if (text := self._node.text()) else None
451
+
452
+ def attr(self, attr_name: str) -> str | None:
453
+ if self._node is None:
454
+ return None
455
+ return attr if (attr := self._node.attributes.get(attr_name)) else None
456
+
457
+
458
+ class WrappedNodeGroup:
459
+ def __init__(self, nodes: list[WrappedNode]) -> None:
460
+ self._nodes = nodes
461
+
462
+ def __iter__(self) -> Iterator[WrappedNode]:
463
+ return iter(self._nodes)
464
+
465
+ def __len__(self) -> int:
466
+ return len(self._nodes)
467
+
468
+ def __getitem__(self, key: int | slice) -> WrappedNode | WrappedNodeGroup:
469
+ if isinstance(key, slice):
470
+ return WrappedNodeGroup(self._nodes[key])
471
+ return self._nodes[key]
472
+
473
+ def __add__(self, other: WrappedNodeGroup) -> WrappedNodeGroup:
474
+ if not isinstance(other, WrappedNodeGroup):
475
+ raise TypeError(
476
+ 'WrappedNodeGroup 同士のみ + で結合できます '
477
+ f'(右辺は {type(other).__name__})'
478
+ )
479
+ return WrappedNodeGroup(self._nodes + other._nodes)
480
+
481
+ @property
482
+ def raw(self) -> list[WrappedNode]:
483
+ return self._nodes
484
+
485
+ @property
486
+ def re(self) -> NodeGrep:
487
+ pairs: list[tuple[str, WrappedNode]] = []
488
+ for n in self._nodes:
489
+ if (t := n.text):
490
+ pairs.append((ud.normalize('NFKC', t), n))
491
+ return NodeGrep(pairs)
492
+
493
+
494
+ class NodeGrep:
495
+ def __init__(self, pairs: list[tuple[str, WrappedNode]]) -> None:
496
+ self._pairs = pairs
497
+
498
+ def s(self, pattern: str) -> WrappedNode:
499
+ try:
500
+ prog = re.compile(pattern)
501
+ for text, n in self._pairs:
502
+ if prog.search(text):
503
+ return n
504
+ except Exception as e:
505
+ logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
506
+ return wrap_node(None)
507
+
508
+ def ss(self, pattern: str) -> WrappedNodeGroup:
509
+ try:
510
+ prog = re.compile(pattern)
511
+ filtered = [n for text, n in self._pairs if prog.search(text)]
512
+ return wrap_node_group(filtered)
513
+ except Exception as e:
514
+ logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
515
+ return wrap_node_group([])
domx/utils.py ADDED
@@ -0,0 +1,162 @@
1
+ import csv
2
+ import hashlib
3
+ import os
4
+ from concurrent.futures import ProcessPoolExecutor
5
+ from pathlib import Path
6
+ from typing import Callable, Iterable
7
+
8
+ from loguru import logger
9
+ from selectolax.lexbor import LexborHTMLParser
10
+ from tqdm import tqdm
11
+
12
+
13
+ def _ensure_parent(path: Path) -> None:
14
+ path.parent.mkdir(parents=True, exist_ok=True)
15
+
16
+
17
+ def parse_html(path: Path) -> LexborHTMLParser | None:
18
+ try:
19
+ return LexborHTMLParser(path.read_bytes())
20
+ except Exception as e:
21
+ logger.error(f'[parse_html] {path} {type(e).__name__}: {e}')
22
+ return None
23
+
24
+ def from_here(file: str) -> Callable[[str], Path]:
25
+ base = Path(file).resolve().parent
26
+ return lambda path: base / path
27
+
28
+ def append_csv(path: Path, row: dict) -> None:
29
+ '''``row`` を 1 行だけ CSV に追記する(ファイルが無ければ作成)。
30
+
31
+ Excel 互換のため、**ファイル新規作成時のみ先頭に UTF-8 BOM** を書く
32
+ (``utf-8-sig`` で open)。既存ファイルへの追記では BOM を書かない
33
+ (中途 BOM は不正になるため)。ファイルが新規 / 空ならヘッダ行を書く。
34
+ 列順は ``row.keys()`` の順で、2 回目以降のキーずれは検知しない
35
+ (pandas 版と同じ挙動)。
36
+ '''
37
+ try:
38
+ _ensure_parent(path)
39
+ need_header = not path.exists() or path.stat().st_size == 0
40
+ encoding = 'utf-8-sig' if need_header else 'utf-8'
41
+ with open(path, mode='a', newline='', encoding=encoding) as f:
42
+ w = csv.DictWriter(f, fieldnames=list(row.keys()))
43
+ if need_header:
44
+ w.writeheader()
45
+ w.writerow(row)
46
+ except Exception as e:
47
+ logger.error(f'[append_csv] {path} {row} {type(e).__name__}: {e}')
48
+
49
+ def write_parquet(path: Path, rows: list[dict]) -> None:
50
+ '''``rows`` を Parquet ファイルとして書き出す。
51
+
52
+ pyarrow を直接使う(pandas 非依存)。``rows`` が空ならスキップ(警告のみ)。
53
+ 列スキーマは各列の最初の non-None 値から推論されるので、**同一キーで型が
54
+ 混在するとエラーになる**ことがある点に注意。
55
+ '''
56
+ import pyarrow as pa
57
+ import pyarrow.parquet as pq
58
+
59
+ try:
60
+ if not rows:
61
+ logger.warning(f'[write_parquet] {path} no rows, skipped')
62
+ return
63
+ _ensure_parent(path)
64
+ pq.write_table(pa.Table.from_pylist(rows), path)
65
+ except Exception as e:
66
+ logger.error(f'[write_parquet] {path} {type(e).__name__}: {e}')
67
+
68
+ def hash_name(key: str) -> str:
69
+ return hashlib.md5(key.encode()).hexdigest()
70
+
71
+ def write_text(path: Path, data: str) -> bool:
72
+ try:
73
+ _ensure_parent(path)
74
+ path.write_text(data, encoding='utf-8', errors='replace')
75
+ return True
76
+ except Exception as e:
77
+ logger.error(f'[write_text] {path} {type(e).__name__}: {e}')
78
+ return False
79
+
80
+ def write_bytes(path: Path, data: bytes) -> bool:
81
+ try:
82
+ _ensure_parent(path)
83
+ path.write_bytes(data)
84
+ return True
85
+ except Exception as e:
86
+ logger.error(f'[write_bytes] {path} {type(e).__name__}: {e}')
87
+ return False
88
+
89
+ def save_log(path: Path, level: str = 'WARNING') -> None:
90
+ '''コンソール(stderr)に出るログと同じ内容を、指定ファイルにも残す。'''
91
+ _ensure_parent(path)
92
+ logger.add(path, level=level, encoding='utf-8')
93
+
94
+
95
+ class _SafeWorker:
96
+ def __init__(self, fn: Callable) -> None:
97
+ self.fn = fn
98
+
99
+ def __call__(self, x):
100
+ try:
101
+ return self.fn(x)
102
+ except Exception as e:
103
+ logger.error(f'[pool_map] {type(e).__name__}: {e}')
104
+ return None
105
+
106
+
107
+ def _auto_chunksize(n: int, workers: int | None) -> int:
108
+ '''``chunksize`` を自動で決める(``pool_map`` で未指定のとき)。
109
+
110
+ 子プロセスへは 1 件ずつより、まとめて送った方が速くなりやすい。そのまとめ数。
111
+
112
+ ``w`` は並列数。引数で決まっていなければ ``os.cpu_count()``、それも無ければ 4。
113
+ この **4** は「CPU が分からないときの仮の並列数」。式 ``n // (w * 4)`` の **4** とは別物。
114
+
115
+ ``n // (w * 4)`` の方の **4** は経験則の係数。ざっくり言うとチャンクの個数が
116
+ ``w * 4`` 前後になりやすく、負荷が均等ならワーカーあたりだいたい **4 回分の塊**
117
+ を処理するイメージ(厳密ではない)。
118
+
119
+ 例: ``n=200``, ``w=5`` なら ``200 // 20 = 10`` が chunksize。全体は 20 チャンク、
120
+ 5 人で割ると 1 人あたり平均 4 チャンク(各 10 件)。
121
+
122
+ 結果は ``min(64, …)`` で上限。塊が大きすぎると **負荷が偏りやすい**。
123
+ タスクの重さがバラバラなとき、太い塊の中に遅いのが多く入ったワーカーだけが
124
+ 長引き、他は先に終わって手待ちしがち(終盤のムラ)。塊を細かくすると配り直しの
125
+ 機会が増えて和らぎやすい。進捗バーも細かく動きやすい。
126
+
127
+ ``max(1, …)`` で下限。割り算で 0 になっても最低 1 件は送る。
128
+ '''
129
+ w = workers or os.cpu_count() or 4
130
+ return max(1, min(64, n // (w * 4)))
131
+
132
+
133
+ def pool_map[T, R](
134
+ worker: Callable[[T], R],
135
+ items: Iterable[T],
136
+ workers: int | None = None,
137
+ *,
138
+ chunksize: int | None = None,
139
+ ) -> list[R | None]:
140
+ '''``ProcessPoolExecutor`` で ``worker`` を並列実行する。
141
+
142
+ 子プロセスで例外が出た分は ``None`` で返す。全体は止めない。
143
+ 進捗バーは常に tqdm。
144
+
145
+ ``chunksize`` は子へまとめて送る件数。省略なら自動。
146
+ 進捗を細かくしたい・タスクの重さがバラバラで末尾に重いのが残る、なら ``chunksize=1``。
147
+ '''
148
+ safe = _SafeWorker(worker)
149
+ item_list = list(items)
150
+ cs = chunksize if chunksize is not None else _auto_chunksize(len(item_list), workers)
151
+ with ProcessPoolExecutor(max_workers=workers) as ex:
152
+ return list(
153
+ tqdm(ex.map(safe, item_list, chunksize=cs), total=len(item_list), unit='file')
154
+ )
155
+
156
+ def glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]:
157
+ '''
158
+ ``dir_path`` 直下で ``pattern`` に一致するパスを ``str`` のリストで返す。
159
+
160
+ ``str`` にしているのは ``pool_map`` 等のプロセスプールへ渡すとき pickle しやすくするため。
161
+ '''
162
+ return [str(p) for p in dir_path.glob(pattern)]
@@ -0,0 +1,239 @@
1
+ Metadata-Version: 2.4
2
+ Name: domx
3
+ Version: 0.1.1
4
+ Summary: 自分用・非汎用
5
+ Requires-Python: >=3.12
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: patchright>=1.40
9
+ Requires-Dist: playwright>=1.40
10
+ Requires-Dist: selectolax>=0.3
11
+ Requires-Dist: pyarrow>=14.0
12
+ Requires-Dist: camoufox>=0.4
13
+ Requires-Dist: loguru>=0.7
14
+ Requires-Dist: tqdm>=4.66
15
+
16
+ # domx
17
+
18
+ 自分用・非汎用
19
+
20
+ ## インストール
21
+
22
+ `uv add domx`
23
+ `uv run patchright install chromium`
24
+ `uv run camoufox fetch`
25
+
26
+ ## 使用例
27
+
28
+ ### スクレイピング
29
+
30
+ ```python
31
+ from domx import wrap_page
32
+ from domx.browser import patchright_page
33
+ from domx.utils import append_csv, from_here, save_log, write_bytes
34
+
35
+ here = from_here(__file__)
36
+ save_log(here('log/scraping.log'))
37
+
38
+ with patchright_page() as page:
39
+ p = wrap_page(page)
40
+
41
+ p.goto('https://www.foobarbaz1.jp')
42
+ pref_urls = p.ss('li.item > ul > li > a').urls
43
+
44
+ classroom_urls = []
45
+ for i, url in enumerate(pref_urls, 1):
46
+ print(f'pref_urls {i}/{len(pref_urls)}')
47
+ if not p.goto(url):
48
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
49
+ continue
50
+ classroom_urls.extend(p.ss('.school-area h4 a').urls)
51
+
52
+ for i, url in enumerate(classroom_urls, 1):
53
+ print(f'classroom_urls {i}/{len(classroom_urls)}')
54
+ if not p.goto(url):
55
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
56
+ continue
57
+ th_grep = p.ss('th').re
58
+ append_csv(here('csv/scrape.csv'), {
59
+ 'id': i,
60
+ 'URL': page.url,
61
+ '教室名': p.s('h1 .text01').text,
62
+ '住所': p.s('.item .mapText').text,
63
+ '電話番号': p.s('.item .phoneNumber').text,
64
+ 'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
65
+ '営業時間': th_grep.s(r'営業時間').next('td').text,
66
+ '定休日': th_grep.s(r'定休日').next('td').text,
67
+ })
68
+ p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
69
+ if (img_url := p.s('.school-area img').src):
70
+ if (res := p.goto(img_url)) and res.ok:
71
+ write_bytes(here(f'media/{i}-img.jpg'), res.body())
72
+ ```
73
+
74
+ ### スクレイピング(スクショと画像も保存)
75
+
76
+ ```python
77
+ import time
78
+ from urllib.parse import urlencode
79
+
80
+ from domx import wrap_page
81
+ from domx.browser import patchright_page
82
+ from domx.utils import save_log, append_csv, from_here, write_bytes
83
+
84
+ here = from_here(__file__)
85
+ save_log(here('log/scraping.log'))
86
+
87
+ with patchright_page() as page:
88
+ p = wrap_page(page)
89
+
90
+ p.goto('https://example.com/demo/search')
91
+ prefecture_urls = p.ss('li > a[href^="https://example.com/demo/search/area/"]').urls
92
+
93
+ bukken_urls = []
94
+ for i, prefecture_url in enumerate(prefecture_urls, 1):
95
+ print(f'{i}/{len(prefecture_urls)} エリア一覧ページ')
96
+ page_num = 1
97
+ while True:
98
+ if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}'):
99
+ break
100
+ if not (bukken_elems := p.ss('ul li div a[href^="https://example.com"]:has(p)')):
101
+ break
102
+ bukken_urls.extend(bukken_elems.urls)
103
+ page_num += 1
104
+
105
+ for i, url in enumerate(bukken_urls, 1):
106
+ print(f'{i}/{len(bukken_urls)} 詳細ページ {url}')
107
+ if not p.goto(url):
108
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
109
+ continue
110
+
111
+ dt_grep = p.ss('h4').re.s(r'概要').next('div:has(dl)').ss('dt').re
112
+ dd_text = lambda pattern: dt_grep.s(pattern).next('dd').text
113
+
114
+ append_csv(here('csv/scrape.csv'), {
115
+ 'id': i,
116
+ 'URL': page.url,
117
+ '価格': dd_text(r'価格'),
118
+ '所在地': dd_text(r'所在地'),
119
+ '交通': dd_text(r'交通'),
120
+ '駐車場': dd_text(r'駐車場'),
121
+ '備考': dd_text(r'備考'),
122
+ '情報更新日': dd_text(r'情報更新日'),
123
+ })
124
+
125
+ page.add_style_tag(content='header, footer.site-footer { visibility: hidden !important; }')
126
+
127
+ p.ss('h4').re.s(r'概要').next('div:has(dl)').screenshot(path=here(f'media/{i}-summary.png'))
128
+
129
+ elem_iframe = p.s('iframe[src^="https://example.com"]')
130
+ elem_iframe.scroll_into_view()
131
+ time.sleep(3)
132
+ elem_iframe.screenshot(path=here(f'media/{i}-iframe.png'))
133
+
134
+ main_img_url = p.s('img.w-full.object-contain').src
135
+
136
+ img_desc_grep = p.ss('p.text-left').re.s(r'画像をクリック').next('ul').ss('li p').re
137
+ img_desc = img_desc_grep.s(r'表紙') or img_desc_grep.s(r'^(?!.*裏面).*')
138
+ img_url = img_desc.parent('li').s('a').url
139
+
140
+ if main_img_url and (res := p.goto(main_img_url)) and res.ok:
141
+ write_bytes(here(f'media/{i}-main-img.jpg'), res.body())
142
+ if img_url and (res := p.goto(img_url)) and res.ok:
143
+ write_bytes(here(f'media/{i}-img-desc.jpg'), res.body())
144
+ ```
145
+
146
+ ### スクレイピング(HTML丸ごと保存)
147
+
148
+ ```python
149
+ from domx import wrap_page
150
+ from domx.browser import camoufox_page
151
+ from domx.utils import append_csv, from_here, hash_name, save_log, write_text
152
+
153
+ here = from_here(__file__)
154
+ save_log(here('log/scraping.log'))
155
+
156
+ with camoufox_page() as page:
157
+ p = wrap_page(page)
158
+
159
+ p.goto('https://www.foobarbaz1.jp')
160
+ item_urls = p.ss('ul.items > li > a').urls
161
+
162
+ for i, url in enumerate(item_urls, 1):
163
+ print(f'item_urls {i}/{len(item_urls)}')
164
+ if not p.goto(url):
165
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
166
+ continue
167
+ file_name = f'{hash_name(url)}.html'
168
+ if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
169
+ append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
170
+ continue
171
+ ```
172
+
173
+ ### ローカルHTMLからデータ抽出&Parquet出力
174
+
175
+ ```python
176
+ from domx import wrap_parser
177
+ from domx.utils import from_here, parse_html, save_log, write_parquet
178
+
179
+ here = from_here(__file__)
180
+ save_log(here('log/scraping.log'))
181
+
182
+ results = []
183
+ for i, file_path in enumerate(here('html').glob('*.html'),1):
184
+ print(f'html {i}')
185
+ if not (parser := parse_html(file_path)):
186
+ continue
187
+ p = wrap_parser(parser)
188
+ dts = p.ss('dt').re
189
+ results.append({
190
+ 'URL': p.url,
191
+ 'file_name': file_path.name,
192
+ '教室名': p.s('h1 .text02').text,
193
+ '住所': p.s('.item .mapText').text,
194
+ '所在地': dts.s(r'所在地').next('dd').text,
195
+ '交通': dts.s(r'交通').next('dd').text,
196
+ '物件番号': dts.s(r'物件番号').next('dd').text,
197
+ })
198
+ write_parquet(here('parquet/extract.parquet'), results)
199
+ ```
200
+
201
+ ### ローカルHTMLからデータ抽出&Parquet出力(並列処理)
202
+
203
+ ```python
204
+ from pathlib import Path
205
+
206
+ from domx import wrap_parser
207
+ from domx.utils import from_here, glob_paths, parse_html, pool_map, write_parquet
208
+
209
+ def main():
210
+ here = from_here(__file__)
211
+ html_paths = glob_paths(here('html'), '*.html')
212
+ results = [r for r in pool_map(extract, html_paths) if r]
213
+ write_parquet(here('parquet/extract.parquet'), results)
214
+
215
+ def extract(file_path: str) -> dict | None:
216
+ if not (parser := parse_html(Path(file_path))):
217
+ return None
218
+ p = wrap_parser(parser)
219
+ dts = p.ss('dt').re
220
+ return {
221
+ 'URL': p.url,
222
+ 'file_path': file_path,
223
+ '教室名': p.s('h1 .text02').text,
224
+ '住所': p.s('.item .mapText').text,
225
+ '所在地': dts.s(r'所在地').next('dd').text,
226
+ '交通': dts.s(r'交通').next('dd').text,
227
+ '価格': dts.s(r'価格').next('dd').text,
228
+ '設備・条件': dts.s(r'設備').next('dd').text,
229
+ '備考': dts.s(r'備考').next('dd').text,
230
+ }
231
+
232
+ if __name__ == '__main__':
233
+ main()
234
+ ```
235
+
236
+ ## License - ライセンス
237
+
238
+ [MIT](./LICENSE)
239
+
@@ -0,0 +1,8 @@
1
+ domx/__init__.py,sha256=F2csRoeqAMIz_PlKzNBZLC75GZ6Bhypj1y8oaglaVkI,692
2
+ domx/browser.py,sha256=nhhabsYOed42va8b5GMt598fG0aOkVIQtwKVPwWph8g,1206
3
+ domx/domx.py,sha256=QjgDTtTDRD0PpIL_2bUfe9abgnFaxGebYcGgyby901c,18219
4
+ domx/utils.py,sha256=gnp15-2Et6rKs7ULgFAGZmQnMvkYZ4VnxN-a1gksPEU,6676
5
+ domx-0.1.1.dist-info/licenses/LICENSE,sha256=q8ED812OTMMCwQSdHvtx6PSnmtRIotcIjKPHMmVseQI,1096
6
+ domx-0.1.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
7
+ domx-0.1.1.dist-info/METADATA,sha256=D1bZj6wntjgx5E0YuDv_d7nSox1f0zXJ-iMSbCPSLow,7876
8
+ domx-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: flit 3.12.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nishizawa Takamasa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.