domx 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- domx/__init__.py +39 -0
- domx/browser.py +39 -0
- domx/domx.py +515 -0
- domx/utils.py +162 -0
- domx-0.1.1.dist-info/METADATA +239 -0
- domx-0.1.1.dist-info/RECORD +8 -0
- domx-0.1.1.dist-info/WHEEL +4 -0
- domx-0.1.1.dist-info/licenses/LICENSE +21 -0
domx/__init__.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from .domx import (
|
|
2
|
+
ElementGrep,
|
|
3
|
+
ElementHandle,
|
|
4
|
+
Frame,
|
|
5
|
+
NodeGrep,
|
|
6
|
+
Page,
|
|
7
|
+
Response,
|
|
8
|
+
WrappedElement,
|
|
9
|
+
WrappedElementGroup,
|
|
10
|
+
WrappedFrame,
|
|
11
|
+
WrappedNode,
|
|
12
|
+
WrappedNodeGroup,
|
|
13
|
+
WrappedPage,
|
|
14
|
+
WrappedParser,
|
|
15
|
+
wrap_node,
|
|
16
|
+
wrap_node_group,
|
|
17
|
+
wrap_page,
|
|
18
|
+
wrap_parser,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Page",
|
|
23
|
+
"ElementHandle",
|
|
24
|
+
"Frame",
|
|
25
|
+
"Response",
|
|
26
|
+
"wrap_page",
|
|
27
|
+
"wrap_parser",
|
|
28
|
+
"wrap_node",
|
|
29
|
+
"wrap_node_group",
|
|
30
|
+
"WrappedPage",
|
|
31
|
+
"WrappedFrame",
|
|
32
|
+
"WrappedElement",
|
|
33
|
+
"WrappedElementGroup",
|
|
34
|
+
"ElementGrep",
|
|
35
|
+
"WrappedParser",
|
|
36
|
+
"WrappedNode",
|
|
37
|
+
"WrappedNodeGroup",
|
|
38
|
+
"NodeGrep",
|
|
39
|
+
]
|
domx/browser.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from collections.abc import Iterator
|
|
2
|
+
from contextlib import contextmanager
|
|
3
|
+
|
|
4
|
+
from camoufox.sync_api import Camoufox
|
|
5
|
+
from patchright.sync_api import Page as PatchrightPage, sync_playwright
|
|
6
|
+
from playwright.sync_api import Page as PlaywrightPage
|
|
7
|
+
|
|
8
|
+
Page = PatchrightPage | PlaywrightPage
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
_VIEWPORT_FULL_HD: dict[str, int] = {'width': 1920, 'height': 1080}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@contextmanager
|
|
15
|
+
def patchright_page(*, large_viewport: bool = False) -> Iterator[Page]:
|
|
16
|
+
with sync_playwright() as pw:
|
|
17
|
+
with pw.chromium.launch(
|
|
18
|
+
channel='chrome',
|
|
19
|
+
headless=False,
|
|
20
|
+
) as browser:
|
|
21
|
+
ctx_kw: dict = {}
|
|
22
|
+
if large_viewport:
|
|
23
|
+
ctx_kw['viewport'] = _VIEWPORT_FULL_HD
|
|
24
|
+
with browser.new_context(**ctx_kw) as context:
|
|
25
|
+
page = context.new_page()
|
|
26
|
+
yield page
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@contextmanager
|
|
30
|
+
def camoufox_page(*, large_viewport: bool = False) -> Iterator[Page]:
|
|
31
|
+
with Camoufox(
|
|
32
|
+
headless=False,
|
|
33
|
+
humanize=True,
|
|
34
|
+
) as browser:
|
|
35
|
+
if large_viewport:
|
|
36
|
+
page = browser.new_page(viewport=_VIEWPORT_FULL_HD)
|
|
37
|
+
else:
|
|
38
|
+
page = browser.new_page()
|
|
39
|
+
yield page
|
domx/domx.py
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import html
|
|
4
|
+
from collections.abc import Iterator
|
|
5
|
+
import random
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
import unicodedata as ud
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Literal
|
|
12
|
+
from urllib.parse import urljoin
|
|
13
|
+
|
|
14
|
+
from loguru import logger
|
|
15
|
+
from patchright.sync_api import Frame as PatchFrame, Page as PatchPage, ElementHandle as PatchElementHandle, Response as PatchResponse
|
|
16
|
+
from playwright.sync_api import Frame as PlayFrame, Page as PlayPage, ElementHandle as PlayElementHandle, Response as PlayResponse
|
|
17
|
+
from selectolax.lexbor import LexborHTMLParser, LexborNode
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
Page = PatchPage | PlayPage
|
|
21
|
+
ElementHandle = PatchElementHandle | PlayElementHandle
|
|
22
|
+
Response = PatchResponse | PlayResponse
|
|
23
|
+
Frame = PatchFrame | PlayFrame
|
|
24
|
+
|
|
25
|
+
_DOMX_META_URL = 'domx:url'
|
|
26
|
+
_DOMX_META_SAVED_AT = 'domx:saved_at'
|
|
27
|
+
|
|
28
|
+
_UNUSABLE_INLINE_URL = re.compile(r'(?i)^(?:#|javascript:|mailto:|tel:|data:)')
|
|
29
|
+
|
|
30
|
+
_ELEMENT_NEXT = 'nextElementSibling'
|
|
31
|
+
_ELEMENT_PREV = 'previousElementSibling'
|
|
32
|
+
_ELEMENT_PARENT = 'parentElement'
|
|
33
|
+
|
|
34
|
+
_NODE_NEXT = 'next'
|
|
35
|
+
_NODE_PREV = 'prev'
|
|
36
|
+
_NODE_PARENT = 'parent'
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def wrap_page(page: Page) -> WrappedPage:
|
|
40
|
+
return WrappedPage(page)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class _PageScoped:
|
|
44
|
+
_page: Page
|
|
45
|
+
|
|
46
|
+
def wrap_element(self, elem: ElementHandle | None) -> WrappedElement:
|
|
47
|
+
return WrappedElement(self._page, elem)
|
|
48
|
+
|
|
49
|
+
def wrap_element_group(self, elems: list[WrappedElement]) -> WrappedElementGroup:
|
|
50
|
+
return WrappedElementGroup(self._page, elems)
|
|
51
|
+
|
|
52
|
+
def wrap_frame(self, frame: Frame | None) -> WrappedFrame:
|
|
53
|
+
return WrappedFrame(self._page, frame)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def wrap_parser(parser: LexborHTMLParser) -> WrappedParser:
|
|
57
|
+
return WrappedParser(parser)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def wrap_node(node: LexborNode | None) -> WrappedNode:
|
|
61
|
+
return WrappedNode(node)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def wrap_node_group(nodes: list[WrappedNode]) -> WrappedNodeGroup:
|
|
65
|
+
return WrappedNodeGroup(nodes)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class WrappedFrame(_PageScoped):
|
|
69
|
+
def __init__(self, page: Page, frame: Frame | None) -> None:
|
|
70
|
+
self._page = page
|
|
71
|
+
self._frame = frame
|
|
72
|
+
|
|
73
|
+
def __bool__(self) -> bool:
|
|
74
|
+
return self._frame is not None
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def raw(self) -> Frame | None:
|
|
78
|
+
return self._frame
|
|
79
|
+
|
|
80
|
+
def s(self, selector: str) -> WrappedElement:
|
|
81
|
+
if self._frame is None:
|
|
82
|
+
return self.wrap_element(None)
|
|
83
|
+
elem = self._frame.query_selector(selector)
|
|
84
|
+
return self.wrap_element(elem)
|
|
85
|
+
|
|
86
|
+
def ss(self, selector: str) -> WrappedElementGroup:
|
|
87
|
+
if self._frame is None:
|
|
88
|
+
return self.wrap_element_group([])
|
|
89
|
+
elems = self._frame.query_selector_all(selector)
|
|
90
|
+
return self.wrap_element_group([self.wrap_element(e) for e in elems])
|
|
91
|
+
|
|
92
|
+
def wait(self, selector: str, state: str = 'attached', timeout: int = 15000) -> WrappedElement:
|
|
93
|
+
if self._frame is None:
|
|
94
|
+
return self.wrap_element(None)
|
|
95
|
+
try:
|
|
96
|
+
elem = self._frame.wait_for_selector(selector, state=state, timeout=timeout)
|
|
97
|
+
return self.wrap_element(elem)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.warning(
|
|
100
|
+
f'[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url}'
|
|
101
|
+
)
|
|
102
|
+
return self.wrap_element(None)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class WrappedPage(_PageScoped):
|
|
106
|
+
def __init__(self, page: Page) -> None:
|
|
107
|
+
self._page = page
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def raw(self) -> Page:
|
|
111
|
+
return self._page
|
|
112
|
+
|
|
113
|
+
def s(self, selector: str) -> WrappedElement:
|
|
114
|
+
elem = self._page.query_selector(selector)
|
|
115
|
+
return self.wrap_element(elem)
|
|
116
|
+
|
|
117
|
+
def ss(self, selector: str) -> WrappedElementGroup:
|
|
118
|
+
elems = self._page.query_selector_all(selector)
|
|
119
|
+
return self.wrap_element_group([self.wrap_element(e) for e in elems])
|
|
120
|
+
|
|
121
|
+
def frame(self, iframe_selector: str) -> WrappedFrame:
|
|
122
|
+
iframe_elem = self._page.query_selector(iframe_selector)
|
|
123
|
+
if iframe_elem is None:
|
|
124
|
+
return self.wrap_frame(None)
|
|
125
|
+
try:
|
|
126
|
+
fr = iframe_elem.content_frame()
|
|
127
|
+
return self.wrap_frame(fr)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.error(f'[frame] {type(e).__name__}: {e} | iframe_selector={iframe_selector!r}')
|
|
130
|
+
return self.wrap_frame(None)
|
|
131
|
+
|
|
132
|
+
def goto(
|
|
133
|
+
self,
|
|
134
|
+
url: str | None,
|
|
135
|
+
try_cnt: int = 3,
|
|
136
|
+
wait_range: tuple[float, float] = (3, 5),
|
|
137
|
+
sleep_after: tuple[float, float] | None = (1, 2),
|
|
138
|
+
) -> Response | None:
|
|
139
|
+
if not url:
|
|
140
|
+
return None
|
|
141
|
+
for i in range(try_cnt):
|
|
142
|
+
try:
|
|
143
|
+
response = self._page.goto(url)
|
|
144
|
+
if response is not None:
|
|
145
|
+
if sleep_after is not None:
|
|
146
|
+
time.sleep(random.uniform(*sleep_after))
|
|
147
|
+
return response
|
|
148
|
+
reason = 'response is None'
|
|
149
|
+
except Exception as e:
|
|
150
|
+
reason = f'{type(e).__name__}: {e}'
|
|
151
|
+
logger.warning(f'[goto] {url} ({i+1}/{try_cnt}) {reason}')
|
|
152
|
+
if i + 1 < try_cnt:
|
|
153
|
+
time.sleep(random.uniform(*wait_range))
|
|
154
|
+
logger.error(f'[goto] giving up: {url}')
|
|
155
|
+
return None
|
|
156
|
+
|
|
157
|
+
def wait(self, selector: str, state: str = 'attached', timeout: int = 15000) -> WrappedElement:
|
|
158
|
+
try:
|
|
159
|
+
elem = self._page.wait_for_selector(selector, state=state, timeout=timeout)
|
|
160
|
+
return self.wrap_element(elem)
|
|
161
|
+
except Exception as e:
|
|
162
|
+
logger.warning(f'[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url}')
|
|
163
|
+
return self.wrap_element(None)
|
|
164
|
+
|
|
165
|
+
def html(self, with_url: bool = False, with_saved_at: bool = False) -> str:
|
|
166
|
+
content = self._page.content()
|
|
167
|
+
metas: list[str] = []
|
|
168
|
+
if with_url:
|
|
169
|
+
metas.append(
|
|
170
|
+
f'<meta name="{_DOMX_META_URL}" content="{html.escape(self._page.url)}">'
|
|
171
|
+
)
|
|
172
|
+
if with_saved_at:
|
|
173
|
+
ts = datetime.now(timezone.utc).isoformat()
|
|
174
|
+
metas.append(f'<meta name="{_DOMX_META_SAVED_AT}" content="{ts}">')
|
|
175
|
+
return ''.join(metas) + content
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class WrappedElement(_PageScoped):
|
|
179
|
+
def __init__(self, page: Page, elem: ElementHandle | None) -> None:
|
|
180
|
+
self._page = page
|
|
181
|
+
self._elem = elem
|
|
182
|
+
|
|
183
|
+
def __bool__(self) -> bool:
|
|
184
|
+
return self._elem is not None
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def raw(self) -> ElementHandle | None:
|
|
188
|
+
return self._elem
|
|
189
|
+
|
|
190
|
+
def s(self, selector: str) -> WrappedElement:
|
|
191
|
+
elem = self._elem.query_selector(selector) if self._elem else None
|
|
192
|
+
return self.wrap_element(elem)
|
|
193
|
+
|
|
194
|
+
def ss(self, selector: str) -> WrappedElementGroup:
|
|
195
|
+
elems = self._elem.query_selector_all(selector) if self._elem else []
|
|
196
|
+
return self.wrap_element_group([self.wrap_element(e) for e in elems])
|
|
197
|
+
|
|
198
|
+
def frame(self, iframe_selector: str | None = None) -> WrappedFrame:
|
|
199
|
+
if self._elem is None:
|
|
200
|
+
return self.wrap_frame(None)
|
|
201
|
+
try:
|
|
202
|
+
if iframe_selector is None:
|
|
203
|
+
fr = self._elem.content_frame()
|
|
204
|
+
else:
|
|
205
|
+
iframe_elem = self._elem.query_selector(iframe_selector)
|
|
206
|
+
if iframe_elem is None:
|
|
207
|
+
return self.wrap_frame(None)
|
|
208
|
+
fr = iframe_elem.content_frame()
|
|
209
|
+
return self.wrap_frame(fr)
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(
|
|
212
|
+
f'[frame] {type(e).__name__}: {e} | iframe_selector={iframe_selector!r}'
|
|
213
|
+
)
|
|
214
|
+
return self.wrap_frame(None)
|
|
215
|
+
|
|
216
|
+
def _walk_relative(self, selector: str, axis: str, label: str) -> WrappedElement:
|
|
217
|
+
if self._elem is None:
|
|
218
|
+
return self.wrap_element(None)
|
|
219
|
+
try:
|
|
220
|
+
elem = self._elem.evaluate_handle(
|
|
221
|
+
'''(el, args) => {
|
|
222
|
+
const [sel, axis] = args;
|
|
223
|
+
let cur = el[axis];
|
|
224
|
+
while (cur) {
|
|
225
|
+
if (cur.matches(sel)) return cur;
|
|
226
|
+
cur = cur[axis];
|
|
227
|
+
}
|
|
228
|
+
return null;
|
|
229
|
+
}''',
|
|
230
|
+
[selector, axis],
|
|
231
|
+
).as_element()
|
|
232
|
+
return self.wrap_element(elem)
|
|
233
|
+
except Exception as e:
|
|
234
|
+
logger.error(f'[{label}] {self._elem} {type(e).__name__}: {e}')
|
|
235
|
+
return self.wrap_element(None)
|
|
236
|
+
|
|
237
|
+
def next(self, selector: str) -> WrappedElement:
|
|
238
|
+
return self._walk_relative(selector, _ELEMENT_NEXT, 'next')
|
|
239
|
+
|
|
240
|
+
def prev(self, selector: str) -> WrappedElement:
|
|
241
|
+
return self._walk_relative(selector, _ELEMENT_PREV, 'prev')
|
|
242
|
+
|
|
243
|
+
def parent(self, selector: str) -> WrappedElement:
|
|
244
|
+
return self._walk_relative(selector, _ELEMENT_PARENT, 'parent')
|
|
245
|
+
|
|
246
|
+
@property
|
|
247
|
+
def text(self) -> str | None:
|
|
248
|
+
if self._elem is None:
|
|
249
|
+
return None
|
|
250
|
+
return text if (text := self._elem.text_content()) else None
|
|
251
|
+
|
|
252
|
+
def attr(self, attr_name: str) -> str | None:
|
|
253
|
+
if self._elem is None:
|
|
254
|
+
return None
|
|
255
|
+
return attr if (attr := self._elem.get_attribute(attr_name)) else None
|
|
256
|
+
|
|
257
|
+
def _resolved_url_from_attr(self, attr_name: str) -> str | None:
|
|
258
|
+
if self._elem is None:
|
|
259
|
+
return None
|
|
260
|
+
if not (attr := self._elem.get_attribute(attr_name)):
|
|
261
|
+
return None
|
|
262
|
+
if not (a := attr.strip()):
|
|
263
|
+
return None
|
|
264
|
+
if _UNUSABLE_INLINE_URL.search(a):
|
|
265
|
+
return None
|
|
266
|
+
return urljoin(self._page.url, a)
|
|
267
|
+
|
|
268
|
+
@property
|
|
269
|
+
def url(self) -> str | None:
|
|
270
|
+
return self._resolved_url_from_attr('href')
|
|
271
|
+
|
|
272
|
+
@property
|
|
273
|
+
def src(self) -> str | None:
|
|
274
|
+
return self._resolved_url_from_attr('src')
|
|
275
|
+
|
|
276
|
+
def scroll_into_view(self) -> None:
|
|
277
|
+
if self._elem is None:
|
|
278
|
+
logger.warning('[scroll_into_view] element is None')
|
|
279
|
+
return
|
|
280
|
+
try:
|
|
281
|
+
self._elem.evaluate(
|
|
282
|
+
'''(el) => el.scrollIntoView({ behavior: "smooth", block: "center", inline: "nearest" });'''
|
|
283
|
+
)
|
|
284
|
+
self._elem.wait_for_element_state('stable')
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.warning(f'[scroll_into_view] {type(e).__name__}: {e} | url={self._page.url!r}')
|
|
287
|
+
|
|
288
|
+
def screenshot(
|
|
289
|
+
self,
|
|
290
|
+
path: Path,
|
|
291
|
+
image_type: Literal['png', 'jpeg'] = 'png',
|
|
292
|
+
) -> bool:
|
|
293
|
+
if self._elem is None:
|
|
294
|
+
logger.warning('[screenshot] element is None')
|
|
295
|
+
return False
|
|
296
|
+
try:
|
|
297
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
298
|
+
self._elem.screenshot(
|
|
299
|
+
path=path,
|
|
300
|
+
type=image_type,
|
|
301
|
+
animations='disabled',
|
|
302
|
+
)
|
|
303
|
+
return True
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.warning(f'[screenshot] {type(e).__name__}: {e} | url={self._page.url!r}')
|
|
306
|
+
return False
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
class WrappedElementGroup(_PageScoped):
|
|
310
|
+
def __init__(self, page: Page, elems: list[WrappedElement]) -> None:
|
|
311
|
+
self._page = page
|
|
312
|
+
self._elems = elems
|
|
313
|
+
|
|
314
|
+
def __iter__(self) -> Iterator[WrappedElement]:
|
|
315
|
+
return iter(self._elems)
|
|
316
|
+
|
|
317
|
+
def __len__(self) -> int:
|
|
318
|
+
return len(self._elems)
|
|
319
|
+
|
|
320
|
+
def __getitem__(self, key: int | slice) -> WrappedElement | WrappedElementGroup:
|
|
321
|
+
if isinstance(key, slice):
|
|
322
|
+
return WrappedElementGroup(self._page, self._elems[key])
|
|
323
|
+
return self._elems[key]
|
|
324
|
+
|
|
325
|
+
def __add__(self, other: WrappedElementGroup) -> WrappedElementGroup:
|
|
326
|
+
if not isinstance(other, WrappedElementGroup):
|
|
327
|
+
raise TypeError(
|
|
328
|
+
'WrappedElementGroup 同士のみ + で結合できます '
|
|
329
|
+
f'(右辺は {type(other).__name__})'
|
|
330
|
+
)
|
|
331
|
+
if self._page is not other._page:
|
|
332
|
+
raise ValueError('異なる Page に紐づいた WrappedElementGroup は結合できません')
|
|
333
|
+
return WrappedElementGroup(self._page, self._elems + other._elems)
|
|
334
|
+
|
|
335
|
+
@property
|
|
336
|
+
def raw(self) -> list[WrappedElement]:
|
|
337
|
+
return self._elems
|
|
338
|
+
|
|
339
|
+
@property
|
|
340
|
+
def re(self) -> ElementGrep:
|
|
341
|
+
pairs: list[tuple[str, WrappedElement]] = []
|
|
342
|
+
for e in self._elems:
|
|
343
|
+
if (t := e.text):
|
|
344
|
+
pairs.append((ud.normalize('NFKC', t), e))
|
|
345
|
+
return ElementGrep(self._page, pairs)
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def urls(self) -> list[str]:
|
|
349
|
+
return [u for e in self._elems if (u := e.url)]
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
class ElementGrep(_PageScoped):
|
|
353
|
+
def __init__(self, page: Page, pairs: list[tuple[str, WrappedElement]]) -> None:
|
|
354
|
+
self._page = page
|
|
355
|
+
self._pairs = pairs
|
|
356
|
+
|
|
357
|
+
def s(self, pattern: str) -> WrappedElement:
|
|
358
|
+
try:
|
|
359
|
+
prog = re.compile(pattern)
|
|
360
|
+
for text, e in self._pairs:
|
|
361
|
+
if prog.search(text):
|
|
362
|
+
return e
|
|
363
|
+
except Exception as e:
|
|
364
|
+
logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
|
|
365
|
+
return self.wrap_element(None)
|
|
366
|
+
|
|
367
|
+
def ss(self, pattern: str) -> WrappedElementGroup:
|
|
368
|
+
try:
|
|
369
|
+
prog = re.compile(pattern)
|
|
370
|
+
filtered = [e for text, e in self._pairs if prog.search(text)]
|
|
371
|
+
return self.wrap_element_group(filtered)
|
|
372
|
+
except Exception as e:
|
|
373
|
+
logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
|
|
374
|
+
return self.wrap_element_group([])
|
|
375
|
+
|
|
376
|
+
|
|
377
|
+
class WrappedParser:
|
|
378
|
+
def __init__(self, parser: LexborHTMLParser) -> None:
|
|
379
|
+
self._parser = parser
|
|
380
|
+
|
|
381
|
+
@property
|
|
382
|
+
def raw(self) -> LexborHTMLParser:
|
|
383
|
+
return self._parser
|
|
384
|
+
|
|
385
|
+
def s(self, selector: str) -> WrappedNode:
|
|
386
|
+
node = self._parser.css_first(selector)
|
|
387
|
+
return wrap_node(node)
|
|
388
|
+
|
|
389
|
+
def ss(self, selector: str) -> WrappedNodeGroup:
|
|
390
|
+
nodes = self._parser.css(selector)
|
|
391
|
+
return wrap_node_group([wrap_node(n) for n in nodes])
|
|
392
|
+
|
|
393
|
+
@property
|
|
394
|
+
def url(self) -> str | None:
|
|
395
|
+
node = self._parser.css_first(f'meta[name="{_DOMX_META_URL}"]')
|
|
396
|
+
if node is None:
|
|
397
|
+
return None
|
|
398
|
+
return node.attributes.get('content') or None
|
|
399
|
+
|
|
400
|
+
@property
|
|
401
|
+
def saved_at(self) -> str | None:
|
|
402
|
+
node = self._parser.css_first(f'meta[name="{_DOMX_META_SAVED_AT}"]')
|
|
403
|
+
if node is None:
|
|
404
|
+
return None
|
|
405
|
+
return node.attributes.get('content') or None
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
class WrappedNode:
|
|
409
|
+
def __init__(self, node: LexborNode | None) -> None:
|
|
410
|
+
self._node = node
|
|
411
|
+
|
|
412
|
+
def __bool__(self) -> bool:
|
|
413
|
+
return self._node is not None
|
|
414
|
+
|
|
415
|
+
@property
|
|
416
|
+
def raw(self) -> LexborNode | None:
|
|
417
|
+
return self._node
|
|
418
|
+
|
|
419
|
+
def s(self, selector: str) -> WrappedNode:
|
|
420
|
+
node = self._node.css_first(selector) if self._node else None
|
|
421
|
+
return wrap_node(node)
|
|
422
|
+
|
|
423
|
+
def ss(self, selector: str) -> WrappedNodeGroup:
|
|
424
|
+
nodes = self._node.css(selector) if self._node else []
|
|
425
|
+
return wrap_node_group([wrap_node(n) for n in nodes])
|
|
426
|
+
|
|
427
|
+
def _walk_relative(self, selector: str, axis: str) -> WrappedNode:
|
|
428
|
+
if self._node is None:
|
|
429
|
+
return wrap_node(None)
|
|
430
|
+
cur = getattr(self._node, axis)
|
|
431
|
+
while cur is not None:
|
|
432
|
+
if cur.is_element_node and cur.css_matches(selector):
|
|
433
|
+
return wrap_node(cur)
|
|
434
|
+
cur = getattr(cur, axis)
|
|
435
|
+
return wrap_node(None)
|
|
436
|
+
|
|
437
|
+
def next(self, selector: str) -> WrappedNode:
|
|
438
|
+
return self._walk_relative(selector, _NODE_NEXT)
|
|
439
|
+
|
|
440
|
+
def prev(self, selector: str) -> WrappedNode:
|
|
441
|
+
return self._walk_relative(selector, _NODE_PREV)
|
|
442
|
+
|
|
443
|
+
def parent(self, selector: str) -> WrappedNode:
|
|
444
|
+
return self._walk_relative(selector, _NODE_PARENT)
|
|
445
|
+
|
|
446
|
+
@property
|
|
447
|
+
def text(self) -> str | None:
|
|
448
|
+
if self._node is None:
|
|
449
|
+
return None
|
|
450
|
+
return text if (text := self._node.text()) else None
|
|
451
|
+
|
|
452
|
+
def attr(self, attr_name: str) -> str | None:
|
|
453
|
+
if self._node is None:
|
|
454
|
+
return None
|
|
455
|
+
return attr if (attr := self._node.attributes.get(attr_name)) else None
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
class WrappedNodeGroup:
|
|
459
|
+
def __init__(self, nodes: list[WrappedNode]) -> None:
|
|
460
|
+
self._nodes = nodes
|
|
461
|
+
|
|
462
|
+
def __iter__(self) -> Iterator[WrappedNode]:
|
|
463
|
+
return iter(self._nodes)
|
|
464
|
+
|
|
465
|
+
def __len__(self) -> int:
|
|
466
|
+
return len(self._nodes)
|
|
467
|
+
|
|
468
|
+
def __getitem__(self, key: int | slice) -> WrappedNode | WrappedNodeGroup:
|
|
469
|
+
if isinstance(key, slice):
|
|
470
|
+
return WrappedNodeGroup(self._nodes[key])
|
|
471
|
+
return self._nodes[key]
|
|
472
|
+
|
|
473
|
+
def __add__(self, other: WrappedNodeGroup) -> WrappedNodeGroup:
|
|
474
|
+
if not isinstance(other, WrappedNodeGroup):
|
|
475
|
+
raise TypeError(
|
|
476
|
+
'WrappedNodeGroup 同士のみ + で結合できます '
|
|
477
|
+
f'(右辺は {type(other).__name__})'
|
|
478
|
+
)
|
|
479
|
+
return WrappedNodeGroup(self._nodes + other._nodes)
|
|
480
|
+
|
|
481
|
+
@property
|
|
482
|
+
def raw(self) -> list[WrappedNode]:
|
|
483
|
+
return self._nodes
|
|
484
|
+
|
|
485
|
+
@property
|
|
486
|
+
def re(self) -> NodeGrep:
|
|
487
|
+
pairs: list[tuple[str, WrappedNode]] = []
|
|
488
|
+
for n in self._nodes:
|
|
489
|
+
if (t := n.text):
|
|
490
|
+
pairs.append((ud.normalize('NFKC', t), n))
|
|
491
|
+
return NodeGrep(pairs)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
class NodeGrep:
|
|
495
|
+
def __init__(self, pairs: list[tuple[str, WrappedNode]]) -> None:
|
|
496
|
+
self._pairs = pairs
|
|
497
|
+
|
|
498
|
+
def s(self, pattern: str) -> WrappedNode:
|
|
499
|
+
try:
|
|
500
|
+
prog = re.compile(pattern)
|
|
501
|
+
for text, n in self._pairs:
|
|
502
|
+
if prog.search(text):
|
|
503
|
+
return n
|
|
504
|
+
except Exception as e:
|
|
505
|
+
logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
|
|
506
|
+
return wrap_node(None)
|
|
507
|
+
|
|
508
|
+
def ss(self, pattern: str) -> WrappedNodeGroup:
|
|
509
|
+
try:
|
|
510
|
+
prog = re.compile(pattern)
|
|
511
|
+
filtered = [n for text, n in self._pairs if prog.search(text)]
|
|
512
|
+
return wrap_node_group(filtered)
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.warning(f'[grep] {type(e).__name__}: {e} | pattern={pattern!r}')
|
|
515
|
+
return wrap_node_group([])
|
domx/utils.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import hashlib
|
|
3
|
+
import os
|
|
4
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Iterable
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from selectolax.lexbor import LexborHTMLParser
|
|
10
|
+
from tqdm import tqdm
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _ensure_parent(path: Path) -> None:
|
|
14
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_html(path: Path) -> LexborHTMLParser | None:
|
|
18
|
+
try:
|
|
19
|
+
return LexborHTMLParser(path.read_bytes())
|
|
20
|
+
except Exception as e:
|
|
21
|
+
logger.error(f'[parse_html] {path} {type(e).__name__}: {e}')
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
def from_here(file: str) -> Callable[[str], Path]:
|
|
25
|
+
base = Path(file).resolve().parent
|
|
26
|
+
return lambda path: base / path
|
|
27
|
+
|
|
28
|
+
def append_csv(path: Path, row: dict) -> None:
|
|
29
|
+
'''``row`` を 1 行だけ CSV に追記する(ファイルが無ければ作成)。
|
|
30
|
+
|
|
31
|
+
Excel 互換のため、**ファイル新規作成時のみ先頭に UTF-8 BOM** を書く
|
|
32
|
+
(``utf-8-sig`` で open)。既存ファイルへの追記では BOM を書かない
|
|
33
|
+
(中途 BOM は不正になるため)。ファイルが新規 / 空ならヘッダ行を書く。
|
|
34
|
+
列順は ``row.keys()`` の順で、2 回目以降のキーずれは検知しない
|
|
35
|
+
(pandas 版と同じ挙動)。
|
|
36
|
+
'''
|
|
37
|
+
try:
|
|
38
|
+
_ensure_parent(path)
|
|
39
|
+
need_header = not path.exists() or path.stat().st_size == 0
|
|
40
|
+
encoding = 'utf-8-sig' if need_header else 'utf-8'
|
|
41
|
+
with open(path, mode='a', newline='', encoding=encoding) as f:
|
|
42
|
+
w = csv.DictWriter(f, fieldnames=list(row.keys()))
|
|
43
|
+
if need_header:
|
|
44
|
+
w.writeheader()
|
|
45
|
+
w.writerow(row)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
logger.error(f'[append_csv] {path} {row} {type(e).__name__}: {e}')
|
|
48
|
+
|
|
49
|
+
def write_parquet(path: Path, rows: list[dict]) -> None:
|
|
50
|
+
'''``rows`` を Parquet ファイルとして書き出す。
|
|
51
|
+
|
|
52
|
+
pyarrow を直接使う(pandas 非依存)。``rows`` が空ならスキップ(警告のみ)。
|
|
53
|
+
列スキーマは各列の最初の non-None 値から推論されるので、**同一キーで型が
|
|
54
|
+
混在するとエラーになる**ことがある点に注意。
|
|
55
|
+
'''
|
|
56
|
+
import pyarrow as pa
|
|
57
|
+
import pyarrow.parquet as pq
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
if not rows:
|
|
61
|
+
logger.warning(f'[write_parquet] {path} no rows, skipped')
|
|
62
|
+
return
|
|
63
|
+
_ensure_parent(path)
|
|
64
|
+
pq.write_table(pa.Table.from_pylist(rows), path)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.error(f'[write_parquet] {path} {type(e).__name__}: {e}')
|
|
67
|
+
|
|
68
|
+
def hash_name(key: str) -> str:
|
|
69
|
+
return hashlib.md5(key.encode()).hexdigest()
|
|
70
|
+
|
|
71
|
+
def write_text(path: Path, data: str) -> bool:
|
|
72
|
+
try:
|
|
73
|
+
_ensure_parent(path)
|
|
74
|
+
path.write_text(data, encoding='utf-8', errors='replace')
|
|
75
|
+
return True
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f'[write_text] {path} {type(e).__name__}: {e}')
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
def write_bytes(path: Path, data: bytes) -> bool:
|
|
81
|
+
try:
|
|
82
|
+
_ensure_parent(path)
|
|
83
|
+
path.write_bytes(data)
|
|
84
|
+
return True
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.error(f'[write_bytes] {path} {type(e).__name__}: {e}')
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
def save_log(path: Path, level: str = 'WARNING') -> None:
|
|
90
|
+
'''コンソール(stderr)に出るログと同じ内容を、指定ファイルにも残す。'''
|
|
91
|
+
_ensure_parent(path)
|
|
92
|
+
logger.add(path, level=level, encoding='utf-8')
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class _SafeWorker:
|
|
96
|
+
def __init__(self, fn: Callable) -> None:
|
|
97
|
+
self.fn = fn
|
|
98
|
+
|
|
99
|
+
def __call__(self, x):
|
|
100
|
+
try:
|
|
101
|
+
return self.fn(x)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
logger.error(f'[pool_map] {type(e).__name__}: {e}')
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _auto_chunksize(n: int, workers: int | None) -> int:
|
|
108
|
+
'''``chunksize`` を自動で決める(``pool_map`` で未指定のとき)。
|
|
109
|
+
|
|
110
|
+
子プロセスへは 1 件ずつより、まとめて送った方が速くなりやすい。そのまとめ数。
|
|
111
|
+
|
|
112
|
+
``w`` は並列数。引数で決まっていなければ ``os.cpu_count()``、それも無ければ 4。
|
|
113
|
+
この **4** は「CPU が分からないときの仮の並列数」。式 ``n // (w * 4)`` の **4** とは別物。
|
|
114
|
+
|
|
115
|
+
``n // (w * 4)`` の方の **4** は経験則の係数。ざっくり言うとチャンクの個数が
|
|
116
|
+
``w * 4`` 前後になりやすく、負荷が均等ならワーカーあたりだいたい **4 回分の塊**
|
|
117
|
+
を処理するイメージ(厳密ではない)。
|
|
118
|
+
|
|
119
|
+
例: ``n=200``, ``w=5`` なら ``200 // 20 = 10`` が chunksize。全体は 20 チャンク、
|
|
120
|
+
5 人で割ると 1 人あたり平均 4 チャンク(各 10 件)。
|
|
121
|
+
|
|
122
|
+
結果は ``min(64, …)`` で上限。塊が大きすぎると **負荷が偏りやすい**。
|
|
123
|
+
タスクの重さがバラバラなとき、太い塊の中に遅いのが多く入ったワーカーだけが
|
|
124
|
+
長引き、他は先に終わって手待ちしがち(終盤のムラ)。塊を細かくすると配り直しの
|
|
125
|
+
機会が増えて和らぎやすい。進捗バーも細かく動きやすい。
|
|
126
|
+
|
|
127
|
+
``max(1, …)`` で下限。割り算で 0 になっても最低 1 件は送る。
|
|
128
|
+
'''
|
|
129
|
+
w = workers or os.cpu_count() or 4
|
|
130
|
+
return max(1, min(64, n // (w * 4)))
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def pool_map[T, R](
|
|
134
|
+
worker: Callable[[T], R],
|
|
135
|
+
items: Iterable[T],
|
|
136
|
+
workers: int | None = None,
|
|
137
|
+
*,
|
|
138
|
+
chunksize: int | None = None,
|
|
139
|
+
) -> list[R | None]:
|
|
140
|
+
'''``ProcessPoolExecutor`` で ``worker`` を並列実行する。
|
|
141
|
+
|
|
142
|
+
子プロセスで例外が出た分は ``None`` で返す。全体は止めない。
|
|
143
|
+
進捗バーは常に tqdm。
|
|
144
|
+
|
|
145
|
+
``chunksize`` は子へまとめて送る件数。省略なら自動。
|
|
146
|
+
進捗を細かくしたい・タスクの重さがバラバラで末尾に重いのが残る、なら ``chunksize=1``。
|
|
147
|
+
'''
|
|
148
|
+
safe = _SafeWorker(worker)
|
|
149
|
+
item_list = list(items)
|
|
150
|
+
cs = chunksize if chunksize is not None else _auto_chunksize(len(item_list), workers)
|
|
151
|
+
with ProcessPoolExecutor(max_workers=workers) as ex:
|
|
152
|
+
return list(
|
|
153
|
+
tqdm(ex.map(safe, item_list, chunksize=cs), total=len(item_list), unit='file')
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
def glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]:
|
|
157
|
+
'''
|
|
158
|
+
``dir_path`` 直下で ``pattern`` に一致するパスを ``str`` のリストで返す。
|
|
159
|
+
|
|
160
|
+
``str`` にしているのは ``pool_map`` 等のプロセスプールへ渡すとき pickle しやすくするため。
|
|
161
|
+
'''
|
|
162
|
+
return [str(p) for p in dir_path.glob(pattern)]
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: domx
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: 自分用・非汎用
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: patchright>=1.40
|
|
9
|
+
Requires-Dist: playwright>=1.40
|
|
10
|
+
Requires-Dist: selectolax>=0.3
|
|
11
|
+
Requires-Dist: pyarrow>=14.0
|
|
12
|
+
Requires-Dist: camoufox>=0.4
|
|
13
|
+
Requires-Dist: loguru>=0.7
|
|
14
|
+
Requires-Dist: tqdm>=4.66
|
|
15
|
+
|
|
16
|
+
# domx
|
|
17
|
+
|
|
18
|
+
自分用・非汎用
|
|
19
|
+
|
|
20
|
+
## インストール
|
|
21
|
+
|
|
22
|
+
`uv add domx`
|
|
23
|
+
`uv run patchright install chromium`
|
|
24
|
+
`uv run camoufox fetch`
|
|
25
|
+
|
|
26
|
+
## 使用例
|
|
27
|
+
|
|
28
|
+
### スクレイピング
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
from domx import wrap_page
|
|
32
|
+
from domx.browser import patchright_page
|
|
33
|
+
from domx.utils import append_csv, from_here, save_log, write_bytes
|
|
34
|
+
|
|
35
|
+
here = from_here(__file__)
|
|
36
|
+
save_log(here('log/scraping.log'))
|
|
37
|
+
|
|
38
|
+
with patchright_page() as page:
|
|
39
|
+
p = wrap_page(page)
|
|
40
|
+
|
|
41
|
+
p.goto('https://www.foobarbaz1.jp')
|
|
42
|
+
pref_urls = p.ss('li.item > ul > li > a').urls
|
|
43
|
+
|
|
44
|
+
classroom_urls = []
|
|
45
|
+
for i, url in enumerate(pref_urls, 1):
|
|
46
|
+
print(f'pref_urls {i}/{len(pref_urls)}')
|
|
47
|
+
if not p.goto(url):
|
|
48
|
+
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
|
|
49
|
+
continue
|
|
50
|
+
classroom_urls.extend(p.ss('.school-area h4 a').urls)
|
|
51
|
+
|
|
52
|
+
for i, url in enumerate(classroom_urls, 1):
|
|
53
|
+
print(f'classroom_urls {i}/{len(classroom_urls)}')
|
|
54
|
+
if not p.goto(url):
|
|
55
|
+
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
|
|
56
|
+
continue
|
|
57
|
+
th_grep = p.ss('th').re
|
|
58
|
+
append_csv(here('csv/scrape.csv'), {
|
|
59
|
+
'id': i,
|
|
60
|
+
'URL': page.url,
|
|
61
|
+
'教室名': p.s('h1 .text01').text,
|
|
62
|
+
'住所': p.s('.item .mapText').text,
|
|
63
|
+
'電話番号': p.s('.item .phoneNumber').text,
|
|
64
|
+
'HP': th_grep.s(r'ホームページ').next('td').s('a').url,
|
|
65
|
+
'営業時間': th_grep.s(r'営業時間').next('td').text,
|
|
66
|
+
'定休日': th_grep.s(r'定休日').next('td').text,
|
|
67
|
+
})
|
|
68
|
+
p.s('.school-map').screenshot(here(f'media/{i}-screenshot.png'))
|
|
69
|
+
if (img_url := p.s('.school-area img').src):
|
|
70
|
+
if (res := p.goto(img_url)) and res.ok:
|
|
71
|
+
write_bytes(here(f'media/{i}-img.jpg'), res.body())
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### スクレイピング(スクショと画像も保存)
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import time
|
|
78
|
+
from urllib.parse import urlencode
|
|
79
|
+
|
|
80
|
+
from domx import wrap_page
|
|
81
|
+
from domx.browser import patchright_page
|
|
82
|
+
from domx.utils import save_log, append_csv, from_here, write_bytes
|
|
83
|
+
|
|
84
|
+
here = from_here(__file__)
|
|
85
|
+
save_log(here('log/scraping.log'))
|
|
86
|
+
|
|
87
|
+
with patchright_page() as page:
|
|
88
|
+
p = wrap_page(page)
|
|
89
|
+
|
|
90
|
+
p.goto('https://example.com/demo/search')
|
|
91
|
+
prefecture_urls = p.ss('li > a[href^="https://example.com/demo/search/area/"]').urls
|
|
92
|
+
|
|
93
|
+
bukken_urls = []
|
|
94
|
+
for i, prefecture_url in enumerate(prefecture_urls, 1):
|
|
95
|
+
print(f'{i}/{len(prefecture_urls)} エリア一覧ページ')
|
|
96
|
+
page_num = 1
|
|
97
|
+
while True:
|
|
98
|
+
if not p.goto(f'{prefecture_url}?{urlencode({"page": page_num})}'):
|
|
99
|
+
break
|
|
100
|
+
if not (bukken_elems := p.ss('ul li div a[href^="https://example.com"]:has(p)')):
|
|
101
|
+
break
|
|
102
|
+
bukken_urls.extend(bukken_elems.urls)
|
|
103
|
+
page_num += 1
|
|
104
|
+
|
|
105
|
+
for i, url in enumerate(bukken_urls, 1):
|
|
106
|
+
print(f'{i}/{len(bukken_urls)} 詳細ページ {url}')
|
|
107
|
+
if not p.goto(url):
|
|
108
|
+
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
dt_grep = p.ss('h4').re.s(r'概要').next('div:has(dl)').ss('dt').re
|
|
112
|
+
dd_text = lambda pattern: dt_grep.s(pattern).next('dd').text
|
|
113
|
+
|
|
114
|
+
append_csv(here('csv/scrape.csv'), {
|
|
115
|
+
'id': i,
|
|
116
|
+
'URL': page.url,
|
|
117
|
+
'価格': dd_text(r'価格'),
|
|
118
|
+
'所在地': dd_text(r'所在地'),
|
|
119
|
+
'交通': dd_text(r'交通'),
|
|
120
|
+
'駐車場': dd_text(r'駐車場'),
|
|
121
|
+
'備考': dd_text(r'備考'),
|
|
122
|
+
'情報更新日': dd_text(r'情報更新日'),
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
page.add_style_tag(content='header, footer.site-footer { visibility: hidden !important; }')
|
|
126
|
+
|
|
127
|
+
p.ss('h4').re.s(r'概要').next('div:has(dl)').screenshot(path=here(f'media/{i}-summary.png'))
|
|
128
|
+
|
|
129
|
+
elem_iframe = p.s('iframe[src^="https://example.com"]')
|
|
130
|
+
elem_iframe.scroll_into_view()
|
|
131
|
+
time.sleep(3)
|
|
132
|
+
elem_iframe.screenshot(path=here(f'media/{i}-iframe.png'))
|
|
133
|
+
|
|
134
|
+
main_img_url = p.s('img.w-full.object-contain').src
|
|
135
|
+
|
|
136
|
+
img_desc_grep = p.ss('p.text-left').re.s(r'画像をクリック').next('ul').ss('li p').re
|
|
137
|
+
img_desc = img_desc_grep.s(r'表紙') or img_desc_grep.s(r'^(?!.*裏面).*')
|
|
138
|
+
img_url = img_desc.parent('li').s('a').url
|
|
139
|
+
|
|
140
|
+
if main_img_url and (res := p.goto(main_img_url)) and res.ok:
|
|
141
|
+
write_bytes(here(f'media/{i}-main-img.jpg'), res.body())
|
|
142
|
+
if img_url and (res := p.goto(img_url)) and res.ok:
|
|
143
|
+
write_bytes(here(f'media/{i}-img-desc.jpg'), res.body())
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### スクレイピング(HTML丸ごと保存)
|
|
147
|
+
|
|
148
|
+
```python
|
|
149
|
+
from domx import wrap_page
|
|
150
|
+
from domx.browser import camoufox_page
|
|
151
|
+
from domx.utils import append_csv, from_here, hash_name, save_log, write_text
|
|
152
|
+
|
|
153
|
+
here = from_here(__file__)
|
|
154
|
+
save_log(here('log/scraping.log'))
|
|
155
|
+
|
|
156
|
+
with camoufox_page() as page:
|
|
157
|
+
p = wrap_page(page)
|
|
158
|
+
|
|
159
|
+
p.goto('https://www.foobarbaz1.jp')
|
|
160
|
+
item_urls = p.ss('ul.items > li > a').urls
|
|
161
|
+
|
|
162
|
+
for i, url in enumerate(item_urls, 1):
|
|
163
|
+
print(f'item_urls {i}/{len(item_urls)}')
|
|
164
|
+
if not p.goto(url):
|
|
165
|
+
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'goto'})
|
|
166
|
+
continue
|
|
167
|
+
file_name = f'{hash_name(url)}.html'
|
|
168
|
+
if not write_text(here('html') / file_name, p.html(with_url=True, with_saved_at=True)):
|
|
169
|
+
append_csv(here('csv/failed.csv'), {'url': url, 'reason': 'write_text'})
|
|
170
|
+
continue
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### ローカルHTMLからデータ抽出&Parquet出力
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from domx import wrap_parser
|
|
177
|
+
from domx.utils import from_here, parse_html, save_log, write_parquet
|
|
178
|
+
|
|
179
|
+
here = from_here(__file__)
|
|
180
|
+
save_log(here('log/scraping.log'))
|
|
181
|
+
|
|
182
|
+
results = []
|
|
183
|
+
for i, file_path in enumerate(here('html').glob('*.html'),1):
|
|
184
|
+
print(f'html {i}')
|
|
185
|
+
if not (parser := parse_html(file_path)):
|
|
186
|
+
continue
|
|
187
|
+
p = wrap_parser(parser)
|
|
188
|
+
dts = p.ss('dt').re
|
|
189
|
+
results.append({
|
|
190
|
+
'URL': p.url,
|
|
191
|
+
'file_name': file_path.name,
|
|
192
|
+
'教室名': p.s('h1 .text02').text,
|
|
193
|
+
'住所': p.s('.item .mapText').text,
|
|
194
|
+
'所在地': dts.s(r'所在地').next('dd').text,
|
|
195
|
+
'交通': dts.s(r'交通').next('dd').text,
|
|
196
|
+
'物件番号': dts.s(r'物件番号').next('dd').text,
|
|
197
|
+
})
|
|
198
|
+
write_parquet(here('parquet/extract.parquet'), results)
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### ローカルHTMLからデータ抽出&Parquet出力(並列処理)
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
from pathlib import Path
|
|
205
|
+
|
|
206
|
+
from domx import wrap_parser
|
|
207
|
+
from domx.utils import from_here, glob_paths, parse_html, pool_map, write_parquet
|
|
208
|
+
|
|
209
|
+
def main():
|
|
210
|
+
here = from_here(__file__)
|
|
211
|
+
html_paths = glob_paths(here('html'), '*.html')
|
|
212
|
+
results = [r for r in pool_map(extract, html_paths) if r]
|
|
213
|
+
write_parquet(here('parquet/extract.parquet'), results)
|
|
214
|
+
|
|
215
|
+
def extract(file_path: str) -> dict | None:
|
|
216
|
+
if not (parser := parse_html(Path(file_path))):
|
|
217
|
+
return None
|
|
218
|
+
p = wrap_parser(parser)
|
|
219
|
+
dts = p.ss('dt').re
|
|
220
|
+
return {
|
|
221
|
+
'URL': p.url,
|
|
222
|
+
'file_path': file_path,
|
|
223
|
+
'教室名': p.s('h1 .text02').text,
|
|
224
|
+
'住所': p.s('.item .mapText').text,
|
|
225
|
+
'所在地': dts.s(r'所在地').next('dd').text,
|
|
226
|
+
'交通': dts.s(r'交通').next('dd').text,
|
|
227
|
+
'価格': dts.s(r'価格').next('dd').text,
|
|
228
|
+
'設備・条件': dts.s(r'設備').next('dd').text,
|
|
229
|
+
'備考': dts.s(r'備考').next('dd').text,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if __name__ == '__main__':
|
|
233
|
+
main()
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## License - ライセンス
|
|
237
|
+
|
|
238
|
+
[MIT](./LICENSE)
|
|
239
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
domx/__init__.py,sha256=F2csRoeqAMIz_PlKzNBZLC75GZ6Bhypj1y8oaglaVkI,692
|
|
2
|
+
domx/browser.py,sha256=nhhabsYOed42va8b5GMt598fG0aOkVIQtwKVPwWph8g,1206
|
|
3
|
+
domx/domx.py,sha256=QjgDTtTDRD0PpIL_2bUfe9abgnFaxGebYcGgyby901c,18219
|
|
4
|
+
domx/utils.py,sha256=gnp15-2Et6rKs7ULgFAGZmQnMvkYZ4VnxN-a1gksPEU,6676
|
|
5
|
+
domx-0.1.1.dist-info/licenses/LICENSE,sha256=q8ED812OTMMCwQSdHvtx6PSnmtRIotcIjKPHMmVseQI,1096
|
|
6
|
+
domx-0.1.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
7
|
+
domx-0.1.1.dist-info/METADATA,sha256=D1bZj6wntjgx5E0YuDv_d7nSox1f0zXJ-iMSbCPSLow,7876
|
|
8
|
+
domx-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nishizawa Takamasa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|