quickquery 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
quickquery/core.py ADDED
@@ -0,0 +1,688 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Iterator
4
+ import random
5
+ import re
6
+ import time
7
+ import unicodedata as ud
8
+ from pathlib import Path
9
+ from typing import Literal
10
+ from urllib.parse import urljoin
11
+
12
+ from loguru import logger
13
+ from patchright.sync_api import Frame as PatchFrame, Page as PatchPage, ElementHandle as PatchElementHandle, Response as PatchResponse
14
+ from playwright.sync_api import Frame as PlayFrame, Page as PlayPage, ElementHandle as PlayElementHandle, Response as PlayResponse
15
+ from selectolax.lexbor import LexborHTMLParser, LexborNode
16
+
17
+
18
+ Page = PatchPage | PlayPage
19
+ ElementHandle = PatchElementHandle | PlayElementHandle
20
+ Response = PatchResponse | PlayResponse
21
+ Frame = PatchFrame | PlayFrame
22
+
23
+ _UNUSABLE_INLINE_URL = re.compile(r'(?i)^(?:#|javascript:|mailto:|tel:|data:)')
24
+
25
+ _ELEMENT_NEXT = 'nextElementSibling'
26
+ _ELEMENT_PREV = 'previousElementSibling'
27
+ _ELEMENT_PARENT = 'parentElement'
28
+
29
+ _NODE_NEXT = 'next'
30
+ _NODE_PREV = 'prev'
31
+ _NODE_PARENT = 'parent'
32
+
33
+
34
+ def _collect_str[T](items: list[T], getter: Callable[[T], str | None]) -> list[str]:
35
+ return [v for item in items if (v := getter(item))]
36
+
37
+
38
+ def quick_page(page: Page) -> QuickPage:
39
+ return QuickPage(page)
40
+
41
+
42
+ def quick_element(page: Page, elem: ElementHandle | None) -> QuickElement:
43
+ return QuickElement(page, elem)
44
+
45
+
46
+ def quick_element_group(page: Page, elems: list[QuickElement]) -> QuickElementGroup:
47
+ return QuickElementGroup(page, elems)
48
+
49
+
50
+ def quick_frame(page: Page, frame: Frame | None) -> QuickFrame:
51
+ return QuickFrame(page, frame)
52
+
53
+
54
+ def quick_shadow_root(page: Page, host: ElementHandle | None) -> QuickShadowRoot:
55
+ return QuickShadowRoot(page, host)
56
+
57
+
58
+ class _PageScoped:
59
+ _page: Page
60
+
61
+ def quick_element(self, elem: ElementHandle | None) -> QuickElement:
62
+ return quick_element(self._page, elem)
63
+
64
+ def quick_element_group(self, elems: list[QuickElement]) -> QuickElementGroup:
65
+ return quick_element_group(self._page, elems)
66
+
67
+ def quick_frame(self, frame: Frame | None) -> QuickFrame:
68
+ return quick_frame(self._page, frame)
69
+
70
+ def quick_shadow_root(self, host: ElementHandle | None) -> QuickShadowRoot:
71
+ return quick_shadow_root(self._page, host)
72
+
73
+
74
+ def quick_parser(parser: LexborHTMLParser) -> QuickParser:
75
+ return QuickParser(parser)
76
+
77
+
78
+ def quick_node(node: LexborNode | None) -> QuickNode:
79
+ return QuickNode(node)
80
+
81
+
82
+ def quick_node_group(nodes: list[QuickNode]) -> QuickNodeGroup:
83
+ return QuickNodeGroup(nodes)
84
+
85
+
86
+ class QuickPage(_PageScoped):
87
+ def __init__(self, page: Page) -> None:
88
+ self._page = page
89
+
90
+ @property
91
+ def raw(self) -> Page:
92
+ return self._page
93
+
94
+ def i(self, selector: str) -> QuickElement:
95
+ '''in'''
96
+ elem = self._page.query_selector(selector)
97
+ return self.quick_element(elem)
98
+
99
+ def ii(self, selector: str) -> QuickElementGroup:
100
+ '''in all'''
101
+ elems = self._page.query_selector_all(selector)
102
+ return self.quick_element_group([self.quick_element(e) for e in elems])
103
+
104
+ def goto(
105
+ self,
106
+ url: str | None,
107
+ try_cnt: int = 3,
108
+ wait_range: tuple[float, float] = (3, 5),
109
+ sleep_after: tuple[float, float] | None = (1, 2),
110
+ ) -> Response | None:
111
+ if not url:
112
+ return None
113
+ for i in range(try_cnt):
114
+ try:
115
+ response = self._page.goto(url)
116
+ if response is not None:
117
+ if sleep_after is not None:
118
+ time.sleep(random.uniform(*sleep_after))
119
+ return response
120
+ reason = 'response is None'
121
+ except Exception as e:
122
+ reason = f'{type(e).__name__}: {e}'
123
+ logger.warning(f'[goto] retry ({i+1}/{try_cnt}) {reason}: {url!r}')
124
+ if i + 1 < try_cnt:
125
+ time.sleep(random.uniform(*wait_range))
126
+ logger.error(f'[goto] retries exhausted ({try_cnt}): {url!r}')
127
+ return None
128
+
129
+ def bytes_at(self, url: str | None) -> bytes | None:
130
+ if not url:
131
+ return None
132
+ new_page = self._page.context.new_page()
133
+ try:
134
+ res = quick_page(new_page).goto(url)
135
+ if not res:
136
+ return None
137
+ if res.ok:
138
+ return res.body()
139
+ logger.warning(
140
+ f'[bytes_at] HTTP {res.status} {res.status_text!r} | url={url!r} | response_url={res.url!r}'
141
+ )
142
+ return None
143
+ except Exception as e:
144
+ logger.warning(f'[bytes_at] {type(e).__name__}: {e} | url={url!r}')
145
+ return None
146
+ finally:
147
+ new_page.close()
148
+
149
+ def w(self, selector: str, state: str = 'attached', timeout: int = 15000) -> QuickElement:
150
+ '''wait'''
151
+ try:
152
+ elem = self._page.wait_for_selector(selector, state=state, timeout=timeout)
153
+ return self.quick_element(elem)
154
+ except Exception as e:
155
+ logger.warning(f'[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url!r}')
156
+ return self.quick_element(None)
157
+
158
+
159
+ class QuickElement(_PageScoped):
160
+ def __init__(self, page: Page, elem: ElementHandle | None) -> None:
161
+ self._page = page
162
+ self._elem = elem
163
+
164
+ def __bool__(self) -> bool:
165
+ return self._elem is not None
166
+
167
+ @property
168
+ def raw(self) -> ElementHandle | None:
169
+ return self._elem
170
+
171
+ def i(self, selector: str) -> QuickElement:
172
+ '''in'''
173
+ if self._elem is None:
174
+ return self.quick_element(None)
175
+ elem = self._elem.query_selector(selector)
176
+ return self.quick_element(elem)
177
+
178
+ def ii(self, selector: str) -> QuickElementGroup:
179
+ '''in all'''
180
+ if self._elem is None:
181
+ return self.quick_element_group([])
182
+ elems = self._elem.query_selector_all(selector)
183
+ return self.quick_element_group([self.quick_element(e) for e in elems])
184
+
185
+ @property
186
+ def frame(self) -> QuickFrame:
187
+ if self._elem is None:
188
+ return self.quick_frame(None)
189
+ try:
190
+ return self.quick_frame(self._elem.content_frame())
191
+ except Exception as e:
192
+ logger.error(f'[frame] {type(e).__name__}: {e}')
193
+ return self.quick_frame(None)
194
+
195
+ @property
196
+ def shadow(self) -> QuickShadowRoot:
197
+ return self.quick_shadow_root(self._elem)
198
+
199
+ def _walk_relative(self, selector: str, axis: str, label: str) -> QuickElement:
200
+ if self._elem is None:
201
+ return self.quick_element(None)
202
+ try:
203
+ elem = self._elem.evaluate_handle(
204
+ '''(el, args) => {
205
+ const [sel, axis] = args;
206
+ let cur = el[axis];
207
+ while (cur) {
208
+ if (cur.matches(sel)) return cur;
209
+ cur = cur[axis];
210
+ }
211
+ return null;
212
+ }''',
213
+ [selector, axis],
214
+ ).as_element()
215
+ return self.quick_element(elem)
216
+ except Exception as e:
217
+ logger.error(f'[{label}] {self._elem} {type(e).__name__}: {e}')
218
+ return self.quick_element(None)
219
+
220
+ def n(self, selector: str) -> QuickElement:
221
+ '''next'''
222
+ return self._walk_relative(selector, _ELEMENT_NEXT, 'n')
223
+
224
+ def p(self, selector: str) -> QuickElement:
225
+ '''prev'''
226
+ return self._walk_relative(selector, _ELEMENT_PREV, 'p')
227
+
228
+ def o(self, selector: str) -> QuickElement:
229
+ '''out'''
230
+ return self._walk_relative(selector, _ELEMENT_PARENT, 'o')
231
+
232
+ @property
233
+ def text(self) -> str | None:
234
+ if self._elem is None:
235
+ return None
236
+ return text if (text := self._elem.text_content()) else None
237
+
238
+ def attr(self, attr_name: str) -> str | None:
239
+ if self._elem is None:
240
+ return None
241
+ return attr if (attr := self._elem.get_attribute(attr_name)) else None
242
+
243
+ def _resolved_url_from_attr(self, attr_name: str) -> str | None:
244
+ if self._elem is None:
245
+ return None
246
+ if not (attr := self._elem.get_attribute(attr_name)):
247
+ return None
248
+ if not (a := attr.strip()):
249
+ return None
250
+ if _UNUSABLE_INLINE_URL.search(a):
251
+ return None
252
+ return urljoin(self._page.url, a)
253
+
254
+ @property
255
+ def url(self) -> str | None:
256
+ return self._resolved_url_from_attr('href')
257
+
258
+ @property
259
+ def src(self) -> str | None:
260
+ return self._resolved_url_from_attr('src')
261
+
262
+ def scroll_into_view(self) -> None:
263
+ if self._elem is None:
264
+ logger.warning('[scroll_into_view] element is None')
265
+ return
266
+ try:
267
+ self._elem.evaluate(
268
+ '''(el) => el.scrollIntoView({ behavior: "smooth", block: "center", inline: "nearest" });'''
269
+ )
270
+ self._elem.wait_for_element_state('stable')
271
+ except Exception as e:
272
+ logger.warning(f'[scroll_into_view] {type(e).__name__}: {e} | url={self._page.url!r}')
273
+
274
+ @staticmethod
275
+ def _isolate_visibility_css(scope: str, attr: str) -> str:
276
+ return (
277
+ f'{scope} * {{\n'
278
+ f' visibility: hidden !important;\n'
279
+ f'}}\n'
280
+ f'[{attr}],\n'
281
+ f'[{attr}] * {{\n'
282
+ f' visibility: visible !important;\n'
283
+ f'}}\n'
284
+ )
285
+
286
+ def _isolate_apply(self, attr: str, css: str, style_id: str) -> None:
287
+ self._elem.evaluate(
288
+ '''(el, args) => {
289
+ const [attr, css, styleId] = args;
290
+ el.setAttribute(attr, '');
291
+ const s = document.createElement('style');
292
+ s.id = styleId;
293
+ s.textContent = css;
294
+ (document.head || document.documentElement).appendChild(s);
295
+ }''',
296
+ [attr, css, style_id],
297
+ )
298
+
299
+ def _isolate_remove(self, attr: str, style_id: str) -> None:
300
+ try:
301
+ self._elem.evaluate(
302
+ '''(el, args) => {
303
+ const [attr, styleId] = args;
304
+ el.removeAttribute(attr);
305
+ const node = document.getElementById(styleId);
306
+ if (node) node.remove();
307
+ }''',
308
+ [attr, style_id],
309
+ )
310
+ except Exception as e:
311
+ logger.warning(
312
+ f'[screenshot isolate cleanup] {type(e).__name__}: {e} | url={self._page.url!r}'
313
+ )
314
+
315
+ def screenshot(
316
+ self,
317
+ path: Path,
318
+ image_type: Literal['png', 'jpeg'] = 'png',
319
+ *,
320
+ isolate: bool = False,
321
+ isolate_scope: str = 'body',
322
+ isolate_attr: str = 'data-quickquery-screenshot-root',
323
+ isolate_style_id: str = 'quickquery-screenshot-isolate',
324
+ ) -> bool:
325
+ if self._elem is None:
326
+ logger.warning('[screenshot] element is None')
327
+ return False
328
+ if isolate:
329
+ style_id = f'{isolate_style_id}-{time.time_ns()}'
330
+ try:
331
+ path.parent.mkdir(parents=True, exist_ok=True)
332
+ if isolate:
333
+ css = self._isolate_visibility_css(isolate_scope, isolate_attr)
334
+ self._isolate_apply(isolate_attr, css, style_id)
335
+ self._elem.screenshot(
336
+ path=path,
337
+ type=image_type,
338
+ animations='disabled',
339
+ )
340
+ return True
341
+ except Exception as e:
342
+ logger.warning(f'[screenshot] {type(e).__name__}: {e} | url={self._page.url!r}')
343
+ return False
344
+ finally:
345
+ if isolate:
346
+ self._isolate_remove(isolate_attr, style_id)
347
+
348
+
349
+ class QuickElementGroup(_PageScoped):
350
+ def __init__(self, page: Page, elems: list[QuickElement]) -> None:
351
+ self._page = page
352
+ self._elems = elems
353
+
354
+ def __iter__(self) -> Iterator[QuickElement]:
355
+ return iter(self._elems)
356
+
357
+ def __len__(self) -> int:
358
+ return len(self._elems)
359
+
360
+ def __getitem__(self, key: int | slice) -> QuickElement | QuickElementGroup:
361
+ if isinstance(key, slice):
362
+ return QuickElementGroup(self._page, self._elems[key])
363
+ return self._elems[key]
364
+
365
+ def __add__(self, other: QuickElementGroup) -> QuickElementGroup:
366
+ if not isinstance(other, QuickElementGroup):
367
+ raise TypeError(
368
+ 'QuickElementGroup 同士のみ + で結合できます '
369
+ f'(右辺は {type(other).__name__})'
370
+ )
371
+ if self._page is not other._page:
372
+ raise ValueError('異なる Page に紐づいた QuickElementGroup は結合できません')
373
+ return QuickElementGroup(self._page, self._elems + other._elems)
374
+
375
+ @property
376
+ def raw(self) -> list[QuickElement]:
377
+ return self._elems
378
+
379
+ @property
380
+ def scan(self) -> ElementScan:
381
+ pairs: list[tuple[str, QuickElement]] = []
382
+ for e in self._elems:
383
+ if (t := e.text):
384
+ pairs.append((ud.normalize('NFKC', t), e))
385
+ return ElementScan(self._page, pairs)
386
+
387
+ @property
388
+ def texts(self) -> list[str]:
389
+ return _collect_str(self._elems, lambda e: e.text)
390
+
391
+ def attrs(self, attr_name: str) -> list[str]:
392
+ return _collect_str(self._elems, lambda e: e.attr(attr_name))
393
+
394
+ @property
395
+ def urls(self) -> list[str]:
396
+ return _collect_str(self._elems, lambda e: e.url)
397
+
398
+ @property
399
+ def srcs(self) -> list[str]:
400
+ return _collect_str(self._elems, lambda e: e.src)
401
+
402
+
403
+ class ElementScan(_PageScoped):
404
+ def __init__(self, page: Page, pairs: list[tuple[str, QuickElement]]) -> None:
405
+ self._page = page
406
+ self._pairs = pairs
407
+
408
+ def m(self, pattern: str) -> QuickElement:
409
+ '''match'''
410
+ try:
411
+ prog = re.compile(pattern)
412
+ for text, e in self._pairs:
413
+ if prog.search(text):
414
+ return e
415
+ except Exception as e:
416
+ logger.warning(f'[scan] {type(e).__name__}: {e} | pattern={pattern!r}')
417
+ return self.quick_element(None)
418
+
419
+ def mm(self, pattern: str) -> QuickElementGroup:
420
+ '''match all'''
421
+ try:
422
+ prog = re.compile(pattern)
423
+ filtered = [e for text, e in self._pairs if prog.search(text)]
424
+ return self.quick_element_group(filtered)
425
+ except Exception as e:
426
+ logger.warning(f'[scan] {type(e).__name__}: {e} | pattern={pattern!r}')
427
+ return self.quick_element_group([])
428
+
429
+
430
+ class QuickFrame(_PageScoped):
431
+ def __init__(self, page: Page, frame: Frame | None) -> None:
432
+ self._page = page
433
+ self._frame = frame
434
+
435
+ def __bool__(self) -> bool:
436
+ return self._frame is not None
437
+
438
+ @property
439
+ def raw(self) -> Frame | None:
440
+ return self._frame
441
+
442
+ def i(self, selector: str) -> QuickElement:
443
+ '''in'''
444
+ if self._frame is None:
445
+ return self.quick_element(None)
446
+ elem = self._frame.query_selector(selector)
447
+ return self.quick_element(elem)
448
+
449
+ def ii(self, selector: str) -> QuickElementGroup:
450
+ '''in all'''
451
+ if self._frame is None:
452
+ return self.quick_element_group([])
453
+ elems = self._frame.query_selector_all(selector)
454
+ return self.quick_element_group([self.quick_element(e) for e in elems])
455
+
456
+ def w(self, selector: str, state: str = 'attached', timeout: int = 15000) -> QuickElement:
457
+ '''wait'''
458
+ if self._frame is None:
459
+ return self.quick_element(None)
460
+ try:
461
+ elem = self._frame.wait_for_selector(selector, state=state, timeout=timeout)
462
+ return self.quick_element(elem)
463
+ except Exception as e:
464
+ logger.warning(
465
+ f'[wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url!r}'
466
+ )
467
+ return self.quick_element(None)
468
+
469
+
470
+ class QuickShadowRoot(_PageScoped):
471
+ def __init__(self, page: Page, host: ElementHandle | None) -> None:
472
+ self._page = page
473
+ self._host = host
474
+
475
+ def __bool__(self) -> bool:
476
+ if self._host is None:
477
+ return False
478
+ try:
479
+ return bool(self._host.evaluate('el => Boolean(el.shadowRoot)'))
480
+ except Exception as e:
481
+ logger.error(f'[shadow] {type(e).__name__}: {e}')
482
+ return False
483
+
484
+ def i(self, selector: str) -> QuickElement:
485
+ '''in'''
486
+ if not self:
487
+ return self.quick_element(None)
488
+ try:
489
+ elem = self._host.evaluate_handle(
490
+ '(el, sel) => el.shadowRoot?.querySelector(sel) ?? null',
491
+ selector,
492
+ ).as_element()
493
+ return self.quick_element(elem)
494
+ except Exception as e:
495
+ logger.error(f'[shadow i] {type(e).__name__}: {e} | selector={selector!r}')
496
+ return self.quick_element(None)
497
+
498
+ def ii(self, selector: str) -> QuickElementGroup:
499
+ '''in all'''
500
+ if not self:
501
+ return self.quick_element_group([])
502
+ try:
503
+ n = self._host.evaluate(
504
+ '(el, sel) => el.shadowRoot?.querySelectorAll(sel)?.length ?? 0',
505
+ selector,
506
+ )
507
+ elems = []
508
+ for idx in range(n):
509
+ elem = self._host.evaluate_handle(
510
+ '''(el, args) => {
511
+ const [sel, i] = args;
512
+ return el.shadowRoot.querySelectorAll(sel)[i];
513
+ }''',
514
+ [selector, idx],
515
+ ).as_element()
516
+ elems.append(self.quick_element(elem))
517
+ return self.quick_element_group(elems)
518
+ except Exception as e:
519
+ logger.error(f'[shadow ii] {type(e).__name__}: {e} | selector={selector!r}')
520
+ return self.quick_element_group([])
521
+
522
+ def w(self, selector: str, timeout: int = 15000) -> QuickElement:
523
+ '''wait (attached in shadow root only)'''
524
+ if not self:
525
+ return self.quick_element(None)
526
+ frame = self._host.owner_frame()
527
+ if frame is None:
528
+ logger.warning('[shadow wait] owner_frame is None')
529
+ return self.quick_element(None)
530
+ try:
531
+ frame.wait_for_function(
532
+ '([el, sel]) => Boolean(el.shadowRoot?.querySelector(sel))',
533
+ [self._host, selector],
534
+ timeout=timeout,
535
+ )
536
+ return self.i(selector)
537
+ except Exception as e:
538
+ logger.warning(
539
+ f'[shadow wait] {type(e).__name__}: {e} | selector={selector!r} | url={self._page.url!r}'
540
+ )
541
+ return self.quick_element(None)
542
+
543
+
544
+ class QuickParser:
545
+ def __init__(self, parser: LexborHTMLParser) -> None:
546
+ self._parser = parser
547
+
548
+ @property
549
+ def raw(self) -> LexborHTMLParser:
550
+ return self._parser
551
+
552
+ def i(self, selector: str) -> QuickNode:
553
+ '''in'''
554
+ node = self._parser.css_first(selector)
555
+ return quick_node(node)
556
+
557
+ def ii(self, selector: str) -> QuickNodeGroup:
558
+ '''in all'''
559
+ nodes = self._parser.css(selector)
560
+ return quick_node_group([quick_node(n) for n in nodes])
561
+
562
+
563
+ class QuickNode:
564
+ def __init__(self, node: LexborNode | None) -> None:
565
+ self._node = node
566
+
567
+ def __bool__(self) -> bool:
568
+ return self._node is not None
569
+
570
+ @property
571
+ def raw(self) -> LexborNode | None:
572
+ return self._node
573
+
574
+ def i(self, selector: str) -> QuickNode:
575
+ '''in'''
576
+ if self._node is None:
577
+ return quick_node(None)
578
+ node = self._node.css_first(selector)
579
+ return quick_node(node)
580
+
581
+ def ii(self, selector: str) -> QuickNodeGroup:
582
+ '''in all'''
583
+ if self._node is None:
584
+ return quick_node_group([])
585
+ nodes = self._node.css(selector)
586
+ return quick_node_group([quick_node(n) for n in nodes])
587
+
588
+ def _walk_relative(self, selector: str, axis: str) -> QuickNode:
589
+ if self._node is None:
590
+ return quick_node(None)
591
+ cur = getattr(self._node, axis)
592
+ while cur is not None:
593
+ if cur.is_element_node and cur.css_matches(selector):
594
+ return quick_node(cur)
595
+ cur = getattr(cur, axis)
596
+ return quick_node(None)
597
+
598
+ def n(self, selector: str) -> QuickNode:
599
+ '''next'''
600
+ return self._walk_relative(selector, _NODE_NEXT)
601
+
602
+ def p(self, selector: str) -> QuickNode:
603
+ '''prev'''
604
+ return self._walk_relative(selector, _NODE_PREV)
605
+
606
+ def o(self, selector: str) -> QuickNode:
607
+ '''out'''
608
+ return self._walk_relative(selector, _NODE_PARENT)
609
+
610
+ @property
611
+ def text(self) -> str | None:
612
+ if self._node is None:
613
+ return None
614
+ return text if (text := self._node.text()) else None
615
+
616
+ def attr(self, attr_name: str) -> str | None:
617
+ if self._node is None:
618
+ return None
619
+ return attr if (attr := self._node.attributes.get(attr_name)) else None
620
+
621
+
622
+ class QuickNodeGroup:
623
+ def __init__(self, nodes: list[QuickNode]) -> None:
624
+ self._nodes = nodes
625
+
626
+ def __iter__(self) -> Iterator[QuickNode]:
627
+ return iter(self._nodes)
628
+
629
+ def __len__(self) -> int:
630
+ return len(self._nodes)
631
+
632
+ def __getitem__(self, key: int | slice) -> QuickNode | QuickNodeGroup:
633
+ if isinstance(key, slice):
634
+ return QuickNodeGroup(self._nodes[key])
635
+ return self._nodes[key]
636
+
637
+ def __add__(self, other: QuickNodeGroup) -> QuickNodeGroup:
638
+ if not isinstance(other, QuickNodeGroup):
639
+ raise TypeError(
640
+ 'QuickNodeGroup 同士のみ + で結合できます '
641
+ f'(右辺は {type(other).__name__})'
642
+ )
643
+ return QuickNodeGroup(self._nodes + other._nodes)
644
+
645
+ @property
646
+ def raw(self) -> list[QuickNode]:
647
+ return self._nodes
648
+
649
+ @property
650
+ def scan(self) -> NodeScan:
651
+ pairs: list[tuple[str, QuickNode]] = []
652
+ for n in self._nodes:
653
+ if (t := n.text):
654
+ pairs.append((ud.normalize('NFKC', t), n))
655
+ return NodeScan(pairs)
656
+
657
+ @property
658
+ def texts(self) -> list[str]:
659
+ return _collect_str(self._nodes, lambda n: n.text)
660
+
661
+ def attrs(self, attr_name: str) -> list[str]:
662
+ return _collect_str(self._nodes, lambda n: n.attr(attr_name))
663
+
664
+
665
+ class NodeScan:
666
+ def __init__(self, pairs: list[tuple[str, QuickNode]]) -> None:
667
+ self._pairs = pairs
668
+
669
+ def m(self, pattern: str) -> QuickNode:
670
+ '''match'''
671
+ try:
672
+ prog = re.compile(pattern)
673
+ for text, n in self._pairs:
674
+ if prog.search(text):
675
+ return n
676
+ except Exception as e:
677
+ logger.warning(f'[scan] {type(e).__name__}: {e} | pattern={pattern!r}')
678
+ return quick_node(None)
679
+
680
+ def mm(self, pattern: str) -> QuickNodeGroup:
681
+ '''match all'''
682
+ try:
683
+ prog = re.compile(pattern)
684
+ filtered = [n for text, n in self._pairs if prog.search(text)]
685
+ return quick_node_group(filtered)
686
+ except Exception as e:
687
+ logger.warning(f'[scan] {type(e).__name__}: {e} | pattern={pattern!r}')
688
+ return quick_node_group([])