quickquery 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quickquery/__init__.py +49 -0
- quickquery/core.py +688 -0
- quickquery/live.py +195 -0
- quickquery/utils.py +199 -0
- quickquery-0.1.1.dist-info/METADATA +338 -0
- quickquery-0.1.1.dist-info/RECORD +8 -0
- quickquery-0.1.1.dist-info/WHEEL +4 -0
- quickquery-0.1.1.dist-info/licenses/LICENSE +21 -0
quickquery/live.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
from contextlib import ExitStack
|
|
2
|
+
from dataclasses import dataclass, fields
|
|
3
|
+
from types import TracebackType
|
|
4
|
+
from typing import Any, Self
|
|
5
|
+
|
|
6
|
+
from camoufox.sync_api import Camoufox
|
|
7
|
+
from patchright.sync_api import (
|
|
8
|
+
Page as PatchrightPage,
|
|
9
|
+
Playwright,
|
|
10
|
+
sync_playwright,
|
|
11
|
+
)
|
|
12
|
+
from playwright.sync_api import Page as PlaywrightPage
|
|
13
|
+
|
|
14
|
+
Page = PatchrightPage | PlaywrightPage
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True, slots=True)
|
|
18
|
+
class RecycleEvery:
|
|
19
|
+
browser: int | None = None
|
|
20
|
+
context: int | None = None
|
|
21
|
+
page: int | None = None
|
|
22
|
+
|
|
23
|
+
def __post_init__(self) -> None:
|
|
24
|
+
for f in fields(self):
|
|
25
|
+
value = getattr(self, f.name)
|
|
26
|
+
if value is not None and value < 1:
|
|
27
|
+
raise ValueError(f'{f.name} は 1 以上で指定してください (got {value})')
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class _SessionBase:
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
browser_options: dict[str, Any] | None = None,
|
|
35
|
+
context_options: dict[str, Any] | None = None,
|
|
36
|
+
recycle: RecycleEvery | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self._recycle = recycle or RecycleEvery()
|
|
39
|
+
self._browser_options = dict(browser_options or {})
|
|
40
|
+
self._context_options = dict(context_options or {})
|
|
41
|
+
self._browser = None
|
|
42
|
+
self._context = None
|
|
43
|
+
self._page: Page | None = None
|
|
44
|
+
self._page_calls = 0
|
|
45
|
+
self._entered = False
|
|
46
|
+
|
|
47
|
+
def page(self) -> Page:
|
|
48
|
+
if not self._entered:
|
|
49
|
+
raise RuntimeError('with ブロックの外で page() を呼べません')
|
|
50
|
+
if self._page is None:
|
|
51
|
+
self._open_browser()
|
|
52
|
+
elif (b := self._recycle.browser) and self._page_calls % b == 0:
|
|
53
|
+
self._close_browser()
|
|
54
|
+
self._open_browser()
|
|
55
|
+
elif (c := self._recycle.context) and self._page_calls % c == 0:
|
|
56
|
+
self._close_context()
|
|
57
|
+
self._open_context()
|
|
58
|
+
elif (p := self._recycle.page) and self._page_calls % p == 0:
|
|
59
|
+
self._close_page()
|
|
60
|
+
self._open_page()
|
|
61
|
+
self._page_calls += 1
|
|
62
|
+
return self._page
|
|
63
|
+
|
|
64
|
+
def _open_page(self) -> None:
|
|
65
|
+
self._page = self._context.new_page()
|
|
66
|
+
|
|
67
|
+
def _close_page(self) -> None:
|
|
68
|
+
if self._page is not None:
|
|
69
|
+
self._page.close()
|
|
70
|
+
self._page = None
|
|
71
|
+
|
|
72
|
+
def _open_context(self) -> None:
|
|
73
|
+
self._context = self._browser.new_context(**self._context_options)
|
|
74
|
+
self._open_page()
|
|
75
|
+
|
|
76
|
+
def _close_context(self) -> None:
|
|
77
|
+
self._close_page()
|
|
78
|
+
if self._context is not None:
|
|
79
|
+
self._context.close()
|
|
80
|
+
self._context = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class PatchrightSession(_SessionBase):
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
*,
|
|
87
|
+
browser_options: dict[str, Any] | None = None,
|
|
88
|
+
context_options: dict[str, Any] | None = None,
|
|
89
|
+
recycle: RecycleEvery | None = None,
|
|
90
|
+
) -> None:
|
|
91
|
+
super().__init__(
|
|
92
|
+
browser_options=browser_options,
|
|
93
|
+
context_options=context_options,
|
|
94
|
+
recycle=recycle,
|
|
95
|
+
)
|
|
96
|
+
self._pw: Playwright | None = None
|
|
97
|
+
|
|
98
|
+
def __enter__(self) -> Self:
|
|
99
|
+
self._pw = sync_playwright().start()
|
|
100
|
+
self._entered = True
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
def __exit__(
|
|
104
|
+
self,
|
|
105
|
+
exc_type: type[BaseException] | None,
|
|
106
|
+
exc: BaseException | None,
|
|
107
|
+
tb: TracebackType | None,
|
|
108
|
+
) -> None:
|
|
109
|
+
if not self._entered:
|
|
110
|
+
return
|
|
111
|
+
self._close_browser()
|
|
112
|
+
self._pw.stop()
|
|
113
|
+
self._pw = None
|
|
114
|
+
self._entered = False
|
|
115
|
+
self._page_calls = 0
|
|
116
|
+
|
|
117
|
+
def _open_browser(self) -> None:
|
|
118
|
+
self._browser = self._pw.chromium.launch(**self._browser_options)
|
|
119
|
+
self._open_context()
|
|
120
|
+
|
|
121
|
+
def _close_browser(self) -> None:
|
|
122
|
+
self._close_context()
|
|
123
|
+
if self._browser is not None:
|
|
124
|
+
self._browser.close()
|
|
125
|
+
self._browser = None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class CamoufoxSession(_SessionBase):
|
|
129
|
+
def __init__(
|
|
130
|
+
self,
|
|
131
|
+
*,
|
|
132
|
+
browser_options: dict[str, Any] | None = None,
|
|
133
|
+
context_options: dict[str, Any] | None = None,
|
|
134
|
+
recycle: RecycleEvery | None = None,
|
|
135
|
+
) -> None:
|
|
136
|
+
super().__init__(
|
|
137
|
+
browser_options=browser_options,
|
|
138
|
+
context_options=context_options,
|
|
139
|
+
recycle=recycle,
|
|
140
|
+
)
|
|
141
|
+
self._stack: ExitStack | None = None
|
|
142
|
+
|
|
143
|
+
def __enter__(self) -> Self:
|
|
144
|
+
self._entered = True
|
|
145
|
+
return self
|
|
146
|
+
|
|
147
|
+
def __exit__(
|
|
148
|
+
self,
|
|
149
|
+
exc_type: type[BaseException] | None,
|
|
150
|
+
exc: BaseException | None,
|
|
151
|
+
tb: TracebackType | None,
|
|
152
|
+
) -> None:
|
|
153
|
+
if not self._entered:
|
|
154
|
+
return
|
|
155
|
+
self._close_browser()
|
|
156
|
+
self._entered = False
|
|
157
|
+
self._page_calls = 0
|
|
158
|
+
|
|
159
|
+
def _open_browser(self) -> None:
|
|
160
|
+
self._stack = ExitStack()
|
|
161
|
+
self._browser = self._stack.enter_context(Camoufox(**self._browser_options))
|
|
162
|
+
self._open_context()
|
|
163
|
+
|
|
164
|
+
def _close_browser(self) -> None:
|
|
165
|
+
self._close_context()
|
|
166
|
+
if self._stack is not None:
|
|
167
|
+
self._stack.close()
|
|
168
|
+
self._stack = None
|
|
169
|
+
self._browser = None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def open_patchright(
|
|
173
|
+
*,
|
|
174
|
+
browser_options: dict[str, Any] | None = None,
|
|
175
|
+
context_options: dict[str, Any] | None = None,
|
|
176
|
+
recycle: RecycleEvery | None = None,
|
|
177
|
+
) -> PatchrightSession:
|
|
178
|
+
return PatchrightSession(
|
|
179
|
+
browser_options=browser_options,
|
|
180
|
+
context_options=context_options,
|
|
181
|
+
recycle=recycle,
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def open_camoufox(
|
|
186
|
+
*,
|
|
187
|
+
browser_options: dict[str, Any] | None = None,
|
|
188
|
+
context_options: dict[str, Any] | None = None,
|
|
189
|
+
recycle: RecycleEvery | None = None,
|
|
190
|
+
) -> CamoufoxSession:
|
|
191
|
+
return CamoufoxSession(
|
|
192
|
+
browser_options=browser_options,
|
|
193
|
+
context_options=context_options,
|
|
194
|
+
recycle=recycle,
|
|
195
|
+
)
|
quickquery/utils.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import hashlib
|
|
3
|
+
import html
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Callable, Iterable, Iterator
|
|
9
|
+
|
|
10
|
+
from loguru import logger
|
|
11
|
+
from selectolax.lexbor import LexborHTMLParser
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _ensure_parent(path: Path) -> None:
|
|
16
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_html(path: Path) -> LexborHTMLParser | None:
|
|
20
|
+
try:
|
|
21
|
+
return LexborHTMLParser(path.read_bytes())
|
|
22
|
+
except Exception as e:
|
|
23
|
+
logger.error(f'[parse_html] {path} {type(e).__name__}: {e}')
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def meta_html(meta: Mapping[str, object | None]) -> str:
|
|
28
|
+
return ''.join(
|
|
29
|
+
f'<meta name="{html.escape(name)}" content="{html.escape(str(content))}">'
|
|
30
|
+
for name, content in meta.items()
|
|
31
|
+
if content is not None
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def from_here(file: str) -> Callable[[str], Path]:
|
|
36
|
+
base = Path(file).resolve().parent
|
|
37
|
+
return lambda path: base / path
|
|
38
|
+
|
|
39
|
+
def append_csv(path: Path, row: dict) -> None:
|
|
40
|
+
'''``row`` を 1 行だけ CSV に追記する(ファイルが無ければ作成)。
|
|
41
|
+
|
|
42
|
+
Excel 互換のため、**ファイル新規作成時のみ先頭に UTF-8 BOM** を書く
|
|
43
|
+
(``utf-8-sig`` で open)。既存ファイルへの追記では BOM を書かない
|
|
44
|
+
(中途 BOM は不正になるため)。ファイルが新規 / 空ならヘッダ行を書く。
|
|
45
|
+
列順は ``row.keys()`` の順で、2 回目以降のキーずれは検知しない
|
|
46
|
+
(pandas 版と同じ挙動)。
|
|
47
|
+
'''
|
|
48
|
+
try:
|
|
49
|
+
_ensure_parent(path)
|
|
50
|
+
need_header = not path.exists() or path.stat().st_size == 0
|
|
51
|
+
encoding = 'utf-8-sig' if need_header else 'utf-8'
|
|
52
|
+
with open(path, mode='a', newline='', encoding=encoding) as f:
|
|
53
|
+
w = csv.DictWriter(f, fieldnames=list(row.keys()))
|
|
54
|
+
if need_header:
|
|
55
|
+
w.writeheader()
|
|
56
|
+
w.writerow(row)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
logger.error(f'[append_csv] {path} {row} {type(e).__name__}: {e}')
|
|
59
|
+
|
|
60
|
+
def write_csv(path: Path, rows: list[dict]) -> None:
|
|
61
|
+
'''``rows`` を CSV ファイルとして書き出す(上書き)。
|
|
62
|
+
|
|
63
|
+
Excel 互換のため UTF-8 BOM(``utf-8-sig``)とヘッダ行を付ける。
|
|
64
|
+
``rows`` が空ならスキップ(警告のみ)。列順は先頭行の ``keys()`` の順で、
|
|
65
|
+
2 回目以降のキーずれは検知しない(``append_csv`` と同じ)。
|
|
66
|
+
'''
|
|
67
|
+
try:
|
|
68
|
+
if not rows:
|
|
69
|
+
logger.warning(f'[write_csv] {path} no rows, skipped')
|
|
70
|
+
return
|
|
71
|
+
_ensure_parent(path)
|
|
72
|
+
with open(path, mode='w', newline='', encoding='utf-8-sig') as f:
|
|
73
|
+
w = csv.DictWriter(f, fieldnames=list(rows[0].keys()))
|
|
74
|
+
w.writeheader()
|
|
75
|
+
w.writerows(rows)
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f'[write_csv] {path} {type(e).__name__}: {e}')
|
|
78
|
+
|
|
79
|
+
def write_parquet(path: Path, rows: list[dict]) -> None:
|
|
80
|
+
'''``rows`` を Parquet ファイルとして書き出す。
|
|
81
|
+
|
|
82
|
+
pyarrow を直接使う(pandas 非依存)。``rows`` が空ならスキップ(警告のみ)。
|
|
83
|
+
列スキーマは各列の最初の non-None 値から推論されるので、**同一キーで型が
|
|
84
|
+
混在するとエラーになる**ことがある点に注意。
|
|
85
|
+
'''
|
|
86
|
+
import pyarrow as pa
|
|
87
|
+
import pyarrow.parquet as pq
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
if not rows:
|
|
91
|
+
logger.warning(f'[write_parquet] {path} no rows, skipped')
|
|
92
|
+
return
|
|
93
|
+
_ensure_parent(path)
|
|
94
|
+
pq.write_table(pa.Table.from_pylist(rows), path)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.error(f'[write_parquet] {path} {type(e).__name__}: {e}')
|
|
97
|
+
|
|
98
|
+
def hash_name(key: str) -> str:
|
|
99
|
+
return hashlib.md5(key.encode()).hexdigest()
|
|
100
|
+
|
|
101
|
+
def write_text(path: Path, data: str) -> bool:
|
|
102
|
+
try:
|
|
103
|
+
_ensure_parent(path)
|
|
104
|
+
path.write_text(data, encoding='utf-8', errors='replace')
|
|
105
|
+
return True
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(f'[write_text] {path} {type(e).__name__}: {e}')
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
def write_bytes(path: Path, data: bytes) -> bool:
|
|
111
|
+
try:
|
|
112
|
+
_ensure_parent(path)
|
|
113
|
+
path.write_bytes(data)
|
|
114
|
+
return True
|
|
115
|
+
except Exception as e:
|
|
116
|
+
logger.error(f'[write_bytes] {path} {type(e).__name__}: {e}')
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
def save_log(path: Path, level: str = 'WARNING') -> None:
|
|
120
|
+
'''コンソール(stderr)に出るログと同じ内容を、指定ファイルにも残す。'''
|
|
121
|
+
_ensure_parent(path)
|
|
122
|
+
logger.add(path, level=level, encoding='utf-8')
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
class _SafeWorker:
|
|
126
|
+
def __init__(self, fn: Callable) -> None:
|
|
127
|
+
self.fn = fn
|
|
128
|
+
|
|
129
|
+
def __call__(self, x):
|
|
130
|
+
try:
|
|
131
|
+
return self.fn(x)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error(f'[process_map] {type(e).__name__}: {e}')
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _auto_chunksize(n: int, workers: int | None) -> int:
|
|
138
|
+
'''``chunksize`` を自動で決める(``process_map`` で未指定のとき)。
|
|
139
|
+
|
|
140
|
+
子プロセスへは 1 件ずつより、まとめて送った方が速くなりやすい。そのまとめ数。
|
|
141
|
+
|
|
142
|
+
``w`` は並列数。引数で決まっていなければ ``os.cpu_count()``、それも無ければ 4。
|
|
143
|
+
この **4** は「CPU が分からないときの仮の並列数」。式 ``n // (w * 4)`` の **4** とは別物。
|
|
144
|
+
|
|
145
|
+
``n // (w * 4)`` の方の **4** は経験則の係数。ざっくり言うとチャンクの個数が
|
|
146
|
+
``w * 4`` 前後になりやすく、負荷が均等ならワーカーあたりだいたい **4 回分の塊**
|
|
147
|
+
を処理するイメージ(厳密ではない)。
|
|
148
|
+
|
|
149
|
+
例: ``n=200``, ``w=5`` なら ``200 // 20 = 10`` が chunksize。全体は 20 チャンク、
|
|
150
|
+
5 人で割ると 1 人あたり平均 4 チャンク(各 10 件)。
|
|
151
|
+
|
|
152
|
+
結果は ``min(64, …)`` で上限。塊が大きすぎると **負荷が偏りやすい**。
|
|
153
|
+
タスクの重さがバラバラなとき、太い塊の中に遅いのが多く入ったワーカーだけが
|
|
154
|
+
長引き、他は先に終わって手待ちしがち(終盤のムラ)。塊を細かくすると配り直しの
|
|
155
|
+
機会が増えて和らぎやすい。進捗バーも細かく動きやすい。
|
|
156
|
+
|
|
157
|
+
``max(1, …)`` で下限。割り算で 0 になっても最低 1 件は送る。
|
|
158
|
+
'''
|
|
159
|
+
w = workers or os.cpu_count() or 4
|
|
160
|
+
return max(1, min(64, n // (w * 4)))
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def process_map[T, R](
|
|
164
|
+
worker: Callable[[T], R],
|
|
165
|
+
items: Iterable[T],
|
|
166
|
+
workers: int | None = None,
|
|
167
|
+
*,
|
|
168
|
+
chunksize: int | None = None,
|
|
169
|
+
) -> list[R | None]:
|
|
170
|
+
'''``ProcessPoolExecutor`` で ``worker`` を並列実行する。
|
|
171
|
+
|
|
172
|
+
子プロセスで例外が出た分は ``None`` で返す。全体は止めない。
|
|
173
|
+
進捗バーは常に tqdm。
|
|
174
|
+
|
|
175
|
+
``chunksize`` は子へまとめて送る件数。省略なら自動。
|
|
176
|
+
進捗を細かくしたい・タスクの重さがバラバラで末尾に重いのが残る、なら ``chunksize=1``。
|
|
177
|
+
'''
|
|
178
|
+
safe = _SafeWorker(worker)
|
|
179
|
+
item_list = list(items)
|
|
180
|
+
cs = chunksize if chunksize is not None else _auto_chunksize(len(item_list), workers)
|
|
181
|
+
with ProcessPoolExecutor(max_workers=workers) as ex:
|
|
182
|
+
return list(
|
|
183
|
+
tqdm(ex.map(safe, item_list, chunksize=cs), total=len(item_list), unit='file')
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]:
|
|
187
|
+
'''
|
|
188
|
+
``dir_path`` 直下で ``pattern`` に一致するパスを ``str`` のリストで返す。
|
|
189
|
+
|
|
190
|
+
``str`` にしているのは ``process_map`` 等のプロセスプールへ渡すとき pickle しやすくするため。
|
|
191
|
+
'''
|
|
192
|
+
return [str(p) for p in dir_path.glob(pattern)]
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def counter(start: int = 1) -> Iterator[int]:
|
|
196
|
+
n = start
|
|
197
|
+
while True:
|
|
198
|
+
yield n
|
|
199
|
+
n += 1
|