quickquery 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {quickquery-0.1.1 → quickquery-0.1.3}/PKG-INFO +7 -57
- {quickquery-0.1.1 → quickquery-0.1.3}/README.md +6 -56
- {quickquery-0.1.1 → quickquery-0.1.3}/pyproject.toml +1 -1
- {quickquery-0.1.1 → quickquery-0.1.3}/quickquery/live.py +10 -9
- {quickquery-0.1.1 → quickquery-0.1.3}/quickquery/utils.py +3 -3
- {quickquery-0.1.1 → quickquery-0.1.3}/.gitignore +0 -0
- {quickquery-0.1.1 → quickquery-0.1.3}/.python-version +0 -0
- {quickquery-0.1.1 → quickquery-0.1.3}/LICENSE +0 -0
- {quickquery-0.1.1 → quickquery-0.1.3}/quickquery/__init__.py +0 -0
- {quickquery-0.1.1 → quickquery-0.1.3}/quickquery/core.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: quickquery
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: 自分用・非汎用
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -13,66 +13,14 @@ Requires-Dist: camoufox>=0.4
|
|
|
13
13
|
Requires-Dist: loguru>=0.7
|
|
14
14
|
Requires-Dist: tqdm>=4.66
|
|
15
15
|
|
|
16
|
-
# QuickQuery
|
|
17
|
-
|
|
18
16
|
自分用・非汎用
|
|
19
17
|
|
|
20
18
|
## インストール
|
|
21
19
|
`uv add quickquery`
|
|
22
20
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
## 実装機能
|
|
27
|
-
|
|
28
|
-
### quickquery
|
|
29
|
-
|
|
30
|
-
- `quick_page(page: Page) -> QuickPage`
|
|
31
|
-
- `quick_element(page: Page, elem: ElementHandle | None) -> QuickElement`
|
|
32
|
-
- `quick_element_group(page: Page, elems: list[QuickElement]) -> QuickElementGroup`
|
|
33
|
-
- `quick_frame(page: Page, frame: Frame | None) -> QuickFrame`
|
|
34
|
-
- `quick_shadow_root(page: Page, host: ElementHandle | None) -> QuickShadowRoot`
|
|
35
|
-
- `quick_parser(parser: LexborHTMLParser) -> QuickParser`
|
|
36
|
-
- `quick_node(node: LexborNode | None) -> QuickNode`
|
|
37
|
-
- `quick_node_group(nodes: list[QuickNode]) -> QuickNodeGroup`
|
|
38
|
-
- `QuickPage`
|
|
39
|
-
- `QuickElement`
|
|
40
|
-
- `QuickElementGroup`
|
|
41
|
-
- `ElementScan`
|
|
42
|
-
- `QuickFrame`
|
|
43
|
-
- `QuickShadowRoot`
|
|
44
|
-
- `QuickParser`
|
|
45
|
-
- `QuickNode`
|
|
46
|
-
- `QuickNodeGroup`
|
|
47
|
-
- `NodeScan`
|
|
48
|
-
|
|
49
|
-
### quickquery.utils
|
|
50
|
-
|
|
51
|
-
- `parse_html(path: Path) -> LexborHTMLParser | None`
|
|
52
|
-
- `meta_html(meta: Mapping[str, object | None]) -> str`
|
|
53
|
-
- `from_here(file: str) -> Callable[[str], Path]`
|
|
54
|
-
- `append_csv(path: Path, row: dict) -> None`
|
|
55
|
-
- `write_csv(path: Path, rows: list[dict]) -> None`
|
|
56
|
-
- `write_parquet(path: Path, rows: list[dict]) -> None`
|
|
57
|
-
- `hash_name(key: str) -> str`
|
|
58
|
-
- `write_text(path: Path, data: str) -> bool`
|
|
59
|
-
- `write_bytes(path: Path, data: bytes) -> bool`
|
|
60
|
-
- `save_log(path: Path, level: str = 'WARNING') -> None`
|
|
61
|
-
- `process_map[T, R](worker: Callable[[T], R], items: Iterable[T], workers: int | None = None, *, chunksize: int | None = None) -> list[R | None]`
|
|
62
|
-
- `glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]`
|
|
63
|
-
- `counter(start: int = 1) -> Iterator[int]`
|
|
64
|
-
|
|
65
|
-
### quickquery.live
|
|
66
|
-
|
|
67
|
-
- `RecycleEvery`
|
|
68
|
-
- `PatchrightSession`
|
|
69
|
-
- `CamoufoxSession`
|
|
70
|
-
- `open_patchright(*, browser_options: dict | None = None, context_options: dict | None = None, recycle: RecycleEvery | None = None) -> PatchrightSession`
|
|
71
|
-
- `open_camoufox(*, browser_options: dict | None = None, context_options: dict | None = None, recycle: RecycleEvery | None = None) -> CamoufoxSession`
|
|
72
|
-
- `PatchrightSession.page() -> Page`
|
|
73
|
-
- `CamoufoxSession.page() -> Page`
|
|
74
|
-
|
|
75
|
-
`browser_options` / `context_options` は Playwright へ渡す起動オプション。`recycle` は quickquery の再生成間隔(`page()` 呼び出し回数ごとに独立して効く。省略時は再生成しない)。`page()` を呼ぶたびに内部カウントが 1 進む。
|
|
21
|
+
`open_patchright` を使うとき:Google ChromeをPCにインストールしておく。
|
|
22
|
+
`open_camoufox` を使うとき:`uv run camoufox fetch`
|
|
23
|
+
|
|
76
24
|
|
|
77
25
|
## 使用例
|
|
78
26
|
|
|
@@ -163,8 +111,10 @@ with open_patchright(
|
|
|
163
111
|
append_csv(here('csv/failed.csv'), {
|
|
164
112
|
'url_index': url_index,
|
|
165
113
|
'request_url': request_url,
|
|
114
|
+
'final_url': page.url,
|
|
166
115
|
'reason': 'write_text',
|
|
167
116
|
})
|
|
117
|
+
continue
|
|
168
118
|
|
|
169
119
|
page.screenshot(path=here(f'media/{url_index}-full-page.png'), full_page=True)
|
|
170
120
|
|
|
@@ -198,7 +148,7 @@ def main():
|
|
|
198
148
|
write_parquet(here('parquet/extract.parquet'), results)
|
|
199
149
|
|
|
200
150
|
def extract(file_path: str) -> dict | None:
|
|
201
|
-
if not (parser := parse_html(Path(file_path))):
|
|
151
|
+
if not (parser := parse_html(Path(file_path).read_bytes())):
|
|
202
152
|
return None
|
|
203
153
|
p = quick_parser(parser)
|
|
204
154
|
dt_scan = p.ii('dt').scan
|
|
@@ -1,63 +1,11 @@
|
|
|
1
|
-
# QuickQuery
|
|
2
|
-
|
|
3
1
|
自分用・非汎用
|
|
4
2
|
|
|
5
3
|
## インストール
|
|
6
4
|
`uv add quickquery`
|
|
7
5
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
## 実装機能
|
|
12
|
-
|
|
13
|
-
### quickquery
|
|
14
|
-
|
|
15
|
-
- `quick_page(page: Page) -> QuickPage`
|
|
16
|
-
- `quick_element(page: Page, elem: ElementHandle | None) -> QuickElement`
|
|
17
|
-
- `quick_element_group(page: Page, elems: list[QuickElement]) -> QuickElementGroup`
|
|
18
|
-
- `quick_frame(page: Page, frame: Frame | None) -> QuickFrame`
|
|
19
|
-
- `quick_shadow_root(page: Page, host: ElementHandle | None) -> QuickShadowRoot`
|
|
20
|
-
- `quick_parser(parser: LexborHTMLParser) -> QuickParser`
|
|
21
|
-
- `quick_node(node: LexborNode | None) -> QuickNode`
|
|
22
|
-
- `quick_node_group(nodes: list[QuickNode]) -> QuickNodeGroup`
|
|
23
|
-
- `QuickPage`
|
|
24
|
-
- `QuickElement`
|
|
25
|
-
- `QuickElementGroup`
|
|
26
|
-
- `ElementScan`
|
|
27
|
-
- `QuickFrame`
|
|
28
|
-
- `QuickShadowRoot`
|
|
29
|
-
- `QuickParser`
|
|
30
|
-
- `QuickNode`
|
|
31
|
-
- `QuickNodeGroup`
|
|
32
|
-
- `NodeScan`
|
|
33
|
-
|
|
34
|
-
### quickquery.utils
|
|
35
|
-
|
|
36
|
-
- `parse_html(path: Path) -> LexborHTMLParser | None`
|
|
37
|
-
- `meta_html(meta: Mapping[str, object | None]) -> str`
|
|
38
|
-
- `from_here(file: str) -> Callable[[str], Path]`
|
|
39
|
-
- `append_csv(path: Path, row: dict) -> None`
|
|
40
|
-
- `write_csv(path: Path, rows: list[dict]) -> None`
|
|
41
|
-
- `write_parquet(path: Path, rows: list[dict]) -> None`
|
|
42
|
-
- `hash_name(key: str) -> str`
|
|
43
|
-
- `write_text(path: Path, data: str) -> bool`
|
|
44
|
-
- `write_bytes(path: Path, data: bytes) -> bool`
|
|
45
|
-
- `save_log(path: Path, level: str = 'WARNING') -> None`
|
|
46
|
-
- `process_map[T, R](worker: Callable[[T], R], items: Iterable[T], workers: int | None = None, *, chunksize: int | None = None) -> list[R | None]`
|
|
47
|
-
- `glob_paths(dir_path: Path, pattern: str = '*.html') -> list[str]`
|
|
48
|
-
- `counter(start: int = 1) -> Iterator[int]`
|
|
49
|
-
|
|
50
|
-
### quickquery.live
|
|
51
|
-
|
|
52
|
-
- `RecycleEvery`
|
|
53
|
-
- `PatchrightSession`
|
|
54
|
-
- `CamoufoxSession`
|
|
55
|
-
- `open_patchright(*, browser_options: dict | None = None, context_options: dict | None = None, recycle: RecycleEvery | None = None) -> PatchrightSession`
|
|
56
|
-
- `open_camoufox(*, browser_options: dict | None = None, context_options: dict | None = None, recycle: RecycleEvery | None = None) -> CamoufoxSession`
|
|
57
|
-
- `PatchrightSession.page() -> Page`
|
|
58
|
-
- `CamoufoxSession.page() -> Page`
|
|
59
|
-
|
|
60
|
-
`browser_options` / `context_options` は Playwright へ渡す起動オプション。`recycle` は quickquery の再生成間隔(`page()` 呼び出し回数ごとに独立して効く。省略時は再生成しない)。`page()` を呼ぶたびに内部カウントが 1 進む。
|
|
6
|
+
`open_patchright` を使うとき:Google ChromeをPCにインストールしておく。
|
|
7
|
+
`open_camoufox` を使うとき:`uv run camoufox fetch`
|
|
8
|
+
|
|
61
9
|
|
|
62
10
|
## 使用例
|
|
63
11
|
|
|
@@ -148,8 +96,10 @@ with open_patchright(
|
|
|
148
96
|
append_csv(here('csv/failed.csv'), {
|
|
149
97
|
'url_index': url_index,
|
|
150
98
|
'request_url': request_url,
|
|
99
|
+
'final_url': page.url,
|
|
151
100
|
'reason': 'write_text',
|
|
152
101
|
})
|
|
102
|
+
continue
|
|
153
103
|
|
|
154
104
|
page.screenshot(path=here(f'media/{url_index}-full-page.png'), full_page=True)
|
|
155
105
|
|
|
@@ -183,7 +133,7 @@ def main():
|
|
|
183
133
|
write_parquet(here('parquet/extract.parquet'), results)
|
|
184
134
|
|
|
185
135
|
def extract(file_path: str) -> dict | None:
|
|
186
|
-
if not (parser := parse_html(Path(file_path))):
|
|
136
|
+
if not (parser := parse_html(Path(file_path).read_bytes())):
|
|
187
137
|
return None
|
|
188
138
|
p = quick_parser(parser)
|
|
189
139
|
dt_scan = p.ii('dt').scan
|
|
@@ -36,8 +36,8 @@ class _SessionBase:
|
|
|
36
36
|
recycle: RecycleEvery | None = None,
|
|
37
37
|
) -> None:
|
|
38
38
|
self._recycle = recycle or RecycleEvery()
|
|
39
|
-
self._browser_options =
|
|
40
|
-
self._context_options =
|
|
39
|
+
self._browser_options = browser_options or {}
|
|
40
|
+
self._context_options = context_options or {}
|
|
41
41
|
self._browser = None
|
|
42
42
|
self._context = None
|
|
43
43
|
self._page: Page | None = None
|
|
@@ -47,7 +47,7 @@ class _SessionBase:
|
|
|
47
47
|
def page(self) -> Page:
|
|
48
48
|
if not self._entered:
|
|
49
49
|
raise RuntimeError('with ブロックの外で page() を呼べません')
|
|
50
|
-
if self.
|
|
50
|
+
if self._browser is None:
|
|
51
51
|
self._open_browser()
|
|
52
52
|
elif (b := self._recycle.browser) and self._page_calls % b == 0:
|
|
53
53
|
self._close_browser()
|
|
@@ -67,7 +67,7 @@ class _SessionBase:
|
|
|
67
67
|
def _close_page(self) -> None:
|
|
68
68
|
if self._page is not None:
|
|
69
69
|
self._page.close()
|
|
70
|
-
|
|
70
|
+
self._page = None
|
|
71
71
|
|
|
72
72
|
def _open_context(self) -> None:
|
|
73
73
|
self._context = self._browser.new_context(**self._context_options)
|
|
@@ -77,7 +77,7 @@ class _SessionBase:
|
|
|
77
77
|
self._close_page()
|
|
78
78
|
if self._context is not None:
|
|
79
79
|
self._context.close()
|
|
80
|
-
|
|
80
|
+
self._context = None
|
|
81
81
|
|
|
82
82
|
|
|
83
83
|
class PatchrightSession(_SessionBase):
|
|
@@ -111,8 +111,8 @@ class PatchrightSession(_SessionBase):
|
|
|
111
111
|
self._close_browser()
|
|
112
112
|
self._pw.stop()
|
|
113
113
|
self._pw = None
|
|
114
|
-
self._entered = False
|
|
115
114
|
self._page_calls = 0
|
|
115
|
+
self._entered = False
|
|
116
116
|
|
|
117
117
|
def _open_browser(self) -> None:
|
|
118
118
|
self._browser = self._pw.chromium.launch(**self._browser_options)
|
|
@@ -153,19 +153,20 @@ class CamoufoxSession(_SessionBase):
|
|
|
153
153
|
if not self._entered:
|
|
154
154
|
return
|
|
155
155
|
self._close_browser()
|
|
156
|
-
self._entered = False
|
|
157
156
|
self._page_calls = 0
|
|
157
|
+
self._entered = False
|
|
158
158
|
|
|
159
159
|
def _open_browser(self) -> None:
|
|
160
|
+
fox = Camoufox(**self._browser_options)
|
|
160
161
|
self._stack = ExitStack()
|
|
161
|
-
self._browser = self._stack.enter_context(
|
|
162
|
+
self._browser = self._stack.enter_context(fox)
|
|
162
163
|
self._open_context()
|
|
163
164
|
|
|
164
165
|
def _close_browser(self) -> None:
|
|
165
166
|
self._close_context()
|
|
166
167
|
if self._stack is not None:
|
|
167
168
|
self._stack.close()
|
|
168
|
-
|
|
169
|
+
self._stack = None
|
|
169
170
|
self._browser = None
|
|
170
171
|
|
|
171
172
|
|
|
@@ -16,11 +16,11 @@ def _ensure_parent(path: Path) -> None:
|
|
|
16
16
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
def parse_html(
|
|
19
|
+
def parse_html(html: str | bytes) -> LexborHTMLParser | None:
|
|
20
20
|
try:
|
|
21
|
-
return LexborHTMLParser(
|
|
21
|
+
return LexborHTMLParser(html)
|
|
22
22
|
except Exception as e:
|
|
23
|
-
logger.error(f'[parse_html] {
|
|
23
|
+
logger.error(f'[parse_html] {type(e).__name__}: {e}')
|
|
24
24
|
return None
|
|
25
25
|
|
|
26
26
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|