quickplay 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.12
|
quickplay-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nishizawa Takamasa
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
quickplay-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# quickplay
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .quickpage import PlayPage, BasePaths, sleep_between, append_csv, run_scraper
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
import unicodedata as ud
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from playwright.sync_api import sync_playwright, Page, ElementHandle
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PlayPage:
|
|
14
|
+
def __init__(self, page: Page) -> None:
|
|
15
|
+
self._page = page
|
|
16
|
+
|
|
17
|
+
def first(self, elems: list[ElementHandle]) -> ElementHandle | None:
|
|
18
|
+
return elems[0] if elems else None
|
|
19
|
+
|
|
20
|
+
def re_filter(self, pattern: str, elems: list[ElementHandle]) -> list[ElementHandle]:
|
|
21
|
+
return [elem for elem in elems if (text := self.text_c(elem)) is not None and re.search(pattern, ud.normalize("NFKC", text))]
|
|
22
|
+
|
|
23
|
+
def ss(self, selector: str) -> list[ElementHandle]:
|
|
24
|
+
return self._page.query_selector_all(selector)
|
|
25
|
+
|
|
26
|
+
def s(self, selector: str) -> ElementHandle | None:
|
|
27
|
+
return self.first(self.ss(selector))
|
|
28
|
+
|
|
29
|
+
def ss_re(self, selector: str, pattern: str) -> list[ElementHandle]:
|
|
30
|
+
return self.re_filter(pattern, self.ss(selector))
|
|
31
|
+
|
|
32
|
+
def s_re(self, selector: str, pattern: str) -> ElementHandle | None:
|
|
33
|
+
return self.first(self.ss_re(selector, pattern))
|
|
34
|
+
|
|
35
|
+
def ss_in(self, selector: str, from_: ElementHandle | None) -> list[ElementHandle]:
|
|
36
|
+
return [] if from_ is None else from_.query_selector_all(selector)
|
|
37
|
+
|
|
38
|
+
def s_in(self, selector: str, from_: ElementHandle | None) -> ElementHandle | None:
|
|
39
|
+
return self.first(self.ss_in(selector, from_))
|
|
40
|
+
|
|
41
|
+
def ss_re_in(self, selector: str, pattern: str, from_: ElementHandle | None) -> list[ElementHandle]:
|
|
42
|
+
return self.re_filter(pattern, self.ss_in(selector, from_))
|
|
43
|
+
|
|
44
|
+
def s_re_in(self, selector: str, pattern: str, from_: ElementHandle | None) -> ElementHandle | None:
|
|
45
|
+
return self.first(self.ss_re_in(selector, pattern, from_))
|
|
46
|
+
|
|
47
|
+
def next(self, elem: ElementHandle | None) -> ElementHandle | None:
|
|
48
|
+
return None if elem is None else elem.evaluate_handle("el => el.nextElementSibling").as_element()
|
|
49
|
+
|
|
50
|
+
def text_c(self, elem: ElementHandle | None) -> str | None:
|
|
51
|
+
if elem is None:
|
|
52
|
+
return None
|
|
53
|
+
return text.strip() if (text := elem.evaluate("el => el.textContent")) else text
|
|
54
|
+
|
|
55
|
+
def i_text(self, elem: ElementHandle | None) -> str | None:
|
|
56
|
+
if elem is None:
|
|
57
|
+
return None
|
|
58
|
+
return text.strip() if (text := elem.evaluate("el => el.innerText")) else text
|
|
59
|
+
|
|
60
|
+
def attr(self, attr_name: str, elem: ElementHandle | None) -> str | None:
|
|
61
|
+
if elem is None:
|
|
62
|
+
return None
|
|
63
|
+
return attr.strip() if (attr := elem.get_attribute(attr_name)) else attr
|
|
64
|
+
|
|
65
|
+
def goto(self, url: str | None) -> bool:
|
|
66
|
+
if not url:
|
|
67
|
+
return False
|
|
68
|
+
try:
|
|
69
|
+
self._page.goto(url, wait_until="domcontentloaded")
|
|
70
|
+
return True
|
|
71
|
+
except Exception as e:
|
|
72
|
+
print(f"{type(e).__name__}: {e}")
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
def wait(self, selector: str, timeout: int = 15000) -> ElementHandle | None:
|
|
76
|
+
try:
|
|
77
|
+
return self._page.wait_for_selector(selector, timeout=timeout)
|
|
78
|
+
except Exception as e:
|
|
79
|
+
print(f"{type(e).__name__}: {e}")
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class BasePaths:
|
|
85
|
+
"""呼び出し元ファイルを基準にしたパス解決。
|
|
86
|
+
|
|
87
|
+
Usage:
|
|
88
|
+
paths = BasePaths(__file__)
|
|
89
|
+
csv_path = paths.from_here('data/out.csv')
|
|
90
|
+
"""
|
|
91
|
+
|
|
92
|
+
def __init__(self, file: str) -> None:
|
|
93
|
+
self._base = Path(file).resolve().parent
|
|
94
|
+
|
|
95
|
+
def from_here(self, path: str) -> Path:
|
|
96
|
+
"""baseを起点に連結した絶対Pathを返す。"""
|
|
97
|
+
return self._base / path
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def sleep_between(a: float, b: float) -> None:
|
|
101
|
+
"""a〜b秒のランダムスリープ。"""
|
|
102
|
+
time.sleep(random.uniform(a, b))
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def append_csv(path: Path | str, row: dict) -> None:
|
|
106
|
+
"""dictを1行としてCSVに追記する。ファイルがなければheaderも書く。"""
|
|
107
|
+
p = Path(path)
|
|
108
|
+
pd.DataFrame([row]).to_csv(
|
|
109
|
+
p,
|
|
110
|
+
mode='a',
|
|
111
|
+
index=False,
|
|
112
|
+
header=not p.exists(),
|
|
113
|
+
encoding='utf-8-sig',
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def run_scraper(
|
|
118
|
+
fn: Callable[[Page], None],
|
|
119
|
+
*,
|
|
120
|
+
headless: bool = False,
|
|
121
|
+
channel: str = "chrome",
|
|
122
|
+
viewport: dict | None = {'width': 1920, 'height': 1080},
|
|
123
|
+
user_agent: str | None = None,
|
|
124
|
+
accept_language: str | None = "ja-JP,ja;q=0.9",
|
|
125
|
+
timeout: int = 15000,
|
|
126
|
+
block_resources: set[str] | None = None,
|
|
127
|
+
) -> None:
|
|
128
|
+
"""Playwrightの定型起動をまとめたランナー。
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
fn: scrape(page) のような関数を渡す。
|
|
132
|
+
headless: ヘッドレスモードにするか。
|
|
133
|
+
channel: ブラウザチャンネル("chrome" など)。
|
|
134
|
+
viewport: {'width': 1920, 'height': 1080} など。Noneなら未設定。
|
|
135
|
+
user_agent: User-Agent文字列。Noneなら未設定。
|
|
136
|
+
accept_language: Accept-Languageヘッダー。Noneなら未設定。
|
|
137
|
+
timeout: デフォルトタイムアウト(ミリ秒)。
|
|
138
|
+
block_resources: ブロックするリソースタイプ。例: {'image', 'font', 'media'}。
|
|
139
|
+
|
|
140
|
+
Usage:
|
|
141
|
+
run_scraper(scrape, user_agent='Mozilla/5.0 ...', block_resources={'image', 'font'})
|
|
142
|
+
"""
|
|
143
|
+
context_kwargs: dict = {}
|
|
144
|
+
if viewport is not None:
|
|
145
|
+
context_kwargs['viewport'] = viewport
|
|
146
|
+
if user_agent is not None:
|
|
147
|
+
context_kwargs['user_agent'] = user_agent
|
|
148
|
+
if accept_language is not None:
|
|
149
|
+
context_kwargs['extra_http_headers'] = {'Accept-Language': accept_language}
|
|
150
|
+
|
|
151
|
+
with sync_playwright() as pw:
|
|
152
|
+
with pw.chromium.launch(headless=headless, channel=channel) as browser:
|
|
153
|
+
with browser.new_context(**context_kwargs) as context:
|
|
154
|
+
page = context.new_page()
|
|
155
|
+
page.set_default_timeout(timeout)
|
|
156
|
+
|
|
157
|
+
if block_resources:
|
|
158
|
+
def handler(route):
|
|
159
|
+
if route.request.resource_type in block_resources:
|
|
160
|
+
route.abort()
|
|
161
|
+
else:
|
|
162
|
+
route.continue_()
|
|
163
|
+
page.route('**/*', handler)
|
|
164
|
+
|
|
165
|
+
fn(page)
|