scraper2-hj3415 2.6.0__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/.DS_Store +0 -0
- scraper2_hj3415/app/adapters/out/.DS_Store +0 -0
- scraper2_hj3415/app/adapters/out/playwright/browser.py +22 -369
- scraper2_hj3415/app/adapters/out/playwright/browser_factory.py +2 -2
- scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
- scraper2_hj3415/app/adapters/out/sinks/.DS_Store +0 -0
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +3 -3
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +11 -11
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +220 -9
- scraper2_hj3415/app/domain/constants.py +2 -2
- scraper2_hj3415/app/parsing/_tables/html_table.py +3 -2
- scraper2_hj3415/app/parsing/c103_parser.py +4 -1
- scraper2_hj3415/app/parsing/c104_parser.py +4 -1
- scraper2_hj3415/app/ports/browser/browser_port.py +25 -108
- scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
- scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
- scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
- scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
- scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
- scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
- scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +3 -3
- scraper2_hj3415/app/ports/site/wisereport_port.py +20 -10
- scraper2_hj3415/app/services/fetch/fetch_c103.py +18 -32
- scraper2_hj3415/app/services/fetch/fetch_c104.py +28 -51
- scraper2_hj3415/app/services/nfs_doc_builders.py +21 -7
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +2 -2
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +2 -2
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +2 -2
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +2 -2
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +2 -2
- scraper2_hj3415/cli.py +10 -7
- {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +1 -1
- {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/RECORD +44 -26
- {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/entry_points.txt +0 -0
- {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
|
Binary file
|
|
Binary file
|
|
@@ -1,373 +1,26 @@
|
|
|
1
1
|
# scraper2_hj3415/app/adapters/out/playwright/browser.py
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
|
-
from
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
4
|
+
from playwright.async_api import Page
|
|
5
|
+
|
|
6
|
+
from .capabilities import (
|
|
7
|
+
_PlaywrightBase,
|
|
8
|
+
PlaywrightNavigation,
|
|
9
|
+
PlaywrightWait,
|
|
10
|
+
PlaywrightInteraction,
|
|
11
|
+
PlaywrightText,
|
|
12
|
+
PlaywrightScope,
|
|
13
|
+
PlaywrightTable,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PlaywrightBrowser(
|
|
18
|
+
PlaywrightNavigation,
|
|
19
|
+
PlaywrightWait,
|
|
20
|
+
PlaywrightInteraction,
|
|
21
|
+
PlaywrightText,
|
|
22
|
+
PlaywrightScope,
|
|
23
|
+
PlaywrightTable,
|
|
24
|
+
):
|
|
14
25
|
def __init__(self, page: Page):
|
|
15
|
-
self
|
|
16
|
-
|
|
17
|
-
async def _wait_for_network_quiet(self, *, timeout_ms: int = 10_000) -> None:
|
|
18
|
-
# networkidle은 사이트에 따라 영원히 안 올 수도 있어서 try로 감싸는 게 안전
|
|
19
|
-
logger.debug("wait for network quiet")
|
|
20
|
-
try:
|
|
21
|
-
await self._page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
|
22
|
-
except Exception:
|
|
23
|
-
# networkidle이 안 와도 다음 단계(앵커 wait)가 더 중요함
|
|
24
|
-
return
|
|
25
|
-
|
|
26
|
-
async def wait_table_nth_ready(
|
|
27
|
-
self,
|
|
28
|
-
table_selector: str,
|
|
29
|
-
*,
|
|
30
|
-
index: int,
|
|
31
|
-
min_rows: int = 1,
|
|
32
|
-
timeout_ms: int = 20_000,
|
|
33
|
-
poll_ms: int = 200,
|
|
34
|
-
) -> None:
|
|
35
|
-
logger.debug("wait for table nth_ready")
|
|
36
|
-
table = self._page.locator(table_selector).nth(index)
|
|
37
|
-
await table.wait_for(state="attached", timeout=timeout_ms)
|
|
38
|
-
|
|
39
|
-
# html = await table.evaluate("el => el.outerHTML")
|
|
40
|
-
# logger.debug(f"TABLE HTML:\n{html}")
|
|
41
|
-
|
|
42
|
-
rows = table.locator("tbody tr")
|
|
43
|
-
deadline = time.monotonic() + timeout_ms / 1000
|
|
44
|
-
|
|
45
|
-
cnt = 0
|
|
46
|
-
while time.monotonic() < deadline:
|
|
47
|
-
try:
|
|
48
|
-
cnt = await rows.count()
|
|
49
|
-
except Exception:
|
|
50
|
-
cnt = 0
|
|
51
|
-
|
|
52
|
-
if cnt >= min_rows:
|
|
53
|
-
return
|
|
54
|
-
|
|
55
|
-
await asyncio.sleep(poll_ms / 1000)
|
|
56
|
-
|
|
57
|
-
logger.warning(f"table rows timeout: last_cnt={cnt}, need>={min_rows}")
|
|
58
|
-
raise TimeoutError(f"nth table not ready: index={index}, rows<{min_rows}")
|
|
59
|
-
|
|
60
|
-
async def title(self) -> str:
|
|
61
|
-
return await self._page.title()
|
|
62
|
-
|
|
63
|
-
async def current_url(self) -> str:
|
|
64
|
-
return self._page.url
|
|
65
|
-
|
|
66
|
-
async def goto_and_wait_for_stable(
|
|
67
|
-
self, url: str, timeout_ms: int = 10_000
|
|
68
|
-
) -> None:
|
|
69
|
-
logger.info(f"goto: {url}")
|
|
70
|
-
await self._page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
71
|
-
await self._wait_for_network_quiet(timeout_ms=timeout_ms // 2)
|
|
72
|
-
|
|
73
|
-
async def reload(self, *, timeout_ms: int = 10_000) -> None:
|
|
74
|
-
logger.info("reload")
|
|
75
|
-
await self._page.reload(timeout=timeout_ms, wait_until="domcontentloaded")
|
|
76
|
-
|
|
77
|
-
async def sleep_ms(self, ms: int) -> None:
|
|
78
|
-
await asyncio.sleep(ms / 1000)
|
|
79
|
-
|
|
80
|
-
async def wait_attached(self, selector: str, *, timeout_ms: int = 10_000) -> None:
|
|
81
|
-
await self._page.locator(selector).first.wait_for(
|
|
82
|
-
state="attached", timeout=timeout_ms
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
async def wait_visible(self, selector: str, *, timeout_ms: int = 10_000) -> None:
|
|
86
|
-
await self._page.locator(selector).first.wait_for(
|
|
87
|
-
state="visible", timeout=timeout_ms
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
async def click(
|
|
91
|
-
self,
|
|
92
|
-
selector: str,
|
|
93
|
-
*,
|
|
94
|
-
index: int = 0,
|
|
95
|
-
timeout_ms: int = 4_000,
|
|
96
|
-
force: bool = False,
|
|
97
|
-
) -> None:
|
|
98
|
-
loc = self._page.locator(selector).nth(index)
|
|
99
|
-
await loc.click(timeout=timeout_ms, force=force)
|
|
100
|
-
|
|
101
|
-
async def try_click(
|
|
102
|
-
self,
|
|
103
|
-
selector: str,
|
|
104
|
-
*,
|
|
105
|
-
index: int = 0,
|
|
106
|
-
timeout_ms: int = 1_500,
|
|
107
|
-
force: bool = False,
|
|
108
|
-
) -> bool:
|
|
109
|
-
loc = self._page.locator(selector).nth(index)
|
|
110
|
-
try:
|
|
111
|
-
await loc.click(timeout=timeout_ms, trial=True, force=force)
|
|
112
|
-
return True
|
|
113
|
-
except PwTimeoutError:
|
|
114
|
-
return False
|
|
115
|
-
|
|
116
|
-
async def count(self, selector: str) -> int:
|
|
117
|
-
return await self._page.locator(selector).count()
|
|
118
|
-
|
|
119
|
-
async def scroll_into_view(self, selector: str, *, index: int = 0) -> None:
|
|
120
|
-
# 선택한 요소가 화면에 보이도록 스크롤을 자동으로 내려준다.
|
|
121
|
-
await self._page.locator(selector).nth(index).scroll_into_view_if_needed()
|
|
122
|
-
|
|
123
|
-
async def text_content_first(self, selector: str) -> str:
|
|
124
|
-
# selector 첫 번째 요소의 text_content()를 반환
|
|
125
|
-
return (await self._page.locator(selector).first.text_content()) or ""
|
|
126
|
-
|
|
127
|
-
async def all_texts(self, selector: str) -> list[str]:
|
|
128
|
-
# selector로 잡히는 모든 요소를 all_text_contents()로 가져옴
|
|
129
|
-
loc = self._page.locator(selector)
|
|
130
|
-
return await loc.all_text_contents()
|
|
131
|
-
|
|
132
|
-
async def get_text_by_text(self, needle: str) -> str:
|
|
133
|
-
"""
|
|
134
|
-
페이지에서 주어진 텍스트(needle)를 포함하는 요소 중
|
|
135
|
-
첫 번째 요소의 text_content를 반환한다.
|
|
136
|
-
|
|
137
|
-
- 요소가 없으면 빈 문자열 반환
|
|
138
|
-
- 부분 일치 기준
|
|
139
|
-
"""
|
|
140
|
-
return (await self._page.get_by_text(needle).first.text_content()) or ""
|
|
141
|
-
|
|
142
|
-
async def inner_text(self, selector: str) -> str:
|
|
143
|
-
"""
|
|
144
|
-
selector에 해당하는 첫 번째 요소의 innerText를 반환한다.
|
|
145
|
-
|
|
146
|
-
- 요소가 DOM에 attach될 때까지 대기
|
|
147
|
-
- 화면에 보이는 텍스트 기준(innerText)
|
|
148
|
-
"""
|
|
149
|
-
return await self._page.locator(selector).first.inner_text()
|
|
150
|
-
|
|
151
|
-
async def outer_html_nth(self, selector: str, index: int) -> str:
|
|
152
|
-
"""
|
|
153
|
-
selector로 매칭되는 요소 중 index번째 요소의 outerHTML을 반환한다.
|
|
154
|
-
|
|
155
|
-
- index는 0-based
|
|
156
|
-
- 요소가 없으면 playwright 예외 발생
|
|
157
|
-
"""
|
|
158
|
-
loc = self._page.locator(selector).nth(index)
|
|
159
|
-
# index가 범위를 벗어나면 playwright가 에러를 내는데,
|
|
160
|
-
# 필요하면 여기서 더 친절한 에러로 감싸도 됨.
|
|
161
|
-
return await loc.evaluate("el => el.outerHTML")
|
|
162
|
-
|
|
163
|
-
async def wait_table_text_changed(
|
|
164
|
-
self,
|
|
165
|
-
table_selector: str,
|
|
166
|
-
*,
|
|
167
|
-
index: int,
|
|
168
|
-
prev_text: str | None,
|
|
169
|
-
min_rows: int = 1,
|
|
170
|
-
min_lines: int = 50,
|
|
171
|
-
timeout_sec: float = 12.0,
|
|
172
|
-
poll_sec: float = 0.2,
|
|
173
|
-
) -> str:
|
|
174
|
-
"""
|
|
175
|
-
지정한 table(nth)의 innerText가 '유효한 상태'가 되고,
|
|
176
|
-
이전 텍스트(prev_text)와 달라질 때까지 대기한 뒤 반환한다.
|
|
177
|
-
|
|
178
|
-
동작 순서:
|
|
179
|
-
1) tbody row 개수 기준으로 테이블이 최소한 로딩되었는지 보장
|
|
180
|
-
2) innerText를 주기적으로 폴링하며
|
|
181
|
-
- 최소 라인 수(min_lines)를 만족하고
|
|
182
|
-
- prev_text가 None이거나, prev_text와 다른 경우 반환
|
|
183
|
-
|
|
184
|
-
특징:
|
|
185
|
-
- DOM이 붙었지만 데이터가 아직 비어 있는 상태를 배제
|
|
186
|
-
- 클릭/토글 이후 실제 데이터 변경을 안정적으로 감지
|
|
187
|
-
- 타임아웃 시 마지막으로 관측된 텍스트를 반환
|
|
188
|
-
|
|
189
|
-
반환값:
|
|
190
|
-
- 변경된(innerText) 문자열
|
|
191
|
-
"""
|
|
192
|
-
|
|
193
|
-
# 0) 최초/혹은 불안정할 때는 row 기준으로 'ready'를 먼저 확보
|
|
194
|
-
await self.wait_table_nth_ready(
|
|
195
|
-
table_selector,
|
|
196
|
-
index=index,
|
|
197
|
-
min_rows=min_rows,
|
|
198
|
-
timeout_ms=int(timeout_sec * 1000),
|
|
199
|
-
poll_ms=int(poll_sec * 1000),
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
# 1) 그 다음 텍스트 기반으로 '유효 + 변경'을 기다림
|
|
203
|
-
start = time.monotonic()
|
|
204
|
-
last_text = ""
|
|
205
|
-
|
|
206
|
-
while True:
|
|
207
|
-
loc = self._page.locator(table_selector).nth(index)
|
|
208
|
-
try:
|
|
209
|
-
text = await loc.inner_text()
|
|
210
|
-
except Exception:
|
|
211
|
-
text = ""
|
|
212
|
-
|
|
213
|
-
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
|
214
|
-
is_valid = len(lines) >= min_lines
|
|
215
|
-
|
|
216
|
-
if is_valid:
|
|
217
|
-
last_text = text
|
|
218
|
-
if prev_text is None or text != prev_text:
|
|
219
|
-
return text
|
|
220
|
-
|
|
221
|
-
if time.monotonic() - start >= timeout_sec:
|
|
222
|
-
return last_text
|
|
223
|
-
|
|
224
|
-
await asyncio.sleep(poll_sec)
|
|
225
|
-
|
|
226
|
-
async def is_attached(self, selector: str, *, index: int = 0) -> bool:
|
|
227
|
-
"""
|
|
228
|
-
selector의 nth(index) 요소가 DOM에 존재하는지(attached) 여부를 반환한다.
|
|
229
|
-
요소가 없거나 접근 중 예외가 발생하면 False를 반환한다.
|
|
230
|
-
"""
|
|
231
|
-
try:
|
|
232
|
-
loc = self._page.locator(selector).nth(index)
|
|
233
|
-
return await loc.count() > 0
|
|
234
|
-
except Exception:
|
|
235
|
-
return False
|
|
236
|
-
|
|
237
|
-
async def computed_style(self, selector: str, *, index: int = 0, prop: str) -> str:
|
|
238
|
-
"""
|
|
239
|
-
selector의 nth(index) 요소에 대해,
|
|
240
|
-
CSS 계산값(getComputedStyle)의 특정 속성(prop)을 문자열로 반환한다.
|
|
241
|
-
(예: display, visibility, opacity 등)
|
|
242
|
-
"""
|
|
243
|
-
loc = self._page.locator(selector).nth(index)
|
|
244
|
-
# attached 보장하고 싶으면 여기서 wait_for(state="attached") 추가 가능
|
|
245
|
-
return await loc.evaluate(
|
|
246
|
-
"(el, prop) => getComputedStyle(el)[prop] || ''", prop
|
|
247
|
-
)
|
|
248
|
-
|
|
249
|
-
async def count_in_nth(
|
|
250
|
-
self,
|
|
251
|
-
scope_selector: str,
|
|
252
|
-
*,
|
|
253
|
-
scope_index: int,
|
|
254
|
-
inner_selector: str,
|
|
255
|
-
) -> int:
|
|
256
|
-
"""
|
|
257
|
-
scope_selector의 nth(scope_index) 범위 안에서
|
|
258
|
-
inner_selector에 매칭되는 요소 개수를 반환한다.
|
|
259
|
-
"""
|
|
260
|
-
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
261
|
-
return await scope.locator(inner_selector).count()
|
|
262
|
-
|
|
263
|
-
async def eval_in_nth_first(
|
|
264
|
-
self,
|
|
265
|
-
scope_selector: str,
|
|
266
|
-
*,
|
|
267
|
-
scope_index: int,
|
|
268
|
-
inner_selector: str,
|
|
269
|
-
expression: str,
|
|
270
|
-
) -> Any:
|
|
271
|
-
"""
|
|
272
|
-
scope(nth) 내부의 inner_selector.first element를 잡고 JS expression을 실행한다.
|
|
273
|
-
|
|
274
|
-
expression 예:
|
|
275
|
-
- "el => window.getComputedStyle(el).display"
|
|
276
|
-
- "el => el.getAttribute('data-content') || ''"
|
|
277
|
-
- "el => el.innerText"
|
|
278
|
-
"""
|
|
279
|
-
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
280
|
-
loc = scope.locator(inner_selector).first
|
|
281
|
-
|
|
282
|
-
# 매칭되는 게 없으면 None
|
|
283
|
-
if await loc.count() == 0:
|
|
284
|
-
return None
|
|
285
|
-
|
|
286
|
-
return await loc.evaluate(expression)
|
|
287
|
-
|
|
288
|
-
async def inner_text_in_nth(
|
|
289
|
-
self,
|
|
290
|
-
scope_selector: str,
|
|
291
|
-
*,
|
|
292
|
-
scope_index: int,
|
|
293
|
-
inner_selector: str,
|
|
294
|
-
inner_index: int = 0,
|
|
295
|
-
timeout_ms: int = 10_000,
|
|
296
|
-
) -> str:
|
|
297
|
-
"""
|
|
298
|
-
scope(nth) 내부에서 inner_selector(nth)의 innerText를 반환.
|
|
299
|
-
- innerText: 렌더링 기준(줄바꿈/숨김 반영)
|
|
300
|
-
"""
|
|
301
|
-
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
302
|
-
inner = scope.locator(inner_selector).nth(inner_index)
|
|
303
|
-
|
|
304
|
-
# 요소가 늦게 뜨는 케이스 대응
|
|
305
|
-
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
306
|
-
|
|
307
|
-
try:
|
|
308
|
-
return (await inner.inner_text()) or ""
|
|
309
|
-
except Exception:
|
|
310
|
-
# inner_text 자체가 실패하는 순간(사라짐/리렌더)도 있어서 안전하게
|
|
311
|
-
return ""
|
|
312
|
-
|
|
313
|
-
async def text_content_in_nth(
|
|
314
|
-
self,
|
|
315
|
-
scope_selector: str,
|
|
316
|
-
*,
|
|
317
|
-
scope_index: int,
|
|
318
|
-
inner_selector: str,
|
|
319
|
-
inner_index: int = 0,
|
|
320
|
-
timeout_ms: int = 10_000,
|
|
321
|
-
) -> str:
|
|
322
|
-
"""
|
|
323
|
-
scope(nth) 내부에서 inner_selector(nth)의 textContent를 반환.
|
|
324
|
-
- textContent: DOM 기준(숨김 텍스트 포함 가능)
|
|
325
|
-
"""
|
|
326
|
-
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
327
|
-
inner = scope.locator(inner_selector).nth(inner_index)
|
|
328
|
-
|
|
329
|
-
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
330
|
-
|
|
331
|
-
try:
|
|
332
|
-
return (await inner.text_content()) or ""
|
|
333
|
-
except Exception:
|
|
334
|
-
return ""
|
|
335
|
-
|
|
336
|
-
async def table_records(
|
|
337
|
-
self,
|
|
338
|
-
table_selector: str,
|
|
339
|
-
*,
|
|
340
|
-
header: int | list[int] | None = 0,
|
|
341
|
-
) -> list[dict[str, Any]]:
|
|
342
|
-
await self.wait_attached(table_selector)
|
|
343
|
-
|
|
344
|
-
table = self._page.locator(table_selector).first
|
|
345
|
-
html = await table.evaluate("el => el.outerHTML")
|
|
346
|
-
|
|
347
|
-
try:
|
|
348
|
-
df = pd.read_html(StringIO(html), header=header)[0]
|
|
349
|
-
except Exception as e:
|
|
350
|
-
raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
|
|
351
|
-
|
|
352
|
-
# 문자열 컬럼일 때만 정규화
|
|
353
|
-
if all(isinstance(c, str) for c in df.columns):
|
|
354
|
-
if "항목" in df.columns:
|
|
355
|
-
df["항목"] = (
|
|
356
|
-
df["항목"].astype(str).str.replace("펼치기", "").str.strip()
|
|
357
|
-
)
|
|
358
|
-
|
|
359
|
-
df.columns = (
|
|
360
|
-
df.columns.astype(str)
|
|
361
|
-
.str.replace("연간컨센서스보기", "", regex=False)
|
|
362
|
-
.str.replace("연간컨센서스닫기", "", regex=False)
|
|
363
|
-
.str.replace("(IFRS연결)", "", regex=False)
|
|
364
|
-
.str.replace("(IFRS별도)", "", regex=False)
|
|
365
|
-
.str.replace("(GAAP개별)", "", regex=False)
|
|
366
|
-
.str.replace("(YoY)", "", regex=False)
|
|
367
|
-
.str.replace("(QoQ)", "", regex=False)
|
|
368
|
-
.str.replace("(E)", "", regex=False)
|
|
369
|
-
.str.replace(".", "", regex=False)
|
|
370
|
-
.str.strip()
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
return df.where(pd.notnull(df), None).to_dict(orient="records")
|
|
26
|
+
_PlaywrightBase.__init__(self, page)
|
|
@@ -4,7 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from contextlib import asynccontextmanager
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import AsyncIterator
|
|
7
|
+
from typing import AsyncIterator, cast
|
|
8
8
|
|
|
9
9
|
from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
10
10
|
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
@@ -54,7 +54,7 @@ class PlaywrightBrowserFactory(BrowserFactoryPort):
|
|
|
54
54
|
for _ in range(self.max_concurrency):
|
|
55
55
|
session = PlaywrightPageSession(headless=self.headless, timeout_ms=self.timeout_ms)
|
|
56
56
|
page = await session.start()
|
|
57
|
-
browser = PlaywrightBrowser(page)
|
|
57
|
+
browser:BrowserPort = cast(BrowserPort, PlaywrightBrowser(page))
|
|
58
58
|
|
|
59
59
|
item = _LeaseItem(session=session, browser=browser)
|
|
60
60
|
self._items.append(item)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py
|
|
2
|
+
from ._base import _PlaywrightBase
|
|
3
|
+
from .navigation import PlaywrightNavigation
|
|
4
|
+
from .wait import PlaywrightWait
|
|
5
|
+
from .interaction import PlaywrightInteraction
|
|
6
|
+
from .text import PlaywrightText
|
|
7
|
+
from .scope import PlaywrightScope
|
|
8
|
+
from .table import PlaywrightTable
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"_PlaywrightBase",
|
|
12
|
+
"PlaywrightNavigation",
|
|
13
|
+
"PlaywrightWait",
|
|
14
|
+
"PlaywrightInteraction",
|
|
15
|
+
"PlaywrightText",
|
|
16
|
+
"PlaywrightScope",
|
|
17
|
+
"PlaywrightTable",
|
|
18
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import Page
|
|
5
|
+
from logging_hj3415 import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _PlaywrightBase:
|
|
9
|
+
def __init__(self, page: Page):
|
|
10
|
+
self._page = page
|
|
11
|
+
|
|
12
|
+
async def _wait_for_network_quiet(self, *, timeout_ms: int = 10_000) -> None:
|
|
13
|
+
# networkidle은 사이트에 따라 영원히 안 올 수도 있어서 try로 감싸는 게 안전
|
|
14
|
+
logger.debug("wait for network quiet")
|
|
15
|
+
try:
|
|
16
|
+
await self._page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
|
17
|
+
except Exception:
|
|
18
|
+
# networkidle이 안 와도 다음 단계(앵커 wait)가 더 중요함
|
|
19
|
+
return
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import TimeoutError as PwTimeoutError
|
|
5
|
+
from ._base import _PlaywrightBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlaywrightInteraction(_PlaywrightBase):
|
|
9
|
+
|
|
10
|
+
async def click(
|
|
11
|
+
self,
|
|
12
|
+
selector: str,
|
|
13
|
+
*,
|
|
14
|
+
index: int = 0,
|
|
15
|
+
timeout_ms: int = 4_000,
|
|
16
|
+
force: bool = False,
|
|
17
|
+
) -> None:
|
|
18
|
+
loc = self._page.locator(selector).nth(index)
|
|
19
|
+
await loc.click(timeout=timeout_ms, force=force)
|
|
20
|
+
|
|
21
|
+
async def try_click(
|
|
22
|
+
self,
|
|
23
|
+
selector: str,
|
|
24
|
+
*,
|
|
25
|
+
index: int = 0,
|
|
26
|
+
timeout_ms: int = 1_500,
|
|
27
|
+
force: bool = False,
|
|
28
|
+
) -> bool:
|
|
29
|
+
loc = self._page.locator(selector).nth(index)
|
|
30
|
+
try:
|
|
31
|
+
await loc.click(timeout=timeout_ms, trial=True, force=force)
|
|
32
|
+
return True
|
|
33
|
+
except PwTimeoutError:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
async def scroll_into_view(self, selector: str, *, index: int = 0) -> None:
|
|
37
|
+
await self._page.locator(selector).nth(index).scroll_into_view_if_needed()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from logging_hj3415 import logger
|
|
5
|
+
from ._base import _PlaywrightBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlaywrightNavigation(_PlaywrightBase):
|
|
9
|
+
async def title(self) -> str:
|
|
10
|
+
return await self._page.title()
|
|
11
|
+
|
|
12
|
+
async def current_url(self) -> str:
|
|
13
|
+
return self._page.url
|
|
14
|
+
|
|
15
|
+
async def goto_and_wait_for_stable(
|
|
16
|
+
self, url: str, timeout_ms: int = 10_000
|
|
17
|
+
) -> None:
|
|
18
|
+
logger.info(f"goto: {url}")
|
|
19
|
+
await self._page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
20
|
+
await self._wait_for_network_quiet(timeout_ms=timeout_ms // 2)
|
|
21
|
+
|
|
22
|
+
async def reload(self, *, timeout_ms: int = 10_000) -> None:
|
|
23
|
+
logger.info("reload")
|
|
24
|
+
await self._page.reload(timeout=timeout_ms, wait_until="domcontentloaded")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from ._base import _PlaywrightBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlaywrightScope(_PlaywrightBase):
|
|
9
|
+
async def is_attached(self, selector: str, *, index: int = 0) -> bool:
|
|
10
|
+
try:
|
|
11
|
+
loc = self._page.locator(selector).nth(index)
|
|
12
|
+
return await loc.count() > 0
|
|
13
|
+
except Exception:
|
|
14
|
+
return False
|
|
15
|
+
|
|
16
|
+
async def computed_style(self, selector: str, *, index: int = 0, prop: str) -> str:
|
|
17
|
+
loc = self._page.locator(selector).nth(index)
|
|
18
|
+
return await loc.evaluate(
|
|
19
|
+
"(el, prop) => getComputedStyle(el)[prop] || ''", prop
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
async def count_in_nth(
|
|
23
|
+
self,
|
|
24
|
+
scope_selector: str,
|
|
25
|
+
*,
|
|
26
|
+
scope_index: int,
|
|
27
|
+
inner_selector: str,
|
|
28
|
+
) -> int:
|
|
29
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
30
|
+
return await scope.locator(inner_selector).count()
|
|
31
|
+
|
|
32
|
+
async def eval_in_nth_first(
|
|
33
|
+
self,
|
|
34
|
+
scope_selector: str,
|
|
35
|
+
*,
|
|
36
|
+
scope_index: int,
|
|
37
|
+
inner_selector: str,
|
|
38
|
+
expression: str,
|
|
39
|
+
) -> Any:
|
|
40
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
41
|
+
loc = scope.locator(inner_selector).first
|
|
42
|
+
|
|
43
|
+
if await loc.count() == 0:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return await loc.evaluate(expression)
|
|
47
|
+
|
|
48
|
+
async def inner_text_in_nth(
|
|
49
|
+
self,
|
|
50
|
+
scope_selector: str,
|
|
51
|
+
*,
|
|
52
|
+
scope_index: int,
|
|
53
|
+
inner_selector: str,
|
|
54
|
+
inner_index: int = 0,
|
|
55
|
+
timeout_ms: int = 10_000,
|
|
56
|
+
) -> str:
|
|
57
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
58
|
+
inner = scope.locator(inner_selector).nth(inner_index)
|
|
59
|
+
|
|
60
|
+
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
return (await inner.inner_text()) or ""
|
|
64
|
+
except Exception:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
async def text_content_in_nth(
|
|
68
|
+
self,
|
|
69
|
+
scope_selector: str,
|
|
70
|
+
*,
|
|
71
|
+
scope_index: int,
|
|
72
|
+
inner_selector: str,
|
|
73
|
+
inner_index: int = 0,
|
|
74
|
+
timeout_ms: int = 10_000,
|
|
75
|
+
) -> str:
|
|
76
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
77
|
+
inner = scope.locator(inner_selector).nth(inner_index)
|
|
78
|
+
|
|
79
|
+
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
return (await inner.text_content()) or ""
|
|
83
|
+
except Exception:
|
|
84
|
+
return ""
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from typing import Any
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from ._base import _PlaywrightBase
|
|
9
|
+
|
|
10
|
+
_PERIOD_MM_RE = re.compile(r"\b(19|20)\d{2}/(03|06|09|12)\b")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PlaywrightTable(_PlaywrightBase):
|
|
14
|
+
async def table_records(
|
|
15
|
+
self,
|
|
16
|
+
table_selector: str,
|
|
17
|
+
*,
|
|
18
|
+
header: int | list[int] | None = 0,
|
|
19
|
+
) -> list[dict[str, Any]]:
|
|
20
|
+
await self.wait_attached(table_selector)
|
|
21
|
+
|
|
22
|
+
table = self._page.locator(table_selector).first
|
|
23
|
+
html = await table.evaluate("el => el.outerHTML")
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
df = pd.read_html(StringIO(html), header=header)[0]
|
|
27
|
+
except Exception as e:
|
|
28
|
+
raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
|
|
29
|
+
|
|
30
|
+
# 문자열 컬럼일 때만 정규화
|
|
31
|
+
if all(isinstance(c, str) for c in df.columns):
|
|
32
|
+
if "항목" in df.columns:
|
|
33
|
+
df["항목"] = (
|
|
34
|
+
df["항목"].astype(str).str.replace("펼치기", "").str.strip()
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
df.columns = (
|
|
38
|
+
df.columns.astype(str)
|
|
39
|
+
.str.replace("연간컨센서스보기", "", regex=False)
|
|
40
|
+
.str.replace("연간컨센서스닫기", "", regex=False)
|
|
41
|
+
.str.replace("(IFRS연결)", "", regex=False)
|
|
42
|
+
.str.replace("(IFRS별도)", "", regex=False)
|
|
43
|
+
.str.replace("(GAAP개별)", "", regex=False)
|
|
44
|
+
.str.replace("(YoY)", "", regex=False)
|
|
45
|
+
.str.replace("(QoQ)", "", regex=False)
|
|
46
|
+
.str.replace("(E)", "", regex=False)
|
|
47
|
+
.str.replace(".", "", regex=False)
|
|
48
|
+
.str.strip()
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return df.where(pd.notnull(df), None).to_dict(orient="records")
|
|
52
|
+
|
|
53
|
+
async def table_header_texts_nth(
|
|
54
|
+
self, table_selector: str, *, index: int
|
|
55
|
+
) -> list[str]:
|
|
56
|
+
table = self._page.locator(table_selector).nth(index)
|
|
57
|
+
thead = table.locator("thead")
|
|
58
|
+
if await thead.count() == 0:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
ths = thead.locator("th")
|
|
62
|
+
try:
|
|
63
|
+
texts = await ths.all_inner_texts()
|
|
64
|
+
except Exception:
|
|
65
|
+
texts = []
|
|
66
|
+
|
|
67
|
+
out: list[str] = []
|
|
68
|
+
for t in texts:
|
|
69
|
+
t = " ".join((t or "").split())
|
|
70
|
+
if t:
|
|
71
|
+
out.append(t)
|
|
72
|
+
return out
|
|
73
|
+
|
|
74
|
+
async def table_header_periods_mm_nth(
|
|
75
|
+
self, table_selector: str, *, index: int
|
|
76
|
+
) -> list[str]:
|
|
77
|
+
texts = await self.table_header_texts_nth(table_selector, index=index)
|
|
78
|
+
periods: list[str] = []
|
|
79
|
+
for t in texts:
|
|
80
|
+
for m in _PERIOD_MM_RE.finditer(t):
|
|
81
|
+
periods.append(m.group(0))
|
|
82
|
+
|
|
83
|
+
seen: set[str] = set()
|
|
84
|
+
uniq: list[str] = []
|
|
85
|
+
for p in periods:
|
|
86
|
+
if p in seen:
|
|
87
|
+
continue
|
|
88
|
+
seen.add(p)
|
|
89
|
+
uniq.append(p)
|
|
90
|
+
return uniq
|