scraper2-hj3415 2.6.0__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. scraper2_hj3415/.DS_Store +0 -0
  2. scraper2_hj3415/app/adapters/out/.DS_Store +0 -0
  3. scraper2_hj3415/app/adapters/out/playwright/browser.py +22 -369
  4. scraper2_hj3415/app/adapters/out/playwright/browser_factory.py +2 -2
  5. scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
  6. scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
  7. scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
  8. scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
  9. scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
  10. scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
  11. scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
  12. scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
  13. scraper2_hj3415/app/adapters/out/sinks/.DS_Store +0 -0
  14. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +3 -3
  15. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +11 -11
  16. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +220 -9
  17. scraper2_hj3415/app/domain/constants.py +2 -2
  18. scraper2_hj3415/app/parsing/_tables/html_table.py +3 -2
  19. scraper2_hj3415/app/parsing/c103_parser.py +4 -1
  20. scraper2_hj3415/app/parsing/c104_parser.py +4 -1
  21. scraper2_hj3415/app/ports/browser/browser_port.py +25 -108
  22. scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
  23. scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
  24. scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
  25. scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
  26. scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
  27. scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
  28. scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
  29. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +3 -3
  30. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -10
  31. scraper2_hj3415/app/services/fetch/fetch_c103.py +18 -32
  32. scraper2_hj3415/app/services/fetch/fetch_c104.py +28 -51
  33. scraper2_hj3415/app/services/nfs_doc_builders.py +21 -7
  34. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +2 -2
  35. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +2 -2
  36. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +2 -2
  37. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +2 -2
  38. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +2 -2
  39. scraper2_hj3415/cli.py +10 -7
  40. {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +1 -1
  41. {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/RECORD +44 -26
  42. {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
  43. {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/entry_points.txt +0 -0
  44. {scraper2_hj3415-2.6.0.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,25 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py
2
+ from __future__ import annotations
3
+ from ._base import _PlaywrightBase
4
+
5
+
6
+ class PlaywrightText(_PlaywrightBase):
7
+ async def count(self, selector: str) -> int:
8
+ return await self._page.locator(selector).count()
9
+
10
+ async def text_content_first(self, selector: str) -> str:
11
+ return (await self._page.locator(selector).first.text_content()) or ""
12
+
13
+ async def all_texts(self, selector: str) -> list[str]:
14
+ loc = self._page.locator(selector)
15
+ return await loc.all_text_contents()
16
+
17
+ async def get_text_by_text(self, needle: str) -> str:
18
+ return (await self._page.get_by_text(needle).first.text_content()) or ""
19
+
20
+ async def inner_text(self, selector: str) -> str:
21
+ return await self._page.locator(selector).first.inner_text()
22
+
23
+ async def outer_html_nth(self, selector: str, index: int) -> str:
24
+ loc = self._page.locator(selector).nth(index)
25
+ return await loc.evaluate("el => el.outerHTML")
@@ -0,0 +1,96 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import time
6
+ from logging_hj3415 import logger
7
+ from ._base import _PlaywrightBase
8
+
9
+ class PlaywrightWait(_PlaywrightBase):
10
+ """
11
+ wait 관련은 _page만 필요하므로 _PlaywrightBase 상속 안 해도 되지만,
12
+ 여기서는 self._page가 있다고 가정(PlaywrightBrowser가 베이스 제공).
13
+ """
14
+
15
+ async def sleep_ms(self, ms: int) -> None:
16
+ await asyncio.sleep(ms / 1000)
17
+
18
+ async def wait_attached(self, selector: str, *, timeout_ms: int = 10_000) -> None:
19
+ await self._page.locator(selector).first.wait_for(state="attached", timeout=timeout_ms)
20
+
21
+ async def wait_visible(self, selector: str, *, timeout_ms: int = 10_000) -> None:
22
+ await self._page.locator(selector).first.wait_for(state="visible", timeout=timeout_ms)
23
+
24
+ async def wait_table_nth_ready(
25
+ self,
26
+ table_selector: str,
27
+ *,
28
+ index: int,
29
+ min_rows: int = 1,
30
+ timeout_ms: int = 20_000,
31
+ poll_ms: int = 200,
32
+ ) -> None:
33
+ logger.debug("wait for table nth_ready")
34
+ table = self._page.locator(table_selector).nth(index)
35
+ await table.wait_for(state="attached", timeout=timeout_ms)
36
+
37
+ rows = table.locator("tbody tr")
38
+ deadline = time.monotonic() + timeout_ms / 1000
39
+
40
+ cnt = 0
41
+ while time.monotonic() < deadline:
42
+ try:
43
+ cnt = await rows.count()
44
+ except Exception:
45
+ cnt = 0
46
+
47
+ if cnt >= min_rows:
48
+ return
49
+
50
+ await asyncio.sleep(poll_ms / 1000)
51
+
52
+ logger.warning(f"table rows timeout: last_cnt={cnt}, need>={min_rows}")
53
+ raise TimeoutError(f"nth table not ready: index={index}, rows<{min_rows}")
54
+
55
+ async def wait_table_text_changed(
56
+ self,
57
+ table_selector: str,
58
+ *,
59
+ index: int,
60
+ prev_text: str | None,
61
+ min_rows: int = 1,
62
+ min_lines: int = 50,
63
+ timeout_sec: float = 12.0,
64
+ poll_sec: float = 0.2,
65
+ ) -> str:
66
+ # 0) row 기준 ready
67
+ await self.wait_table_nth_ready(
68
+ table_selector,
69
+ index=index,
70
+ min_rows=min_rows,
71
+ timeout_ms=int(timeout_sec * 1000),
72
+ poll_ms=int(poll_sec * 1000),
73
+ )
74
+
75
+ start = time.monotonic()
76
+ last_text = ""
77
+
78
+ while True:
79
+ loc = self._page.locator(table_selector).nth(index)
80
+ try:
81
+ text = await loc.inner_text()
82
+ except Exception:
83
+ text = ""
84
+
85
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
86
+ is_valid = len(lines) >= min_lines
87
+
88
+ if is_valid:
89
+ last_text = text
90
+ if prev_text is None or text != prev_text:
91
+ return text
92
+
93
+ if time.monotonic() - start >= timeout_sec:
94
+ return last_text
95
+
96
+ await asyncio.sleep(poll_sec)
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
  from typing import Iterable
5
5
 
6
6
  from contracts_hj3415.nfs.nfs_dto import NfsDTO
7
- from contracts_hj3415.nfs.types import Endpoints
7
+ from contracts_hj3415.nfs.types import Endpoint
8
8
 
9
9
  from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
10
10
 
@@ -13,13 +13,13 @@ class MemorySink:
13
13
  def __init__(self, store: InMemoryStore[NfsDTO]):
14
14
  self._store = store
15
15
 
16
- async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
16
+ async def write(self, dto: NfsDTO, *, endpoint: Endpoint) -> None:
17
17
  await self._store.put(endpoint, dto.code, dto)
18
18
 
19
19
  async def write_many(
20
20
  self,
21
21
  dtos: Iterable[NfsDTO],
22
22
  *,
23
- endpoint: Endpoints,
23
+ endpoint: Endpoint,
24
24
  ) -> None:
25
25
  await self._store.put_many(endpoint, ((d.code, d) for d in dtos))
@@ -7,13 +7,13 @@ from typing import Iterable
7
7
  from pymongo.asynchronous.database import AsyncDatabase
8
8
 
9
9
  from contracts_hj3415.nfs.nfs_dto import NfsDTO
10
- from contracts_hj3415.nfs.types import Endpoints
10
+ from contracts_hj3415.nfs.types import Endpoint
11
11
 
12
12
  from db2_hj3415.nfs.repo import (
13
- upsert_latest_payload,
14
- upsert_latest_payload_many,
15
- insert_snapshot_payload,
16
- insert_snapshots_payload_many,
13
+ upsert_latest,
14
+ upsert_latest_many,
15
+ insert_snapshot,
16
+ insert_snapshots_many,
17
17
  )
18
18
 
19
19
 
@@ -21,17 +21,17 @@ class MongoSink:
21
21
  def __init__(self, db: AsyncDatabase):
22
22
  self._db = db
23
23
 
24
- async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
24
+ async def write(self, dto: NfsDTO, *, endpoint: Endpoint) -> None:
25
25
  code = str(dto.code).strip()
26
26
  if not code:
27
27
  return
28
28
 
29
29
  payload = dict(dto.payload) # Mapping 방어
30
30
 
31
- await upsert_latest_payload(
31
+ await upsert_latest(
32
32
  self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
33
33
  )
34
- await insert_snapshot_payload(
34
+ await insert_snapshot(
35
35
  self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
36
36
  )
37
37
 
@@ -39,7 +39,7 @@ class MongoSink:
39
39
  self,
40
40
  dtos: Iterable[NfsDTO],
41
41
  *,
42
- endpoint: Endpoints,
42
+ endpoint: Endpoint,
43
43
  ) -> None:
44
44
  items: dict[str, dict] = {}
45
45
  ts: datetime | None = None
@@ -55,9 +55,9 @@ class MongoSink:
55
55
  if not items:
56
56
  return
57
57
 
58
- await upsert_latest_payload_many(
58
+ await upsert_latest_many(
59
59
  self._db, endpoint=endpoint, items=items, asof=ts
60
60
  )
61
- await insert_snapshots_payload_many(
61
+ await insert_snapshots_many(
62
62
  self._db, endpoint=endpoint, items=items, asof=ts
63
63
  )
@@ -1,15 +1,19 @@
1
1
  # scraper2_hj3415/app/adapters/site/wisereport_playwright.py
2
2
  from __future__ import annotations
3
3
 
4
+ import re
4
5
  from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from scraper2_hj3415.app.ports.site.wisereport_port import WiseReportPort
5
7
  from logging_hj3415 import logger
6
8
 
9
+ _Q_SIGNAL_RE = re.compile(r"/(03|06|09)\b")
7
10
 
8
- class WiseReportPlaywright:
11
+
12
+ class WiseReportPlaywright(WiseReportPort):
9
13
  def __init__(self, browser: BrowserPort):
10
14
  self.browser = browser
11
15
 
12
- async def ensure_yearly_consensus_open_in_table_nth(
16
+ async def _ensure_yearly_consensus_open_in_table_nth(
13
17
  self,
14
18
  *,
15
19
  table_selector: str, # 예: TABLE_XPATH ("xpath=//div[@id='wrapper']//div//table")
@@ -150,19 +154,226 @@ class WiseReportPlaywright:
150
154
  logger.warning("ensure_yearly_consensus_open_in_table_nth: exceeded max_rounds")
151
155
  return False
152
156
 
153
- async def click_steps(
157
+ async def _click_steps(
154
158
  self,
155
159
  steps: list[tuple[str, str]],
156
160
  *,
157
161
  jitter_sec: float = 0.6,
158
162
  ) -> None:
159
- """
160
- 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
161
- """
162
163
  for _name, selector in steps:
163
164
  await self.browser.wait_attached(selector)
165
+ await self.browser.scroll_into_view(selector)
166
+
164
167
  logger.info(f"click step: {_name}")
165
- await self.browser.click(selector)
166
- # 서버/클라이언트 부담 줄이기: 작은 지터
167
- wait = int((0.5 + (jitter_sec * 0.5))*1000)
168
+
169
+ ok = await self.browser.try_click(selector, timeout_ms=1500, force=False)
170
+ try:
171
+ if ok:
172
+ await self.browser.click(selector, timeout_ms=4000, force=False)
173
+ else:
174
+ await self.browser.click(selector, timeout_ms=4000, force=True)
175
+ except Exception:
176
+ await self.browser.click(selector, timeout_ms=4000, force=True)
177
+
178
+ wait = int((0.5 + (jitter_sec * 0.5)) * 1000)
168
179
  await self.browser.sleep_ms(wait)
180
+
181
+ async def _is_quarter_view_by_header(
182
+ self, *, table_selector: str, table_index: int
183
+ ) -> tuple[bool, list[str]]:
184
+ """
185
+ 분기 화면 판정:
186
+ - 헤더에 YYYY/03 or YYYY/06 or YYYY/09 가 하나라도 있으면 분기 화면으로 확정
187
+ - YYYY/12만 있는 경우는 (분기 전환 실패 가능성이 높으니) 분기 화면으로 보지 않음
188
+ """
189
+ periods = await self.browser.table_header_periods_mm_nth(
190
+ table_selector, index=table_index
191
+ )
192
+ is_q = any(_Q_SIGNAL_RE.search(p) for p in periods)
193
+ return is_q, periods
194
+
195
+ async def set_view_c103(
196
+ self,
197
+ *,
198
+ key: str,
199
+ steps: list[tuple[str, str]],
200
+ table_selector: str,
201
+ table_index: int,
202
+ max_attempts: int = 5,
203
+ stabilize_timeout_sec: float = 10.0,
204
+ ) -> None:
205
+ """
206
+ c103 화면을 원하는 상태(key)에 맞게 '확정'한다.
207
+ - key.endswith("q") => 분기 화면이어야 함
208
+ - key.endswith("y") => 연간 화면이어야 함 (즉 분기 시그널이 없어야 함)
209
+
210
+ 클릭 -> 안정화(wait_table_text_changed) -> 헤더 판정 -> 실패 시 복구/재시도.
211
+ """
212
+ want_q = key.endswith("q")
213
+ last_periods: list[str] = []
214
+ last_is_q: bool | None = None
215
+
216
+ for attempt in range(1, max_attempts + 1):
217
+ logger.info(f"set_view_c103: key={key} attempt={attempt} want_q={want_q}")
218
+
219
+ # 1) 클릭 (가벼운 지터 포함)
220
+ await self._click_steps(steps, jitter_sec=0.6)
221
+
222
+ # 2) 토글 펼치기 (네 함수 그대로 사용)
223
+ try:
224
+ await self._ensure_yearly_consensus_open_in_table_nth(
225
+ table_selector=table_selector,
226
+ table_index=table_index,
227
+ )
228
+ except Exception as e:
229
+ logger.debug(f"ensure open failed (ignored): {type(e).__name__}: {e}")
230
+
231
+ # 3) 렌더 안정화: '변경'은 보장 못하지만, 로딩 흔들림을 줄여줌
232
+ try:
233
+ _ = await self.browser.wait_table_text_changed(
234
+ table_selector,
235
+ index=table_index,
236
+ prev_text=None,
237
+ min_rows=5,
238
+ min_lines=30,
239
+ timeout_sec=stabilize_timeout_sec,
240
+ )
241
+ except Exception as e:
242
+ logger.debug(
243
+ f"stabilize wait failed (ignored): {type(e).__name__}: {e}"
244
+ )
245
+
246
+ # 4) 헤더로 상태 확정
247
+ is_q, periods = await self._is_quarter_view_by_header(
248
+ table_selector=table_selector,
249
+ table_index=table_index,
250
+ )
251
+ last_is_q, last_periods = is_q, periods
252
+
253
+ logger.info(
254
+ f"set_view_c103: key={key} header periods(head)={periods[:8]} is_q={is_q}"
255
+ )
256
+
257
+ if want_q == is_q:
258
+ return # ✅ 성공 확정
259
+
260
+ # 5) 실패 시 복구 전략
261
+ # attempt가 올라갈수록 강하게: 스크롤/force click은 click_steps 쪽을 강화하는 게 좋지만
262
+ # 여기서는 reload로 리셋을 걸어준다.
263
+ if attempt in (2, 4):
264
+ logger.warning(f"set_view_c103 mismatch -> reload | key={key}")
265
+ await self.browser.reload(timeout_ms=12_000)
266
+ await self.browser.sleep_ms(250)
267
+
268
+ raise RuntimeError(
269
+ f"set_view_c103 failed: key={key} want_q={want_q} last_is_q={last_is_q} "
270
+ f"last_periods={last_periods[:12]}"
271
+ )
272
+
273
+ async def set_view_c104(
274
+ self,
275
+ *,
276
+ key: str,
277
+ steps: list[tuple[str, str]],
278
+ table_selector: str,
279
+ table_index: int,
280
+ prev_text_by_idx: dict[int, str | None],
281
+ max_attempts: int = 5,
282
+ stabilize_timeout_sec: float = 10.0,
283
+ min_rows: int = 5,
284
+ min_lines: int = 30,
285
+ open_consensus: bool = True,
286
+ ) -> None:
287
+ """
288
+ c104 화면을 원하는 상태(key)에 맞게 '확정'한다.
289
+
290
+ - key.endswith("q") => 분기 화면이어야 함 (헤더에 03/06/09 존재)
291
+ - key.endswith("y") => 연간 화면이어야 함 (헤더에 03/06/09 없어야 함, 12는 보통 존재)
292
+
293
+ 절차:
294
+ 1) 클릭 steps
295
+ 2) (옵션) 연간컨센서스 펼치기
296
+ 3) wait_table_text_changed (idx별 prev_text 추적)
297
+ 4) header periods로 q/y 판정
298
+ 5) mismatch면 reload/재시도
299
+ """
300
+ want_q = key.endswith("q")
301
+
302
+ # idx별 prev_text 초기값 방어
303
+ if table_index not in prev_text_by_idx:
304
+ prev_text_by_idx[table_index] = None
305
+
306
+ last_periods: list[str] = []
307
+ last_is_q: bool | None = None
308
+
309
+ for attempt in range(1, max_attempts + 1):
310
+ logger.info(
311
+ f"set_view_c104: key={key} idx={table_index} attempt={attempt} want_q={want_q}"
312
+ )
313
+
314
+ # 1) 클릭(행동)
315
+ await self._click_steps(steps, jitter_sec=0.6)
316
+
317
+ # 2) 컨센서스 펼치기(옵션)
318
+ if open_consensus:
319
+ try:
320
+ await self._ensure_yearly_consensus_open_in_table_nth(
321
+ table_selector=table_selector,
322
+ table_index=table_index,
323
+ wait_timeout_sec=stabilize_timeout_sec,
324
+ )
325
+ except Exception as e:
326
+ logger.debug(
327
+ f"ensure open failed (ignored): {type(e).__name__}: {e}"
328
+ )
329
+
330
+ # 3) 안정화(변경/유효 텍스트 대기) - idx별로 prev 추적
331
+ try:
332
+ prev_text_by_idx[
333
+ table_index
334
+ ] = await self.browser.wait_table_text_changed(
335
+ table_selector,
336
+ index=table_index,
337
+ prev_text=prev_text_by_idx[table_index],
338
+ min_rows=min_rows,
339
+ min_lines=min_lines,
340
+ timeout_sec=stabilize_timeout_sec,
341
+ )
342
+ except Exception as e:
343
+ logger.debug(
344
+ f"stabilize wait failed (ignored): {type(e).__name__}: {e}"
345
+ )
346
+
347
+ # 4) 헤더로 상태 확정
348
+ is_q, periods = await self._is_quarter_view_by_header(
349
+ table_selector=table_selector,
350
+ table_index=table_index,
351
+ )
352
+ last_is_q, last_periods = is_q, periods
353
+
354
+ logger.info(
355
+ f"set_view_c104: key={key} idx={table_index} periods(head)={periods[:8]} is_q={is_q}"
356
+ )
357
+
358
+ # periods 자체가 비면 "불확정"으로 보고 재시도하는 게 안전
359
+ if not periods:
360
+ logger.warning(
361
+ f"set_view_c104: header periods empty -> retry | key={key} idx={table_index}"
362
+ )
363
+ else:
364
+ if want_q == is_q:
365
+ return # ✅ 성공 확정
366
+
367
+ # 5) 실패 복구 전략
368
+ if attempt in (2, 4):
369
+ logger.warning(
370
+ f"set_view_c104 mismatch/uncertain -> reload | key={key}"
371
+ )
372
+ await self.browser.reload(timeout_ms=12_000)
373
+
374
+ await self.browser.sleep_ms(250)
375
+
376
+ raise RuntimeError(
377
+ f"set_view_c104 failed: key={key} idx={table_index} want_q={want_q} "
378
+ f"last_is_q={last_is_q} last_periods={last_periods[:8]}"
379
+ )
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
 
4
4
  from typing import Mapping
5
5
 
6
- from contracts_hj3415.nfs.types import BlockKeys
6
+ from contracts_hj3415.nfs.types import BlockKey
7
7
  from contracts_hj3415.nfs.constants import C101_BLOCK_KEYS, C103_BLOCK_KEYS, C104_BLOCK_KEYS, C106_BLOCK_KEYS, C108_BLOCK_KEYS
8
8
  from scraper2_hj3415.app.domain.endpoint import EndpointKind
9
9
 
@@ -25,7 +25,7 @@ def get_block_keys(endpoint: EndpointKind) -> tuple[str, ...]:
25
25
  return BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())
26
26
 
27
27
 
28
- def is_known_block(endpoint: EndpointKind, key: BlockKeys) -> bool:
28
+ def is_known_block(endpoint: EndpointKind, key: BlockKey) -> bool:
29
29
  """
30
30
  블록 키가 해당 endpoint의 공식 목록에 포함되는지 여부.
31
31
  (검증/필터링/동적 payload merge 등에 사용)
@@ -5,6 +5,7 @@ import pandas as pd
5
5
 
6
6
  from logging_hj3415 import logger
7
7
  from scraper2_hj3415.app.parsing._normalize.table import normalize_metrics_df
8
+ from scraper2_hj3415.app.parsing._normalize.label import sanitize_label
8
9
  from common_hj3415.utils import clean_text
9
10
 
10
11
 
@@ -68,12 +69,12 @@ def df_to_c1034_metric_list(df: pd.DataFrame) -> list[dict[str, Any]]:
68
69
 
69
70
  df = df.copy()
70
71
 
71
- # 정규화 전에 원래 항목 라벨 보존
72
+ # 정규화 전에 원래 항목 라벨 보존(공백, 펼치기, 닫기등 제외)
72
73
  if "항목" in df.columns:
73
74
  df["항목_raw"] = (
74
75
  df["항목"]
75
76
  .where(df["항목"].notna(), None)
76
- .map(lambda x: str(x) if x is not None else None)
77
+ .map(lambda x: sanitize_label(x) if x is not None else None)
77
78
  )
78
79
 
79
80
  df = normalize_metrics_df(df)
@@ -3,7 +3,10 @@ from __future__ import annotations
3
3
  from typing import Any
4
4
 
5
5
  from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
- from scraper2_hj3415.app.parsing._tables.html_table import try_html_table_to_df, df_to_c1034_metric_list
6
+ from scraper2_hj3415.app.parsing._tables.html_table import (
7
+ try_html_table_to_df,
8
+ df_to_c1034_metric_list,
9
+ )
7
10
 
8
11
  TABLE_XPATH = "xpath=//div[@id='wrapper']//div//table"
9
12
  TABLE_INDEX = 2
@@ -4,7 +4,10 @@ from __future__ import annotations
4
4
  from typing import Any
5
5
 
6
6
  from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
7
- from scraper2_hj3415.app.parsing._tables.html_table import try_html_table_to_df, df_to_c1034_metric_list
7
+ from scraper2_hj3415.app.parsing._tables.html_table import (
8
+ try_html_table_to_df,
9
+ df_to_c1034_metric_list,
10
+ )
8
11
 
9
12
  TABLE_XPATH = 'xpath=//table[@class="gHead01 all-width data-list"]'
10
13
 
@@ -1,115 +1,32 @@
1
1
  # scraper2_hj3415/app/ports/browser/browser_port.py
2
2
  from __future__ import annotations
3
3
 
4
- from typing import Protocol, Any
4
+ from typing import Protocol
5
5
 
6
+ from .capabilities import (
7
+ BrowserInteractionPort,
8
+ BrowserNavigationPort,
9
+ BrowserScopePort,
10
+ BrowserTablePort,
11
+ BrowserTextPort,
12
+ BrowserWaitPort,
13
+ )
6
14
 
7
- class BrowserPort(Protocol):
8
- async def wait_table_nth_ready(
9
- self,
10
- table_selector: str,
11
- *,
12
- index: int,
13
- min_rows: int = 1,
14
- timeout_ms: int = 20_000,
15
- poll_ms: int = 200,
16
- ) -> None: ...
17
- async def title(self) -> str: ...
18
- async def current_url(self) -> str: ...
19
- async def goto_and_wait_for_stable(self, url: str, timeout_ms: int = 10_000) -> None: ...
20
- async def reload(self, *, timeout_ms: int = 10_000) -> None: ...
21
- async def sleep_ms(self, ms: int) -> None: ...
22
- async def wait_attached(
23
- self, selector: str, *, timeout_ms: int = 10_000
24
- ) -> None: ...
25
- async def wait_visible(
26
- self, selector: str, *, timeout_ms: int = 10_000
27
- ) -> None: ...
28
- async def click(
29
- self,
30
- selector: str,
31
- *,
32
- index: int = 0,
33
- timeout_ms: int = 4_000,
34
- force: bool = False,
35
- ) -> None: ...
36
- async def try_click(
37
- self,
38
- selector: str,
39
- *,
40
- index: int = 0,
41
- timeout_ms: int = 1_500,
42
- force: bool = False,
43
- ) -> bool: ...
44
- async def count(self, selector: str) -> int: ...
45
- async def scroll_into_view(self, selector: str, *, index: int = 0) -> None: ...
46
- async def text_content_first(self, selector: str) -> str: ...
47
- async def all_texts(self, selector: str) -> list[str]: ...
48
- async def get_text_by_text(self, needle: str) -> str: ...
49
- async def inner_text(self, selector: str) -> str: ...
50
- async def outer_html_nth(self, selector: str, index: int) -> str: ...
51
- async def wait_table_text_changed(
52
- self,
53
- table_selector: str,
54
- *,
55
- index: int,
56
- prev_text: str | None,
57
- min_rows: int = 1,
58
- min_lines: int = 50,
59
- timeout_sec: float = 12.0,
60
- poll_sec: float = 0.2,
61
- ) -> str: ...
62
- async def is_attached(self, selector: str, *, index: int = 0) -> bool: ...
63
- async def computed_style(
64
- self, selector: str, *, index: int = 0, prop: str
65
- ) -> str: ...
66
- async def count_in_nth(
67
- self,
68
- scope_selector: str,
69
- *,
70
- scope_index: int,
71
- inner_selector: str,
72
- ) -> int: ...
73
- async def eval_in_nth_first(
74
- self,
75
- scope_selector: str,
76
- *,
77
- scope_index: int,
78
- inner_selector: str,
79
- expression: str,
80
- ) -> Any: ...
81
- async def inner_text_in_nth(
82
- self,
83
- scope_selector: str,
84
- *,
85
- scope_index: int,
86
- inner_selector: str,
87
- inner_index: int = 0,
88
- timeout_ms: int = 10_000,
89
- ) -> str:
90
- """
91
- scope_selector의 nth(scope_index) 요소 안에서
92
- inner_selector의 nth(inner_index) 요소의 innerText를 반환.
93
- (렌더링 기준 텍스트: 줄바꿈/스타일 영향 반영)
94
- """
95
- ...
96
15
 
97
- async def text_content_in_nth(
98
- self,
99
- scope_selector: str,
100
- *,
101
- scope_index: int,
102
- inner_selector: str,
103
- inner_index: int = 0,
104
- timeout_ms: int = 10_000,
105
- ) -> str:
106
- """
107
- scope_selector의 nth(scope_index) 요소 안에서
108
- inner_selector의 nth(inner_index) 요소의 textContent를 반환.
109
- (DOM 기준 텍스트: 숨김 텍스트도 포함될 수 있음)
110
- """
111
- ...
16
+ class BrowserPort(
17
+ BrowserNavigationPort,
18
+ BrowserWaitPort,
19
+ BrowserInteractionPort,
20
+ BrowserTextPort,
21
+ BrowserScopePort,
22
+ BrowserTablePort,
23
+ Protocol,
24
+ ):
25
+ """
26
+ 프로젝트에서 사용하는 최종 BrowserPort.
112
27
 
113
- async def table_records(
114
- self, table_selector: str, *, header: int | list[int] | None = 0
115
- ) -> list[dict[str, Any]]: ...
28
+ - 내부는 capability 단위로 분리되어 있으며,
29
+ 필요하면 파서/유스케이스가 BrowserPort 대신
30
+ 특정 capability 포트만 의존하도록 바꿀 수 있다.
31
+ """
32
+ ...