scraper2-hj3415 2.4.1__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {scraper2_hj3415-2.4.1 → scraper2_hj3415-2.6.0}/PKG-INFO +3 -1
- {scraper2_hj3415-2.4.1 → scraper2_hj3415-2.6.0}/pyproject.toml +6 -4
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
- {scraper2_hj3415-2.4.1/src/scraper2 → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
- {scraper2_hj3415-2.4.1/src/scraper2 → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c103_parser.py +19 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c104_parser.py +23 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2_hj3415-2.4.1/src/scraper2/main.py → scraper2_hj3415-2.6.0/src/scraper2_hj3415/cli.py +40 -70
- scraper2_hj3415-2.4.1/src/scraper2/.DS_Store +0 -0
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/.DS_Store +0 -0
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2_hj3415-2.4.1/src/scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2_hj3415-2.4.1/src/scraper2/app/composition.py +0 -204
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/_converters.py +0 -85
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/_normalize.py +0 -134
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/c101_parser.py +0 -143
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/c103_parser.py +0 -128
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/c104_parser.py +0 -143
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/c106_parser.py +0 -153
- scraper2_hj3415-2.4.1/src/scraper2/app/parsing/c108_parser.py +0 -65
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/ingest_port.py +0 -14
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2_hj3415-2.4.1/src/scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- {scraper2_hj3415-2.4.1 → scraper2_hj3415-2.6.0}/LICENSE +0 -0
- {scraper2_hj3415-2.4.1 → scraper2_hj3415-2.6.0}/README.md +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2 → scraper2_hj3415-2.6.0/src/scraper2_hj3415}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/adapters/out → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/adapters/out/playwright → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/parsing → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/ports → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/ports/browser → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/ports/sinks → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/usecases → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/usecases/fetch → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1/src/scraper2/app/usecases/ingest → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: scraper2-hj3415
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.6.0
|
|
4
4
|
Summary: Naver WiseReport scraper
|
|
5
5
|
Keywords: example,demo
|
|
6
6
|
Author-email: Hyungjin Kim <hj3415@gmail.com>
|
|
@@ -17,6 +17,8 @@ Requires-Dist: lxml>=6.0.2
|
|
|
17
17
|
Requires-Dist: typer>=0.21.0
|
|
18
18
|
Requires-Dist: db2-hj3415
|
|
19
19
|
Requires-Dist: contracts-hj3415
|
|
20
|
+
Requires-Dist: common-hj3415
|
|
21
|
+
Requires-Dist: logging-hj3415
|
|
20
22
|
|
|
21
23
|
# scraper2
|
|
22
24
|
|
|
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "scraper2-hj3415" # PyPI 이름 (하이픈 허용)
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.6.0"
|
|
8
8
|
description = "Naver WiseReport scraper"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.11"
|
|
@@ -25,11 +25,13 @@ dependencies = [
|
|
|
25
25
|
"typer>=0.21.0",
|
|
26
26
|
"db2-hj3415",
|
|
27
27
|
"contracts-hj3415",
|
|
28
|
+
"common-hj3415",
|
|
29
|
+
"logging-hj3415",
|
|
28
30
|
]
|
|
29
31
|
|
|
30
32
|
[tool.flit.module]
|
|
31
|
-
name = "
|
|
32
|
-
path = "src/
|
|
33
|
+
name = "scraper2_hj3415"
|
|
34
|
+
path = "src/scraper2_hj3415"
|
|
33
35
|
|
|
34
36
|
[project.scripts]
|
|
35
|
-
scraper2 = "
|
|
37
|
+
scraper2 = "scraper2_hj3415.cli:app"
|
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/browser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from io import StringIO
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from playwright.async_api import Page, TimeoutError as PwTimeoutError
|
|
8
|
+
import asyncio
|
|
9
|
+
import time
|
|
10
|
+
from logging_hj3415 import logger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PlaywrightBrowser:
|
|
14
|
+
def __init__(self, page: Page):
|
|
15
|
+
self._page = page
|
|
16
|
+
|
|
17
|
+
async def _wait_for_network_quiet(self, *, timeout_ms: int = 10_000) -> None:
|
|
18
|
+
# networkidle은 사이트에 따라 영원히 안 올 수도 있어서 try로 감싸는 게 안전
|
|
19
|
+
logger.debug("wait for network quiet")
|
|
20
|
+
try:
|
|
21
|
+
await self._page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
|
22
|
+
except Exception:
|
|
23
|
+
# networkidle이 안 와도 다음 단계(앵커 wait)가 더 중요함
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
async def wait_table_nth_ready(
|
|
27
|
+
self,
|
|
28
|
+
table_selector: str,
|
|
29
|
+
*,
|
|
30
|
+
index: int,
|
|
31
|
+
min_rows: int = 1,
|
|
32
|
+
timeout_ms: int = 20_000,
|
|
33
|
+
poll_ms: int = 200,
|
|
34
|
+
) -> None:
|
|
35
|
+
logger.debug("wait for table nth_ready")
|
|
36
|
+
table = self._page.locator(table_selector).nth(index)
|
|
37
|
+
await table.wait_for(state="attached", timeout=timeout_ms)
|
|
38
|
+
|
|
39
|
+
# html = await table.evaluate("el => el.outerHTML")
|
|
40
|
+
# logger.debug(f"TABLE HTML:\n{html}")
|
|
41
|
+
|
|
42
|
+
rows = table.locator("tbody tr")
|
|
43
|
+
deadline = time.monotonic() + timeout_ms / 1000
|
|
44
|
+
|
|
45
|
+
cnt = 0
|
|
46
|
+
while time.monotonic() < deadline:
|
|
47
|
+
try:
|
|
48
|
+
cnt = await rows.count()
|
|
49
|
+
except Exception:
|
|
50
|
+
cnt = 0
|
|
51
|
+
|
|
52
|
+
if cnt >= min_rows:
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
await asyncio.sleep(poll_ms / 1000)
|
|
56
|
+
|
|
57
|
+
logger.warning(f"table rows timeout: last_cnt={cnt}, need>={min_rows}")
|
|
58
|
+
raise TimeoutError(f"nth table not ready: index={index}, rows<{min_rows}")
|
|
59
|
+
|
|
60
|
+
async def title(self) -> str:
|
|
61
|
+
return await self._page.title()
|
|
62
|
+
|
|
63
|
+
async def current_url(self) -> str:
|
|
64
|
+
return self._page.url
|
|
65
|
+
|
|
66
|
+
async def goto_and_wait_for_stable(
|
|
67
|
+
self, url: str, timeout_ms: int = 10_000
|
|
68
|
+
) -> None:
|
|
69
|
+
logger.info(f"goto: {url}")
|
|
70
|
+
await self._page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
71
|
+
await self._wait_for_network_quiet(timeout_ms=timeout_ms // 2)
|
|
72
|
+
|
|
73
|
+
async def reload(self, *, timeout_ms: int = 10_000) -> None:
|
|
74
|
+
logger.info("reload")
|
|
75
|
+
await self._page.reload(timeout=timeout_ms, wait_until="domcontentloaded")
|
|
76
|
+
|
|
77
|
+
async def sleep_ms(self, ms: int) -> None:
|
|
78
|
+
await asyncio.sleep(ms / 1000)
|
|
79
|
+
|
|
80
|
+
async def wait_attached(self, selector: str, *, timeout_ms: int = 10_000) -> None:
|
|
81
|
+
await self._page.locator(selector).first.wait_for(
|
|
82
|
+
state="attached", timeout=timeout_ms
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
async def wait_visible(self, selector: str, *, timeout_ms: int = 10_000) -> None:
|
|
86
|
+
await self._page.locator(selector).first.wait_for(
|
|
87
|
+
state="visible", timeout=timeout_ms
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
async def click(
|
|
91
|
+
self,
|
|
92
|
+
selector: str,
|
|
93
|
+
*,
|
|
94
|
+
index: int = 0,
|
|
95
|
+
timeout_ms: int = 4_000,
|
|
96
|
+
force: bool = False,
|
|
97
|
+
) -> None:
|
|
98
|
+
loc = self._page.locator(selector).nth(index)
|
|
99
|
+
await loc.click(timeout=timeout_ms, force=force)
|
|
100
|
+
|
|
101
|
+
async def try_click(
|
|
102
|
+
self,
|
|
103
|
+
selector: str,
|
|
104
|
+
*,
|
|
105
|
+
index: int = 0,
|
|
106
|
+
timeout_ms: int = 1_500,
|
|
107
|
+
force: bool = False,
|
|
108
|
+
) -> bool:
|
|
109
|
+
loc = self._page.locator(selector).nth(index)
|
|
110
|
+
try:
|
|
111
|
+
await loc.click(timeout=timeout_ms, trial=True, force=force)
|
|
112
|
+
return True
|
|
113
|
+
except PwTimeoutError:
|
|
114
|
+
return False
|
|
115
|
+
|
|
116
|
+
async def count(self, selector: str) -> int:
|
|
117
|
+
return await self._page.locator(selector).count()
|
|
118
|
+
|
|
119
|
+
async def scroll_into_view(self, selector: str, *, index: int = 0) -> None:
|
|
120
|
+
# 선택한 요소가 화면에 보이도록 스크롤을 자동으로 내려준다.
|
|
121
|
+
await self._page.locator(selector).nth(index).scroll_into_view_if_needed()
|
|
122
|
+
|
|
123
|
+
async def text_content_first(self, selector: str) -> str:
|
|
124
|
+
# selector 첫 번째 요소의 text_content()를 반환
|
|
125
|
+
return (await self._page.locator(selector).first.text_content()) or ""
|
|
126
|
+
|
|
127
|
+
async def all_texts(self, selector: str) -> list[str]:
|
|
128
|
+
# selector로 잡히는 모든 요소를 all_text_contents()로 가져옴
|
|
129
|
+
loc = self._page.locator(selector)
|
|
130
|
+
return await loc.all_text_contents()
|
|
131
|
+
|
|
132
|
+
async def get_text_by_text(self, needle: str) -> str:
|
|
133
|
+
"""
|
|
134
|
+
페이지에서 주어진 텍스트(needle)를 포함하는 요소 중
|
|
135
|
+
첫 번째 요소의 text_content를 반환한다.
|
|
136
|
+
|
|
137
|
+
- 요소가 없으면 빈 문자열 반환
|
|
138
|
+
- 부분 일치 기준
|
|
139
|
+
"""
|
|
140
|
+
return (await self._page.get_by_text(needle).first.text_content()) or ""
|
|
141
|
+
|
|
142
|
+
async def inner_text(self, selector: str) -> str:
|
|
143
|
+
"""
|
|
144
|
+
selector에 해당하는 첫 번째 요소의 innerText를 반환한다.
|
|
145
|
+
|
|
146
|
+
- 요소가 DOM에 attach될 때까지 대기
|
|
147
|
+
- 화면에 보이는 텍스트 기준(innerText)
|
|
148
|
+
"""
|
|
149
|
+
return await self._page.locator(selector).first.inner_text()
|
|
150
|
+
|
|
151
|
+
async def outer_html_nth(self, selector: str, index: int) -> str:
|
|
152
|
+
"""
|
|
153
|
+
selector로 매칭되는 요소 중 index번째 요소의 outerHTML을 반환한다.
|
|
154
|
+
|
|
155
|
+
- index는 0-based
|
|
156
|
+
- 요소가 없으면 playwright 예외 발생
|
|
157
|
+
"""
|
|
158
|
+
loc = self._page.locator(selector).nth(index)
|
|
159
|
+
# index가 범위를 벗어나면 playwright가 에러를 내는데,
|
|
160
|
+
# 필요하면 여기서 더 친절한 에러로 감싸도 됨.
|
|
161
|
+
return await loc.evaluate("el => el.outerHTML")
|
|
162
|
+
|
|
163
|
+
async def wait_table_text_changed(
|
|
164
|
+
self,
|
|
165
|
+
table_selector: str,
|
|
166
|
+
*,
|
|
167
|
+
index: int,
|
|
168
|
+
prev_text: str | None,
|
|
169
|
+
min_rows: int = 1,
|
|
170
|
+
min_lines: int = 50,
|
|
171
|
+
timeout_sec: float = 12.0,
|
|
172
|
+
poll_sec: float = 0.2,
|
|
173
|
+
) -> str:
|
|
174
|
+
"""
|
|
175
|
+
지정한 table(nth)의 innerText가 '유효한 상태'가 되고,
|
|
176
|
+
이전 텍스트(prev_text)와 달라질 때까지 대기한 뒤 반환한다.
|
|
177
|
+
|
|
178
|
+
동작 순서:
|
|
179
|
+
1) tbody row 개수 기준으로 테이블이 최소한 로딩되었는지 보장
|
|
180
|
+
2) innerText를 주기적으로 폴링하며
|
|
181
|
+
- 최소 라인 수(min_lines)를 만족하고
|
|
182
|
+
- prev_text가 None이거나, prev_text와 다른 경우 반환
|
|
183
|
+
|
|
184
|
+
특징:
|
|
185
|
+
- DOM이 붙었지만 데이터가 아직 비어 있는 상태를 배제
|
|
186
|
+
- 클릭/토글 이후 실제 데이터 변경을 안정적으로 감지
|
|
187
|
+
- 타임아웃 시 마지막으로 관측된 텍스트를 반환
|
|
188
|
+
|
|
189
|
+
반환값:
|
|
190
|
+
- 변경된(innerText) 문자열
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
# 0) 최초/혹은 불안정할 때는 row 기준으로 'ready'를 먼저 확보
|
|
194
|
+
await self.wait_table_nth_ready(
|
|
195
|
+
table_selector,
|
|
196
|
+
index=index,
|
|
197
|
+
min_rows=min_rows,
|
|
198
|
+
timeout_ms=int(timeout_sec * 1000),
|
|
199
|
+
poll_ms=int(poll_sec * 1000),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# 1) 그 다음 텍스트 기반으로 '유효 + 변경'을 기다림
|
|
203
|
+
start = time.monotonic()
|
|
204
|
+
last_text = ""
|
|
205
|
+
|
|
206
|
+
while True:
|
|
207
|
+
loc = self._page.locator(table_selector).nth(index)
|
|
208
|
+
try:
|
|
209
|
+
text = await loc.inner_text()
|
|
210
|
+
except Exception:
|
|
211
|
+
text = ""
|
|
212
|
+
|
|
213
|
+
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
|
214
|
+
is_valid = len(lines) >= min_lines
|
|
215
|
+
|
|
216
|
+
if is_valid:
|
|
217
|
+
last_text = text
|
|
218
|
+
if prev_text is None or text != prev_text:
|
|
219
|
+
return text
|
|
220
|
+
|
|
221
|
+
if time.monotonic() - start >= timeout_sec:
|
|
222
|
+
return last_text
|
|
223
|
+
|
|
224
|
+
await asyncio.sleep(poll_sec)
|
|
225
|
+
|
|
226
|
+
async def is_attached(self, selector: str, *, index: int = 0) -> bool:
|
|
227
|
+
"""
|
|
228
|
+
selector의 nth(index) 요소가 DOM에 존재하는지(attached) 여부를 반환한다.
|
|
229
|
+
요소가 없거나 접근 중 예외가 발생하면 False를 반환한다.
|
|
230
|
+
"""
|
|
231
|
+
try:
|
|
232
|
+
loc = self._page.locator(selector).nth(index)
|
|
233
|
+
return await loc.count() > 0
|
|
234
|
+
except Exception:
|
|
235
|
+
return False
|
|
236
|
+
|
|
237
|
+
async def computed_style(self, selector: str, *, index: int = 0, prop: str) -> str:
|
|
238
|
+
"""
|
|
239
|
+
selector의 nth(index) 요소에 대해,
|
|
240
|
+
CSS 계산값(getComputedStyle)의 특정 속성(prop)을 문자열로 반환한다.
|
|
241
|
+
(예: display, visibility, opacity 등)
|
|
242
|
+
"""
|
|
243
|
+
loc = self._page.locator(selector).nth(index)
|
|
244
|
+
# attached 보장하고 싶으면 여기서 wait_for(state="attached") 추가 가능
|
|
245
|
+
return await loc.evaluate(
|
|
246
|
+
"(el, prop) => getComputedStyle(el)[prop] || ''", prop
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
async def count_in_nth(
|
|
250
|
+
self,
|
|
251
|
+
scope_selector: str,
|
|
252
|
+
*,
|
|
253
|
+
scope_index: int,
|
|
254
|
+
inner_selector: str,
|
|
255
|
+
) -> int:
|
|
256
|
+
"""
|
|
257
|
+
scope_selector의 nth(scope_index) 범위 안에서
|
|
258
|
+
inner_selector에 매칭되는 요소 개수를 반환한다.
|
|
259
|
+
"""
|
|
260
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
261
|
+
return await scope.locator(inner_selector).count()
|
|
262
|
+
|
|
263
|
+
async def eval_in_nth_first(
|
|
264
|
+
self,
|
|
265
|
+
scope_selector: str,
|
|
266
|
+
*,
|
|
267
|
+
scope_index: int,
|
|
268
|
+
inner_selector: str,
|
|
269
|
+
expression: str,
|
|
270
|
+
) -> Any:
|
|
271
|
+
"""
|
|
272
|
+
scope(nth) 내부의 inner_selector.first element를 잡고 JS expression을 실행한다.
|
|
273
|
+
|
|
274
|
+
expression 예:
|
|
275
|
+
- "el => window.getComputedStyle(el).display"
|
|
276
|
+
- "el => el.getAttribute('data-content') || ''"
|
|
277
|
+
- "el => el.innerText"
|
|
278
|
+
"""
|
|
279
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
280
|
+
loc = scope.locator(inner_selector).first
|
|
281
|
+
|
|
282
|
+
# 매칭되는 게 없으면 None
|
|
283
|
+
if await loc.count() == 0:
|
|
284
|
+
return None
|
|
285
|
+
|
|
286
|
+
return await loc.evaluate(expression)
|
|
287
|
+
|
|
288
|
+
async def inner_text_in_nth(
|
|
289
|
+
self,
|
|
290
|
+
scope_selector: str,
|
|
291
|
+
*,
|
|
292
|
+
scope_index: int,
|
|
293
|
+
inner_selector: str,
|
|
294
|
+
inner_index: int = 0,
|
|
295
|
+
timeout_ms: int = 10_000,
|
|
296
|
+
) -> str:
|
|
297
|
+
"""
|
|
298
|
+
scope(nth) 내부에서 inner_selector(nth)의 innerText를 반환.
|
|
299
|
+
- innerText: 렌더링 기준(줄바꿈/숨김 반영)
|
|
300
|
+
"""
|
|
301
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
302
|
+
inner = scope.locator(inner_selector).nth(inner_index)
|
|
303
|
+
|
|
304
|
+
# 요소가 늦게 뜨는 케이스 대응
|
|
305
|
+
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
return (await inner.inner_text()) or ""
|
|
309
|
+
except Exception:
|
|
310
|
+
# inner_text 자체가 실패하는 순간(사라짐/리렌더)도 있어서 안전하게
|
|
311
|
+
return ""
|
|
312
|
+
|
|
313
|
+
async def text_content_in_nth(
|
|
314
|
+
self,
|
|
315
|
+
scope_selector: str,
|
|
316
|
+
*,
|
|
317
|
+
scope_index: int,
|
|
318
|
+
inner_selector: str,
|
|
319
|
+
inner_index: int = 0,
|
|
320
|
+
timeout_ms: int = 10_000,
|
|
321
|
+
) -> str:
|
|
322
|
+
"""
|
|
323
|
+
scope(nth) 내부에서 inner_selector(nth)의 textContent를 반환.
|
|
324
|
+
- textContent: DOM 기준(숨김 텍스트 포함 가능)
|
|
325
|
+
"""
|
|
326
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
327
|
+
inner = scope.locator(inner_selector).nth(inner_index)
|
|
328
|
+
|
|
329
|
+
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
return (await inner.text_content()) or ""
|
|
333
|
+
except Exception:
|
|
334
|
+
return ""
|
|
335
|
+
|
|
336
|
+
async def table_records(
|
|
337
|
+
self,
|
|
338
|
+
table_selector: str,
|
|
339
|
+
*,
|
|
340
|
+
header: int | list[int] | None = 0,
|
|
341
|
+
) -> list[dict[str, Any]]:
|
|
342
|
+
await self.wait_attached(table_selector)
|
|
343
|
+
|
|
344
|
+
table = self._page.locator(table_selector).first
|
|
345
|
+
html = await table.evaluate("el => el.outerHTML")
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
df = pd.read_html(StringIO(html), header=header)[0]
|
|
349
|
+
except Exception as e:
|
|
350
|
+
raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
|
|
351
|
+
|
|
352
|
+
# 문자열 컬럼일 때만 정규화
|
|
353
|
+
if all(isinstance(c, str) for c in df.columns):
|
|
354
|
+
if "항목" in df.columns:
|
|
355
|
+
df["항목"] = (
|
|
356
|
+
df["항목"].astype(str).str.replace("펼치기", "").str.strip()
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
df.columns = (
|
|
360
|
+
df.columns.astype(str)
|
|
361
|
+
.str.replace("연간컨센서스보기", "", regex=False)
|
|
362
|
+
.str.replace("연간컨센서스닫기", "", regex=False)
|
|
363
|
+
.str.replace("(IFRS연결)", "", regex=False)
|
|
364
|
+
.str.replace("(IFRS별도)", "", regex=False)
|
|
365
|
+
.str.replace("(GAAP개별)", "", regex=False)
|
|
366
|
+
.str.replace("(YoY)", "", regex=False)
|
|
367
|
+
.str.replace("(QoQ)", "", regex=False)
|
|
368
|
+
.str.replace("(E)", "", regex=False)
|
|
369
|
+
.str.replace(".", "", regex=False)
|
|
370
|
+
.str.strip()
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
return df.where(pd.notnull(df), None).to_dict(orient="records")
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/browser_factory.py
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
@@ -6,10 +6,10 @@ from contextlib import asynccontextmanager
|
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from typing import AsyncIterator
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
9
|
+
from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
10
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
11
|
+
from scraper2_hj3415.app.adapters.out.playwright.session import PlaywrightPageSession
|
|
12
|
+
from scraper2_hj3415.app.adapters.out.playwright.browser import PlaywrightBrowser
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/sinks/memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts_hj3415.nfs.nfs_dto import NfsDTO
|
|
7
|
+
from contracts_hj3415.nfs.types import Endpoints
|
|
8
|
+
|
|
9
|
+
from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MemorySink:
|
|
13
|
+
def __init__(self, store: InMemoryStore[NfsDTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
|
|
17
|
+
await self._store.put(endpoint, dto.code, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(
|
|
20
|
+
self,
|
|
21
|
+
dtos: Iterable[NfsDTO],
|
|
22
|
+
*,
|
|
23
|
+
endpoint: Endpoints,
|
|
24
|
+
) -> None:
|
|
25
|
+
await self._store.put_many(endpoint, ((d.code, d) for d in dtos))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts_hj3415.nfs.nfs_dto import NfsDTO
|
|
10
|
+
from contracts_hj3415.nfs.types import Endpoints
|
|
11
|
+
|
|
12
|
+
from db2_hj3415.nfs.repo import (
|
|
13
|
+
upsert_latest_payload,
|
|
14
|
+
upsert_latest_payload_many,
|
|
15
|
+
insert_snapshot_payload,
|
|
16
|
+
insert_snapshots_payload_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MongoSink:
|
|
21
|
+
def __init__(self, db: AsyncDatabase):
|
|
22
|
+
self._db = db
|
|
23
|
+
|
|
24
|
+
async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
|
|
25
|
+
code = str(dto.code).strip()
|
|
26
|
+
if not code:
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
payload = dict(dto.payload) # Mapping 방어
|
|
30
|
+
|
|
31
|
+
await upsert_latest_payload(
|
|
32
|
+
self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
|
|
33
|
+
)
|
|
34
|
+
await insert_snapshot_payload(
|
|
35
|
+
self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
async def write_many(
|
|
39
|
+
self,
|
|
40
|
+
dtos: Iterable[NfsDTO],
|
|
41
|
+
*,
|
|
42
|
+
endpoint: Endpoints,
|
|
43
|
+
) -> None:
|
|
44
|
+
items: dict[str, dict] = {}
|
|
45
|
+
ts: datetime | None = None
|
|
46
|
+
|
|
47
|
+
for dto in dtos:
|
|
48
|
+
code = str(dto.code).strip()
|
|
49
|
+
if not code:
|
|
50
|
+
continue
|
|
51
|
+
items[code] = dict(dto.payload)
|
|
52
|
+
if ts is None:
|
|
53
|
+
ts = dto.asof # 첫 dto의 asof를 배치 기준으로
|
|
54
|
+
|
|
55
|
+
if not items:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
await upsert_latest_payload_many(
|
|
59
|
+
self._db, endpoint=endpoint, items=items, asof=ts
|
|
60
|
+
)
|
|
61
|
+
await insert_snapshots_payload_many(
|
|
62
|
+
self._db, endpoint=endpoint, items=items, asof=ts
|
|
63
|
+
)
|
|
@@ -1,29 +1,34 @@
|
|
|
1
|
-
#
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/sinks/store.py
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from collections import defaultdict, deque
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Deque, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
|
|
8
8
|
|
|
9
9
|
T = TypeVar("T") # DTO 타입
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
@dataclass(frozen=True)
|
|
12
13
|
class StoreStats:
|
|
13
14
|
endpoint: str
|
|
14
15
|
latest_count: int
|
|
15
16
|
history_count: int
|
|
16
17
|
|
|
18
|
+
|
|
17
19
|
class InMemoryStore(Generic[T]):
|
|
18
20
|
"""
|
|
19
21
|
endpoint별로 DTO를 저장한다.
|
|
20
22
|
- latest: endpoint -> key(보통 코드) -> dto
|
|
21
23
|
- history: endpoint -> deque[dto] (최근 max_history개)
|
|
22
24
|
"""
|
|
25
|
+
|
|
23
26
|
def __init__(self, *, max_history: int = 2000):
|
|
24
27
|
self._lock = asyncio.Lock()
|
|
25
28
|
self._max_history = max_history
|
|
26
|
-
self._history: Dict[str, Deque[T]] = defaultdict(
|
|
29
|
+
self._history: Dict[str, Deque[T]] = defaultdict(
|
|
30
|
+
lambda: deque(maxlen=max_history)
|
|
31
|
+
)
|
|
27
32
|
self._latest: Dict[str, Dict[str, T]] = defaultdict(dict)
|
|
28
33
|
|
|
29
34
|
# ---------- write ----------
|
|
@@ -62,7 +67,11 @@ class InMemoryStore(Generic[T]):
|
|
|
62
67
|
async with self._lock:
|
|
63
68
|
latest_count = len(self._latest.get(endpoint, {}))
|
|
64
69
|
history_count = len(self._history.get(endpoint, []))
|
|
65
|
-
return StoreStats(
|
|
70
|
+
return StoreStats(
|
|
71
|
+
endpoint=endpoint,
|
|
72
|
+
latest_count=latest_count,
|
|
73
|
+
history_count=history_count,
|
|
74
|
+
)
|
|
66
75
|
|
|
67
76
|
async def clear(self, endpoint: str | None = None) -> None:
|
|
68
77
|
async with self._lock:
|
|
@@ -71,4 +80,4 @@ class InMemoryStore(Generic[T]):
|
|
|
71
80
|
self._latest.clear()
|
|
72
81
|
else:
|
|
73
82
|
self._history.pop(endpoint, None)
|
|
74
|
-
self._latest.pop(endpoint, None)
|
|
83
|
+
self._latest.pop(endpoint, None)
|