scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +103 -0
- scraper2/adapters/out/playwright/browser_factory.py +112 -0
- scraper2/adapters/out/playwright/session.py +121 -0
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +15 -0
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/store.py +74 -0
- scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
- scraper2/app/composition.py +195 -0
- scraper2/app/parsing/_converters.py +85 -0
- scraper2/app/parsing/_normalize.py +134 -0
- scraper2/app/parsing/c101_parser.py +143 -0
- scraper2/app/parsing/c103_parser.py +128 -0
- scraper2/app/parsing/c104_parser.py +143 -0
- scraper2/app/parsing/c106_parser.py +153 -0
- scraper2/app/parsing/c108_parser.py +65 -0
- scraper2/app/ports/browser/browser_factory_port.py +11 -0
- scraper2/app/ports/browser/browser_port.py +22 -0
- scraper2/app/ports/ingest_port.py +13 -0
- scraper2/app/ports/sinks/base_sink_port.py +14 -0
- scraper2/app/ports/sinks/c101_sink_port.py +9 -0
- scraper2/app/ports/sinks/c103_sink_port.py +9 -0
- scraper2/app/ports/sinks/c104_sink_port.py +9 -0
- scraper2/app/ports/sinks/c106_sink_port.py +9 -0
- scraper2/app/ports/sinks/c108_sink_port.py +9 -0
- scraper2/app/usecases/fetch/fetch_c101.py +43 -0
- scraper2/app/usecases/fetch/fetch_c103.py +103 -0
- scraper2/app/usecases/fetch/fetch_c104.py +76 -0
- scraper2/app/usecases/fetch/fetch_c106.py +90 -0
- scraper2/app/usecases/fetch/fetch_c108.py +49 -0
- scraper2/app/usecases/ingest/ingest_c101.py +36 -0
- scraper2/app/usecases/ingest/ingest_c103.py +37 -0
- scraper2/app/usecases/ingest/ingest_c104.py +37 -0
- scraper2/app/usecases/ingest/ingest_c106.py +38 -0
- scraper2/app/usecases/ingest/ingest_c108.py +39 -0
- scraper2/main.py +257 -0
- scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
- scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
- scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
- scraper2_hj3415/__main__.py +0 -6
- scraper2_hj3415/adapters/_shared/utils.py +0 -29
- scraper2_hj3415/adapters/clients/browser.py +0 -124
- scraper2_hj3415/adapters/clients/http.py +0 -51
- scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
- scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
- scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
- scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
- scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
- scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
- scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
- scraper2_hj3415/core/constants.py +0 -47
- scraper2_hj3415/core/ports/sink_port.py +0 -16
- scraper2_hj3415/core/ports/source_port.py +0 -13
- scraper2_hj3415/core/types.py +0 -11
- scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
- scraper2_hj3415/di.py +0 -103
- scraper2_hj3415/entrypoints/cli.py +0 -226
- scraper2_hj3415/entrypoints/main.py +0 -20
- scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
- scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
- scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
- {scraper2_hj3415 → scraper2}/__init__.py +0 -0
- {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
- {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
- {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
- {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
- {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
- {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
- {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
# src/scraper2_hj3415/adapters/clients/http.py
|
|
2
|
-
|
|
3
|
-
import httpx
|
|
4
|
-
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
|
|
5
|
-
|
|
6
|
-
# HTTPX 에러용 커스텀 예외
|
|
7
|
-
class HttpClientError(Exception):
|
|
8
|
-
"""기본 HTTP 클라이언트 예외."""
|
|
9
|
-
|
|
10
|
-
# 재시도 데코레이터 (옵션)
|
|
11
|
-
def retry_on_http_error():
|
|
12
|
-
return retry(
|
|
13
|
-
reraise=True,
|
|
14
|
-
stop=stop_after_attempt(3),
|
|
15
|
-
wait=wait_exponential(min=1, max=10),
|
|
16
|
-
retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
# AsyncClient를 생성하는 팩토리 함수
|
|
20
|
-
def create_http_client(
|
|
21
|
-
base_url: str | None = None,
|
|
22
|
-
timeout: float = 10.0,
|
|
23
|
-
headers: dict | None = None,
|
|
24
|
-
) -> httpx.AsyncClient:
|
|
25
|
-
"""
|
|
26
|
-
공통 설정을 가진 httpx.AsyncClient 인스턴스를 생성합니다.
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
base_url: 기본 URL (있으면 모든 요청이 상대경로로 가능)
|
|
30
|
-
timeout: 요청 타임아웃 (초)
|
|
31
|
-
headers: 기본 헤더 (없으면 User-Agent 자동 설정)
|
|
32
|
-
"""
|
|
33
|
-
default_headers = {
|
|
34
|
-
"User-Agent": (
|
|
35
|
-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
36
|
-
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
37
|
-
"Chrome/127.0.0.0 Safari/537.36"
|
|
38
|
-
),
|
|
39
|
-
}
|
|
40
|
-
if headers:
|
|
41
|
-
default_headers.update(headers)
|
|
42
|
-
|
|
43
|
-
client = httpx.AsyncClient(
|
|
44
|
-
base_url=base_url or "",
|
|
45
|
-
timeout=httpx.Timeout(timeout),
|
|
46
|
-
headers=default_headers,
|
|
47
|
-
follow_redirects=True,
|
|
48
|
-
verify=True, # SSL 검증
|
|
49
|
-
)
|
|
50
|
-
return client
|
|
51
|
-
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py
|
|
2
|
-
|
|
3
|
-
import asyncio
|
|
4
|
-
from typing import List, Callable
|
|
5
|
-
from playwright.async_api import Browser
|
|
6
|
-
from scraper2_hj3415.core import constants as C
|
|
7
|
-
from scraper2_hj3415.core.types import NormalizedBundle
|
|
8
|
-
from scraper2_hj3415.adapters.nfs.sources import c1034_fetch as fetch, c1034_session as session
|
|
9
|
-
from scraper2_hj3415.adapters.nfs.pipelines.normalize_c1034 import normalize_dispatch
|
|
10
|
-
from loguru import logger
|
|
11
|
-
|
|
12
|
-
async def list_bundles(
|
|
13
|
-
page: C.PAGE,
|
|
14
|
-
cmp_cd: str,
|
|
15
|
-
rpt_enum: type[C.C103RPT | C.C104RPT],
|
|
16
|
-
get_data_func: Callable,
|
|
17
|
-
*,
|
|
18
|
-
browser: Browser | None = None,
|
|
19
|
-
concurrency: int = 2,
|
|
20
|
-
) -> list[NormalizedBundle]:
|
|
21
|
-
sem = asyncio.Semaphore(max(1, concurrency))
|
|
22
|
-
session_info = await session.extract_session_info(browser=browser, cmp_cd=cmp_cd, page=page)
|
|
23
|
-
|
|
24
|
-
async def _one(rpt, frq):
|
|
25
|
-
async with sem:
|
|
26
|
-
meta = {"cmp_cd": cmp_cd, "page": page, "rpt": rpt, "frq": frq}
|
|
27
|
-
payload = await get_data_func(session_info=session_info, cmp_cd=cmp_cd, rpt=rpt, frq=frq)
|
|
28
|
-
return normalize_dispatch(payload, meta)
|
|
29
|
-
|
|
30
|
-
tasks = [_one(rpt, frq) for rpt in rpt_enum for frq in (C.FRQ.Q, C.FRQ.Y)]
|
|
31
|
-
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
32
|
-
bundles: list[NormalizedBundle] = []
|
|
33
|
-
for r in results:
|
|
34
|
-
if isinstance(r, Exception):
|
|
35
|
-
logger.warning(f"{C.PAGE_TO_LABEL[page]} partial failure: {r}")
|
|
36
|
-
continue
|
|
37
|
-
bundles.append(r)
|
|
38
|
-
return bundles
|
|
39
|
-
|
|
40
|
-
async def list_c103_bundles(
|
|
41
|
-
cmp_cd: str,
|
|
42
|
-
*,
|
|
43
|
-
browser: Browser | None = None,
|
|
44
|
-
concurrency: int = 3, # 분기/연간 총 6페이지 중 절반
|
|
45
|
-
) -> List[NormalizedBundle]:
|
|
46
|
-
return await list_bundles(C.PAGE.c103, cmp_cd, C.C103RPT, fetch.get_c103_data, browser=browser, concurrency=concurrency)
|
|
47
|
-
|
|
48
|
-
async def list_c104_bundles(
|
|
49
|
-
cmp_cd: str,
|
|
50
|
-
*,
|
|
51
|
-
browser: Browser | None = None,
|
|
52
|
-
concurrency: int = 5, # 분기/연간 총 10페이지 중 절반
|
|
53
|
-
) -> List[NormalizedBundle]:
|
|
54
|
-
return await list_bundles(C.PAGE.c104, cmp_cd, C.C104RPT, fetch.get_c104_data, browser=browser, concurrency=concurrency)
|
|
55
|
-
|
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/nfs/pipeline/normalize_c1034.py
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from scraper2_hj3415.core import constants as C
|
|
6
|
-
from scraper2_hj3415.core.types import NormalizedBundle
|
|
7
|
-
from scraper2_hj3415.adapters._shared.utils import log_df
|
|
8
|
-
|
|
9
|
-
def _parse_yymm(lbl: str):
|
|
10
|
-
s = re.sub(r"<br\s*/?>", " ", lbl).strip()
|
|
11
|
-
# 날짜가 아닌 지표 라벨들(전년/전분기 비교 섹션) 필터링
|
|
12
|
-
if any(key in s for key in ["전년대비", "YoY", "QoQ", "전분기대비"]):
|
|
13
|
-
return {"label": s, "period": None, "basis": None, "is_estimate": False, "is_yoy_row": True}
|
|
14
|
-
est = "(E)" in s
|
|
15
|
-
m = re.search(r"(\d{4})/(\d{2})", s)
|
|
16
|
-
period = pd.Timestamp(f"{m.group(1)}-{m.group(2)}-01") if m else None
|
|
17
|
-
b = re.search(r"\(([^)]+)\)$", s.replace("(E)","").strip())
|
|
18
|
-
basis = b.group(1) if b else None
|
|
19
|
-
return {"label": s, "period": period, "basis": basis, "is_estimate": est, "is_yoy_row": False}
|
|
20
|
-
|
|
21
|
-
def normalize_metric_rows(rows: list[dict], yymm: list[str], *, frq: C.FRQ, rate_fields: dict, meta: dict):
|
|
22
|
-
lab = pd.DataFrame([_parse_yymm(x) for x in yymm]).rename_axis("pos").reset_index()
|
|
23
|
-
lab_valid = lab[~lab["is_yoy_row"]].copy().reset_index(drop=True)
|
|
24
|
-
lab_valid["pos1"] = lab_valid.index + 1 # DATA1..K 매핑용
|
|
25
|
-
df = pd.DataFrame(rows)
|
|
26
|
-
|
|
27
|
-
# enum 값을 이해하기 쉬운 라벨로 변환
|
|
28
|
-
meta_lbl = {
|
|
29
|
-
"cmp_cd": meta["cmp_cd"],
|
|
30
|
-
"page": C.PAGE_TO_LABEL[meta["page"]],
|
|
31
|
-
"rpt": C.RPT_TO_LABEL[meta["rpt"]],
|
|
32
|
-
"frq": C.FRQ_TO_LABEL[meta["frq"]],
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
# fact 처리
|
|
36
|
-
k = len(lab_valid)
|
|
37
|
-
data_cols = [f"DATA{i}" for i in range(1, k+1) if f"DATA{i}" in df.columns]
|
|
38
|
-
|
|
39
|
-
long = df.melt(
|
|
40
|
-
id_vars=["ACCODE","ACC_NM","LVL","P_ACCODE","GRP_TYP","UNT_TYP","ACKIND","POINT_CNT"],
|
|
41
|
-
value_vars=data_cols, var_name="data_col", value_name="value"
|
|
42
|
-
)
|
|
43
|
-
long["pos1"] = long["data_col"].str.replace("DATA","",regex=False).astype(int)
|
|
44
|
-
long = long.merge(lab_valid[["pos1","period","basis","is_estimate"]], on="pos1", how="inner")
|
|
45
|
-
long = long.dropna(subset=["period", "value"])
|
|
46
|
-
|
|
47
|
-
for k in ["cmp_cd","page","rpt","frq"]:
|
|
48
|
-
long[k] = meta_lbl[k]
|
|
49
|
-
fact = (long
|
|
50
|
-
.drop(columns=["data_col","pos1"])
|
|
51
|
-
.rename(columns={
|
|
52
|
-
"ACCODE":"accode","ACC_NM":"account_name","LVL":"level",
|
|
53
|
-
"P_ACCODE":"parent_accode","GRP_TYP":"group_type","UNT_TYP":"unit_type",
|
|
54
|
-
"ACKIND":"acc_kind","POINT_CNT":"precision"
|
|
55
|
-
})
|
|
56
|
-
.reset_index(drop=True)
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
# dim_account 처리
|
|
60
|
-
dim_account = (fact[["accode","account_name","level","parent_accode","group_type","unit_type","acc_kind","precision"]]
|
|
61
|
-
.drop_duplicates().reset_index(drop=True))
|
|
62
|
-
|
|
63
|
-
# dim_period 처리
|
|
64
|
-
dim_period = (lab_valid[["period","basis","is_estimate"]].dropna(subset=["period"]).drop_duplicates().reset_index(drop=True))
|
|
65
|
-
dim_period["frq"] = meta_lbl["frq"]
|
|
66
|
-
|
|
67
|
-
# delta 처리
|
|
68
|
-
rate_numeric = rate_fields.get("numeric", [])
|
|
69
|
-
rate_comments = rate_fields.get("comments", [])
|
|
70
|
-
use_cols = ["ACCODE"] + [c for c in rate_numeric+rate_comments if c in df.columns]
|
|
71
|
-
if len(use_cols) > 1:
|
|
72
|
-
delta = df[use_cols].copy().rename(columns={"ACCODE":"accode"})
|
|
73
|
-
for k in ["cmp_cd","page","rpt","frq"]:
|
|
74
|
-
delta[k] = meta_lbl[k]
|
|
75
|
-
else:
|
|
76
|
-
delta = pd.DataFrame(columns=["accode"] + rate_numeric + rate_comments + ["cmp_cd","page","rpt","frq"])
|
|
77
|
-
|
|
78
|
-
# 데이터프레임 로그출력
|
|
79
|
-
log_df(fact, "fact_df", 1000)
|
|
80
|
-
log_df(dim_account, "dim_account_df", 1000)
|
|
81
|
-
log_df(dim_period, "dim_period_df", 1000)
|
|
82
|
-
log_df(delta, "delta_df", 1000)
|
|
83
|
-
|
|
84
|
-
return fact, dim_account, dim_period, delta
|
|
85
|
-
|
|
86
|
-
def normalize_quarter_payload(payload: dict, meta: dict):
|
|
87
|
-
return normalize_metric_rows(
|
|
88
|
-
payload["DATA"], payload["YYMM"], frq=C.FRQ.Q,
|
|
89
|
-
rate_fields={"numeric":["QOQ","QOQ_E"], "comments":["QOQ_COMMENT","QOQ_E_COMMENT"]},
|
|
90
|
-
meta=meta,
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
def normalize_year_payload(payload: dict, meta: dict):
|
|
94
|
-
return normalize_metric_rows(
|
|
95
|
-
payload["DATA"], payload["YYMM"], frq=C.FRQ.Y,
|
|
96
|
-
rate_fields={"numeric":["YYOY","YEYOY"], "comments":[]},
|
|
97
|
-
meta=meta,
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
def normalize_dispatch(payload: dict, meta: dict) -> NormalizedBundle:
|
|
101
|
-
"""frq에 따라 분기/연간 정규화를 자동 선택."""
|
|
102
|
-
frq: C.FRQ = meta["frq"]
|
|
103
|
-
if frq == C.FRQ.Q:
|
|
104
|
-
fact, dim_account, dim_period, delta = normalize_quarter_payload(payload, meta)
|
|
105
|
-
elif frq == C.FRQ.Y:
|
|
106
|
-
fact, dim_account, dim_period, delta = normalize_year_payload(payload, meta)
|
|
107
|
-
else:
|
|
108
|
-
raise ValueError(f"Unsupported FRQ: {frq!r}")
|
|
109
|
-
return NormalizedBundle(fact, dim_account, dim_period, delta)
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
|
|
5
|
-
from .df_to_dto_mappers import (
|
|
6
|
-
rows_to_dim_account, rows_to_dim_period,
|
|
7
|
-
rows_to_fact_finance, rows_to_delta_finance,
|
|
8
|
-
)
|
|
9
|
-
from scraper2_hj3415.adapters._shared.utils import chunked
|
|
10
|
-
from scraper2_hj3415.core.ports.sink_port import C1034SinkPort
|
|
11
|
-
from contracts_hj3415.ports.c1034_write_repo import C1034WriteRepoPort
|
|
12
|
-
|
|
13
|
-
DEFAULT_CHUNK = 1_000
|
|
14
|
-
|
|
15
|
-
class C1034Sink(C1034SinkPort):
|
|
16
|
-
def __init__(self, repo: C1034WriteRepoPort, chunk: int = DEFAULT_CHUNK):
|
|
17
|
-
self.repo = repo
|
|
18
|
-
self.chunk = chunk
|
|
19
|
-
|
|
20
|
-
async def save_dim_account(self, df: pd.DataFrame) -> None:
|
|
21
|
-
for batch in chunked(rows_to_dim_account(df), self.chunk):
|
|
22
|
-
await self.repo.upsert_dim_account(batch)
|
|
23
|
-
|
|
24
|
-
async def save_dim_period(self, df: pd.DataFrame) -> None:
|
|
25
|
-
for batch in chunked(rows_to_dim_period(df), self.chunk):
|
|
26
|
-
await self.repo.upsert_dim_period(batch)
|
|
27
|
-
|
|
28
|
-
async def save_fact_finance(self, df: pd.DataFrame) -> None:
|
|
29
|
-
for batch in chunked(rows_to_fact_finance(df), self.chunk):
|
|
30
|
-
await self.repo.upsert_fact_finance(batch)
|
|
31
|
-
|
|
32
|
-
async def save_delta_finance(self, df: pd.DataFrame) -> None:
|
|
33
|
-
for batch in chunked(rows_to_delta_finance(df), self.chunk):
|
|
34
|
-
await self.repo.upsert_delta_finance(batch)
|
|
35
|
-
|
|
36
|
-
async def save_all(
|
|
37
|
-
self,
|
|
38
|
-
*,
|
|
39
|
-
dim_account_df: pd.DataFrame,
|
|
40
|
-
dim_period_df: pd.DataFrame,
|
|
41
|
-
fact_df: pd.DataFrame,
|
|
42
|
-
delta_df: pd.DataFrame,
|
|
43
|
-
) -> None:
|
|
44
|
-
if dim_account_df is not None:
|
|
45
|
-
await self.save_dim_account(dim_account_df)
|
|
46
|
-
if dim_period_df is not None:
|
|
47
|
-
await self.save_dim_period(dim_period_df)
|
|
48
|
-
if fact_df is not None:
|
|
49
|
-
await self.save_fact_finance(fact_df)
|
|
50
|
-
if delta_df is not None:
|
|
51
|
-
await self.save_delta_finance(delta_df)
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/nfs/sinks/mappers.py
|
|
2
|
-
# DF → DTO 변환 유틸
|
|
3
|
-
|
|
4
|
-
from typing import Iterator
|
|
5
|
-
import math
|
|
6
|
-
import pandas as pd
|
|
7
|
-
from pydantic import ValidationError
|
|
8
|
-
from contracts_hj3415.nfs.dto import FactC1034DTO, DimAccountDTO, DimPeriodDTO, DeltaC1034DTO
|
|
9
|
-
|
|
10
|
-
def _none_if_nan(v):
|
|
11
|
-
# pandas NaN/NaT → None
|
|
12
|
-
if v is None:
|
|
13
|
-
return None
|
|
14
|
-
if isinstance(v, float) and math.isnan(v):
|
|
15
|
-
return None
|
|
16
|
-
if pd.isna(v):
|
|
17
|
-
return None
|
|
18
|
-
return v
|
|
19
|
-
|
|
20
|
-
def rows_to_dim_account(df: pd.DataFrame) -> Iterator[DimAccountDTO]:
|
|
21
|
-
# 기대 컬럼: ['accode','account_name','level','parent_accode','group_type','unit_type','acc_kind','precision']
|
|
22
|
-
for row in df.itertuples(index=False):
|
|
23
|
-
try:
|
|
24
|
-
yield DimAccountDTO(
|
|
25
|
-
accode=row.accode,
|
|
26
|
-
account_name=row.account_name,
|
|
27
|
-
level=_none_if_nan(getattr(row, "level", None)),
|
|
28
|
-
parent_accode=_none_if_nan(getattr(row, "parent_accode", None)),
|
|
29
|
-
group_type=_none_if_nan(getattr(row, "group_type", None)),
|
|
30
|
-
unit_type=_none_if_nan(getattr(row, "unit_type", None)),
|
|
31
|
-
acc_kind=_none_if_nan(getattr(row, "acc_kind", None)),
|
|
32
|
-
precision=_none_if_nan(getattr(row, "precision", None)),
|
|
33
|
-
)
|
|
34
|
-
except ValidationError:
|
|
35
|
-
# 필요시 로깅/수집
|
|
36
|
-
continue
|
|
37
|
-
|
|
38
|
-
def _sanitize_basis(v):
|
|
39
|
-
return "" if v is None or (isinstance(v, float) and pd.isna(v)) else v
|
|
40
|
-
|
|
41
|
-
def rows_to_dim_period(df: pd.DataFrame) -> Iterator[DimPeriodDTO]:
|
|
42
|
-
# 기대 컬럼: ['period','basis','is_estimate','frq']
|
|
43
|
-
# period는 반드시 date로 변환되어 있어야 안전
|
|
44
|
-
if df["period"].dtype != "datetime64[ns]":
|
|
45
|
-
df = df.copy()
|
|
46
|
-
df["period"] = pd.to_datetime(df["period"], errors="coerce").dt.date
|
|
47
|
-
|
|
48
|
-
for row in df.itertuples(index=False):
|
|
49
|
-
try:
|
|
50
|
-
yield DimPeriodDTO(
|
|
51
|
-
period=row.period,
|
|
52
|
-
basis=_sanitize_basis(getattr(row, "basis", None)),
|
|
53
|
-
is_estimate=bool(getattr(row, "is_estimate", False)),
|
|
54
|
-
frq=row.frq,
|
|
55
|
-
)
|
|
56
|
-
except ValidationError:
|
|
57
|
-
continue
|
|
58
|
-
|
|
59
|
-
def rows_to_fact_finance(df: pd.DataFrame) -> Iterator[FactC1034DTO]:
|
|
60
|
-
# 기대 컬럼: ['cmp_cd','page','rpt','frq','accode','account_name','period','value', ...옵션들]
|
|
61
|
-
if df["period"].dtype != "datetime64[ns]":
|
|
62
|
-
df = df.copy()
|
|
63
|
-
df["period"] = pd.to_datetime(df["period"], errors="coerce").dt.date
|
|
64
|
-
|
|
65
|
-
for row in df.itertuples(index=False):
|
|
66
|
-
try:
|
|
67
|
-
yield FactC1034DTO(
|
|
68
|
-
cmp_cd=row.cmp_cd,
|
|
69
|
-
page=row.page,
|
|
70
|
-
rpt=row.rpt,
|
|
71
|
-
frq=row.frq,
|
|
72
|
-
accode=row.accode,
|
|
73
|
-
account_name=row.account_name,
|
|
74
|
-
period=row.period,
|
|
75
|
-
value=float(row.value),
|
|
76
|
-
basis=_none_if_nan(getattr(row, "basis", None)),
|
|
77
|
-
is_estimate=bool(getattr(row, "is_estimate", False)),
|
|
78
|
-
unit_type=_none_if_nan(getattr(row, "unit_type", None)),
|
|
79
|
-
level=_none_if_nan(getattr(row, "level", None)),
|
|
80
|
-
parent_accode=_none_if_nan(getattr(row, "parent_accode", None)),
|
|
81
|
-
group_type=_none_if_nan(getattr(row, "group_type", None)),
|
|
82
|
-
acc_kind=_none_if_nan(getattr(row, "acc_kind", None)),
|
|
83
|
-
precision=_none_if_nan(getattr(row, "precision", None)),
|
|
84
|
-
)
|
|
85
|
-
except (ValidationError, TypeError, ValueError):
|
|
86
|
-
continue
|
|
87
|
-
|
|
88
|
-
def rows_to_delta_finance(df: pd.DataFrame) -> Iterator[DeltaC1034DTO]:
|
|
89
|
-
# 기대 컬럼: ['cmp_cd','page','rpt','frq','accode','qoq','yoy','qoq_e','yoy_e','qoq_comment','yoy_comment']
|
|
90
|
-
for row in df.itertuples(index=False):
|
|
91
|
-
try:
|
|
92
|
-
yield DeltaC1034DTO(
|
|
93
|
-
cmp_cd=row.cmp_cd,
|
|
94
|
-
page=row.page,
|
|
95
|
-
rpt=row.rpt,
|
|
96
|
-
frq=row.frq,
|
|
97
|
-
accode=row.accode,
|
|
98
|
-
qoq=_none_if_nan(getattr(row, "qoq", None)),
|
|
99
|
-
yoy=_none_if_nan(getattr(row, "yoy", None)),
|
|
100
|
-
qoq_e=_none_if_nan(getattr(row, "qoq_e", None)),
|
|
101
|
-
yoy_e=_none_if_nan(getattr(row, "yoy_e", None)),
|
|
102
|
-
qoq_comment=_none_if_nan(getattr(row, "qoq_comment", None)),
|
|
103
|
-
yoy_comment=_none_if_nan(getattr(row, "yoy_comment", None)),
|
|
104
|
-
)
|
|
105
|
-
except ValidationError:
|
|
106
|
-
continue
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/nfs/sources/bundle_source.py
|
|
2
|
-
|
|
3
|
-
from typing import List, Literal
|
|
4
|
-
from scraper2_hj3415.core.types import NormalizedBundle
|
|
5
|
-
from scraper2_hj3415.core.ports.source_port import C1034BundleSourcePort
|
|
6
|
-
from scraper2_hj3415.adapters.nfs.pipelines.c1034_pipeline import list_c103_bundles, list_c104_bundles
|
|
7
|
-
from scraper2_hj3415.adapters.clients.browser import PlaywrightSession # 세션은 Adapter에서 관리
|
|
8
|
-
|
|
9
|
-
class NfsBundleSource(C1034BundleSourcePort):
|
|
10
|
-
def __init__(self, session: PlaywrightSession):
|
|
11
|
-
self.session = session
|
|
12
|
-
self.browser = session.browser
|
|
13
|
-
|
|
14
|
-
async def list_bundles(
|
|
15
|
-
self,
|
|
16
|
-
cmp_cd: str,
|
|
17
|
-
page: Literal["c103", "c104"],
|
|
18
|
-
*,
|
|
19
|
-
concurrency: int = 2,
|
|
20
|
-
) -> List[NormalizedBundle]:
|
|
21
|
-
if page == "c103":
|
|
22
|
-
return await list_c103_bundles(cmp_cd, browser=self.browser, concurrency=concurrency)
|
|
23
|
-
else:
|
|
24
|
-
return await list_c104_bundles(cmp_cd, browser=self.browser, concurrency=concurrency)
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
# scraper2-hj3415/adapters/nfs/sources/c1034_fetch.py
|
|
2
|
-
|
|
3
|
-
import httpx
|
|
4
|
-
from loguru import logger
|
|
5
|
-
from scraper2_hj3415.core.constants import ASPXInner, FRQ, C103RPT, C104RPT
|
|
6
|
-
from scraper2_hj3415.adapters.clients.http import create_http_client
|
|
7
|
-
|
|
8
|
-
async def fetch_financial_json(
|
|
9
|
-
*,
|
|
10
|
-
cmp_cd: str = "005930",
|
|
11
|
-
aspx_inner: ASPXInner = ASPXInner.c103,
|
|
12
|
-
rpt: C103RPT | C104RPT = C103RPT.손익계산서,
|
|
13
|
-
frq: FRQ = FRQ.Y,
|
|
14
|
-
encparam: str,
|
|
15
|
-
cookies: str,
|
|
16
|
-
referer: str,
|
|
17
|
-
user_agent: str,
|
|
18
|
-
) -> dict | list:
|
|
19
|
-
if not encparam:
|
|
20
|
-
raise ValueError("encparam is missing")
|
|
21
|
-
|
|
22
|
-
request_url = f"https://navercomp.wisereport.co.kr/v2/company/{aspx_inner}?cmp_cd={cmp_cd}"
|
|
23
|
-
|
|
24
|
-
params = {
|
|
25
|
-
"cmp_cd": cmp_cd,
|
|
26
|
-
"frq": frq,
|
|
27
|
-
"rpt": rpt,
|
|
28
|
-
"finGubun": "MAIN",
|
|
29
|
-
"frqTyp": frq, # 연간
|
|
30
|
-
"encparam": encparam,
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
headers = {
|
|
34
|
-
"Cookie": cookies,
|
|
35
|
-
"Referer": referer,
|
|
36
|
-
"User-Agent": user_agent,
|
|
37
|
-
"Accept": "application/json, text/javascript, */*; q=0.01",
|
|
38
|
-
"X-Requested-With": "XMLHttpRequest", # 일부 서버가 AJAX 헤더 선호
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
async with create_http_client(timeout=10.0) as client:
|
|
42
|
-
r = await client.get(request_url, params=params, headers=headers)
|
|
43
|
-
r.raise_for_status()
|
|
44
|
-
try:
|
|
45
|
-
import json
|
|
46
|
-
payload = r.json()
|
|
47
|
-
for i, row in enumerate(payload["DATA"]):
|
|
48
|
-
logger.debug(
|
|
49
|
-
"row {}/{} ACC_NM={}, ACCODE={}",
|
|
50
|
-
i + 1,
|
|
51
|
-
len(payload["DATA"]),
|
|
52
|
-
row.get("ACC_NM"),
|
|
53
|
-
row.get("ACCODE"),
|
|
54
|
-
)
|
|
55
|
-
logger.debug(
|
|
56
|
-
"row raw:\n{}", json.dumps(row, indent=2, ensure_ascii=False)
|
|
57
|
-
)
|
|
58
|
-
return payload
|
|
59
|
-
except Exception:
|
|
60
|
-
# text/html이더라도 JSON 형태면 그대로 파싱 시도
|
|
61
|
-
text = r.text.strip()
|
|
62
|
-
if text.startswith("{") or text.startswith("["):
|
|
63
|
-
import json
|
|
64
|
-
return json.loads(text)
|
|
65
|
-
# 진짜 HTML일 경우만 예외 발생
|
|
66
|
-
ctype = r.headers.get("Content-Type", "")
|
|
67
|
-
snippet = text[:2000]
|
|
68
|
-
raise httpx.HTTPStatusError(
|
|
69
|
-
f"Unexpected content-type: {ctype}. Snippet:\n{snippet}",
|
|
70
|
-
request=r.request,
|
|
71
|
-
response=r,
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
async def get_c103_data(
|
|
75
|
-
*,
|
|
76
|
-
session_info : dict,
|
|
77
|
-
cmp_cd: str = "005930",
|
|
78
|
-
rpt: C103RPT = C103RPT.손익계산서,
|
|
79
|
-
frq: FRQ = FRQ.Y,
|
|
80
|
-
) -> dict | list:
|
|
81
|
-
|
|
82
|
-
aspx_inner = ASPXInner.c103
|
|
83
|
-
|
|
84
|
-
resp = await fetch_financial_json(
|
|
85
|
-
cmp_cd=cmp_cd,
|
|
86
|
-
aspx_inner=aspx_inner,
|
|
87
|
-
rpt=rpt,
|
|
88
|
-
frq=frq,
|
|
89
|
-
encparam=session_info["encparam"],
|
|
90
|
-
cookies=session_info["cookies"],
|
|
91
|
-
referer=session_info["referer"],
|
|
92
|
-
user_agent=session_info["user_agent"],
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
return resp
|
|
96
|
-
|
|
97
|
-
async def get_c104_data(
|
|
98
|
-
*,
|
|
99
|
-
session_info : dict,
|
|
100
|
-
cmp_cd: str = "005930",
|
|
101
|
-
rpt: C104RPT = C104RPT.수익성,
|
|
102
|
-
frq: FRQ = FRQ.Y,
|
|
103
|
-
) -> dict | list:
|
|
104
|
-
|
|
105
|
-
aspx_inner = ASPXInner.c104
|
|
106
|
-
|
|
107
|
-
resp = await fetch_financial_json(
|
|
108
|
-
cmp_cd=cmp_cd,
|
|
109
|
-
aspx_inner=aspx_inner,
|
|
110
|
-
rpt=rpt,
|
|
111
|
-
frq=frq,
|
|
112
|
-
encparam=session_info["encparam"],
|
|
113
|
-
cookies=session_info["cookies"],
|
|
114
|
-
referer=session_info["referer"],
|
|
115
|
-
user_agent=session_info["user_agent"],
|
|
116
|
-
)
|
|
117
|
-
return resp
|
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/nfs/sources/c1034_session.py
|
|
2
|
-
|
|
3
|
-
import re
|
|
4
|
-
from loguru import logger
|
|
5
|
-
|
|
6
|
-
from scraper2_hj3415.adapters.clients.browser import browser_context
|
|
7
|
-
from scraper2_hj3415.core.constants import PAGE
|
|
8
|
-
from playwright.async_api import Browser
|
|
9
|
-
|
|
10
|
-
async def extract_session_info(
|
|
11
|
-
*,
|
|
12
|
-
browser: Browser | None = None,
|
|
13
|
-
cmp_cd: str = "005930",
|
|
14
|
-
page: PAGE = PAGE.c103,
|
|
15
|
-
) -> dict:
|
|
16
|
-
"""
|
|
17
|
-
Playwright로 referer/UA/쿠키/encparam만 추출.
|
|
18
|
-
- 외부 browser가 있으면 재사용, 없으면 내부에서 열고 닫음.
|
|
19
|
-
- 데이터 호출은 httpx로 하는 하이브리드 구성 전제를 만족하도록 세션 구성만 담당.
|
|
20
|
-
"""
|
|
21
|
-
is_external = browser is not None
|
|
22
|
-
|
|
23
|
-
if not is_external:
|
|
24
|
-
# 내부 생성
|
|
25
|
-
browser_cm = browser_context(headless=True) # 기존에 갖고있는 @asynccontextmanager
|
|
26
|
-
browser = await browser_cm.__aenter__()
|
|
27
|
-
|
|
28
|
-
context = None
|
|
29
|
-
page_ = None
|
|
30
|
-
try:
|
|
31
|
-
# 1) 격리된 Context 생성 (쿠키/스토리지 분리)
|
|
32
|
-
context = await browser.new_context()
|
|
33
|
-
|
|
34
|
-
# 2) 리소스 차단: 불필요한 렌더링 비용 제거
|
|
35
|
-
await context.route("**/*", lambda route: (
|
|
36
|
-
route.abort()
|
|
37
|
-
if route.request.resource_type in {"image", "font", "stylesheet", "media"}
|
|
38
|
-
else route.continue_()
|
|
39
|
-
))
|
|
40
|
-
|
|
41
|
-
# 3) 페이지 오픈
|
|
42
|
-
page_ = await context.new_page()
|
|
43
|
-
# StrEnum이라도 안전하게 .value 로 명시
|
|
44
|
-
url = f"https://navercomp.wisereport.co.kr/v2/company/{page.value}?cmp_cd={cmp_cd}"
|
|
45
|
-
|
|
46
|
-
# UA는 context 생성 이후 읽어도 같지만, page 쪽이 더 직관적
|
|
47
|
-
await page_.goto(url, wait_until="domcontentloaded")
|
|
48
|
-
|
|
49
|
-
# 4) 'encparam'이 들어있는 스크립트가 로드될 시간을 짧게 확보
|
|
50
|
-
# 가능하면 안정적인 셀렉터를 쓰세요(예: 표/탭 등). 없으면 폴백 타임아웃.
|
|
51
|
-
try:
|
|
52
|
-
# 스크립트 내 텍스트 매칭 (Playwright의 has-text 선택자를 활용)
|
|
53
|
-
await page_.wait_for_selector("script:has-text('encparam')", timeout=3000)
|
|
54
|
-
except Exception:
|
|
55
|
-
# 폴백: 짧은 대기
|
|
56
|
-
await page_.wait_for_timeout(300)
|
|
57
|
-
|
|
58
|
-
html = await page_.content()
|
|
59
|
-
m = re.search(
|
|
60
|
-
r"""encparam\s*(?:[:=])\s*['"]([^'"]+)['"]""", html, re.IGNORECASE
|
|
61
|
-
)
|
|
62
|
-
encparam = m.group(1) if m else None
|
|
63
|
-
if not encparam:
|
|
64
|
-
logger.warning("encparam not found in page HTML")
|
|
65
|
-
|
|
66
|
-
# 5) UA / 쿠키 수집(컨텍스트 기준)
|
|
67
|
-
ua = await page_.evaluate("navigator.userAgent")
|
|
68
|
-
cookies = await context.cookies()
|
|
69
|
-
cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookies)
|
|
70
|
-
|
|
71
|
-
logger.debug(f"encparam={encparam!r}")
|
|
72
|
-
logger.debug(f"cookies={cookie_header!r}")
|
|
73
|
-
|
|
74
|
-
return {
|
|
75
|
-
"encparam": encparam,
|
|
76
|
-
"cookies": cookie_header,
|
|
77
|
-
"referer": url,
|
|
78
|
-
"user_agent": ua,
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
finally:
|
|
82
|
-
# 열린 순서의 반대로 정리
|
|
83
|
-
try:
|
|
84
|
-
if page_:
|
|
85
|
-
await page_.close()
|
|
86
|
-
finally:
|
|
87
|
-
if context:
|
|
88
|
-
await context.close()
|
|
89
|
-
if not is_external:
|
|
90
|
-
await browser_cm.__aexit__(None, None, None)
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/core/constants.py
|
|
2
|
-
|
|
3
|
-
from enum import StrEnum
|
|
4
|
-
|
|
5
|
-
class PAGE(StrEnum):
|
|
6
|
-
c101 = "c1010001.aspx"
|
|
7
|
-
c103 = "c1030001.aspx"
|
|
8
|
-
c104 = "c1040001.aspx"
|
|
9
|
-
|
|
10
|
-
PAGE_TO_LABEL = {
|
|
11
|
-
PAGE.c101: "c101",
|
|
12
|
-
PAGE.c103: "c103",
|
|
13
|
-
PAGE.c104: "c104"
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
class ASPXInner(StrEnum):
|
|
17
|
-
c103 = "cF3002.aspx"
|
|
18
|
-
c104 = "cF4002.aspx"
|
|
19
|
-
|
|
20
|
-
class FRQ(StrEnum):
|
|
21
|
-
Y = "0" # 연간
|
|
22
|
-
Q = "1" # 분기
|
|
23
|
-
|
|
24
|
-
FRQ_TO_LABEL = {FRQ.Y: "y", FRQ.Q: "q"} # 저장·로그용 라벨
|
|
25
|
-
|
|
26
|
-
class C103RPT(StrEnum):
|
|
27
|
-
손익계산서 = "0"
|
|
28
|
-
재무상태표 = "1"
|
|
29
|
-
현금흐름표 = "2"
|
|
30
|
-
|
|
31
|
-
class C104RPT(StrEnum):
|
|
32
|
-
수익성 = "1"
|
|
33
|
-
성장성 = "2"
|
|
34
|
-
안정성 = "3"
|
|
35
|
-
활동성 = "4"
|
|
36
|
-
가치분석 = "5"
|
|
37
|
-
|
|
38
|
-
RPT_TO_LABEL = {
|
|
39
|
-
C103RPT.손익계산서: "손익계산서",
|
|
40
|
-
C103RPT.재무상태표: "재무상태표",
|
|
41
|
-
C103RPT.현금흐름표: "현금흐름표",
|
|
42
|
-
C104RPT.수익성: "수익성",
|
|
43
|
-
C104RPT.성장성: "성장성",
|
|
44
|
-
C104RPT.안정성: "안정성",
|
|
45
|
-
C104RPT.활동성: "활동성",
|
|
46
|
-
C104RPT.가치분석: "가치분석",
|
|
47
|
-
}
|
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/core/ports/sink_port.py
|
|
2
|
-
|
|
3
|
-
from typing import Protocol
|
|
4
|
-
import pandas as pd
|
|
5
|
-
|
|
6
|
-
class C1034SinkPort(Protocol):
|
|
7
|
-
async def save_all(
|
|
8
|
-
self,
|
|
9
|
-
*,
|
|
10
|
-
dim_account_df: pd.DataFrame,
|
|
11
|
-
dim_period_df: pd.DataFrame,
|
|
12
|
-
fact_df: pd.DataFrame,
|
|
13
|
-
delta_df: pd.DataFrame,
|
|
14
|
-
) -> None: ...
|
|
15
|
-
|
|
16
|
-
|