scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scraper2/.DS_Store +0 -0
  2. scraper2/adapters/out/.DS_Store +0 -0
  3. scraper2/adapters/out/playwright/browser.py +103 -0
  4. scraper2/adapters/out/playwright/browser_factory.py +112 -0
  5. scraper2/adapters/out/playwright/session.py +121 -0
  6. scraper2/adapters/out/sinks/.DS_Store +0 -0
  7. scraper2/adapters/out/sinks/memory/__init__.py +15 -0
  8. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
  9. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
  10. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
  11. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
  12. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
  13. scraper2/adapters/out/sinks/memory/store.py +74 -0
  14. scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
  15. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
  16. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
  17. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
  18. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
  19. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
  20. scraper2/app/composition.py +195 -0
  21. scraper2/app/parsing/_converters.py +85 -0
  22. scraper2/app/parsing/_normalize.py +134 -0
  23. scraper2/app/parsing/c101_parser.py +143 -0
  24. scraper2/app/parsing/c103_parser.py +128 -0
  25. scraper2/app/parsing/c104_parser.py +143 -0
  26. scraper2/app/parsing/c106_parser.py +153 -0
  27. scraper2/app/parsing/c108_parser.py +65 -0
  28. scraper2/app/ports/browser/browser_factory_port.py +11 -0
  29. scraper2/app/ports/browser/browser_port.py +22 -0
  30. scraper2/app/ports/ingest_port.py +13 -0
  31. scraper2/app/ports/sinks/base_sink_port.py +14 -0
  32. scraper2/app/ports/sinks/c101_sink_port.py +9 -0
  33. scraper2/app/ports/sinks/c103_sink_port.py +9 -0
  34. scraper2/app/ports/sinks/c104_sink_port.py +9 -0
  35. scraper2/app/ports/sinks/c106_sink_port.py +9 -0
  36. scraper2/app/ports/sinks/c108_sink_port.py +9 -0
  37. scraper2/app/usecases/fetch/fetch_c101.py +43 -0
  38. scraper2/app/usecases/fetch/fetch_c103.py +103 -0
  39. scraper2/app/usecases/fetch/fetch_c104.py +76 -0
  40. scraper2/app/usecases/fetch/fetch_c106.py +90 -0
  41. scraper2/app/usecases/fetch/fetch_c108.py +49 -0
  42. scraper2/app/usecases/ingest/ingest_c101.py +36 -0
  43. scraper2/app/usecases/ingest/ingest_c103.py +37 -0
  44. scraper2/app/usecases/ingest/ingest_c104.py +37 -0
  45. scraper2/app/usecases/ingest/ingest_c106.py +38 -0
  46. scraper2/app/usecases/ingest/ingest_c108.py +39 -0
  47. scraper2/main.py +257 -0
  48. scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
  49. scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
  50. scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
  51. scraper2_hj3415/__main__.py +0 -6
  52. scraper2_hj3415/adapters/_shared/utils.py +0 -29
  53. scraper2_hj3415/adapters/clients/browser.py +0 -124
  54. scraper2_hj3415/adapters/clients/http.py +0 -51
  55. scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
  56. scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
  57. scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
  58. scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
  59. scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
  60. scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
  61. scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
  62. scraper2_hj3415/core/constants.py +0 -47
  63. scraper2_hj3415/core/ports/sink_port.py +0 -16
  64. scraper2_hj3415/core/ports/source_port.py +0 -13
  65. scraper2_hj3415/core/types.py +0 -11
  66. scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
  67. scraper2_hj3415/di.py +0 -103
  68. scraper2_hj3415/entrypoints/cli.py +0 -226
  69. scraper2_hj3415/entrypoints/main.py +0 -20
  70. scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
  71. scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
  72. scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
  73. {scraper2_hj3415 → scraper2}/__init__.py +0 -0
  74. {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
  75. {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
  76. {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
  77. {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
  78. {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
  79. {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
  80. {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
  81. {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
  82. {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
  83. {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
  84. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
  85. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,51 +0,0 @@
1
- # src/scraper2_hj3415/adapters/clients/http.py
2
-
3
- import httpx
4
- from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
5
-
6
- # HTTPX 에러용 커스텀 예외
7
- class HttpClientError(Exception):
8
- """기본 HTTP 클라이언트 예외."""
9
-
10
- # 재시도 데코레이터 (옵션)
11
- def retry_on_http_error():
12
- return retry(
13
- reraise=True,
14
- stop=stop_after_attempt(3),
15
- wait=wait_exponential(min=1, max=10),
16
- retry=retry_if_exception_type((httpx.RequestError, httpx.HTTPStatusError)),
17
- )
18
-
19
- # AsyncClient를 생성하는 팩토리 함수
20
- def create_http_client(
21
- base_url: str | None = None,
22
- timeout: float = 10.0,
23
- headers: dict | None = None,
24
- ) -> httpx.AsyncClient:
25
- """
26
- 공통 설정을 가진 httpx.AsyncClient 인스턴스를 생성합니다.
27
-
28
- Args:
29
- base_url: 기본 URL (있으면 모든 요청이 상대경로로 가능)
30
- timeout: 요청 타임아웃 (초)
31
- headers: 기본 헤더 (없으면 User-Agent 자동 설정)
32
- """
33
- default_headers = {
34
- "User-Agent": (
35
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
36
- "AppleWebKit/537.36 (KHTML, like Gecko) "
37
- "Chrome/127.0.0.0 Safari/537.36"
38
- ),
39
- }
40
- if headers:
41
- default_headers.update(headers)
42
-
43
- client = httpx.AsyncClient(
44
- base_url=base_url or "",
45
- timeout=httpx.Timeout(timeout),
46
- headers=default_headers,
47
- follow_redirects=True,
48
- verify=True, # SSL 검증
49
- )
50
- return client
51
-
@@ -1,55 +0,0 @@
1
- # scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py
2
-
3
- import asyncio
4
- from typing import List, Callable
5
- from playwright.async_api import Browser
6
- from scraper2_hj3415.core import constants as C
7
- from scraper2_hj3415.core.types import NormalizedBundle
8
- from scraper2_hj3415.adapters.nfs.sources import c1034_fetch as fetch, c1034_session as session
9
- from scraper2_hj3415.adapters.nfs.pipelines.normalize_c1034 import normalize_dispatch
10
- from loguru import logger
11
-
12
- async def list_bundles(
13
- page: C.PAGE,
14
- cmp_cd: str,
15
- rpt_enum: type[C.C103RPT | C.C104RPT],
16
- get_data_func: Callable,
17
- *,
18
- browser: Browser | None = None,
19
- concurrency: int = 2,
20
- ) -> list[NormalizedBundle]:
21
- sem = asyncio.Semaphore(max(1, concurrency))
22
- session_info = await session.extract_session_info(browser=browser, cmp_cd=cmp_cd, page=page)
23
-
24
- async def _one(rpt, frq):
25
- async with sem:
26
- meta = {"cmp_cd": cmp_cd, "page": page, "rpt": rpt, "frq": frq}
27
- payload = await get_data_func(session_info=session_info, cmp_cd=cmp_cd, rpt=rpt, frq=frq)
28
- return normalize_dispatch(payload, meta)
29
-
30
- tasks = [_one(rpt, frq) for rpt in rpt_enum for frq in (C.FRQ.Q, C.FRQ.Y)]
31
- results = await asyncio.gather(*tasks, return_exceptions=True)
32
- bundles: list[NormalizedBundle] = []
33
- for r in results:
34
- if isinstance(r, Exception):
35
- logger.warning(f"{C.PAGE_TO_LABEL[page]} partial failure: {r}")
36
- continue
37
- bundles.append(r)
38
- return bundles
39
-
40
- async def list_c103_bundles(
41
- cmp_cd: str,
42
- *,
43
- browser: Browser | None = None,
44
- concurrency: int = 3, # 분기/연간 총 6페이지 중 절반
45
- ) -> List[NormalizedBundle]:
46
- return await list_bundles(C.PAGE.c103, cmp_cd, C.C103RPT, fetch.get_c103_data, browser=browser, concurrency=concurrency)
47
-
48
- async def list_c104_bundles(
49
- cmp_cd: str,
50
- *,
51
- browser: Browser | None = None,
52
- concurrency: int = 5, # 분기/연간 총 10페이지 중 절반
53
- ) -> List[NormalizedBundle]:
54
- return await list_bundles(C.PAGE.c104, cmp_cd, C.C104RPT, fetch.get_c104_data, browser=browser, concurrency=concurrency)
55
-
@@ -1,109 +0,0 @@
1
- # scraper2_hj3415/adapters/nfs/pipeline/normalize_c1034.py
2
-
3
- import re
4
- import pandas as pd
5
- from scraper2_hj3415.core import constants as C
6
- from scraper2_hj3415.core.types import NormalizedBundle
7
- from scraper2_hj3415.adapters._shared.utils import log_df
8
-
9
- def _parse_yymm(lbl: str):
10
- s = re.sub(r"<br\s*/?>", " ", lbl).strip()
11
- # 날짜가 아닌 지표 라벨들(전년/전분기 비교 섹션) 필터링
12
- if any(key in s for key in ["전년대비", "YoY", "QoQ", "전분기대비"]):
13
- return {"label": s, "period": None, "basis": None, "is_estimate": False, "is_yoy_row": True}
14
- est = "(E)" in s
15
- m = re.search(r"(\d{4})/(\d{2})", s)
16
- period = pd.Timestamp(f"{m.group(1)}-{m.group(2)}-01") if m else None
17
- b = re.search(r"\(([^)]+)\)$", s.replace("(E)","").strip())
18
- basis = b.group(1) if b else None
19
- return {"label": s, "period": period, "basis": basis, "is_estimate": est, "is_yoy_row": False}
20
-
21
- def normalize_metric_rows(rows: list[dict], yymm: list[str], *, frq: C.FRQ, rate_fields: dict, meta: dict):
22
- lab = pd.DataFrame([_parse_yymm(x) for x in yymm]).rename_axis("pos").reset_index()
23
- lab_valid = lab[~lab["is_yoy_row"]].copy().reset_index(drop=True)
24
- lab_valid["pos1"] = lab_valid.index + 1 # DATA1..K 매핑용
25
- df = pd.DataFrame(rows)
26
-
27
- # enum 값을 이해하기 쉬운 라벨로 변환
28
- meta_lbl = {
29
- "cmp_cd": meta["cmp_cd"],
30
- "page": C.PAGE_TO_LABEL[meta["page"]],
31
- "rpt": C.RPT_TO_LABEL[meta["rpt"]],
32
- "frq": C.FRQ_TO_LABEL[meta["frq"]],
33
- }
34
-
35
- # fact 처리
36
- k = len(lab_valid)
37
- data_cols = [f"DATA{i}" for i in range(1, k+1) if f"DATA{i}" in df.columns]
38
-
39
- long = df.melt(
40
- id_vars=["ACCODE","ACC_NM","LVL","P_ACCODE","GRP_TYP","UNT_TYP","ACKIND","POINT_CNT"],
41
- value_vars=data_cols, var_name="data_col", value_name="value"
42
- )
43
- long["pos1"] = long["data_col"].str.replace("DATA","",regex=False).astype(int)
44
- long = long.merge(lab_valid[["pos1","period","basis","is_estimate"]], on="pos1", how="inner")
45
- long = long.dropna(subset=["period", "value"])
46
-
47
- for k in ["cmp_cd","page","rpt","frq"]:
48
- long[k] = meta_lbl[k]
49
- fact = (long
50
- .drop(columns=["data_col","pos1"])
51
- .rename(columns={
52
- "ACCODE":"accode","ACC_NM":"account_name","LVL":"level",
53
- "P_ACCODE":"parent_accode","GRP_TYP":"group_type","UNT_TYP":"unit_type",
54
- "ACKIND":"acc_kind","POINT_CNT":"precision"
55
- })
56
- .reset_index(drop=True)
57
- )
58
-
59
- # dim_account 처리
60
- dim_account = (fact[["accode","account_name","level","parent_accode","group_type","unit_type","acc_kind","precision"]]
61
- .drop_duplicates().reset_index(drop=True))
62
-
63
- # dim_period 처리
64
- dim_period = (lab_valid[["period","basis","is_estimate"]].dropna(subset=["period"]).drop_duplicates().reset_index(drop=True))
65
- dim_period["frq"] = meta_lbl["frq"]
66
-
67
- # delta 처리
68
- rate_numeric = rate_fields.get("numeric", [])
69
- rate_comments = rate_fields.get("comments", [])
70
- use_cols = ["ACCODE"] + [c for c in rate_numeric+rate_comments if c in df.columns]
71
- if len(use_cols) > 1:
72
- delta = df[use_cols].copy().rename(columns={"ACCODE":"accode"})
73
- for k in ["cmp_cd","page","rpt","frq"]:
74
- delta[k] = meta_lbl[k]
75
- else:
76
- delta = pd.DataFrame(columns=["accode"] + rate_numeric + rate_comments + ["cmp_cd","page","rpt","frq"])
77
-
78
- # 데이터프레임 로그출력
79
- log_df(fact, "fact_df", 1000)
80
- log_df(dim_account, "dim_account_df", 1000)
81
- log_df(dim_period, "dim_period_df", 1000)
82
- log_df(delta, "delta_df", 1000)
83
-
84
- return fact, dim_account, dim_period, delta
85
-
86
- def normalize_quarter_payload(payload: dict, meta: dict):
87
- return normalize_metric_rows(
88
- payload["DATA"], payload["YYMM"], frq=C.FRQ.Q,
89
- rate_fields={"numeric":["QOQ","QOQ_E"], "comments":["QOQ_COMMENT","QOQ_E_COMMENT"]},
90
- meta=meta,
91
- )
92
-
93
- def normalize_year_payload(payload: dict, meta: dict):
94
- return normalize_metric_rows(
95
- payload["DATA"], payload["YYMM"], frq=C.FRQ.Y,
96
- rate_fields={"numeric":["YYOY","YEYOY"], "comments":[]},
97
- meta=meta,
98
- )
99
-
100
- def normalize_dispatch(payload: dict, meta: dict) -> NormalizedBundle:
101
- """frq에 따라 분기/연간 정규화를 자동 선택."""
102
- frq: C.FRQ = meta["frq"]
103
- if frq == C.FRQ.Q:
104
- fact, dim_account, dim_period, delta = normalize_quarter_payload(payload, meta)
105
- elif frq == C.FRQ.Y:
106
- fact, dim_account, dim_period, delta = normalize_year_payload(payload, meta)
107
- else:
108
- raise ValueError(f"Unsupported FRQ: {frq!r}")
109
- return NormalizedBundle(fact, dim_account, dim_period, delta)
@@ -1,51 +0,0 @@
1
- # scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py
2
-
3
- import pandas as pd
4
-
5
- from .df_to_dto_mappers import (
6
- rows_to_dim_account, rows_to_dim_period,
7
- rows_to_fact_finance, rows_to_delta_finance,
8
- )
9
- from scraper2_hj3415.adapters._shared.utils import chunked
10
- from scraper2_hj3415.core.ports.sink_port import C1034SinkPort
11
- from contracts_hj3415.ports.c1034_write_repo import C1034WriteRepoPort
12
-
13
- DEFAULT_CHUNK = 1_000
14
-
15
- class C1034Sink(C1034SinkPort):
16
- def __init__(self, repo: C1034WriteRepoPort, chunk: int = DEFAULT_CHUNK):
17
- self.repo = repo
18
- self.chunk = chunk
19
-
20
- async def save_dim_account(self, df: pd.DataFrame) -> None:
21
- for batch in chunked(rows_to_dim_account(df), self.chunk):
22
- await self.repo.upsert_dim_account(batch)
23
-
24
- async def save_dim_period(self, df: pd.DataFrame) -> None:
25
- for batch in chunked(rows_to_dim_period(df), self.chunk):
26
- await self.repo.upsert_dim_period(batch)
27
-
28
- async def save_fact_finance(self, df: pd.DataFrame) -> None:
29
- for batch in chunked(rows_to_fact_finance(df), self.chunk):
30
- await self.repo.upsert_fact_finance(batch)
31
-
32
- async def save_delta_finance(self, df: pd.DataFrame) -> None:
33
- for batch in chunked(rows_to_delta_finance(df), self.chunk):
34
- await self.repo.upsert_delta_finance(batch)
35
-
36
- async def save_all(
37
- self,
38
- *,
39
- dim_account_df: pd.DataFrame,
40
- dim_period_df: pd.DataFrame,
41
- fact_df: pd.DataFrame,
42
- delta_df: pd.DataFrame,
43
- ) -> None:
44
- if dim_account_df is not None:
45
- await self.save_dim_account(dim_account_df)
46
- if dim_period_df is not None:
47
- await self.save_dim_period(dim_period_df)
48
- if fact_df is not None:
49
- await self.save_fact_finance(fact_df)
50
- if delta_df is not None:
51
- await self.save_delta_finance(delta_df)
@@ -1,106 +0,0 @@
1
- # scraper2_hj3415/adapters/nfs/sinks/mappers.py
2
- # DF → DTO 변환 유틸
3
-
4
- from typing import Iterator
5
- import math
6
- import pandas as pd
7
- from pydantic import ValidationError
8
- from contracts_hj3415.nfs.dto import FactC1034DTO, DimAccountDTO, DimPeriodDTO, DeltaC1034DTO
9
-
10
- def _none_if_nan(v):
11
- # pandas NaN/NaT → None
12
- if v is None:
13
- return None
14
- if isinstance(v, float) and math.isnan(v):
15
- return None
16
- if pd.isna(v):
17
- return None
18
- return v
19
-
20
- def rows_to_dim_account(df: pd.DataFrame) -> Iterator[DimAccountDTO]:
21
- # 기대 컬럼: ['accode','account_name','level','parent_accode','group_type','unit_type','acc_kind','precision']
22
- for row in df.itertuples(index=False):
23
- try:
24
- yield DimAccountDTO(
25
- accode=row.accode,
26
- account_name=row.account_name,
27
- level=_none_if_nan(getattr(row, "level", None)),
28
- parent_accode=_none_if_nan(getattr(row, "parent_accode", None)),
29
- group_type=_none_if_nan(getattr(row, "group_type", None)),
30
- unit_type=_none_if_nan(getattr(row, "unit_type", None)),
31
- acc_kind=_none_if_nan(getattr(row, "acc_kind", None)),
32
- precision=_none_if_nan(getattr(row, "precision", None)),
33
- )
34
- except ValidationError:
35
- # 필요시 로깅/수집
36
- continue
37
-
38
- def _sanitize_basis(v):
39
- return "" if v is None or (isinstance(v, float) and pd.isna(v)) else v
40
-
41
- def rows_to_dim_period(df: pd.DataFrame) -> Iterator[DimPeriodDTO]:
42
- # 기대 컬럼: ['period','basis','is_estimate','frq']
43
- # period는 반드시 date로 변환되어 있어야 안전
44
- if df["period"].dtype != "datetime64[ns]":
45
- df = df.copy()
46
- df["period"] = pd.to_datetime(df["period"], errors="coerce").dt.date
47
-
48
- for row in df.itertuples(index=False):
49
- try:
50
- yield DimPeriodDTO(
51
- period=row.period,
52
- basis=_sanitize_basis(getattr(row, "basis", None)),
53
- is_estimate=bool(getattr(row, "is_estimate", False)),
54
- frq=row.frq,
55
- )
56
- except ValidationError:
57
- continue
58
-
59
- def rows_to_fact_finance(df: pd.DataFrame) -> Iterator[FactC1034DTO]:
60
- # 기대 컬럼: ['cmp_cd','page','rpt','frq','accode','account_name','period','value', ...옵션들]
61
- if df["period"].dtype != "datetime64[ns]":
62
- df = df.copy()
63
- df["period"] = pd.to_datetime(df["period"], errors="coerce").dt.date
64
-
65
- for row in df.itertuples(index=False):
66
- try:
67
- yield FactC1034DTO(
68
- cmp_cd=row.cmp_cd,
69
- page=row.page,
70
- rpt=row.rpt,
71
- frq=row.frq,
72
- accode=row.accode,
73
- account_name=row.account_name,
74
- period=row.period,
75
- value=float(row.value),
76
- basis=_none_if_nan(getattr(row, "basis", None)),
77
- is_estimate=bool(getattr(row, "is_estimate", False)),
78
- unit_type=_none_if_nan(getattr(row, "unit_type", None)),
79
- level=_none_if_nan(getattr(row, "level", None)),
80
- parent_accode=_none_if_nan(getattr(row, "parent_accode", None)),
81
- group_type=_none_if_nan(getattr(row, "group_type", None)),
82
- acc_kind=_none_if_nan(getattr(row, "acc_kind", None)),
83
- precision=_none_if_nan(getattr(row, "precision", None)),
84
- )
85
- except (ValidationError, TypeError, ValueError):
86
- continue
87
-
88
- def rows_to_delta_finance(df: pd.DataFrame) -> Iterator[DeltaC1034DTO]:
89
- # 기대 컬럼: ['cmp_cd','page','rpt','frq','accode','qoq','yoy','qoq_e','yoy_e','qoq_comment','yoy_comment']
90
- for row in df.itertuples(index=False):
91
- try:
92
- yield DeltaC1034DTO(
93
- cmp_cd=row.cmp_cd,
94
- page=row.page,
95
- rpt=row.rpt,
96
- frq=row.frq,
97
- accode=row.accode,
98
- qoq=_none_if_nan(getattr(row, "qoq", None)),
99
- yoy=_none_if_nan(getattr(row, "yoy", None)),
100
- qoq_e=_none_if_nan(getattr(row, "qoq_e", None)),
101
- yoy_e=_none_if_nan(getattr(row, "yoy_e", None)),
102
- qoq_comment=_none_if_nan(getattr(row, "qoq_comment", None)),
103
- yoy_comment=_none_if_nan(getattr(row, "yoy_comment", None)),
104
- )
105
- except ValidationError:
106
- continue
@@ -1,24 +0,0 @@
1
- # scraper2_hj3415/adapters/nfs/sources/bundle_source.py
2
-
3
- from typing import List, Literal
4
- from scraper2_hj3415.core.types import NormalizedBundle
5
- from scraper2_hj3415.core.ports.source_port import C1034BundleSourcePort
6
- from scraper2_hj3415.adapters.nfs.pipelines.c1034_pipeline import list_c103_bundles, list_c104_bundles
7
- from scraper2_hj3415.adapters.clients.browser import PlaywrightSession # 세션은 Adapter에서 관리
8
-
9
- class NfsBundleSource(C1034BundleSourcePort):
10
- def __init__(self, session: PlaywrightSession):
11
- self.session = session
12
- self.browser = session.browser
13
-
14
- async def list_bundles(
15
- self,
16
- cmp_cd: str,
17
- page: Literal["c103", "c104"],
18
- *,
19
- concurrency: int = 2,
20
- ) -> List[NormalizedBundle]:
21
- if page == "c103":
22
- return await list_c103_bundles(cmp_cd, browser=self.browser, concurrency=concurrency)
23
- else:
24
- return await list_c104_bundles(cmp_cd, browser=self.browser, concurrency=concurrency)
@@ -1,117 +0,0 @@
1
- # scraper2-hj3415/adapters/nfs/sources/c1034_fetch.py
2
-
3
- import httpx
4
- from loguru import logger
5
- from scraper2_hj3415.core.constants import ASPXInner, FRQ, C103RPT, C104RPT
6
- from scraper2_hj3415.adapters.clients.http import create_http_client
7
-
8
- async def fetch_financial_json(
9
- *,
10
- cmp_cd: str = "005930",
11
- aspx_inner: ASPXInner = ASPXInner.c103,
12
- rpt: C103RPT | C104RPT = C103RPT.손익계산서,
13
- frq: FRQ = FRQ.Y,
14
- encparam: str,
15
- cookies: str,
16
- referer: str,
17
- user_agent: str,
18
- ) -> dict | list:
19
- if not encparam:
20
- raise ValueError("encparam is missing")
21
-
22
- request_url = f"https://navercomp.wisereport.co.kr/v2/company/{aspx_inner}?cmp_cd={cmp_cd}"
23
-
24
- params = {
25
- "cmp_cd": cmp_cd,
26
- "frq": frq,
27
- "rpt": rpt,
28
- "finGubun": "MAIN",
29
- "frqTyp": frq, # 연간
30
- "encparam": encparam,
31
- }
32
-
33
- headers = {
34
- "Cookie": cookies,
35
- "Referer": referer,
36
- "User-Agent": user_agent,
37
- "Accept": "application/json, text/javascript, */*; q=0.01",
38
- "X-Requested-With": "XMLHttpRequest", # 일부 서버가 AJAX 헤더 선호
39
- }
40
-
41
- async with create_http_client(timeout=10.0) as client:
42
- r = await client.get(request_url, params=params, headers=headers)
43
- r.raise_for_status()
44
- try:
45
- import json
46
- payload = r.json()
47
- for i, row in enumerate(payload["DATA"]):
48
- logger.debug(
49
- "row {}/{} ACC_NM={}, ACCODE={}",
50
- i + 1,
51
- len(payload["DATA"]),
52
- row.get("ACC_NM"),
53
- row.get("ACCODE"),
54
- )
55
- logger.debug(
56
- "row raw:\n{}", json.dumps(row, indent=2, ensure_ascii=False)
57
- )
58
- return payload
59
- except Exception:
60
- # text/html이더라도 JSON 형태면 그대로 파싱 시도
61
- text = r.text.strip()
62
- if text.startswith("{") or text.startswith("["):
63
- import json
64
- return json.loads(text)
65
- # 진짜 HTML일 경우만 예외 발생
66
- ctype = r.headers.get("Content-Type", "")
67
- snippet = text[:2000]
68
- raise httpx.HTTPStatusError(
69
- f"Unexpected content-type: {ctype}. Snippet:\n{snippet}",
70
- request=r.request,
71
- response=r,
72
- )
73
-
74
- async def get_c103_data(
75
- *,
76
- session_info : dict,
77
- cmp_cd: str = "005930",
78
- rpt: C103RPT = C103RPT.손익계산서,
79
- frq: FRQ = FRQ.Y,
80
- ) -> dict | list:
81
-
82
- aspx_inner = ASPXInner.c103
83
-
84
- resp = await fetch_financial_json(
85
- cmp_cd=cmp_cd,
86
- aspx_inner=aspx_inner,
87
- rpt=rpt,
88
- frq=frq,
89
- encparam=session_info["encparam"],
90
- cookies=session_info["cookies"],
91
- referer=session_info["referer"],
92
- user_agent=session_info["user_agent"],
93
- )
94
-
95
- return resp
96
-
97
- async def get_c104_data(
98
- *,
99
- session_info : dict,
100
- cmp_cd: str = "005930",
101
- rpt: C104RPT = C104RPT.수익성,
102
- frq: FRQ = FRQ.Y,
103
- ) -> dict | list:
104
-
105
- aspx_inner = ASPXInner.c104
106
-
107
- resp = await fetch_financial_json(
108
- cmp_cd=cmp_cd,
109
- aspx_inner=aspx_inner,
110
- rpt=rpt,
111
- frq=frq,
112
- encparam=session_info["encparam"],
113
- cookies=session_info["cookies"],
114
- referer=session_info["referer"],
115
- user_agent=session_info["user_agent"],
116
- )
117
- return resp
@@ -1,90 +0,0 @@
1
- # scraper2_hj3415/adapters/nfs/sources/c1034_session.py
2
-
3
- import re
4
- from loguru import logger
5
-
6
- from scraper2_hj3415.adapters.clients.browser import browser_context
7
- from scraper2_hj3415.core.constants import PAGE
8
- from playwright.async_api import Browser
9
-
10
- async def extract_session_info(
11
- *,
12
- browser: Browser | None = None,
13
- cmp_cd: str = "005930",
14
- page: PAGE = PAGE.c103,
15
- ) -> dict:
16
- """
17
- Playwright로 referer/UA/쿠키/encparam만 추출.
18
- - 외부 browser가 있으면 재사용, 없으면 내부에서 열고 닫음.
19
- - 데이터 호출은 httpx로 하는 하이브리드 구성 전제를 만족하도록 세션 구성만 담당.
20
- """
21
- is_external = browser is not None
22
-
23
- if not is_external:
24
- # 내부 생성
25
- browser_cm = browser_context(headless=True) # 기존에 갖고있는 @asynccontextmanager
26
- browser = await browser_cm.__aenter__()
27
-
28
- context = None
29
- page_ = None
30
- try:
31
- # 1) 격리된 Context 생성 (쿠키/스토리지 분리)
32
- context = await browser.new_context()
33
-
34
- # 2) 리소스 차단: 불필요한 렌더링 비용 제거
35
- await context.route("**/*", lambda route: (
36
- route.abort()
37
- if route.request.resource_type in {"image", "font", "stylesheet", "media"}
38
- else route.continue_()
39
- ))
40
-
41
- # 3) 페이지 오픈
42
- page_ = await context.new_page()
43
- # StrEnum이라도 안전하게 .value 로 명시
44
- url = f"https://navercomp.wisereport.co.kr/v2/company/{page.value}?cmp_cd={cmp_cd}"
45
-
46
- # UA는 context 생성 이후 읽어도 같지만, page 쪽이 더 직관적
47
- await page_.goto(url, wait_until="domcontentloaded")
48
-
49
- # 4) 'encparam'이 들어있는 스크립트가 로드될 시간을 짧게 확보
50
- # 가능하면 안정적인 셀렉터를 쓰세요(예: 표/탭 등). 없으면 폴백 타임아웃.
51
- try:
52
- # 스크립트 내 텍스트 매칭 (Playwright의 has-text 선택자를 활용)
53
- await page_.wait_for_selector("script:has-text('encparam')", timeout=3000)
54
- except Exception:
55
- # 폴백: 짧은 대기
56
- await page_.wait_for_timeout(300)
57
-
58
- html = await page_.content()
59
- m = re.search(
60
- r"""encparam\s*(?:[:=])\s*['"]([^'"]+)['"]""", html, re.IGNORECASE
61
- )
62
- encparam = m.group(1) if m else None
63
- if not encparam:
64
- logger.warning("encparam not found in page HTML")
65
-
66
- # 5) UA / 쿠키 수집(컨텍스트 기준)
67
- ua = await page_.evaluate("navigator.userAgent")
68
- cookies = await context.cookies()
69
- cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookies)
70
-
71
- logger.debug(f"encparam={encparam!r}")
72
- logger.debug(f"cookies={cookie_header!r}")
73
-
74
- return {
75
- "encparam": encparam,
76
- "cookies": cookie_header,
77
- "referer": url,
78
- "user_agent": ua,
79
- }
80
-
81
- finally:
82
- # 열린 순서의 반대로 정리
83
- try:
84
- if page_:
85
- await page_.close()
86
- finally:
87
- if context:
88
- await context.close()
89
- if not is_external:
90
- await browser_cm.__aexit__(None, None, None)
@@ -1,47 +0,0 @@
1
- # scraper2_hj3415/core/constants.py
2
-
3
- from enum import StrEnum
4
-
5
- class PAGE(StrEnum):
6
- c101 = "c1010001.aspx"
7
- c103 = "c1030001.aspx"
8
- c104 = "c1040001.aspx"
9
-
10
- PAGE_TO_LABEL = {
11
- PAGE.c101: "c101",
12
- PAGE.c103: "c103",
13
- PAGE.c104: "c104"
14
- }
15
-
16
- class ASPXInner(StrEnum):
17
- c103 = "cF3002.aspx"
18
- c104 = "cF4002.aspx"
19
-
20
- class FRQ(StrEnum):
21
- Y = "0" # 연간
22
- Q = "1" # 분기
23
-
24
- FRQ_TO_LABEL = {FRQ.Y: "y", FRQ.Q: "q"} # 저장·로그용 라벨
25
-
26
- class C103RPT(StrEnum):
27
- 손익계산서 = "0"
28
- 재무상태표 = "1"
29
- 현금흐름표 = "2"
30
-
31
- class C104RPT(StrEnum):
32
- 수익성 = "1"
33
- 성장성 = "2"
34
- 안정성 = "3"
35
- 활동성 = "4"
36
- 가치분석 = "5"
37
-
38
- RPT_TO_LABEL = {
39
- C103RPT.손익계산서: "손익계산서",
40
- C103RPT.재무상태표: "재무상태표",
41
- C103RPT.현금흐름표: "현금흐름표",
42
- C104RPT.수익성: "수익성",
43
- C104RPT.성장성: "성장성",
44
- C104RPT.안정성: "안정성",
45
- C104RPT.활동성: "활동성",
46
- C104RPT.가치분석: "가치분석",
47
- }
@@ -1,16 +0,0 @@
1
- # scraper2_hj3415/core/ports/sink_port.py
2
-
3
- from typing import Protocol
4
- import pandas as pd
5
-
6
- class C1034SinkPort(Protocol):
7
- async def save_all(
8
- self,
9
- *,
10
- dim_account_df: pd.DataFrame,
11
- dim_period_df: pd.DataFrame,
12
- fact_df: pd.DataFrame,
13
- delta_df: pd.DataFrame,
14
- ) -> None: ...
15
-
16
-