scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +19 -0
- scraper2_hj3415/app/parsing/c104_parser.py +23 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +40 -70
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
- scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/major_shareholders.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
7
|
+
from scraper2_hj3415.app.parsing._normalize.label import normalize_key_label
|
|
8
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_int, to_float
|
|
9
|
+
|
|
10
|
+
def _pick_value_by_norm_key(row: dict[str, Any], candidates: list[str]) -> Any:
|
|
11
|
+
# row의 키들을 정규화 맵으로 만든 뒤 후보를 정규화해서 조회
|
|
12
|
+
norm_map: dict[str, str] = {
|
|
13
|
+
normalize_key_label(k): k for k in row.keys()
|
|
14
|
+
}
|
|
15
|
+
for cand in candidates:
|
|
16
|
+
rk = norm_map.get(normalize_key_label(cand))
|
|
17
|
+
if rk is None:
|
|
18
|
+
continue
|
|
19
|
+
v = row.get(rk)
|
|
20
|
+
# "키만 있고 값이 비어있는" 경우 다음 후보 탐색
|
|
21
|
+
if v is None:
|
|
22
|
+
continue
|
|
23
|
+
if isinstance(v, str) and not v.strip():
|
|
24
|
+
continue
|
|
25
|
+
return v
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def parse_c101_major_shareholders(browser: BrowserPort) -> list[dict[str, Any]]:
|
|
30
|
+
table_sel = "#cTB13"
|
|
31
|
+
await browser.wait_attached(table_sel)
|
|
32
|
+
|
|
33
|
+
records = await browser.table_records(table_sel, header=0)
|
|
34
|
+
|
|
35
|
+
if not records:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
out: list[dict[str, Any]] = []
|
|
39
|
+
for r in records:
|
|
40
|
+
name = normalize_text(_pick_value_by_norm_key(r, ["주요주주", "주요주주명"]))
|
|
41
|
+
if not name:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
shares_raw = _pick_value_by_norm_key(
|
|
45
|
+
r, ["보유주식수(보통)", "보유주식수", "보유주식수(보통주)"]
|
|
46
|
+
)
|
|
47
|
+
ratio_raw = _pick_value_by_norm_key(r, ["보유지분(%)", "보유지분", "보유지분%"])
|
|
48
|
+
|
|
49
|
+
out.append(
|
|
50
|
+
{
|
|
51
|
+
"주요주주": name,
|
|
52
|
+
"보유주식수": to_int(shares_raw), # 파싱 실패 시 None 가능
|
|
53
|
+
"보유지분": to_float(ratio_raw), # "0.91%"도 처리되게 파서 보장 필요
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return out
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/sise.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
5
|
+
from common_hj3415.utils import clean_text
|
|
6
|
+
from ._sise_normalizer import normalize_sise_kv_map
|
|
7
|
+
|
|
8
|
+
_SISE_TABLE = "#cTB11"
|
|
9
|
+
|
|
10
|
+
async def parse_c101_sise_table(browser: BrowserPort) -> dict[str, str]:
|
|
11
|
+
"""
|
|
12
|
+
#cTB11 시세정보 테이블을 th(항목명) -> td(값) dict로 추출한다.
|
|
13
|
+
- 화면에 보이는 텍스트 기준(innerText)
|
|
14
|
+
"""
|
|
15
|
+
await browser.wait_attached(_SISE_TABLE)
|
|
16
|
+
|
|
17
|
+
row_cnt = await browser.count_in_nth(
|
|
18
|
+
_SISE_TABLE,
|
|
19
|
+
scope_index=0,
|
|
20
|
+
inner_selector="tbody tr",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
out: dict[str, str] = {}
|
|
24
|
+
|
|
25
|
+
for i in range(1, row_cnt + 1): # nth-child는 1-based
|
|
26
|
+
row_sel = f"tbody tr:nth-child({i})"
|
|
27
|
+
|
|
28
|
+
key = await browser.inner_text_in_nth(
|
|
29
|
+
_SISE_TABLE,
|
|
30
|
+
scope_index=0,
|
|
31
|
+
inner_selector=f"{row_sel} th",
|
|
32
|
+
inner_index=0,
|
|
33
|
+
)
|
|
34
|
+
val = await browser.inner_text_in_nth(
|
|
35
|
+
_SISE_TABLE,
|
|
36
|
+
scope_index=0,
|
|
37
|
+
inner_selector=f"{row_sel} td",
|
|
38
|
+
inner_index=0,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
k = clean_text(key)
|
|
42
|
+
v = clean_text(val)
|
|
43
|
+
if k:
|
|
44
|
+
out[k] = v
|
|
45
|
+
raw = out
|
|
46
|
+
return normalize_sise_kv_map(raw)
|
|
47
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/summary_cmp.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from common_hj3415.utils import clean_text
|
|
7
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_number
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def parse_c101_summary_cmp_table(browser: BrowserPort) -> dict[str, Any]:
|
|
11
|
+
"""
|
|
12
|
+
<table class="cmp-table"> (회사 요약 테이블)에서 종목 기본 + EPS/BPS/PER... 등을 추출한다.
|
|
13
|
+
|
|
14
|
+
반환 예:
|
|
15
|
+
{
|
|
16
|
+
"종목명": "삼성전자",
|
|
17
|
+
"코드": "005930",
|
|
18
|
+
"영문명": "SamsungElec",
|
|
19
|
+
"시장": "KOSPI : 코스피 전기·전자",
|
|
20
|
+
"WICS": "WICS : 반도체와반도체장비",
|
|
21
|
+
"EPS": 4816,
|
|
22
|
+
"BPS": 60632,
|
|
23
|
+
"PER": 31.58,
|
|
24
|
+
"업종PER": 21.93,
|
|
25
|
+
"PBR": 2.51,
|
|
26
|
+
"현금배당수익률": 0.95,
|
|
27
|
+
"결산": "12월 결산",
|
|
28
|
+
}
|
|
29
|
+
"""
|
|
30
|
+
out: dict[str, Any] = {}
|
|
31
|
+
|
|
32
|
+
# 테이블 존재 확인
|
|
33
|
+
await browser.wait_attached("table.cmp-table")
|
|
34
|
+
|
|
35
|
+
# --- 1) td0101: 종목명/코드/영문/시장/WICS ---
|
|
36
|
+
out["종목명"] = clean_text(
|
|
37
|
+
await browser.text_content_first("table.cmp-table td.td0101 span.name")
|
|
38
|
+
)
|
|
39
|
+
out["코드"] = clean_text(
|
|
40
|
+
await browser.text_content_first("table.cmp-table td.td0101 b.num")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# td0101의 dt 텍스트들을 읽어 분류
|
|
44
|
+
dt0101 = await browser.all_texts("table.cmp-table td.td0101 dl > dt")
|
|
45
|
+
for t in dt0101[1:] if dt0101 else []:
|
|
46
|
+
t = clean_text(t)
|
|
47
|
+
if not t:
|
|
48
|
+
continue
|
|
49
|
+
if t.startswith("KOSPI") or t.startswith("KOSDAQ"):
|
|
50
|
+
out["시장"] = t
|
|
51
|
+
elif t.startswith("WICS"):
|
|
52
|
+
out["WICS"] = t
|
|
53
|
+
else:
|
|
54
|
+
if "영문명" not in out:
|
|
55
|
+
out["영문명"] = t
|
|
56
|
+
|
|
57
|
+
# --- 2) td0301: EPS/BPS/PER/업종PER/PBR/현금배당수익률/결산 ---
|
|
58
|
+
base_dl = "table.cmp-table td.td0301 dl"
|
|
59
|
+
dt_sel = f"{base_dl} > dt"
|
|
60
|
+
|
|
61
|
+
dt_texts = await browser.all_texts(dt_sel) # dt 전체 텍스트(숫자 포함)
|
|
62
|
+
if not dt_texts:
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
# dt는 DOM 상에서 1..N 순서
|
|
66
|
+
for i, raw_dt in enumerate(dt_texts, start=1):
|
|
67
|
+
dt_text = clean_text(raw_dt)
|
|
68
|
+
if not dt_text:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
num_sel = f"{base_dl} > dt:nth-child({i}) b.num"
|
|
72
|
+
|
|
73
|
+
# 숫자 없는 라인: 예) "12월 결산"
|
|
74
|
+
if not await browser.is_attached(num_sel):
|
|
75
|
+
if "결산" in dt_text:
|
|
76
|
+
out["결산"] = dt_text
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
num_text = clean_text(await browser.text_content_first(num_sel))
|
|
80
|
+
if not num_text:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
label = clean_text(dt_text.replace(num_text, "")).replace(":", "")
|
|
84
|
+
if label:
|
|
85
|
+
out[label] = to_number(num_text)
|
|
86
|
+
|
|
87
|
+
return out
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/yearly_consensus.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
11
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_float
|
|
12
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
13
|
+
from common_hj3415.utils import clean_text
|
|
14
|
+
from logging_hj3415 import logger
|
|
15
|
+
|
|
16
|
+
_YEARLY_CONSENSUS_TABLE = "#cTB25"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# -----------------------------
|
|
20
|
+
# column / period normalize
|
|
21
|
+
# -----------------------------
|
|
22
|
+
_COL_UNIT_RE = re.compile(r"\([^)]*\)") # (억원, %), (원), (배) ... 제거용
|
|
23
|
+
_PERIOD_RE = re.compile(r"^\s*(\d{4})\s*\(?([A-Za-z])?\)?\s*$") # 2022(A), 2025(E)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _flatten_col(col: Any) -> str:
|
|
27
|
+
"""
|
|
28
|
+
pd.read_html(header=[0,1])로 생긴 MultiIndex 컬럼을 '매출액_금액' 같은 단일 키로 만든다.
|
|
29
|
+
- ('매출액(억원, %)', '금액') -> '매출액_금액'
|
|
30
|
+
- ('매출액(억원, %)', 'YoY') -> '매출액_YoY'
|
|
31
|
+
- 단위 괄호 제거
|
|
32
|
+
"""
|
|
33
|
+
if isinstance(col, tuple):
|
|
34
|
+
parts = [clean_text(str(p)) for p in col if clean_text(str(p))]
|
|
35
|
+
if len(parts) == 2 and parts[0] == parts[1]:
|
|
36
|
+
s = parts[0]
|
|
37
|
+
else:
|
|
38
|
+
s = "_".join(parts) if parts else ""
|
|
39
|
+
else:
|
|
40
|
+
s = clean_text(str(col))
|
|
41
|
+
|
|
42
|
+
# 단위 괄호 제거
|
|
43
|
+
s = _COL_UNIT_RE.sub("", s)
|
|
44
|
+
s = clean_text(s)
|
|
45
|
+
|
|
46
|
+
# 컬럼 표기 깨짐 보정
|
|
47
|
+
s = s.replace("주재 무제표", "주재무제표")
|
|
48
|
+
|
|
49
|
+
# 공백 제거(키 안정화)
|
|
50
|
+
s = s.replace(" ", "")
|
|
51
|
+
return s
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _normalize_period(
|
|
55
|
+
s: Any,
|
|
56
|
+
*,
|
|
57
|
+
keep_suffix: bool = False,
|
|
58
|
+
) -> str | None:
|
|
59
|
+
"""
|
|
60
|
+
기간 문자열을 표준 period key로 정규화한다.
|
|
61
|
+
|
|
62
|
+
- "2022(A)", "2026(E)", "2022" 등을 처리
|
|
63
|
+
- 기본 정책: 연간 = YYYY/12
|
|
64
|
+
"""
|
|
65
|
+
t = normalize_text(s)
|
|
66
|
+
if not t:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
# 헤더 방어
|
|
70
|
+
if t == "재무년월":
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# 이미 표준 포맷이면 그대로
|
|
74
|
+
if re.fullmatch(r"\d{4}/\d{2}", t):
|
|
75
|
+
return t
|
|
76
|
+
|
|
77
|
+
m = _PERIOD_RE.match(t)
|
|
78
|
+
if not m:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
year, suffix = m.groups() # suffix: "A" | "E" | None
|
|
82
|
+
|
|
83
|
+
if keep_suffix and suffix:
|
|
84
|
+
return f"{year}{suffix}"
|
|
85
|
+
|
|
86
|
+
return f"{year}/{12}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _normalize_metric_key(col_key: str) -> str:
|
|
90
|
+
"""
|
|
91
|
+
최종 metric key를 사람이 쓰기 좋은 형태로 정리.
|
|
92
|
+
"""
|
|
93
|
+
k = col_key
|
|
94
|
+
|
|
95
|
+
# 매출액은 '금액'/'YoY'가 분리되어 있으니 명시적으로 이름을 고정
|
|
96
|
+
if k.startswith("매출액_금액"):
|
|
97
|
+
return "매출액"
|
|
98
|
+
if k.startswith("매출액_YoY"):
|
|
99
|
+
return "매출액YoY"
|
|
100
|
+
|
|
101
|
+
# 나머지는 그대로(단위/공백은 _flatten_col에서 제거됨)
|
|
102
|
+
# 예: "영업이익", "당기순이익", "EPS", "PER", "PBR", "ROE", "EV/EBITDA", "순부채비율"
|
|
103
|
+
return k
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _html_to_df(html: str) -> pd.DataFrame | None:
|
|
107
|
+
"""
|
|
108
|
+
yearly consensus 테이블은 2줄 헤더이므로 header=[0,1]로 읽고 flatten한다.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
dfs = pd.read_html(StringIO(html), header=[0, 1])
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.exception("pd.read_html failed: {}", e)
|
|
114
|
+
return None
|
|
115
|
+
if not dfs:
|
|
116
|
+
return None
|
|
117
|
+
df = dfs[0]
|
|
118
|
+
if df is None or df.empty:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
df = df.copy()
|
|
122
|
+
df.columns = [_flatten_col(c) for c in df.columns]
|
|
123
|
+
return df
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _df_to_metric_map(df: pd.DataFrame) -> dict[str, dict[str, Any]]:
|
|
127
|
+
"""
|
|
128
|
+
DataFrame(row: period, col: metric) -> {metric: {period: value}} 로 pivot
|
|
129
|
+
"""
|
|
130
|
+
if df is None or df.empty:
|
|
131
|
+
return {}
|
|
132
|
+
|
|
133
|
+
# NaN -> None
|
|
134
|
+
df = df.where(pd.notnull(df), None)
|
|
135
|
+
|
|
136
|
+
# '재무년월' 컬럼 찾기(안정)
|
|
137
|
+
# 보통 "재무년월"로 flatten 되지만, 혹시 깨지는 경우 대비
|
|
138
|
+
period_col = None
|
|
139
|
+
for c in df.columns:
|
|
140
|
+
if "재무년월" == c or c.endswith("재무년월") or "재무년월" in c:
|
|
141
|
+
period_col = c
|
|
142
|
+
break
|
|
143
|
+
if not period_col:
|
|
144
|
+
logger.warning("[cTB25] period column not found")
|
|
145
|
+
return {}
|
|
146
|
+
|
|
147
|
+
out: dict[str, dict[str, Any]] = {}
|
|
148
|
+
|
|
149
|
+
for _, row in df.iterrows():
|
|
150
|
+
period = _normalize_period(row.get(period_col), keep_suffix=True)
|
|
151
|
+
if not period:
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
for col, raw_val in row.items():
|
|
155
|
+
if col == period_col:
|
|
156
|
+
continue
|
|
157
|
+
# 주재무제표는 metric-map에서 제외(원하면 따로 meta로 빼도 됨)
|
|
158
|
+
if "주재무제표" in str(col):
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
metric = _normalize_metric_key(str(col))
|
|
162
|
+
|
|
163
|
+
num = to_float(raw_val)
|
|
164
|
+
val: Any = num if num is not None else (normalize_text(raw_val) or None)
|
|
165
|
+
|
|
166
|
+
out.setdefault(metric, {})[period] = val
|
|
167
|
+
|
|
168
|
+
return out
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
async def parse_c101_yearly_consensus_table(
|
|
172
|
+
browser: BrowserPort,
|
|
173
|
+
) -> dict[str, dict[str, Any]]:
|
|
174
|
+
"""
|
|
175
|
+
#cTB25 (3년 실적 + 2년 추정) 테이블을
|
|
176
|
+
{metric: {period: value}} 형태로 반환한다.
|
|
177
|
+
"""
|
|
178
|
+
await browser.wait_attached(_YEARLY_CONSENSUS_TABLE)
|
|
179
|
+
await browser.wait_table_nth_ready(
|
|
180
|
+
_YEARLY_CONSENSUS_TABLE,
|
|
181
|
+
index=0,
|
|
182
|
+
min_rows=5,
|
|
183
|
+
timeout_ms=30_000,
|
|
184
|
+
poll_ms=200,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
html = await browser.outer_html_nth(_YEARLY_CONSENSUS_TABLE, 0)
|
|
188
|
+
if not html or "<table" not in html:
|
|
189
|
+
logger.warning("[cTB25] outerHTML invalid or empty")
|
|
190
|
+
return {}
|
|
191
|
+
|
|
192
|
+
df = _html_to_df(html)
|
|
193
|
+
if df is None:
|
|
194
|
+
logger.warning("[cTB25] df is empty/invalid")
|
|
195
|
+
return {}
|
|
196
|
+
|
|
197
|
+
return _df_to_metric_map(df)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
5
|
+
from logging_hj3415 import logger
|
|
6
|
+
|
|
7
|
+
from .c101.sise import parse_c101_sise_table
|
|
8
|
+
from .c101.earning_surprise import parse_c101_earnings_surprise_table
|
|
9
|
+
from .c101.fundamentals import parse_c101_fundamentals_table
|
|
10
|
+
from .c101.major_shareholders import parse_c101_major_shareholders
|
|
11
|
+
from .c101.company_overview import parse_c101_company_overview
|
|
12
|
+
from .c101.summary_cmp import parse_c101_summary_cmp_table
|
|
13
|
+
from .c101.yearly_consensus import parse_c101_yearly_consensus_table
|
|
14
|
+
|
|
15
|
+
async def parse_c101_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
|
|
16
|
+
parsed_summary_cmp = await parse_c101_summary_cmp_table(browser)
|
|
17
|
+
logger.debug(f"parsed_summary_cmp data: {parsed_summary_cmp}")
|
|
18
|
+
|
|
19
|
+
parsed_sise = await parse_c101_sise_table(browser)
|
|
20
|
+
logger.debug(f"parsed_sise data: {parsed_sise}")
|
|
21
|
+
|
|
22
|
+
parsed_company_overview = await parse_c101_company_overview(browser)
|
|
23
|
+
logger.debug(f"parsed_company_overview data: {parsed_company_overview}")
|
|
24
|
+
|
|
25
|
+
parsed_major_shareholders = await parse_c101_major_shareholders(browser)
|
|
26
|
+
logger.debug(f"parsed_major_shareholders data: {parsed_major_shareholders}")
|
|
27
|
+
|
|
28
|
+
parsed_fundamentals = await parse_c101_fundamentals_table(browser)
|
|
29
|
+
logger.debug(f"parsed_fundamentals data: {parsed_fundamentals}")
|
|
30
|
+
|
|
31
|
+
parsed_earnings_surprise = await parse_c101_earnings_surprise_table(browser)
|
|
32
|
+
logger.debug(f"parsed_earnings_surprise data: {parsed_earnings_surprise}")
|
|
33
|
+
|
|
34
|
+
parsed_yearly_consensus = await parse_c101_yearly_consensus_table(browser)
|
|
35
|
+
logger.debug(f"parsed_yearly_consensus data: {parsed_yearly_consensus}")
|
|
36
|
+
|
|
37
|
+
return {
|
|
38
|
+
"요약": parsed_summary_cmp,
|
|
39
|
+
"시세": parsed_sise,
|
|
40
|
+
"주주현황": parsed_major_shareholders,
|
|
41
|
+
"기업개요": parsed_company_overview,
|
|
42
|
+
"펀더멘털": parsed_fundamentals,
|
|
43
|
+
"어닝서프라이즈": parsed_earnings_surprise,
|
|
44
|
+
"연간컨센서스": parsed_yearly_consensus,
|
|
45
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c103_parser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from scraper2_hj3415.app.parsing._tables.html_table import try_html_table_to_df, df_to_c1034_metric_list
|
|
7
|
+
|
|
8
|
+
TABLE_XPATH = "xpath=//div[@id='wrapper']//div//table"
|
|
9
|
+
TABLE_INDEX = 2
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def parse_c103_current_table(browser: BrowserPort) -> list[dict[str, Any]]:
|
|
13
|
+
"""
|
|
14
|
+
✅ 현재 화면 상태(탭/연간/분기/검색 결과)가 이미 준비되었다는 전제.
|
|
15
|
+
이 상태에서 TABLE_INDEX 테이블만 읽어서 rows로 변환한다.
|
|
16
|
+
"""
|
|
17
|
+
html = await browser.outer_html_nth(TABLE_XPATH, TABLE_INDEX)
|
|
18
|
+
df = try_html_table_to_df(html)
|
|
19
|
+
return df_to_c1034_metric_list(df)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c104_parser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
7
|
+
from scraper2_hj3415.app.parsing._tables.html_table import try_html_table_to_df, df_to_c1034_metric_list
|
|
8
|
+
|
|
9
|
+
TABLE_XPATH = 'xpath=//table[@class="gHead01 all-width data-list"]'
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def parse_c104_current_table(
|
|
13
|
+
browser: BrowserPort,
|
|
14
|
+
*,
|
|
15
|
+
table_index: int,
|
|
16
|
+
) -> list[dict[str, Any]]:
|
|
17
|
+
"""
|
|
18
|
+
✅ 현재 화면 상태(탭/연간/분기/검색 결과)가 이미 준비되었다는 전제.
|
|
19
|
+
이 상태에서 지정된 table_index 테이블만 읽어서 rows로 변환한다.
|
|
20
|
+
"""
|
|
21
|
+
html = await browser.outer_html_nth(TABLE_XPATH, table_index)
|
|
22
|
+
df = try_html_table_to_df(html)
|
|
23
|
+
return df_to_c1034_metric_list(df)
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c106_parser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import re
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from common_hj3415.utils import clean_text
|
|
11
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
12
|
+
from scraper2_hj3415.app.parsing._normalize.label import (
|
|
13
|
+
normalize_metric_label,
|
|
14
|
+
sanitize_label,
|
|
15
|
+
)
|
|
16
|
+
from logging_hj3415 import logger
|
|
17
|
+
|
|
18
|
+
_CODE_RE = re.compile(r"\b\d{6}\b")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
async def parse_c106_header_codes(browser: BrowserPort) -> list[str]:
|
|
22
|
+
"""
|
|
23
|
+
현재 페이지에서 '기업간비교자료' 헤더(회사명들)에서 종목코드(6자리)만 추출한다.
|
|
24
|
+
(goto/sleep 없음)
|
|
25
|
+
"""
|
|
26
|
+
selector = (
|
|
27
|
+
'xpath=//caption[contains(normalize-space(.), "기업간비교자료")]'
|
|
28
|
+
"/following-sibling::thead//th[not(@colspan)]"
|
|
29
|
+
)
|
|
30
|
+
await browser.wait_attached(selector)
|
|
31
|
+
th_texts = await browser.all_texts(selector)
|
|
32
|
+
|
|
33
|
+
codes: list[str] = []
|
|
34
|
+
for i, t in enumerate(th_texts):
|
|
35
|
+
text = (t or "").strip()
|
|
36
|
+
if not text:
|
|
37
|
+
continue
|
|
38
|
+
m = _CODE_RE.search(text)
|
|
39
|
+
if not m:
|
|
40
|
+
continue
|
|
41
|
+
codes.append(m.group(0))
|
|
42
|
+
|
|
43
|
+
# 중복 제거(순서 유지)
|
|
44
|
+
seen: set[str] = set()
|
|
45
|
+
uniq: list[str] = []
|
|
46
|
+
for c in codes:
|
|
47
|
+
if c not in seen:
|
|
48
|
+
seen.add(c)
|
|
49
|
+
uniq.append(c)
|
|
50
|
+
logger.debug(f"c106 header codes: {uniq}")
|
|
51
|
+
return uniq
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def html_table_to_df(html: str, codes: list[str]) -> pd.DataFrame:
|
|
55
|
+
df = pd.read_html(StringIO(html), header=None)[0]
|
|
56
|
+
if df is None or df.empty:
|
|
57
|
+
return pd.DataFrame()
|
|
58
|
+
|
|
59
|
+
df.columns = ["항목_group", "항목"] + codes
|
|
60
|
+
df["항목_group"] = df["항목_group"].ffill()
|
|
61
|
+
|
|
62
|
+
# 첫 두 줄 주가데이터 주입(기존 로직 유지)
|
|
63
|
+
for i in range(min(2, len(df))):
|
|
64
|
+
row = df.loc[i].tolist()
|
|
65
|
+
new_row = ["주가데이터"] + row
|
|
66
|
+
df.loc[i] = new_row[: len(df.columns)]
|
|
67
|
+
|
|
68
|
+
df = df[df["항목"].notna()].reset_index(drop=True)
|
|
69
|
+
df.loc[df["항목"].isin(["투자의견", "목표주가(원)"]), "항목_group"] = "기타지표"
|
|
70
|
+
df = df[df["항목"] != "재무연월"].reset_index(drop=True)
|
|
71
|
+
|
|
72
|
+
for col in df.columns[2:]:
|
|
73
|
+
df[col] = df[col].replace("-", "0")
|
|
74
|
+
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
75
|
+
|
|
76
|
+
df["항목_group"] = df["항목_group"].astype("string").map(clean_text)
|
|
77
|
+
df["항목"] = df["항목"].astype("string").map(clean_text)
|
|
78
|
+
|
|
79
|
+
return df.replace({np.nan: None})
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def df_to_c106_metric_list(df: pd.DataFrame) -> list[dict[str, Any]]:
|
|
83
|
+
"""
|
|
84
|
+
C106 DataFrame -> records(list[dict])
|
|
85
|
+
|
|
86
|
+
A안 적용:
|
|
87
|
+
- 항목(key)은 normalize_c1034_item으로 강하게 정규화(괄호/별표 등 제거)
|
|
88
|
+
- 항목_raw는 정규화 전(단 UI 노이즈만 제거된) 원라벨을 저장
|
|
89
|
+
- 항목_group은 그대로 두되, 필요 없으면 caller에서 삭제하면 됨
|
|
90
|
+
"""
|
|
91
|
+
if df is None or df.empty:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
df = df.copy()
|
|
95
|
+
|
|
96
|
+
# raw 보존(정규화 전, UI 노이즈만 제거)
|
|
97
|
+
raw = df["항목"].where(df["항목"].notna(), None)
|
|
98
|
+
df["항목_raw"] = raw.map(
|
|
99
|
+
lambda x: sanitize_label(str(x)) if x is not None else None
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# 항목_group 컬럼들은 제거(있을 때만)
|
|
103
|
+
drop_cols = [c for c in ("항목_group", "항목_group_raw") if c in df.columns]
|
|
104
|
+
if drop_cols:
|
|
105
|
+
df = df.drop(columns=drop_cols)
|
|
106
|
+
|
|
107
|
+
# key 정규화(A안)
|
|
108
|
+
df["항목"] = df["항목"].map(
|
|
109
|
+
lambda x: normalize_metric_label(str(x)) if x is not None else ""
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# 유효 행만
|
|
113
|
+
df = df[df["항목"].astype(str).str.strip() != ""].reset_index(drop=True)
|
|
114
|
+
|
|
115
|
+
# NaN -> None
|
|
116
|
+
df = df.where(pd.notnull(df), None)
|
|
117
|
+
|
|
118
|
+
return df.to_dict(orient="records")
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
async def parse_c106_current_table(
|
|
122
|
+
browser: BrowserPort,
|
|
123
|
+
*,
|
|
124
|
+
columns: list[str],
|
|
125
|
+
table_selector: str = "#cTB611",
|
|
126
|
+
table_index: int = 0,
|
|
127
|
+
timeout_ms: int = 10_000,
|
|
128
|
+
) -> list[dict[str, Any]]:
|
|
129
|
+
"""
|
|
130
|
+
✅ 현재 화면(이미 goto/대기 완료된 상태)에서 비교테이블만 파싱한다.
|
|
131
|
+
"""
|
|
132
|
+
await browser.wait_table_nth_ready(
|
|
133
|
+
table_selector, index=table_index, min_rows=3, timeout_ms=timeout_ms
|
|
134
|
+
)
|
|
135
|
+
html = await browser.outer_html_nth(table_selector, table_index)
|
|
136
|
+
df = html_table_to_df(html, columns)
|
|
137
|
+
return df_to_c106_metric_list(df)
|