scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
- scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +22 -0
- scraper2_hj3415/app/parsing/c104_parser.py +26 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
- scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
- scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
- scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
- scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
- scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
- scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
- scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +45 -72
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
- scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from common_hj3415.utils import clean_text
|
|
7
|
+
|
|
8
|
+
_EARNING_SURPRISE_TABLE = "#earning_list"
|
|
9
|
+
|
|
10
|
+
def _strip_bullets_commas(s: str) -> str:
|
|
11
|
+
"""
|
|
12
|
+
"● 120,064.0" / "101,922.8" 같은 텍스트에서 숫자 파싱을 방해하는 것 제거.
|
|
13
|
+
"""
|
|
14
|
+
s = clean_text(s)
|
|
15
|
+
s = s.replace(",", "")
|
|
16
|
+
s = s.replace("●", "")
|
|
17
|
+
s = s.replace("○", "")
|
|
18
|
+
s = s.replace("▲", "")
|
|
19
|
+
s = s.replace("▼", "")
|
|
20
|
+
return clean_text(s)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _to_number_like(x: Any) -> Any:
|
|
24
|
+
"""
|
|
25
|
+
숫자면 float/int로, 아니면 문자열 그대로.
|
|
26
|
+
"""
|
|
27
|
+
if x is None:
|
|
28
|
+
return None
|
|
29
|
+
if isinstance(x, (int, float)):
|
|
30
|
+
return x
|
|
31
|
+
s = _strip_bullets_commas(str(x))
|
|
32
|
+
if not s:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
# 숫자 패턴이면 숫자로
|
|
36
|
+
# - "65.00" "209.17" "-123.4"
|
|
37
|
+
if re.fullmatch(r"[-+]?\d+(\.\d+)?", s):
|
|
38
|
+
# 정수면 int 유지하고 싶으면 여기서 분기 가능
|
|
39
|
+
try:
|
|
40
|
+
f = float(s)
|
|
41
|
+
# "65.0" 같이 소수점 .0이면 int로 바꿀지 정책 선택
|
|
42
|
+
return f
|
|
43
|
+
except Exception:
|
|
44
|
+
return s
|
|
45
|
+
|
|
46
|
+
return s
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _norm_item_label(item: str) -> str:
|
|
50
|
+
"""
|
|
51
|
+
item(th 텍스트) 정규화:
|
|
52
|
+
- "전분기대비보기 전년동기대비" -> "전년동기대비"
|
|
53
|
+
- "Surprise" 등은 그대로
|
|
54
|
+
"""
|
|
55
|
+
t = clean_text(item)
|
|
56
|
+
|
|
57
|
+
# 버튼 텍스트가 섞이는 케이스: "전분기대비보기 전년동기대비"
|
|
58
|
+
if ("전분기대비" in t) and ("전년동기대비" in t):
|
|
59
|
+
return "전년동기대비"
|
|
60
|
+
if "전분기대비" in t:
|
|
61
|
+
return "전분기대비"
|
|
62
|
+
if "전년동기대비" in t:
|
|
63
|
+
return "전년동기대비"
|
|
64
|
+
if "컨센서스" in t:
|
|
65
|
+
return "컨센서스"
|
|
66
|
+
if "잠정치" in t:
|
|
67
|
+
return "잠정치"
|
|
68
|
+
if "Surprise" in t or "SURPRISE" in t or "surprise" in t:
|
|
69
|
+
return "Surprise"
|
|
70
|
+
|
|
71
|
+
return t
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def _row_cells_texts(
|
|
75
|
+
browser: BrowserPort,
|
|
76
|
+
*,
|
|
77
|
+
row_sel: str,
|
|
78
|
+
) -> list[str]:
|
|
79
|
+
"""
|
|
80
|
+
tbody의 특정 tr에서 th/td 텍스트를 왼쪽부터 순서대로 모두 가져온다.
|
|
81
|
+
"""
|
|
82
|
+
# th,td 전체 개수
|
|
83
|
+
n = await browser.count_in_nth(
|
|
84
|
+
_EARNING_SURPRISE_TABLE,
|
|
85
|
+
scope_index=0,
|
|
86
|
+
inner_selector=f"{row_sel} th, {row_sel} td",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
out: list[str] = []
|
|
90
|
+
for j in range(n):
|
|
91
|
+
txt = await browser.inner_text_in_nth(
|
|
92
|
+
_EARNING_SURPRISE_TABLE,
|
|
93
|
+
scope_index=0,
|
|
94
|
+
inner_selector=f"{row_sel} th, {row_sel} td",
|
|
95
|
+
inner_index=j,
|
|
96
|
+
)
|
|
97
|
+
out.append(clean_text(txt))
|
|
98
|
+
return out
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def parse_c101_earnings_surprise_table(
|
|
102
|
+
browser: BrowserPort,
|
|
103
|
+
*,
|
|
104
|
+
debug_rows: bool = False,
|
|
105
|
+
) -> dict[str, Any]:
|
|
106
|
+
"""
|
|
107
|
+
earning_list HTML 구조(제공된 원본)에 맞춘 안정 파서.
|
|
108
|
+
|
|
109
|
+
반환:
|
|
110
|
+
{
|
|
111
|
+
"periods": [...],
|
|
112
|
+
"metrics": { section: { item: {period: value} } },
|
|
113
|
+
"meta": {...},
|
|
114
|
+
...(debug_rows면 "rows": raw_cells_rows)
|
|
115
|
+
}
|
|
116
|
+
"""
|
|
117
|
+
await browser.wait_attached(_EARNING_SURPRISE_TABLE)
|
|
118
|
+
|
|
119
|
+
row_cnt = await browser.count_in_nth(
|
|
120
|
+
_EARNING_SURPRISE_TABLE,
|
|
121
|
+
scope_index=0,
|
|
122
|
+
inner_selector="tbody tr",
|
|
123
|
+
)
|
|
124
|
+
if not row_cnt:
|
|
125
|
+
out = {"periods": [], "metrics": {}, "meta": {}}
|
|
126
|
+
if debug_rows:
|
|
127
|
+
out["rows"] = []
|
|
128
|
+
return out
|
|
129
|
+
|
|
130
|
+
raw_cells_rows: list[list[str]] = []
|
|
131
|
+
|
|
132
|
+
periods: list[str] = []
|
|
133
|
+
period_count = 0
|
|
134
|
+
|
|
135
|
+
metrics: dict[str, dict[str, dict[str, Any]]] = {}
|
|
136
|
+
meta: dict[str, dict[str, Any]] = {}
|
|
137
|
+
|
|
138
|
+
current_section: str | None = None
|
|
139
|
+
|
|
140
|
+
for i in range(1, row_cnt + 1): # nth-child 1-based
|
|
141
|
+
row_sel = f"tbody tr:nth-child({i})"
|
|
142
|
+
cells = await _row_cells_texts(browser, row_sel=row_sel)
|
|
143
|
+
raw_cells_rows.append(cells)
|
|
144
|
+
|
|
145
|
+
if not cells:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
joined = " ".join([c for c in cells if c])
|
|
149
|
+
|
|
150
|
+
# 1) periods 추출: "재무연월" 헤더 row
|
|
151
|
+
# HTML: <th colspan="2">재무연월</th> + <th>2025/09</th> + <th>2025/12</th>
|
|
152
|
+
if ("재무연월" in joined) and not periods:
|
|
153
|
+
# cells 예: ["재무연월", "2025/09", "2025/12"] 또는 table 구조에 따라 3~4개
|
|
154
|
+
# 여기서는 "YYYY/NN" 패턴만 뽑는 게 가장 안전함
|
|
155
|
+
periods = [c for c in cells if re.fullmatch(r"\d{4}/\d{2}", c)]
|
|
156
|
+
period_count = len(periods)
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# periods 없으면 본문 해석 불가
|
|
160
|
+
if not periods:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
# 2) meta row: "잠정치발표(예정)일/회계기준"
|
|
164
|
+
if "잠정치발표(예정)일/회계기준" in joined:
|
|
165
|
+
# 보통 cells: ["잠정치발표(예정)일/회계기준", "2025/10/14(연결)", "2026/01/08(연결)"]
|
|
166
|
+
vals = [c for c in cells if c and "잠정치발표" not in c]
|
|
167
|
+
vals = vals[-period_count:] if period_count else vals
|
|
168
|
+
meta["잠정치발표(예정)일/회계기준"] = {
|
|
169
|
+
periods[idx]: vals[idx] if idx < len(vals) else None
|
|
170
|
+
for idx in range(period_count)
|
|
171
|
+
}
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
# 3) 본문 row 정규화: 항상 [section, item, v1, v2, ...] 로 맞추기
|
|
175
|
+
# HTML 케이스:
|
|
176
|
+
# - 섹션 시작 행(영업이익/당기순이익): cells = ["영업이익", "컨센서스", v1, v2]
|
|
177
|
+
# - rowspan 내부 다음 행: cells = ["잠정치", v1, v2] (section 없음 → 왼쪽 패딩 필요)
|
|
178
|
+
# - ext0 행(전분기대비): cells = ["", "전분기대비", v1, v2] (첫 칸 빈 th)
|
|
179
|
+
#
|
|
180
|
+
# period_count가 2라면, 정상형은 길이 2 + period_count = 4
|
|
181
|
+
want_len = 2 + period_count
|
|
182
|
+
|
|
183
|
+
norm = cells[:]
|
|
184
|
+
if len(norm) == want_len - 1:
|
|
185
|
+
# section th가 빠진 케이스: ["잠정치", v1, v2] -> ["", "잠정치", v1, v2]
|
|
186
|
+
norm = [""] + norm
|
|
187
|
+
elif len(norm) < want_len:
|
|
188
|
+
# 애매한 경우: 오른쪽을 None으로 채움
|
|
189
|
+
norm = ([""] * (want_len - len(norm))) + norm
|
|
190
|
+
norm = norm[-want_len:]
|
|
191
|
+
|
|
192
|
+
section_cell = clean_text(norm[0])
|
|
193
|
+
item_cell = clean_text(norm[1])
|
|
194
|
+
value_cells = norm[2 : 2 + period_count]
|
|
195
|
+
|
|
196
|
+
# section 갱신
|
|
197
|
+
if section_cell:
|
|
198
|
+
current_section = section_cell
|
|
199
|
+
metrics.setdefault(current_section, {})
|
|
200
|
+
if not current_section:
|
|
201
|
+
# 섹션이 한 번도 잡히지 않은 상태면 skip
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
item = _norm_item_label(item_cell)
|
|
205
|
+
if not item:
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
# 값 매핑
|
|
209
|
+
bucket = metrics[current_section].setdefault(item, {})
|
|
210
|
+
for idx, p in enumerate(periods):
|
|
211
|
+
raw_v = value_cells[idx] if idx < len(value_cells) else None
|
|
212
|
+
bucket[p] = _to_number_like(raw_v)
|
|
213
|
+
|
|
214
|
+
out: dict[str, Any] = {"periods": periods, "metrics": metrics, "meta": meta}
|
|
215
|
+
if debug_rows:
|
|
216
|
+
out["rows"] = raw_cells_rows
|
|
217
|
+
return out
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/fundamentals.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
7
|
+
from common_hj3415.utils import clean_text
|
|
8
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
9
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_number_or_text
|
|
10
|
+
|
|
11
|
+
_FUNDAMENTALS_TABLE = "div.fund.fl_le table.gHead03"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _normalize_period_key(s: str) -> str:
|
|
15
|
+
"""
|
|
16
|
+
예)
|
|
17
|
+
"2024/12(A)" -> "2024/12"
|
|
18
|
+
"2025/12(E)" -> "2025/12"
|
|
19
|
+
"2025/12" -> "2025/12"
|
|
20
|
+
"""
|
|
21
|
+
s = s.strip()
|
|
22
|
+
# 뒤쪽 괄호 주석 제거: (A) (E) (P) 등
|
|
23
|
+
s = re.sub(r"\([^)]*\)$", "", s).strip()
|
|
24
|
+
return s
|
|
25
|
+
|
|
26
|
+
EXCLUDED_METRICS = {"회계기준"}
|
|
27
|
+
|
|
28
|
+
async def parse_c101_fundamentals_table(
|
|
29
|
+
browser: BrowserPort,
|
|
30
|
+
) -> dict[str, dict[str, Any]]:
|
|
31
|
+
"""
|
|
32
|
+
'펀더멘털 주요지표(실적/컨센서스)' 테이블을
|
|
33
|
+
metric_key -> {period_key -> value} 형태로 반환한다.
|
|
34
|
+
|
|
35
|
+
반환 예)
|
|
36
|
+
{
|
|
37
|
+
"PBR": {"2024/12": 13.62, "2025/12": None},
|
|
38
|
+
"회계기준": {"2024/12": "연결", "2025/12": "연결"},
|
|
39
|
+
...
|
|
40
|
+
}
|
|
41
|
+
"""
|
|
42
|
+
await browser.wait_attached(_FUNDAMENTALS_TABLE)
|
|
43
|
+
|
|
44
|
+
rows = await browser.table_records(_FUNDAMENTALS_TABLE, header=0)
|
|
45
|
+
if not rows:
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
cleaned_rows: list[dict[str, Any]] = []
|
|
49
|
+
for r in rows:
|
|
50
|
+
rr: dict[str, Any] = {}
|
|
51
|
+
for k, v in r.items():
|
|
52
|
+
kk = clean_text(k)
|
|
53
|
+
if not kk:
|
|
54
|
+
continue
|
|
55
|
+
rr[kk] = normalize_text(v) if kk == "주요지표" else to_number_or_text(v)
|
|
56
|
+
|
|
57
|
+
if rr.get("주요지표"):
|
|
58
|
+
cleaned_rows.append(rr)
|
|
59
|
+
|
|
60
|
+
if not cleaned_rows:
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
# columns: 순서 보존 합치기
|
|
64
|
+
seen: set[str] = set()
|
|
65
|
+
columns: list[str] = []
|
|
66
|
+
for rr in cleaned_rows:
|
|
67
|
+
for kk in rr.keys():
|
|
68
|
+
if kk not in seen:
|
|
69
|
+
seen.add(kk)
|
|
70
|
+
columns.append(kk)
|
|
71
|
+
|
|
72
|
+
metric_col = "주요지표" if "주요지표" in columns else columns[0]
|
|
73
|
+
raw_value_cols = [c for c in columns if c != metric_col]
|
|
74
|
+
|
|
75
|
+
# period_cols 정규화(괄호 제거)
|
|
76
|
+
# ⚠️ "2024/12(A)" / "2025/12" 같은 원본 컬럼명을 유지해야 rr.get(...)이 되므로
|
|
77
|
+
# (원본컬럼, 정규화컬럼) 페어로 들고 간다.
|
|
78
|
+
col_pairs: list[tuple[str, str]] = [(c, _normalize_period_key(c)) for c in raw_value_cols]
|
|
79
|
+
|
|
80
|
+
metrics: dict[str, dict[str, Any]] = {}
|
|
81
|
+
|
|
82
|
+
for rr in cleaned_rows:
|
|
83
|
+
name = rr.get(metric_col)
|
|
84
|
+
if not name:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
metric_key = str(name).strip()
|
|
88
|
+
if metric_key in EXCLUDED_METRICS:
|
|
89
|
+
continue # ⬅️ 여기서 제외
|
|
90
|
+
|
|
91
|
+
bucket = metrics.setdefault(metric_key, {})
|
|
92
|
+
for raw_c, norm_c in col_pairs:
|
|
93
|
+
bucket[norm_c] = rr.get(raw_c)
|
|
94
|
+
|
|
95
|
+
return metrics
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/major_shareholders.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
7
|
+
from scraper2_hj3415.app.parsing._normalize.label import normalize_key_label
|
|
8
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_int, to_float
|
|
9
|
+
|
|
10
|
+
def _pick_value_by_norm_key(row: dict[str, Any], candidates: list[str]) -> Any:
|
|
11
|
+
# row의 키들을 정규화 맵으로 만든 뒤 후보를 정규화해서 조회
|
|
12
|
+
norm_map: dict[str, str] = {
|
|
13
|
+
normalize_key_label(k): k for k in row.keys()
|
|
14
|
+
}
|
|
15
|
+
for cand in candidates:
|
|
16
|
+
rk = norm_map.get(normalize_key_label(cand))
|
|
17
|
+
if rk is None:
|
|
18
|
+
continue
|
|
19
|
+
v = row.get(rk)
|
|
20
|
+
# "키만 있고 값이 비어있는" 경우 다음 후보 탐색
|
|
21
|
+
if v is None:
|
|
22
|
+
continue
|
|
23
|
+
if isinstance(v, str) and not v.strip():
|
|
24
|
+
continue
|
|
25
|
+
return v
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
async def parse_c101_major_shareholders(browser: BrowserPort) -> list[dict[str, Any]]:
|
|
30
|
+
table_sel = "#cTB13"
|
|
31
|
+
await browser.wait_attached(table_sel)
|
|
32
|
+
|
|
33
|
+
records = await browser.table_records(table_sel, header=0)
|
|
34
|
+
|
|
35
|
+
if not records:
|
|
36
|
+
return []
|
|
37
|
+
|
|
38
|
+
out: list[dict[str, Any]] = []
|
|
39
|
+
for r in records:
|
|
40
|
+
name = normalize_text(_pick_value_by_norm_key(r, ["주요주주", "주요주주명"]))
|
|
41
|
+
if not name:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
shares_raw = _pick_value_by_norm_key(
|
|
45
|
+
r, ["보유주식수(보통)", "보유주식수", "보유주식수(보통주)"]
|
|
46
|
+
)
|
|
47
|
+
ratio_raw = _pick_value_by_norm_key(r, ["보유지분(%)", "보유지분", "보유지분%"])
|
|
48
|
+
|
|
49
|
+
out.append(
|
|
50
|
+
{
|
|
51
|
+
"주요주주": name,
|
|
52
|
+
"보유주식수": to_int(shares_raw), # 파싱 실패 시 None 가능
|
|
53
|
+
"보유지분": to_float(ratio_raw), # "0.91%"도 처리되게 파서 보장 필요
|
|
54
|
+
}
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
return out
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/sise.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
5
|
+
from common_hj3415.utils import clean_text
|
|
6
|
+
from ._sise_normalizer import normalize_sise_kv_map
|
|
7
|
+
|
|
8
|
+
_SISE_TABLE = "#cTB11"
|
|
9
|
+
|
|
10
|
+
async def parse_c101_sise_table(browser: BrowserPort) -> dict[str, str]:
|
|
11
|
+
"""
|
|
12
|
+
#cTB11 시세정보 테이블을 th(항목명) -> td(값) dict로 추출한다.
|
|
13
|
+
- 화면에 보이는 텍스트 기준(innerText)
|
|
14
|
+
"""
|
|
15
|
+
await browser.wait_attached(_SISE_TABLE)
|
|
16
|
+
|
|
17
|
+
row_cnt = await browser.count_in_nth(
|
|
18
|
+
_SISE_TABLE,
|
|
19
|
+
scope_index=0,
|
|
20
|
+
inner_selector="tbody tr",
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
out: dict[str, str] = {}
|
|
24
|
+
|
|
25
|
+
for i in range(1, row_cnt + 1): # nth-child는 1-based
|
|
26
|
+
row_sel = f"tbody tr:nth-child({i})"
|
|
27
|
+
|
|
28
|
+
key = await browser.inner_text_in_nth(
|
|
29
|
+
_SISE_TABLE,
|
|
30
|
+
scope_index=0,
|
|
31
|
+
inner_selector=f"{row_sel} th",
|
|
32
|
+
inner_index=0,
|
|
33
|
+
)
|
|
34
|
+
val = await browser.inner_text_in_nth(
|
|
35
|
+
_SISE_TABLE,
|
|
36
|
+
scope_index=0,
|
|
37
|
+
inner_selector=f"{row_sel} td",
|
|
38
|
+
inner_index=0,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
k = clean_text(key)
|
|
42
|
+
v = clean_text(val)
|
|
43
|
+
if k:
|
|
44
|
+
out[k] = v
|
|
45
|
+
raw = out
|
|
46
|
+
return normalize_sise_kv_map(raw)
|
|
47
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/summary_cmp.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from common_hj3415.utils import clean_text
|
|
7
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_number
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def parse_c101_summary_cmp_table(browser: BrowserPort) -> dict[str, Any]:
|
|
11
|
+
"""
|
|
12
|
+
<table class="cmp-table"> (회사 요약 테이블)에서 종목 기본 + EPS/BPS/PER... 등을 추출한다.
|
|
13
|
+
|
|
14
|
+
반환 예:
|
|
15
|
+
{
|
|
16
|
+
"종목명": "삼성전자",
|
|
17
|
+
"코드": "005930",
|
|
18
|
+
"영문명": "SamsungElec",
|
|
19
|
+
"시장": "KOSPI : 코스피 전기·전자",
|
|
20
|
+
"WICS": "WICS : 반도체와반도체장비",
|
|
21
|
+
"EPS": 4816,
|
|
22
|
+
"BPS": 60632,
|
|
23
|
+
"PER": 31.58,
|
|
24
|
+
"업종PER": 21.93,
|
|
25
|
+
"PBR": 2.51,
|
|
26
|
+
"현금배당수익률": 0.95,
|
|
27
|
+
"결산": "12월 결산",
|
|
28
|
+
}
|
|
29
|
+
"""
|
|
30
|
+
out: dict[str, Any] = {}
|
|
31
|
+
|
|
32
|
+
# 테이블 존재 확인
|
|
33
|
+
await browser.wait_attached("table.cmp-table")
|
|
34
|
+
|
|
35
|
+
# --- 1) td0101: 종목명/코드/영문/시장/WICS ---
|
|
36
|
+
out["종목명"] = clean_text(
|
|
37
|
+
await browser.text_content_first("table.cmp-table td.td0101 span.name")
|
|
38
|
+
)
|
|
39
|
+
out["코드"] = clean_text(
|
|
40
|
+
await browser.text_content_first("table.cmp-table td.td0101 b.num")
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
# td0101의 dt 텍스트들을 읽어 분류
|
|
44
|
+
dt0101 = await browser.all_texts("table.cmp-table td.td0101 dl > dt")
|
|
45
|
+
for t in dt0101[1:] if dt0101 else []:
|
|
46
|
+
t = clean_text(t)
|
|
47
|
+
if not t:
|
|
48
|
+
continue
|
|
49
|
+
if t.startswith("KOSPI") or t.startswith("KOSDAQ"):
|
|
50
|
+
out["시장"] = t
|
|
51
|
+
elif t.startswith("WICS"):
|
|
52
|
+
out["WICS"] = t
|
|
53
|
+
else:
|
|
54
|
+
if "영문명" not in out:
|
|
55
|
+
out["영문명"] = t
|
|
56
|
+
|
|
57
|
+
# --- 2) td0301: EPS/BPS/PER/업종PER/PBR/현금배당수익률/결산 ---
|
|
58
|
+
base_dl = "table.cmp-table td.td0301 dl"
|
|
59
|
+
dt_sel = f"{base_dl} > dt"
|
|
60
|
+
|
|
61
|
+
dt_texts = await browser.all_texts(dt_sel) # dt 전체 텍스트(숫자 포함)
|
|
62
|
+
if not dt_texts:
|
|
63
|
+
return out
|
|
64
|
+
|
|
65
|
+
# dt는 DOM 상에서 1..N 순서
|
|
66
|
+
for i, raw_dt in enumerate(dt_texts, start=1):
|
|
67
|
+
dt_text = clean_text(raw_dt)
|
|
68
|
+
if not dt_text:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
num_sel = f"{base_dl} > dt:nth-child({i}) b.num"
|
|
72
|
+
|
|
73
|
+
# 숫자 없는 라인: 예) "12월 결산"
|
|
74
|
+
if not await browser.is_attached(num_sel):
|
|
75
|
+
if "결산" in dt_text:
|
|
76
|
+
out["결산"] = dt_text
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
num_text = clean_text(await browser.text_content_first(num_sel))
|
|
80
|
+
if not num_text:
|
|
81
|
+
continue
|
|
82
|
+
|
|
83
|
+
label = clean_text(dt_text.replace(num_text, "")).replace(":", "")
|
|
84
|
+
if label:
|
|
85
|
+
out[label] = to_number(num_text)
|
|
86
|
+
|
|
87
|
+
return out
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/yearly_consensus.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
11
|
+
from scraper2_hj3415.app.parsing._normalize.values import to_float
|
|
12
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
13
|
+
from common_hj3415.utils import clean_text
|
|
14
|
+
from logging_hj3415 import logger
|
|
15
|
+
|
|
16
|
+
_YEARLY_CONSENSUS_TABLE = "#cTB25"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# -----------------------------
|
|
20
|
+
# column / period normalize
|
|
21
|
+
# -----------------------------
|
|
22
|
+
_COL_UNIT_RE = re.compile(r"\([^)]*\)") # (억원, %), (원), (배) ... 제거용
|
|
23
|
+
_PERIOD_RE = re.compile(r"^\s*(\d{4})\s*\(?([A-Za-z])?\)?\s*$") # 2022(A), 2025(E)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _flatten_col(col: Any) -> str:
|
|
27
|
+
"""
|
|
28
|
+
pd.read_html(header=[0,1])로 생긴 MultiIndex 컬럼을 '매출액_금액' 같은 단일 키로 만든다.
|
|
29
|
+
- ('매출액(억원, %)', '금액') -> '매출액_금액'
|
|
30
|
+
- ('매출액(억원, %)', 'YoY') -> '매출액_YoY'
|
|
31
|
+
- 단위 괄호 제거
|
|
32
|
+
"""
|
|
33
|
+
if isinstance(col, tuple):
|
|
34
|
+
parts = [clean_text(str(p)) for p in col if clean_text(str(p))]
|
|
35
|
+
if len(parts) == 2 and parts[0] == parts[1]:
|
|
36
|
+
s = parts[0]
|
|
37
|
+
else:
|
|
38
|
+
s = "_".join(parts) if parts else ""
|
|
39
|
+
else:
|
|
40
|
+
s = clean_text(str(col))
|
|
41
|
+
|
|
42
|
+
# 단위 괄호 제거
|
|
43
|
+
s = _COL_UNIT_RE.sub("", s)
|
|
44
|
+
s = clean_text(s)
|
|
45
|
+
|
|
46
|
+
# 컬럼 표기 깨짐 보정
|
|
47
|
+
s = s.replace("주재 무제표", "주재무제표")
|
|
48
|
+
|
|
49
|
+
# 공백 제거(키 안정화)
|
|
50
|
+
s = s.replace(" ", "")
|
|
51
|
+
return s
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _normalize_period(
|
|
55
|
+
s: Any,
|
|
56
|
+
*,
|
|
57
|
+
keep_suffix: bool = False,
|
|
58
|
+
) -> str | None:
|
|
59
|
+
"""
|
|
60
|
+
기간 문자열을 표준 period key로 정규화한다.
|
|
61
|
+
|
|
62
|
+
- "2022(A)", "2026(E)", "2022" 등을 처리
|
|
63
|
+
- 기본 정책: 연간 = YYYY/12
|
|
64
|
+
"""
|
|
65
|
+
t = normalize_text(s)
|
|
66
|
+
if not t:
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
# 헤더 방어
|
|
70
|
+
if t == "재무년월":
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
# 이미 표준 포맷이면 그대로
|
|
74
|
+
if re.fullmatch(r"\d{4}/\d{2}", t):
|
|
75
|
+
return t
|
|
76
|
+
|
|
77
|
+
m = _PERIOD_RE.match(t)
|
|
78
|
+
if not m:
|
|
79
|
+
return None
|
|
80
|
+
|
|
81
|
+
year, suffix = m.groups() # suffix: "A" | "E" | None
|
|
82
|
+
|
|
83
|
+
if keep_suffix and suffix:
|
|
84
|
+
return f"{year}{suffix}"
|
|
85
|
+
|
|
86
|
+
return f"{year}/{12}"
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _normalize_metric_key(col_key: str) -> str:
|
|
90
|
+
"""
|
|
91
|
+
최종 metric key를 사람이 쓰기 좋은 형태로 정리.
|
|
92
|
+
"""
|
|
93
|
+
k = col_key
|
|
94
|
+
|
|
95
|
+
# 매출액은 '금액'/'YoY'가 분리되어 있으니 명시적으로 이름을 고정
|
|
96
|
+
if k.startswith("매출액_금액"):
|
|
97
|
+
return "매출액"
|
|
98
|
+
if k.startswith("매출액_YoY"):
|
|
99
|
+
return "매출액YoY"
|
|
100
|
+
|
|
101
|
+
# 나머지는 그대로(단위/공백은 _flatten_col에서 제거됨)
|
|
102
|
+
# 예: "영업이익", "당기순이익", "EPS", "PER", "PBR", "ROE", "EV/EBITDA", "순부채비율"
|
|
103
|
+
return k
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _html_to_df(html: str) -> pd.DataFrame | None:
|
|
107
|
+
"""
|
|
108
|
+
yearly consensus 테이블은 2줄 헤더이므로 header=[0,1]로 읽고 flatten한다.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
dfs = pd.read_html(StringIO(html), header=[0, 1])
|
|
112
|
+
except Exception as e:
|
|
113
|
+
logger.exception("pd.read_html failed: {}", e)
|
|
114
|
+
return None
|
|
115
|
+
if not dfs:
|
|
116
|
+
return None
|
|
117
|
+
df = dfs[0]
|
|
118
|
+
if df is None or df.empty:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
df = df.copy()
|
|
122
|
+
df.columns = [_flatten_col(c) for c in df.columns]
|
|
123
|
+
return df
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _df_to_metric_map(df: pd.DataFrame) -> dict[str, dict[str, Any]]:
|
|
127
|
+
"""
|
|
128
|
+
DataFrame(row: period, col: metric) -> {metric: {period: value}} 로 pivot
|
|
129
|
+
"""
|
|
130
|
+
if df is None or df.empty:
|
|
131
|
+
return {}
|
|
132
|
+
|
|
133
|
+
# NaN -> None
|
|
134
|
+
df = df.where(pd.notnull(df), None)
|
|
135
|
+
|
|
136
|
+
# '재무년월' 컬럼 찾기(안정)
|
|
137
|
+
# 보통 "재무년월"로 flatten 되지만, 혹시 깨지는 경우 대비
|
|
138
|
+
period_col = None
|
|
139
|
+
for c in df.columns:
|
|
140
|
+
if "재무년월" == c or c.endswith("재무년월") or "재무년월" in c:
|
|
141
|
+
period_col = c
|
|
142
|
+
break
|
|
143
|
+
if not period_col:
|
|
144
|
+
logger.warning("[cTB25] period column not found")
|
|
145
|
+
return {}
|
|
146
|
+
|
|
147
|
+
out: dict[str, dict[str, Any]] = {}
|
|
148
|
+
|
|
149
|
+
for _, row in df.iterrows():
|
|
150
|
+
period = _normalize_period(row.get(period_col), keep_suffix=True)
|
|
151
|
+
if not period:
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
for col, raw_val in row.items():
|
|
155
|
+
if col == period_col:
|
|
156
|
+
continue
|
|
157
|
+
# 주재무제표는 metric-map에서 제외(원하면 따로 meta로 빼도 됨)
|
|
158
|
+
if "주재무제표" in str(col):
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
metric = _normalize_metric_key(str(col))
|
|
162
|
+
|
|
163
|
+
num = to_float(raw_val)
|
|
164
|
+
val: Any = num if num is not None else (normalize_text(raw_val) or None)
|
|
165
|
+
|
|
166
|
+
out.setdefault(metric, {})[period] = val
|
|
167
|
+
|
|
168
|
+
return out
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
async def parse_c101_yearly_consensus_table(
|
|
172
|
+
browser: BrowserPort,
|
|
173
|
+
) -> dict[str, dict[str, Any]]:
|
|
174
|
+
"""
|
|
175
|
+
#cTB25 (3년 실적 + 2년 추정) 테이블을
|
|
176
|
+
{metric: {period: value}} 형태로 반환한다.
|
|
177
|
+
"""
|
|
178
|
+
await browser.wait_attached(_YEARLY_CONSENSUS_TABLE)
|
|
179
|
+
await browser.wait_table_nth_ready(
|
|
180
|
+
_YEARLY_CONSENSUS_TABLE,
|
|
181
|
+
index=0,
|
|
182
|
+
min_rows=5,
|
|
183
|
+
timeout_ms=30_000,
|
|
184
|
+
poll_ms=200,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
html = await browser.outer_html_nth(_YEARLY_CONSENSUS_TABLE, 0)
|
|
188
|
+
if not html or "<table" not in html:
|
|
189
|
+
logger.warning("[cTB25] outerHTML invalid or empty")
|
|
190
|
+
return {}
|
|
191
|
+
|
|
192
|
+
df = _html_to_df(html)
|
|
193
|
+
if df is None:
|
|
194
|
+
logger.warning("[cTB25] df is empty/invalid")
|
|
195
|
+
return {}
|
|
196
|
+
|
|
197
|
+
return _df_to_metric_map(df)
|