scraper2-hj3415 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +19 -0
- scraper2_hj3415/app/parsing/c104_parser.py +23 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +40 -80
- {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
- scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.0.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.0.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c108_parser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from html import unescape
|
|
6
|
+
from typing import Any
|
|
7
|
+
from common_hj3415.utils import clean_text
|
|
8
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
9
|
+
|
|
10
|
+
_TAGS = re.compile(r"<[^>]+>")
|
|
11
|
+
_WS = re.compile(r"\s+")
|
|
12
|
+
|
|
13
|
+
_TD_ID_RE = re.compile(r"^td(\d+)$") # td0, td1, ...
|
|
14
|
+
_C_ID_RE = re.compile(r"^c(\d+)$") # c0, c1, ...
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _clean_text(x: Any) -> str:
|
|
18
|
+
"""
|
|
19
|
+
경계/로깅/파싱 단계에서 Any를 안전하게 사람이 읽을 문자열로 만든다.
|
|
20
|
+
- Any → str
|
|
21
|
+
- html entity unescape
|
|
22
|
+
- 이후 normalize_text 적용
|
|
23
|
+
"""
|
|
24
|
+
if x is None:
|
|
25
|
+
return ""
|
|
26
|
+
s = unescape(str(x)) # ❗ x or "" 대신 None만 처리 (falsy 보존)
|
|
27
|
+
return clean_text(s)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _clean_html_to_text(html: str) -> str:
|
|
31
|
+
s = unescape(html or "")
|
|
32
|
+
s = s.replace("<br/>", "\n").replace("<br>", "\n").replace("<br />", "\n")
|
|
33
|
+
s = _TAGS.sub("", s)
|
|
34
|
+
s = s.replace("\r", "")
|
|
35
|
+
lines = [ln.strip() for ln in s.split("\n")]
|
|
36
|
+
lines = [ln for ln in lines if ln]
|
|
37
|
+
return "\n".join(lines).strip()
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_UI_LINES = {"요약정보닫기"}
|
|
41
|
+
_UI_PREFIXES = ("요약정보 :", "요약정보:")
|
|
42
|
+
_BULLET_RE = re.compile(r"^\s*▶\s*")
|
|
43
|
+
_MULTI_NL = re.compile(r"\n{3,}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _prettify_report_text(
|
|
47
|
+
text: str,
|
|
48
|
+
*,
|
|
49
|
+
bullet: str = "- ",
|
|
50
|
+
) -> str:
|
|
51
|
+
if not text:
|
|
52
|
+
return ""
|
|
53
|
+
|
|
54
|
+
lines = [ln.strip() for ln in text.split("\n")]
|
|
55
|
+
out: list[str] = []
|
|
56
|
+
|
|
57
|
+
for ln in lines:
|
|
58
|
+
if not ln:
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
# UI 잔재 제거 (prefix)
|
|
62
|
+
for p in _UI_PREFIXES:
|
|
63
|
+
if ln.startswith(p):
|
|
64
|
+
ln = ln[len(p) :].strip()
|
|
65
|
+
break
|
|
66
|
+
if not ln:
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
if ln in _UI_LINES:
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
# 불릿 정리
|
|
73
|
+
if _BULLET_RE.match(ln):
|
|
74
|
+
ln = _BULLET_RE.sub(bullet, ln)
|
|
75
|
+
|
|
76
|
+
out.append(ln)
|
|
77
|
+
|
|
78
|
+
s = "\n".join(out)
|
|
79
|
+
s = _MULTI_NL.sub("\n\n", s).strip()
|
|
80
|
+
return s
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _parse_target_price(x: Any) -> int | None:
|
|
84
|
+
s = _clean_text(x)
|
|
85
|
+
if not s:
|
|
86
|
+
return None
|
|
87
|
+
s2 = re.sub(r"[^0-9]", "", s)
|
|
88
|
+
if not s2:
|
|
89
|
+
return None
|
|
90
|
+
try:
|
|
91
|
+
return int(s2)
|
|
92
|
+
except Exception:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _parse_pages(x: Any) -> int | None:
|
|
97
|
+
s = _clean_text(x)
|
|
98
|
+
m = re.search(r"(\d+)", s)
|
|
99
|
+
return int(m.group(1)) if m else None
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
async def parse_c108_recent_reports_dom(
|
|
103
|
+
browser: BrowserPort,
|
|
104
|
+
*,
|
|
105
|
+
table_selector: str = "#tableCmpDetail",
|
|
106
|
+
) -> list[dict[str, Any]]:
|
|
107
|
+
"""
|
|
108
|
+
pandas(read_html) 없이 DOM 기반으로 안정적으로 추출.
|
|
109
|
+
|
|
110
|
+
전제:
|
|
111
|
+
- "정상 행"에는 td[id^='td'] 가 있고, 그 id가 tdN 형태다.
|
|
112
|
+
- "상세 요약(숨김)"은 td[id='cN'] data-content로 붙어있다.
|
|
113
|
+
- summary는 td[id='tdN'] data-content에, comment는 td[id='cN'] data-content에 들어있다.
|
|
114
|
+
|
|
115
|
+
BrowserPort 요구 기능:
|
|
116
|
+
- wait_attached(selector)
|
|
117
|
+
- count_in_nth(scope_selector, scope_index, inner_selector) -> int
|
|
118
|
+
- eval_in_nth_first(scope_selector, scope_index, inner_selector, expression) -> Any
|
|
119
|
+
(이미 네가 추가해둔 형태 그대로 사용)
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
await browser.wait_attached(table_selector)
|
|
123
|
+
|
|
124
|
+
# tbody tr 개수
|
|
125
|
+
tr_count = await browser.count_in_nth(
|
|
126
|
+
table_selector, scope_index=0, inner_selector="tbody tr"
|
|
127
|
+
)
|
|
128
|
+
if tr_count <= 0:
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
out: list[dict[str, Any]] = []
|
|
132
|
+
|
|
133
|
+
for tr_idx in range(tr_count):
|
|
134
|
+
# row scope: table_selector >> tbody tr (nth=tr_idx)
|
|
135
|
+
row_scope = f"{table_selector} >> tbody tr >> nth={tr_idx}"
|
|
136
|
+
|
|
137
|
+
# 1) 이 행이 "정상 행"인지 판정: td[id^=td]가 있어야 함
|
|
138
|
+
td_id = await browser.eval_in_nth_first(
|
|
139
|
+
row_scope,
|
|
140
|
+
scope_index=0,
|
|
141
|
+
inner_selector="td[id^='td']",
|
|
142
|
+
expression="el => el.id",
|
|
143
|
+
)
|
|
144
|
+
td_id = _clean_text(td_id)
|
|
145
|
+
m = _TD_ID_RE.match(td_id)
|
|
146
|
+
if not m:
|
|
147
|
+
# 숨김 상세행(cN) 같은 건 스킵
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
n = m.group(1) # row_id
|
|
151
|
+
# 2) 컬럼 텍스트 추출 (C108 테이블 구조에 맞게 td 순서 기준)
|
|
152
|
+
# 보통: 1=일자, 2=제목, 3=작성자, 4=제공처, 5=투자의견, 6=목표가, 7=분량 ...
|
|
153
|
+
date = _clean_text(
|
|
154
|
+
await browser.eval_in_nth_first(
|
|
155
|
+
row_scope,
|
|
156
|
+
scope_index=0,
|
|
157
|
+
inner_selector="td:nth-child(1)",
|
|
158
|
+
expression="el => el.innerText",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
title = _clean_text(
|
|
162
|
+
await browser.eval_in_nth_first(
|
|
163
|
+
row_scope,
|
|
164
|
+
scope_index=0,
|
|
165
|
+
inner_selector="td:nth-child(2)",
|
|
166
|
+
expression="el => el.innerText",
|
|
167
|
+
)
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# 최소 필터
|
|
171
|
+
if not date or not title:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
authors = _clean_text(
|
|
175
|
+
await browser.eval_in_nth_first(
|
|
176
|
+
row_scope,
|
|
177
|
+
scope_index=0,
|
|
178
|
+
inner_selector="td:nth-child(3)",
|
|
179
|
+
expression="el => el.innerText",
|
|
180
|
+
)
|
|
181
|
+
) or None
|
|
182
|
+
|
|
183
|
+
provider = _clean_text(
|
|
184
|
+
await browser.eval_in_nth_first(
|
|
185
|
+
row_scope,
|
|
186
|
+
scope_index=0,
|
|
187
|
+
inner_selector="td:nth-child(4)",
|
|
188
|
+
expression="el => el.innerText",
|
|
189
|
+
)
|
|
190
|
+
) or None
|
|
191
|
+
|
|
192
|
+
rating = _clean_text(
|
|
193
|
+
await browser.eval_in_nth_first(
|
|
194
|
+
row_scope,
|
|
195
|
+
scope_index=0,
|
|
196
|
+
inner_selector="td:nth-child(5)",
|
|
197
|
+
expression="el => el.innerText",
|
|
198
|
+
)
|
|
199
|
+
) or None
|
|
200
|
+
|
|
201
|
+
target_price_raw = await browser.eval_in_nth_first(
|
|
202
|
+
row_scope,
|
|
203
|
+
scope_index=0,
|
|
204
|
+
inner_selector="td:nth-child(6)",
|
|
205
|
+
expression="el => el.innerText",
|
|
206
|
+
)
|
|
207
|
+
target_price = _parse_target_price(target_price_raw)
|
|
208
|
+
|
|
209
|
+
pages_raw = await browser.eval_in_nth_first(
|
|
210
|
+
row_scope,
|
|
211
|
+
scope_index=0,
|
|
212
|
+
inner_selector="td:nth-child(7)",
|
|
213
|
+
expression="el => el.innerText",
|
|
214
|
+
)
|
|
215
|
+
pages = _parse_pages(pages_raw)
|
|
216
|
+
|
|
217
|
+
# 3) summary/comment: N으로 tdN / cN의 data-content를 직접 읽기
|
|
218
|
+
# (DOM에 존재하지만 display:none인 경우도 data-content는 읽을 수 있음)
|
|
219
|
+
summary_html = await browser.eval_in_nth_first(
|
|
220
|
+
table_selector,
|
|
221
|
+
scope_index=0,
|
|
222
|
+
inner_selector=f"td#td{n}",
|
|
223
|
+
expression="el => el.getAttribute('data-content') || ''",
|
|
224
|
+
)
|
|
225
|
+
comment_html = await browser.eval_in_nth_first(
|
|
226
|
+
table_selector,
|
|
227
|
+
scope_index=0,
|
|
228
|
+
inner_selector=f"td#c{n}",
|
|
229
|
+
expression="el => el.getAttribute('data-content') || ''",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
summary = _prettify_report_text(_clean_html_to_text(_clean_text(summary_html)))
|
|
233
|
+
comment = _prettify_report_text(_clean_html_to_text(_clean_text(comment_html)))
|
|
234
|
+
|
|
235
|
+
out.append(
|
|
236
|
+
{
|
|
237
|
+
"row_id": n,
|
|
238
|
+
"date": date,
|
|
239
|
+
"title": title,
|
|
240
|
+
"authors": authors,
|
|
241
|
+
"provider": provider,
|
|
242
|
+
"rating": rating,
|
|
243
|
+
"target_price": target_price,
|
|
244
|
+
"pages": pages,
|
|
245
|
+
"summary": summary or None,
|
|
246
|
+
"comment": comment or None,
|
|
247
|
+
}
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return out
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
async def parse_c108_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
|
|
254
|
+
return {"리포트": await parse_c108_recent_reports_dom(browser)}
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# scraper2_hj3415/app/ports/browser/browser_factory_port.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Protocol, AsyncContextManager
|
|
4
|
+
|
|
5
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
|
|
7
|
+
class BrowserFactoryPort(Protocol):
|
|
8
|
+
def lease(self) -> AsyncContextManager[BrowserPort]: ...
|
|
9
|
+
async def aclose(self) -> None: ...
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# scraper2_hj3415/app/ports/browser/browser_port.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Protocol, Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BrowserPort(Protocol):
|
|
8
|
+
async def wait_table_nth_ready(
|
|
9
|
+
self,
|
|
10
|
+
table_selector: str,
|
|
11
|
+
*,
|
|
12
|
+
index: int,
|
|
13
|
+
min_rows: int = 1,
|
|
14
|
+
timeout_ms: int = 20_000,
|
|
15
|
+
poll_ms: int = 200,
|
|
16
|
+
) -> None: ...
|
|
17
|
+
async def title(self) -> str: ...
|
|
18
|
+
async def current_url(self) -> str: ...
|
|
19
|
+
async def goto_and_wait_for_stable(self, url: str, timeout_ms: int = 10_000) -> None: ...
|
|
20
|
+
async def reload(self, *, timeout_ms: int = 10_000) -> None: ...
|
|
21
|
+
async def sleep_ms(self, ms: int) -> None: ...
|
|
22
|
+
async def wait_attached(
|
|
23
|
+
self, selector: str, *, timeout_ms: int = 10_000
|
|
24
|
+
) -> None: ...
|
|
25
|
+
async def wait_visible(
|
|
26
|
+
self, selector: str, *, timeout_ms: int = 10_000
|
|
27
|
+
) -> None: ...
|
|
28
|
+
async def click(
|
|
29
|
+
self,
|
|
30
|
+
selector: str,
|
|
31
|
+
*,
|
|
32
|
+
index: int = 0,
|
|
33
|
+
timeout_ms: int = 4_000,
|
|
34
|
+
force: bool = False,
|
|
35
|
+
) -> None: ...
|
|
36
|
+
async def try_click(
|
|
37
|
+
self,
|
|
38
|
+
selector: str,
|
|
39
|
+
*,
|
|
40
|
+
index: int = 0,
|
|
41
|
+
timeout_ms: int = 1_500,
|
|
42
|
+
force: bool = False,
|
|
43
|
+
) -> bool: ...
|
|
44
|
+
async def count(self, selector: str) -> int: ...
|
|
45
|
+
async def scroll_into_view(self, selector: str, *, index: int = 0) -> None: ...
|
|
46
|
+
async def text_content_first(self, selector: str) -> str: ...
|
|
47
|
+
async def all_texts(self, selector: str) -> list[str]: ...
|
|
48
|
+
async def get_text_by_text(self, needle: str) -> str: ...
|
|
49
|
+
async def inner_text(self, selector: str) -> str: ...
|
|
50
|
+
async def outer_html_nth(self, selector: str, index: int) -> str: ...
|
|
51
|
+
async def wait_table_text_changed(
|
|
52
|
+
self,
|
|
53
|
+
table_selector: str,
|
|
54
|
+
*,
|
|
55
|
+
index: int,
|
|
56
|
+
prev_text: str | None,
|
|
57
|
+
min_rows: int = 1,
|
|
58
|
+
min_lines: int = 50,
|
|
59
|
+
timeout_sec: float = 12.0,
|
|
60
|
+
poll_sec: float = 0.2,
|
|
61
|
+
) -> str: ...
|
|
62
|
+
async def is_attached(self, selector: str, *, index: int = 0) -> bool: ...
|
|
63
|
+
async def computed_style(
|
|
64
|
+
self, selector: str, *, index: int = 0, prop: str
|
|
65
|
+
) -> str: ...
|
|
66
|
+
async def count_in_nth(
|
|
67
|
+
self,
|
|
68
|
+
scope_selector: str,
|
|
69
|
+
*,
|
|
70
|
+
scope_index: int,
|
|
71
|
+
inner_selector: str,
|
|
72
|
+
) -> int: ...
|
|
73
|
+
async def eval_in_nth_first(
|
|
74
|
+
self,
|
|
75
|
+
scope_selector: str,
|
|
76
|
+
*,
|
|
77
|
+
scope_index: int,
|
|
78
|
+
inner_selector: str,
|
|
79
|
+
expression: str,
|
|
80
|
+
) -> Any: ...
|
|
81
|
+
async def inner_text_in_nth(
|
|
82
|
+
self,
|
|
83
|
+
scope_selector: str,
|
|
84
|
+
*,
|
|
85
|
+
scope_index: int,
|
|
86
|
+
inner_selector: str,
|
|
87
|
+
inner_index: int = 0,
|
|
88
|
+
timeout_ms: int = 10_000,
|
|
89
|
+
) -> str:
|
|
90
|
+
"""
|
|
91
|
+
scope_selector의 nth(scope_index) 요소 안에서
|
|
92
|
+
inner_selector의 nth(inner_index) 요소의 innerText를 반환.
|
|
93
|
+
(렌더링 기준 텍스트: 줄바꿈/스타일 영향 반영)
|
|
94
|
+
"""
|
|
95
|
+
...
|
|
96
|
+
|
|
97
|
+
async def text_content_in_nth(
|
|
98
|
+
self,
|
|
99
|
+
scope_selector: str,
|
|
100
|
+
*,
|
|
101
|
+
scope_index: int,
|
|
102
|
+
inner_selector: str,
|
|
103
|
+
inner_index: int = 0,
|
|
104
|
+
timeout_ms: int = 10_000,
|
|
105
|
+
) -> str:
|
|
106
|
+
"""
|
|
107
|
+
scope_selector의 nth(scope_index) 요소 안에서
|
|
108
|
+
inner_selector의 nth(inner_index) 요소의 textContent를 반환.
|
|
109
|
+
(DOM 기준 텍스트: 숨김 텍스트도 포함될 수 있음)
|
|
110
|
+
"""
|
|
111
|
+
...
|
|
112
|
+
|
|
113
|
+
async def table_records(
|
|
114
|
+
self, table_selector: str, *, header: int | list[int] | None = 0
|
|
115
|
+
) -> list[dict[str, Any]]: ...
|
|
File without changes
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Protocol, Iterable, Optional, TypeVar
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
from contracts_hj3415.nfs.nfs_dto import NfsDTO
|
|
7
|
+
|
|
8
|
+
TDto = TypeVar("TDto", bound=NfsDTO)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NfsIngestPort(Protocol[TDto]):
|
|
12
|
+
async def execute(
|
|
13
|
+
self,
|
|
14
|
+
code: str,
|
|
15
|
+
*,
|
|
16
|
+
sleep_sec: float = ...,
|
|
17
|
+
asof: Optional[datetime] = None,
|
|
18
|
+
) -> TDto:
|
|
19
|
+
...
|
|
20
|
+
|
|
21
|
+
async def execute_many(
|
|
22
|
+
self,
|
|
23
|
+
codes: Iterable[str],
|
|
24
|
+
*,
|
|
25
|
+
sleep_sec: float = ...,
|
|
26
|
+
asof: Optional[datetime] = None,
|
|
27
|
+
) -> list[TDto]:
|
|
28
|
+
...
|
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# scraper2_hj3415/app/ports/sinks/nfs_sink_port.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Protocol, Iterable, TypeVar
|
|
5
|
+
from contracts_hj3415.nfs.types import Endpoints
|
|
6
|
+
from contracts_hj3415.nfs.nfs_dto import NfsDTO
|
|
7
|
+
|
|
8
|
+
TDto = TypeVar("TDto", bound=NfsDTO)
|
|
9
|
+
|
|
10
|
+
class NfsSinkPort(Protocol[TDto]):
|
|
11
|
+
async def write(
|
|
12
|
+
self, dto: TDto, *, endpoint: Endpoints
|
|
13
|
+
) -> None: ...
|
|
14
|
+
|
|
15
|
+
async def write_many(
|
|
16
|
+
self,
|
|
17
|
+
dtos: Iterable[TDto],
|
|
18
|
+
*,
|
|
19
|
+
endpoint: Endpoints
|
|
20
|
+
) -> None: ...
|
|
File without changes
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# scraper2_hj3415/app/ports/site/wisereport_port.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Protocol
|
|
4
|
+
|
|
5
|
+
class WiseReportPort(Protocol):
|
|
6
|
+
async def ensure_yearly_consensus_open_in_table_nth(
|
|
7
|
+
self,
|
|
8
|
+
*,
|
|
9
|
+
table_selector: str, # 예: TABLE_XPATH ("xpath=//div[@id='wrapper']//div//table")
|
|
10
|
+
table_index: int, # 예: TABLE_INDEX (2)
|
|
11
|
+
after_click_sleep_ms: int = 150,
|
|
12
|
+
max_rounds: int = 6,
|
|
13
|
+
wait_timeout_sec: float = 12.0,
|
|
14
|
+
) -> bool: ...
|
|
15
|
+
async def click_steps(
|
|
16
|
+
self,
|
|
17
|
+
steps: list[tuple[str, str]],
|
|
18
|
+
*,
|
|
19
|
+
jitter_sec: float = 0.6,
|
|
20
|
+
) -> None: ...
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# scraper2_hj3415/app/usecases/fetch/fetch_c101.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import random
|
|
6
|
+
from typing import Iterable
|
|
7
|
+
|
|
8
|
+
from logging_hj3415 import logger
|
|
9
|
+
from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
10
|
+
from scraper2_hj3415.app.parsing.c101_parser import parse_c101_to_dict
|
|
11
|
+
|
|
12
|
+
from scraper2_hj3415.app.services.nfs_doc_builders import build_c101_doc_from_parsed
|
|
13
|
+
from scraper2_hj3415.app.domain.endpoint import EndpointKind
|
|
14
|
+
from scraper2_hj3415.app.domain.doc import NfsDoc
|
|
15
|
+
from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FetchC101:
|
|
19
|
+
def __init__(self, factory: BrowserFactoryPort):
|
|
20
|
+
self.factory = factory
|
|
21
|
+
|
|
22
|
+
async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
|
|
23
|
+
async with self.factory.lease() as browser:
|
|
24
|
+
url = f"https://navercomp.wisereport.co.kr/v2/company/c1010001.aspx?cmp_cd={code}"
|
|
25
|
+
await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
|
|
26
|
+
|
|
27
|
+
if sleep_sec > 0:
|
|
28
|
+
await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
|
|
29
|
+
|
|
30
|
+
parsed = await parse_c101_to_dict(browser)
|
|
31
|
+
|
|
32
|
+
logger.debug(f"parsed data: {parsed}")
|
|
33
|
+
block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C101]
|
|
34
|
+
if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
|
|
35
|
+
logger.warning(
|
|
36
|
+
f"c101 fetch: parsed result empty; return None | code={code}"
|
|
37
|
+
)
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
doc = build_c101_doc_from_parsed(
|
|
41
|
+
code=code, parsed=parsed, keep_empty_blocks=True
|
|
42
|
+
)
|
|
43
|
+
logger.debug(f"c101 doc: {doc}")
|
|
44
|
+
return doc
|
|
45
|
+
|
|
46
|
+
async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
|
|
47
|
+
return await self._fetch_one(code, sleep_sec=sleep_sec)
|
|
48
|
+
|
|
49
|
+
async def execute_many(
|
|
50
|
+
self,
|
|
51
|
+
codes: Iterable[str],
|
|
52
|
+
*,
|
|
53
|
+
sleep_sec: float = 2.0,
|
|
54
|
+
) -> list[NfsDoc]:
|
|
55
|
+
results = await asyncio.gather(
|
|
56
|
+
*(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes),
|
|
57
|
+
return_exceptions=False,
|
|
58
|
+
)
|
|
59
|
+
return [r for r in results if r is not None]
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# scraper2_hj3415/app/usecases/fetch/fetch_c103.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import random
|
|
6
|
+
from typing import Iterable, Any
|
|
7
|
+
|
|
8
|
+
from logging_hj3415 import logger
|
|
9
|
+
from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
10
|
+
from scraper2_hj3415.app.ports.site.wisereport_port import WiseReportPort
|
|
11
|
+
|
|
12
|
+
from scraper2_hj3415.app.adapters.site.wisereport_playwright import WiseReportPlaywright
|
|
13
|
+
from scraper2_hj3415.app.parsing.c103_parser import parse_c103_current_table
|
|
14
|
+
from scraper2_hj3415.app.services.nfs_doc_builders import build_metrics_doc_from_parsed
|
|
15
|
+
|
|
16
|
+
from scraper2_hj3415.app.domain.endpoint import EndpointKind
|
|
17
|
+
from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
|
|
18
|
+
from scraper2_hj3415.app.domain.doc import NfsDoc
|
|
19
|
+
|
|
20
|
+
BTN_SETS: dict[str, list[tuple[str, str]]] = {
|
|
21
|
+
"손익계산서y": [
|
|
22
|
+
("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
|
|
23
|
+
("연간", 'xpath=//*[@id="frqTyp0"]'),
|
|
24
|
+
("검색", 'xpath=//*[@id="hfinGubun"]'),
|
|
25
|
+
],
|
|
26
|
+
"재무상태표y": [
|
|
27
|
+
("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
|
|
28
|
+
("연간", 'xpath=//*[@id="frqTyp0"]'),
|
|
29
|
+
("검색", 'xpath=//*[@id="hfinGubun"]'),
|
|
30
|
+
],
|
|
31
|
+
"현금흐름표y": [
|
|
32
|
+
("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
|
|
33
|
+
("연간", 'xpath=//*[@id="frqTyp0"]'),
|
|
34
|
+
("검색", 'xpath=//*[@id="hfinGubun"]'),
|
|
35
|
+
],
|
|
36
|
+
"손익계산서q": [
|
|
37
|
+
("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
|
|
38
|
+
("분기", 'xpath=//*[@id="frqTyp1"]'),
|
|
39
|
+
("검색", 'xpath=//*[@id="hfinGubun"]'),
|
|
40
|
+
],
|
|
41
|
+
"재무상태표q": [
|
|
42
|
+
("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
|
|
43
|
+
("분기", 'xpath=//*[@id="frqTyp1"]'),
|
|
44
|
+
("검색", 'xpath=//*[@id="hfinGubun"]'),
|
|
45
|
+
],
|
|
46
|
+
"현금흐름표q": [
|
|
47
|
+
("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
|
|
48
|
+
("분기", 'xpath=//*[@id="frqTyp1"]'),
|
|
49
|
+
("검색", 'xpath=//*[@id="hfinGubun"]'),
|
|
50
|
+
],
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FetchC103:
|
|
55
|
+
def __init__(self, factory: BrowserFactoryPort):
|
|
56
|
+
self.factory = factory
|
|
57
|
+
|
|
58
|
+
async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
|
|
59
|
+
async with self.factory.lease() as browser:
|
|
60
|
+
wr: WiseReportPort = WiseReportPlaywright(browser)
|
|
61
|
+
|
|
62
|
+
url = (
|
|
63
|
+
"https://navercomp.wisereport.co.kr/v2/company/c1030001.aspx"
|
|
64
|
+
f"?cn=&cmp_cd={code}"
|
|
65
|
+
)
|
|
66
|
+
await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
|
|
67
|
+
|
|
68
|
+
if sleep_sec > 0:
|
|
69
|
+
await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
|
|
70
|
+
|
|
71
|
+
parsed: dict[str, list[dict[str, Any]]] = {}
|
|
72
|
+
prev_text: str | None = None
|
|
73
|
+
|
|
74
|
+
# 최초 기준 텍스트 확보(없어도 동작하게)
|
|
75
|
+
prev_text = await browser.wait_table_text_changed(
|
|
76
|
+
"xpath=//div[@id='wrapper']//div//table",
|
|
77
|
+
index=2,
|
|
78
|
+
prev_text=None,
|
|
79
|
+
min_rows=5,
|
|
80
|
+
min_lines=50,
|
|
81
|
+
timeout_sec=10.0,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
for key, steps in BTN_SETS.items():
|
|
85
|
+
# ✅ 상태 전환 (행동)
|
|
86
|
+
await wr.click_steps(steps, jitter_sec=0.6) # 포트/어댑터로 이동 권장
|
|
87
|
+
await wr.ensure_yearly_consensus_open_in_table_nth(
|
|
88
|
+
table_selector="xpath=//div[@id='wrapper']//div//table",
|
|
89
|
+
table_index=2,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# ✅ 데이터 변경 대기 (행동)
|
|
93
|
+
prev_text = await browser.wait_table_text_changed(
|
|
94
|
+
"xpath=//div[@id='wrapper']//div//table",
|
|
95
|
+
index=2,
|
|
96
|
+
prev_text=prev_text,
|
|
97
|
+
min_rows=5,
|
|
98
|
+
min_lines=50,
|
|
99
|
+
timeout_sec=12.0,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
# ✅ 파싱은 “현재 화면 테이블”만
|
|
103
|
+
try:
|
|
104
|
+
parsed[key] = await parse_c103_current_table(browser)
|
|
105
|
+
except Exception:
|
|
106
|
+
parsed[key] = []
|
|
107
|
+
|
|
108
|
+
block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C103]
|
|
109
|
+
if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"c103 fetch: parsed result empty; return None | code={code}"
|
|
112
|
+
)
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
doc = build_metrics_doc_from_parsed(
|
|
116
|
+
code=code,
|
|
117
|
+
endpoint_kind=EndpointKind.C103,
|
|
118
|
+
parsed=parsed,
|
|
119
|
+
block_keys=block_keys,
|
|
120
|
+
item_key="항목",
|
|
121
|
+
raw_label_key="항목_raw",
|
|
122
|
+
keep_empty_blocks=True,
|
|
123
|
+
)
|
|
124
|
+
return doc
|
|
125
|
+
|
|
126
|
+
async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
|
|
127
|
+
return await self._fetch_one(code, sleep_sec=sleep_sec)
|
|
128
|
+
|
|
129
|
+
async def execute_many(
|
|
130
|
+
self, codes: Iterable[str], *, sleep_sec: float = 2.0
|
|
131
|
+
) -> list[NfsDoc]:
|
|
132
|
+
results = await asyncio.gather(
|
|
133
|
+
*(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes)
|
|
134
|
+
)
|
|
135
|
+
return [r for r in results if r is not None]
|