scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +19 -0
- scraper2_hj3415/app/parsing/c104_parser.py +23 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +40 -70
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
- scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/site/wisereport_playwright.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
5
|
+
from logging_hj3415 import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class WiseReportPlaywright:
|
|
9
|
+
def __init__(self, browser: BrowserPort):
|
|
10
|
+
self.browser = browser
|
|
11
|
+
|
|
12
|
+
async def ensure_yearly_consensus_open_in_table_nth(
|
|
13
|
+
self,
|
|
14
|
+
*,
|
|
15
|
+
table_selector: str, # 예: TABLE_XPATH ("xpath=//div[@id='wrapper']//div//table")
|
|
16
|
+
table_index: int, # 예: TABLE_INDEX (2)
|
|
17
|
+
after_click_sleep_ms: int = 150,
|
|
18
|
+
max_rounds: int = 6,
|
|
19
|
+
wait_timeout_sec: float = 12.0,
|
|
20
|
+
) -> bool:
|
|
21
|
+
"""
|
|
22
|
+
목표: 연간 컨센서스 컬럼이 '반드시 펼쳐진 상태'가 되게 한다.
|
|
23
|
+
전략:
|
|
24
|
+
- TABLE_NTH 스코프 안에서
|
|
25
|
+
- btn_moreY 또는 btn_moreQQ 이면서
|
|
26
|
+
- '연간컨센서스보기' 텍스트를 가진 a 토글들 중
|
|
27
|
+
- computedStyle(display) != 'none' 인 것들을 전부 클릭
|
|
28
|
+
- 클릭마다 테이블 텍스트 변경을 기다림
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
table_scoped = f"{table_selector} >> nth={table_index}"
|
|
32
|
+
|
|
33
|
+
# table 내부의 토글(a)만 잡기 (btn_moreY / btn_moreQQ 둘 다)
|
|
34
|
+
VIEW_ALL = (
|
|
35
|
+
f"{table_scoped} >> xpath=.//a["
|
|
36
|
+
"("
|
|
37
|
+
"contains(@class,'btn_moreY') or contains(@class,'btn_moreQQ')"
|
|
38
|
+
")"
|
|
39
|
+
" and .//span[contains(normalize-space(.),'연간컨센서스보기')]"
|
|
40
|
+
"]"
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
CLOSE_ALL = (
|
|
44
|
+
f"{table_scoped} >> xpath=.//a["
|
|
45
|
+
"("
|
|
46
|
+
"contains(@class,'btn_moreY') or contains(@class,'btn_moreQQ')"
|
|
47
|
+
")"
|
|
48
|
+
" and .//span[contains(normalize-space(.),'연간컨센서스닫기')]"
|
|
49
|
+
"]"
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# 테이블 텍스트 변화 감지용 “prev_text”
|
|
53
|
+
prev_text = await self.browser.wait_table_text_changed(
|
|
54
|
+
table_selector,
|
|
55
|
+
index=table_index,
|
|
56
|
+
prev_text=None,
|
|
57
|
+
timeout_sec=wait_timeout_sec,
|
|
58
|
+
min_lines=10,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
logger.debug("ensure_yearly_consensus_open_in_table_nth: start")
|
|
62
|
+
|
|
63
|
+
# round를 두는 이유:
|
|
64
|
+
# - 보기 토글이 여러 개고, 클릭할 때 DOM이 재배치될 수 있음
|
|
65
|
+
# - 1번에 다 못 누르면 다음 라운드에서 다시 스캔
|
|
66
|
+
for round_no in range(1, max_rounds + 1):
|
|
67
|
+
view_cnt = await self.browser.count(VIEW_ALL)
|
|
68
|
+
close_cnt = await self.browser.count(CLOSE_ALL)
|
|
69
|
+
logger.debug(
|
|
70
|
+
f"round={round_no} toggle exists: view={view_cnt}, close={close_cnt}"
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# "보기" 토글이 아예 없으면 -> 이미 다 펼쳐져 있거나(닫기만 존재),
|
|
74
|
+
# 혹은 페이지 구조가 달라서 못 찾는 것. 여기서는 '성공'으로 간주.
|
|
75
|
+
if view_cnt == 0:
|
|
76
|
+
logger.debug("no VIEW toggles found in-table -> treat as OPEN")
|
|
77
|
+
return True
|
|
78
|
+
|
|
79
|
+
clicked_any = False
|
|
80
|
+
|
|
81
|
+
# i를 0..view_cnt-1로 돌면서 display != none 인 것만 클릭
|
|
82
|
+
# (중간에 DOM 바뀌면 count/순서가 바뀔 수 있으니, 실패해도 계속)
|
|
83
|
+
for i in range(view_cnt):
|
|
84
|
+
try:
|
|
85
|
+
# 혹시 DOM이 바뀌어 index가 사라졌으면 skip
|
|
86
|
+
if not await self.browser.is_attached(VIEW_ALL, index=i):
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
disp = await self.browser.computed_style(
|
|
90
|
+
VIEW_ALL, index=i, prop="display"
|
|
91
|
+
)
|
|
92
|
+
if disp.strip().lower() == "none":
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# 화면 밖이면 클릭 실패할 수 있으니 스크롤
|
|
96
|
+
await self.browser.scroll_into_view(VIEW_ALL, index=i)
|
|
97
|
+
|
|
98
|
+
# trial(실패해도 진행)
|
|
99
|
+
_ = await self.browser.try_click(
|
|
100
|
+
VIEW_ALL, index=i, timeout_ms=1500, force=False
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
# 실제 클릭
|
|
104
|
+
try:
|
|
105
|
+
await self.browser.click(
|
|
106
|
+
VIEW_ALL, index=i, timeout_ms=4000, force=False
|
|
107
|
+
)
|
|
108
|
+
except Exception:
|
|
109
|
+
await self.browser.click(
|
|
110
|
+
VIEW_ALL, index=i, timeout_ms=4000, force=True
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
await self.browser.sleep_ms(after_click_sleep_ms)
|
|
114
|
+
|
|
115
|
+
# 클릭 후 테이블 텍스트 변경 대기
|
|
116
|
+
prev_text = await self.browser.wait_table_text_changed(
|
|
117
|
+
table_selector,
|
|
118
|
+
index=table_index,
|
|
119
|
+
prev_text=prev_text,
|
|
120
|
+
timeout_sec=wait_timeout_sec,
|
|
121
|
+
min_lines=10,
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
clicked_any = True
|
|
125
|
+
logger.debug(f"clicked VIEW toggle: idx={i}, display={disp}")
|
|
126
|
+
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.debug(
|
|
129
|
+
f"click VIEW toggle failed: idx={i}, err={type(e).__name__}: {e}"
|
|
130
|
+
)
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
# 이번 라운드에서 클릭을 하나도 못 했으면:
|
|
134
|
+
# - 모든 VIEW가 display:none 이었거나
|
|
135
|
+
# - 클릭이 막혔거나
|
|
136
|
+
# => VIEW(display!=none)가 남아있는지 다시 검사
|
|
137
|
+
if not clicked_any:
|
|
138
|
+
remain = await self.browser.count(VIEW_ALL)
|
|
139
|
+
logger.debug(f"no clicks in round; remain VIEW count={remain}")
|
|
140
|
+
# VIEW는 있는데 전부 display:none 이면 사실상 '열림' 상태로 볼 수 있음
|
|
141
|
+
# (닫기만 보이는 케이스)
|
|
142
|
+
# 여기서는 “성공” 처리
|
|
143
|
+
return True
|
|
144
|
+
|
|
145
|
+
# 다음 라운드에서 다시 스캔해서 VIEW(display!=none)가 남아있으면 또 클릭
|
|
146
|
+
# (다 눌렀으면 결국 클릭할 게 없어짐)
|
|
147
|
+
|
|
148
|
+
# 라운드 다 돌았는데도 여기까지 왔다면,
|
|
149
|
+
# “보기 토글이 계속 display!=none으로 남는다” = 열리지 않는 구조/권한/오버레이 등
|
|
150
|
+
logger.warning("ensure_yearly_consensus_open_in_table_nth: exceeded max_rounds")
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
async def click_steps(
|
|
154
|
+
self,
|
|
155
|
+
steps: list[tuple[str, str]],
|
|
156
|
+
*,
|
|
157
|
+
jitter_sec: float = 0.6,
|
|
158
|
+
) -> None:
|
|
159
|
+
"""
|
|
160
|
+
현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
|
|
161
|
+
"""
|
|
162
|
+
for _name, selector in steps:
|
|
163
|
+
await self.browser.wait_attached(selector)
|
|
164
|
+
logger.info(f"click step: {_name}")
|
|
165
|
+
await self.browser.click(selector)
|
|
166
|
+
# 서버/클라이언트 부담 줄이기: 작은 지터
|
|
167
|
+
wait = int((0.5 + (jitter_sec * 0.5))*1000)
|
|
168
|
+
await self.browser.sleep_ms(wait)
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# scraper2_hj3415/app/composition.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
9
|
+
|
|
10
|
+
from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
11
|
+
from scraper2_hj3415.app.adapters.out.playwright.browser_factory import (
|
|
12
|
+
PlaywrightBrowserFactory,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
from scraper2_hj3415.app.services.fetch.fetch_c101 import FetchC101
|
|
16
|
+
from scraper2_hj3415.app.services.fetch.fetch_c103 import FetchC103
|
|
17
|
+
from scraper2_hj3415.app.services.fetch.fetch_c104 import FetchC104
|
|
18
|
+
from scraper2_hj3415.app.services.fetch.fetch_c106 import FetchC106
|
|
19
|
+
from scraper2_hj3415.app.services.fetch.fetch_c108 import FetchC108
|
|
20
|
+
|
|
21
|
+
from scraper2_hj3415.app.usecases.ingest.ingest_c101 import IngestC101
|
|
22
|
+
from scraper2_hj3415.app.usecases.ingest.ingest_c103 import IngestC103
|
|
23
|
+
from scraper2_hj3415.app.usecases.ingest.ingest_c104 import IngestC104
|
|
24
|
+
from scraper2_hj3415.app.usecases.ingest.ingest_c106 import IngestC106
|
|
25
|
+
from scraper2_hj3415.app.usecases.ingest.ingest_c108 import IngestC108
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
from scraper2_hj3415.app.ports.sinks.nfs_sink_port import NfsSinkPort
|
|
29
|
+
from contracts_hj3415.nfs.c101_dto import C101DTO
|
|
30
|
+
from contracts_hj3415.nfs.c103_dto import C103DTO
|
|
31
|
+
from contracts_hj3415.nfs.c104_dto import C104DTO
|
|
32
|
+
from contracts_hj3415.nfs.c106_dto import C106DTO
|
|
33
|
+
from contracts_hj3415.nfs.c108_dto import C108DTO
|
|
34
|
+
|
|
35
|
+
from scraper2_hj3415.app.adapters.out.sinks.mongo_sink import MongoSink
|
|
36
|
+
from scraper2_hj3415.app.adapters.out.sinks.memory_sink import MemorySink
|
|
37
|
+
|
|
38
|
+
from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
|
|
39
|
+
|
|
40
|
+
from db2_hj3415.mongo import Mongo
|
|
41
|
+
|
|
42
|
+
from scraper2_hj3415.app.domain.types import Sink
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _env_bool(key: str, default: bool) -> bool:
|
|
46
|
+
v = os.getenv(key)
|
|
47
|
+
return (
|
|
48
|
+
default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _env_int(key: str, default: int) -> int:
|
|
53
|
+
v = os.getenv(key)
|
|
54
|
+
if v is None:
|
|
55
|
+
return default
|
|
56
|
+
try:
|
|
57
|
+
return int(v)
|
|
58
|
+
except ValueError:
|
|
59
|
+
return default
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def build_browser_factory() -> BrowserFactoryPort:
|
|
63
|
+
return PlaywrightBrowserFactory(
|
|
64
|
+
headless=_env_bool("SCRAPER_HEADLESS", True),
|
|
65
|
+
timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
|
|
66
|
+
max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# -------------------------
|
|
71
|
+
# Bundles
|
|
72
|
+
# -------------------------
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass(frozen=True)
|
|
76
|
+
class FetchUsecases:
|
|
77
|
+
c101: FetchC101
|
|
78
|
+
c103: FetchC103
|
|
79
|
+
c104: FetchC104
|
|
80
|
+
c106: FetchC106
|
|
81
|
+
c108: FetchC108
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@dataclass(frozen=True)
|
|
85
|
+
class Sinks:
|
|
86
|
+
c101: NfsSinkPort[C101DTO]
|
|
87
|
+
c103: NfsSinkPort[C103DTO]
|
|
88
|
+
c104: NfsSinkPort[C104DTO]
|
|
89
|
+
c106: NfsSinkPort[C106DTO]
|
|
90
|
+
c108: NfsSinkPort[C108DTO]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass(frozen=True)
|
|
94
|
+
class IngestUsecases:
|
|
95
|
+
c101: IngestC101
|
|
96
|
+
c103: IngestC103
|
|
97
|
+
c104: IngestC104
|
|
98
|
+
c106: IngestC106
|
|
99
|
+
c108: IngestC108
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@dataclass(frozen=True)
|
|
103
|
+
class Usecases:
|
|
104
|
+
fetch: FetchUsecases
|
|
105
|
+
ingest: IngestUsecases
|
|
106
|
+
sinks: Sinks
|
|
107
|
+
store: InMemoryStore | None = None # ✅ memory일 때만
|
|
108
|
+
mongo: Mongo | None = None # ✅ mongo일 때만
|
|
109
|
+
db: AsyncDatabase | None = None # ✅ mongo일 때만
|
|
110
|
+
browser_factory: Optional[BrowserFactoryPort] = None
|
|
111
|
+
|
|
112
|
+
async def aclose(self) -> None:
|
|
113
|
+
if self.browser_factory is not None:
|
|
114
|
+
await self.browser_factory.aclose()
|
|
115
|
+
|
|
116
|
+
if self.mongo is not None:
|
|
117
|
+
await self.mongo.close()
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
# -------------------------
|
|
121
|
+
# builders
|
|
122
|
+
# -------------------------
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
|
|
126
|
+
return FetchUsecases(
|
|
127
|
+
c101=FetchC101(factory=factory),
|
|
128
|
+
c103=FetchC103(factory=factory),
|
|
129
|
+
c104=FetchC104(factory=factory),
|
|
130
|
+
c106=FetchC106(factory=factory),
|
|
131
|
+
c108=FetchC108(factory=factory),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass(frozen=True)
|
|
136
|
+
class MemoryBundle:
|
|
137
|
+
store: InMemoryStore
|
|
138
|
+
sinks: Sinks
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def build_memory_bundle() -> MemoryBundle:
|
|
142
|
+
store = InMemoryStore()
|
|
143
|
+
c101_sink: NfsSinkPort[C101DTO] = MemorySink(store)
|
|
144
|
+
c103_sink: NfsSinkPort[C103DTO] = MemorySink(store)
|
|
145
|
+
c104_sink: NfsSinkPort[C104DTO] = MemorySink(store)
|
|
146
|
+
c106_sink: NfsSinkPort[C106DTO] = MemorySink(store)
|
|
147
|
+
c108_sink: NfsSinkPort[C108DTO] = MemorySink(store)
|
|
148
|
+
sinks = Sinks(
|
|
149
|
+
c101=c101_sink,
|
|
150
|
+
c103=c103_sink,
|
|
151
|
+
c104=c104_sink,
|
|
152
|
+
c106=c106_sink,
|
|
153
|
+
c108=c108_sink,
|
|
154
|
+
)
|
|
155
|
+
return MemoryBundle(store=store, sinks=sinks)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# ---- mongo bundle ----
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
@dataclass(frozen=True)
|
|
162
|
+
class MongoBundle:
|
|
163
|
+
mongo: Mongo
|
|
164
|
+
db: AsyncDatabase
|
|
165
|
+
sinks: Sinks
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def build_mongo_bundle() -> MongoBundle:
|
|
169
|
+
mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
|
|
170
|
+
db = mongo.get_db()
|
|
171
|
+
c101_sink: NfsSinkPort[C101DTO] = MongoSink(db)
|
|
172
|
+
c103_sink: NfsSinkPort[C103DTO] = MongoSink(db)
|
|
173
|
+
c104_sink: NfsSinkPort[C104DTO] = MongoSink(db)
|
|
174
|
+
c106_sink: NfsSinkPort[C106DTO] = MongoSink(db)
|
|
175
|
+
c108_sink: NfsSinkPort[C108DTO] = MongoSink(db)
|
|
176
|
+
sinks = Sinks(
|
|
177
|
+
c101=c101_sink,
|
|
178
|
+
c103=c103_sink,
|
|
179
|
+
c104=c104_sink,
|
|
180
|
+
c106=c106_sink,
|
|
181
|
+
c108=c108_sink,
|
|
182
|
+
)
|
|
183
|
+
return MongoBundle(mongo=mongo, db=db, sinks=sinks)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
|
|
187
|
+
return IngestUsecases(
|
|
188
|
+
c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
|
|
189
|
+
c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
|
|
190
|
+
c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
|
|
191
|
+
c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
|
|
192
|
+
c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def build_usecases(
|
|
197
|
+
*, factory: BrowserFactoryPort | None = None, sink: Sink = "memory"
|
|
198
|
+
) -> Usecases:
|
|
199
|
+
factory = factory or build_browser_factory()
|
|
200
|
+
fetch = build_fetch_usecases(factory=factory)
|
|
201
|
+
|
|
202
|
+
if sink == "memory":
|
|
203
|
+
bundle = build_memory_bundle()
|
|
204
|
+
ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
|
|
205
|
+
return Usecases(
|
|
206
|
+
fetch=fetch,
|
|
207
|
+
ingest=ingest,
|
|
208
|
+
sinks=bundle.sinks,
|
|
209
|
+
store=bundle.store,
|
|
210
|
+
browser_factory=factory,
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
if sink == "mongo":
|
|
214
|
+
bundle = build_mongo_bundle()
|
|
215
|
+
ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
|
|
216
|
+
return Usecases(
|
|
217
|
+
fetch=fetch,
|
|
218
|
+
ingest=ingest,
|
|
219
|
+
sinks=bundle.sinks,
|
|
220
|
+
mongo=bundle.mongo,
|
|
221
|
+
db=bundle.db,
|
|
222
|
+
browser_factory=factory,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
raise ValueError(f"Unknown sink_kind: {sink}")
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/blocks.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Any, Mapping, Sequence
|
|
6
|
+
from scraper2_hj3415.app.domain.constants import BLOCK_KEYS_BY_ENDPOINT
|
|
7
|
+
from scraper2_hj3415.app.domain.endpoint import EndpointKind
|
|
8
|
+
from scraper2_hj3415.app.domain.types import BlockKey, MetricKey, Record
|
|
9
|
+
from scraper2_hj3415.app.domain.series import MetricSeries
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _validate_block_key(endpoint_kind: EndpointKind, block_key: str) -> None:
|
|
13
|
+
allowed = BLOCK_KEYS_BY_ENDPOINT.get(endpoint_kind)
|
|
14
|
+
if allowed is not None and block_key not in allowed:
|
|
15
|
+
raise ValueError(f"Invalid block key for {endpoint_kind}: {block_key!r}")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True)
|
|
19
|
+
class MetricsBlock:
|
|
20
|
+
endpoint_kind: EndpointKind
|
|
21
|
+
block_key: BlockKey
|
|
22
|
+
metrics: Mapping[MetricKey, MetricSeries]
|
|
23
|
+
|
|
24
|
+
def __post_init__(self) -> None:
|
|
25
|
+
_validate_block_key(self.endpoint_kind, self.block_key)
|
|
26
|
+
|
|
27
|
+
# 컨테이너 키와 엔티티 키 불일치 방지(선택)
|
|
28
|
+
for k, m in self.metrics.items():
|
|
29
|
+
if m.key != k:
|
|
30
|
+
raise ValueError(
|
|
31
|
+
f"Metric key mismatch: map key={k!r} != series key={m.key!r}"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def get(self, key: MetricKey) -> MetricSeries | None:
|
|
35
|
+
m = self.metrics.get(key)
|
|
36
|
+
if m and m.key != key:
|
|
37
|
+
raise ValueError("Metric key mismatch")
|
|
38
|
+
return m
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# 다양한 블록형태 구성 추후 수정필요
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(frozen=True)
|
|
45
|
+
class RecordsBlock:
|
|
46
|
+
endpoint_kind: EndpointKind
|
|
47
|
+
block_key: BlockKey
|
|
48
|
+
rows: Sequence[Record]
|
|
49
|
+
|
|
50
|
+
def __post_init__(self) -> None:
|
|
51
|
+
_validate_block_key(self.endpoint_kind, self.block_key)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True)
|
|
55
|
+
class KvBlock:
|
|
56
|
+
endpoint_kind: EndpointKind
|
|
57
|
+
block_key: BlockKey
|
|
58
|
+
values: Mapping[str, Any]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
BlockData = MetricsBlock | RecordsBlock | KvBlock
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/constants.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Mapping
|
|
5
|
+
|
|
6
|
+
from contracts_hj3415.nfs.types import BlockKeys
|
|
7
|
+
from contracts_hj3415.nfs.constants import C101_BLOCK_KEYS, C103_BLOCK_KEYS, C104_BLOCK_KEYS, C106_BLOCK_KEYS, C108_BLOCK_KEYS
|
|
8
|
+
from scraper2_hj3415.app.domain.endpoint import EndpointKind
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
BLOCK_KEYS_BY_ENDPOINT: Mapping[EndpointKind, tuple[str, ...]] = {
|
|
12
|
+
EndpointKind.C101: C101_BLOCK_KEYS,
|
|
13
|
+
EndpointKind.C103: C103_BLOCK_KEYS,
|
|
14
|
+
EndpointKind.C104: C104_BLOCK_KEYS,
|
|
15
|
+
EndpointKind.C106: C106_BLOCK_KEYS,
|
|
16
|
+
EndpointKind.C108: C108_BLOCK_KEYS,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_block_keys(endpoint: EndpointKind) -> tuple[str, ...]:
|
|
21
|
+
"""
|
|
22
|
+
엔드포인트의 "공식" 블록 키 목록.
|
|
23
|
+
- 도메인 레이어에 두되, selector/table index 같은 구현 디테일은 넣지 않는다.
|
|
24
|
+
"""
|
|
25
|
+
return BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def is_known_block(endpoint: EndpointKind, key: BlockKeys) -> bool:
|
|
29
|
+
"""
|
|
30
|
+
블록 키가 해당 endpoint의 공식 목록에 포함되는지 여부.
|
|
31
|
+
(검증/필터링/동적 payload merge 등에 사용)
|
|
32
|
+
"""
|
|
33
|
+
return key in BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/doc.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Mapping
|
|
6
|
+
from scraper2_hj3415.app.domain.endpoint import EndpointKind
|
|
7
|
+
from scraper2_hj3415.app.domain.types import BlockKey, LabelsMap
|
|
8
|
+
from scraper2_hj3415.app.domain.blocks import BlockData
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class NfsDoc:
|
|
13
|
+
code: str
|
|
14
|
+
endpoint_kind: EndpointKind
|
|
15
|
+
blocks: Mapping[BlockKey, BlockData]
|
|
16
|
+
labels: Mapping[BlockKey, LabelsMap]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/series.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Mapping
|
|
6
|
+
from scraper2_hj3415.app.domain.types import MetricKey, Period, Num
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class MetricSeries:
|
|
10
|
+
key: MetricKey
|
|
11
|
+
values: Mapping[Period, Num]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/types.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Mapping, Any, Sequence, TypeAlias, Literal
|
|
5
|
+
|
|
6
|
+
BlockKey = str
|
|
7
|
+
MetricKey = str
|
|
8
|
+
Period = str
|
|
9
|
+
Num = float | int | None
|
|
10
|
+
|
|
11
|
+
Record: TypeAlias = Mapping[str, Any]
|
|
12
|
+
Records: TypeAlias = Sequence[Record]
|
|
13
|
+
RawLabel = str
|
|
14
|
+
LabelsMap = dict[MetricKey, RawLabel]
|
|
15
|
+
|
|
16
|
+
Sink = Literal["memory", "mongo"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/_normalize/label.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from common_hj3415.utils import clean_text
|
|
8
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
9
|
+
|
|
10
|
+
# -----------------------------
|
|
11
|
+
# 일반적인 라벨 정규화
|
|
12
|
+
# -----------------------------
|
|
13
|
+
|
|
14
|
+
UI_LABEL_NOISE = (
|
|
15
|
+
"펼치기",
|
|
16
|
+
"접기",
|
|
17
|
+
"더보기",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def sanitize_label(x: Any) -> str:
|
|
22
|
+
"""
|
|
23
|
+
raw label(항목_raw 포함)에서:
|
|
24
|
+
- '펼치기' 같은 UI 텍스트 제거
|
|
25
|
+
- 과도한 공백 정리
|
|
26
|
+
- 양끝 공백 제거
|
|
27
|
+
"""
|
|
28
|
+
s = normalize_text(x)
|
|
29
|
+
|
|
30
|
+
# UI 노이즈 단어 제거
|
|
31
|
+
for w in UI_LABEL_NOISE:
|
|
32
|
+
s = s.replace(w, " ")
|
|
33
|
+
|
|
34
|
+
return clean_text(s)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# -----------------------------
|
|
38
|
+
# metric 라벨 정규화
|
|
39
|
+
# -----------------------------
|
|
40
|
+
_BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
|
|
41
|
+
_EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
|
|
42
|
+
_ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def normalize_label_base(text: str | None) -> str:
|
|
46
|
+
s = sanitize_label(text)
|
|
47
|
+
s = _EXTRA_WORDS_PATTERN.sub("", s)
|
|
48
|
+
s = _BRACKET_PATTERN.sub("", s)
|
|
49
|
+
s = _ALL_PAREN_PATTERN.sub("", s)
|
|
50
|
+
s = s.replace("*", "")
|
|
51
|
+
return clean_text(s)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def normalize_metric_label(text: str | None) -> str:
|
|
55
|
+
# "보유 지분 (%)" → "보유 지분" (공백 유지)
|
|
56
|
+
return normalize_label_base(text)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def normalize_key_label(text: str | None) -> str:
|
|
60
|
+
# "보유 지분 (%)" → "보유지분"
|
|
61
|
+
s = normalize_label_base(text)
|
|
62
|
+
return s.replace(" ", "").replace("\xa0", "").replace("%", "").strip()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# -----------------------------
|
|
66
|
+
# 컬럼명 정규화
|
|
67
|
+
# -----------------------------
|
|
68
|
+
_COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
|
|
69
|
+
_COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
|
|
70
|
+
_COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def normalize_col_label(col: str | None) -> str:
|
|
74
|
+
"""
|
|
75
|
+
컬럼명 정규화
|
|
76
|
+
예)
|
|
77
|
+
"2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
|
|
78
|
+
"2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
|
|
79
|
+
"전년대비 (YoY)" -> "전년대비"
|
|
80
|
+
"전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
|
|
81
|
+
"""
|
|
82
|
+
s = normalize_text(col)
|
|
83
|
+
# 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
|
|
84
|
+
s = _COL_DOTNUM.sub("", s)
|
|
85
|
+
|
|
86
|
+
# 2) 컨센서스 문구 제거
|
|
87
|
+
s = _COL_EXTRA_WORDS.sub("", s)
|
|
88
|
+
|
|
89
|
+
# 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
|
|
90
|
+
s = _COL_PAREN_PATTERN.sub("", s)
|
|
91
|
+
|
|
92
|
+
return clean_text(s)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/_normalize/table.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from .label import normalize_col_label, normalize_metric_label
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _dedupe_columns(cols: list[str]) -> list[str]:
|
|
12
|
+
"""
|
|
13
|
+
정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
|
|
14
|
+
예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
|
|
15
|
+
"""
|
|
16
|
+
seen: Counter[str] = Counter()
|
|
17
|
+
out: list[str] = []
|
|
18
|
+
for c in cols:
|
|
19
|
+
c = c or ""
|
|
20
|
+
seen[c] += 1
|
|
21
|
+
if seen[c] == 1:
|
|
22
|
+
out.append(c)
|
|
23
|
+
else:
|
|
24
|
+
out.append(f"{c}_{seen[c]}")
|
|
25
|
+
return out
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -----------------------------
|
|
29
|
+
# 3) DataFrame 전체 정규화 + records 변환
|
|
30
|
+
# -----------------------------
|
|
31
|
+
def normalize_metrics_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
32
|
+
"""
|
|
33
|
+
- 컬럼명 전체 정규화
|
|
34
|
+
- '항목' 값 정규화
|
|
35
|
+
- NaN -> None
|
|
36
|
+
- 중복 컬럼명 자동 분리(_2/_3)
|
|
37
|
+
"""
|
|
38
|
+
if df is None or df.empty:
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
df = df.copy()
|
|
42
|
+
|
|
43
|
+
# 컬럼명 정규화 + 중복 방지
|
|
44
|
+
norm_cols = [normalize_col_label(c) for c in df.columns.astype(str).tolist()]
|
|
45
|
+
df.columns = _dedupe_columns(norm_cols)
|
|
46
|
+
|
|
47
|
+
# 항목 값 정규화
|
|
48
|
+
if "항목" in df.columns:
|
|
49
|
+
df["항목"] = df["항목"].map(normalize_metric_label)
|
|
50
|
+
|
|
51
|
+
# NaN -> None
|
|
52
|
+
df = df.replace({np.nan: None})
|
|
53
|
+
return df
|