scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -70
  60. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,168 @@
1
+ # scraper2_hj3415/app/adapters/site/wisereport_playwright.py
2
+ from __future__ import annotations
3
+
4
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
5
+ from logging_hj3415 import logger
6
+
7
+
8
+ class WiseReportPlaywright:
9
+ def __init__(self, browser: BrowserPort):
10
+ self.browser = browser
11
+
12
+ async def ensure_yearly_consensus_open_in_table_nth(
13
+ self,
14
+ *,
15
+ table_selector: str, # 예: TABLE_XPATH ("xpath=//div[@id='wrapper']//div//table")
16
+ table_index: int, # 예: TABLE_INDEX (2)
17
+ after_click_sleep_ms: int = 150,
18
+ max_rounds: int = 6,
19
+ wait_timeout_sec: float = 12.0,
20
+ ) -> bool:
21
+ """
22
+ 목표: 연간 컨센서스 컬럼이 '반드시 펼쳐진 상태'가 되게 한다.
23
+ 전략:
24
+ - TABLE_NTH 스코프 안에서
25
+ - btn_moreY 또는 btn_moreQQ 이면서
26
+ - '연간컨센서스보기' 텍스트를 가진 a 토글들 중
27
+ - computedStyle(display) != 'none' 인 것들을 전부 클릭
28
+ - 클릭마다 테이블 텍스트 변경을 기다림
29
+ """
30
+
31
+ table_scoped = f"{table_selector} >> nth={table_index}"
32
+
33
+ # table 내부의 토글(a)만 잡기 (btn_moreY / btn_moreQQ 둘 다)
34
+ VIEW_ALL = (
35
+ f"{table_scoped} >> xpath=.//a["
36
+ "("
37
+ "contains(@class,'btn_moreY') or contains(@class,'btn_moreQQ')"
38
+ ")"
39
+ " and .//span[contains(normalize-space(.),'연간컨센서스보기')]"
40
+ "]"
41
+ )
42
+
43
+ CLOSE_ALL = (
44
+ f"{table_scoped} >> xpath=.//a["
45
+ "("
46
+ "contains(@class,'btn_moreY') or contains(@class,'btn_moreQQ')"
47
+ ")"
48
+ " and .//span[contains(normalize-space(.),'연간컨센서스닫기')]"
49
+ "]"
50
+ )
51
+
52
+ # 테이블 텍스트 변화 감지용 “prev_text”
53
+ prev_text = await self.browser.wait_table_text_changed(
54
+ table_selector,
55
+ index=table_index,
56
+ prev_text=None,
57
+ timeout_sec=wait_timeout_sec,
58
+ min_lines=10,
59
+ )
60
+
61
+ logger.debug("ensure_yearly_consensus_open_in_table_nth: start")
62
+
63
+ # round를 두는 이유:
64
+ # - 보기 토글이 여러 개고, 클릭할 때 DOM이 재배치될 수 있음
65
+ # - 1번에 다 못 누르면 다음 라운드에서 다시 스캔
66
+ for round_no in range(1, max_rounds + 1):
67
+ view_cnt = await self.browser.count(VIEW_ALL)
68
+ close_cnt = await self.browser.count(CLOSE_ALL)
69
+ logger.debug(
70
+ f"round={round_no} toggle exists: view={view_cnt}, close={close_cnt}"
71
+ )
72
+
73
+ # "보기" 토글이 아예 없으면 -> 이미 다 펼쳐져 있거나(닫기만 존재),
74
+ # 혹은 페이지 구조가 달라서 못 찾는 것. 여기서는 '성공'으로 간주.
75
+ if view_cnt == 0:
76
+ logger.debug("no VIEW toggles found in-table -> treat as OPEN")
77
+ return True
78
+
79
+ clicked_any = False
80
+
81
+ # i를 0..view_cnt-1로 돌면서 display != none 인 것만 클릭
82
+ # (중간에 DOM 바뀌면 count/순서가 바뀔 수 있으니, 실패해도 계속)
83
+ for i in range(view_cnt):
84
+ try:
85
+ # 혹시 DOM이 바뀌어 index가 사라졌으면 skip
86
+ if not await self.browser.is_attached(VIEW_ALL, index=i):
87
+ continue
88
+
89
+ disp = await self.browser.computed_style(
90
+ VIEW_ALL, index=i, prop="display"
91
+ )
92
+ if disp.strip().lower() == "none":
93
+ continue
94
+
95
+ # 화면 밖이면 클릭 실패할 수 있으니 스크롤
96
+ await self.browser.scroll_into_view(VIEW_ALL, index=i)
97
+
98
+ # trial(실패해도 진행)
99
+ _ = await self.browser.try_click(
100
+ VIEW_ALL, index=i, timeout_ms=1500, force=False
101
+ )
102
+
103
+ # 실제 클릭
104
+ try:
105
+ await self.browser.click(
106
+ VIEW_ALL, index=i, timeout_ms=4000, force=False
107
+ )
108
+ except Exception:
109
+ await self.browser.click(
110
+ VIEW_ALL, index=i, timeout_ms=4000, force=True
111
+ )
112
+
113
+ await self.browser.sleep_ms(after_click_sleep_ms)
114
+
115
+ # 클릭 후 테이블 텍스트 변경 대기
116
+ prev_text = await self.browser.wait_table_text_changed(
117
+ table_selector,
118
+ index=table_index,
119
+ prev_text=prev_text,
120
+ timeout_sec=wait_timeout_sec,
121
+ min_lines=10,
122
+ )
123
+
124
+ clicked_any = True
125
+ logger.debug(f"clicked VIEW toggle: idx={i}, display={disp}")
126
+
127
+ except Exception as e:
128
+ logger.debug(
129
+ f"click VIEW toggle failed: idx={i}, err={type(e).__name__}: {e}"
130
+ )
131
+ continue
132
+
133
+ # 이번 라운드에서 클릭을 하나도 못 했으면:
134
+ # - 모든 VIEW가 display:none 이었거나
135
+ # - 클릭이 막혔거나
136
+ # => VIEW(display!=none)가 남아있는지 다시 검사
137
+ if not clicked_any:
138
+ remain = await self.browser.count(VIEW_ALL)
139
+ logger.debug(f"no clicks in round; remain VIEW count={remain}")
140
+ # VIEW는 있는데 전부 display:none 이면 사실상 '열림' 상태로 볼 수 있음
141
+ # (닫기만 보이는 케이스)
142
+ # 여기서는 “성공” 처리
143
+ return True
144
+
145
+ # 다음 라운드에서 다시 스캔해서 VIEW(display!=none)가 남아있으면 또 클릭
146
+ # (다 눌렀으면 결국 클릭할 게 없어짐)
147
+
148
+ # 라운드 다 돌았는데도 여기까지 왔다면,
149
+ # “보기 토글이 계속 display!=none으로 남는다” = 열리지 않는 구조/권한/오버레이 등
150
+ logger.warning("ensure_yearly_consensus_open_in_table_nth: exceeded max_rounds")
151
+ return False
152
+
153
+ async def click_steps(
154
+ self,
155
+ steps: list[tuple[str, str]],
156
+ *,
157
+ jitter_sec: float = 0.6,
158
+ ) -> None:
159
+ """
160
+ 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
161
+ """
162
+ for _name, selector in steps:
163
+ await self.browser.wait_attached(selector)
164
+ logger.info(f"click step: {_name}")
165
+ await self.browser.click(selector)
166
+ # 서버/클라이언트 부담 줄이기: 작은 지터
167
+ wait = int((0.5 + (jitter_sec * 0.5))*1000)
168
+ await self.browser.sleep_ms(wait)
@@ -0,0 +1,225 @@
1
+ # scraper2_hj3415/app/composition.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ from pymongo.asynchronous.database import AsyncDatabase
9
+
10
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
11
+ from scraper2_hj3415.app.adapters.out.playwright.browser_factory import (
12
+ PlaywrightBrowserFactory,
13
+ )
14
+
15
+ from scraper2_hj3415.app.services.fetch.fetch_c101 import FetchC101
16
+ from scraper2_hj3415.app.services.fetch.fetch_c103 import FetchC103
17
+ from scraper2_hj3415.app.services.fetch.fetch_c104 import FetchC104
18
+ from scraper2_hj3415.app.services.fetch.fetch_c106 import FetchC106
19
+ from scraper2_hj3415.app.services.fetch.fetch_c108 import FetchC108
20
+
21
+ from scraper2_hj3415.app.usecases.ingest.ingest_c101 import IngestC101
22
+ from scraper2_hj3415.app.usecases.ingest.ingest_c103 import IngestC103
23
+ from scraper2_hj3415.app.usecases.ingest.ingest_c104 import IngestC104
24
+ from scraper2_hj3415.app.usecases.ingest.ingest_c106 import IngestC106
25
+ from scraper2_hj3415.app.usecases.ingest.ingest_c108 import IngestC108
26
+
27
+
28
+ from scraper2_hj3415.app.ports.sinks.nfs_sink_port import NfsSinkPort
29
+ from contracts_hj3415.nfs.c101_dto import C101DTO
30
+ from contracts_hj3415.nfs.c103_dto import C103DTO
31
+ from contracts_hj3415.nfs.c104_dto import C104DTO
32
+ from contracts_hj3415.nfs.c106_dto import C106DTO
33
+ from contracts_hj3415.nfs.c108_dto import C108DTO
34
+
35
+ from scraper2_hj3415.app.adapters.out.sinks.mongo_sink import MongoSink
36
+ from scraper2_hj3415.app.adapters.out.sinks.memory_sink import MemorySink
37
+
38
+ from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
39
+
40
+ from db2_hj3415.mongo import Mongo
41
+
42
+ from scraper2_hj3415.app.domain.types import Sink
43
+
44
+
45
+ def _env_bool(key: str, default: bool) -> bool:
46
+ v = os.getenv(key)
47
+ return (
48
+ default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
49
+ )
50
+
51
+
52
+ def _env_int(key: str, default: int) -> int:
53
+ v = os.getenv(key)
54
+ if v is None:
55
+ return default
56
+ try:
57
+ return int(v)
58
+ except ValueError:
59
+ return default
60
+
61
+
62
+ def build_browser_factory() -> BrowserFactoryPort:
63
+ return PlaywrightBrowserFactory(
64
+ headless=_env_bool("SCRAPER_HEADLESS", True),
65
+ timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
66
+ max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
67
+ )
68
+
69
+
70
+ # -------------------------
71
+ # Bundles
72
+ # -------------------------
73
+
74
+
75
+ @dataclass(frozen=True)
76
+ class FetchUsecases:
77
+ c101: FetchC101
78
+ c103: FetchC103
79
+ c104: FetchC104
80
+ c106: FetchC106
81
+ c108: FetchC108
82
+
83
+
84
+ @dataclass(frozen=True)
85
+ class Sinks:
86
+ c101: NfsSinkPort[C101DTO]
87
+ c103: NfsSinkPort[C103DTO]
88
+ c104: NfsSinkPort[C104DTO]
89
+ c106: NfsSinkPort[C106DTO]
90
+ c108: NfsSinkPort[C108DTO]
91
+
92
+
93
+ @dataclass(frozen=True)
94
+ class IngestUsecases:
95
+ c101: IngestC101
96
+ c103: IngestC103
97
+ c104: IngestC104
98
+ c106: IngestC106
99
+ c108: IngestC108
100
+
101
+
102
+ @dataclass(frozen=True)
103
+ class Usecases:
104
+ fetch: FetchUsecases
105
+ ingest: IngestUsecases
106
+ sinks: Sinks
107
+ store: InMemoryStore | None = None # ✅ memory일 때만
108
+ mongo: Mongo | None = None # ✅ mongo일 때만
109
+ db: AsyncDatabase | None = None # ✅ mongo일 때만
110
+ browser_factory: Optional[BrowserFactoryPort] = None
111
+
112
+ async def aclose(self) -> None:
113
+ if self.browser_factory is not None:
114
+ await self.browser_factory.aclose()
115
+
116
+ if self.mongo is not None:
117
+ await self.mongo.close()
118
+
119
+
120
+ # -------------------------
121
+ # builders
122
+ # -------------------------
123
+
124
+
125
+ def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
126
+ return FetchUsecases(
127
+ c101=FetchC101(factory=factory),
128
+ c103=FetchC103(factory=factory),
129
+ c104=FetchC104(factory=factory),
130
+ c106=FetchC106(factory=factory),
131
+ c108=FetchC108(factory=factory),
132
+ )
133
+
134
+
135
+ @dataclass(frozen=True)
136
+ class MemoryBundle:
137
+ store: InMemoryStore
138
+ sinks: Sinks
139
+
140
+
141
+ def build_memory_bundle() -> MemoryBundle:
142
+ store = InMemoryStore()
143
+ c101_sink: NfsSinkPort[C101DTO] = MemorySink(store)
144
+ c103_sink: NfsSinkPort[C103DTO] = MemorySink(store)
145
+ c104_sink: NfsSinkPort[C104DTO] = MemorySink(store)
146
+ c106_sink: NfsSinkPort[C106DTO] = MemorySink(store)
147
+ c108_sink: NfsSinkPort[C108DTO] = MemorySink(store)
148
+ sinks = Sinks(
149
+ c101=c101_sink,
150
+ c103=c103_sink,
151
+ c104=c104_sink,
152
+ c106=c106_sink,
153
+ c108=c108_sink,
154
+ )
155
+ return MemoryBundle(store=store, sinks=sinks)
156
+
157
+
158
+ # ---- mongo bundle ----
159
+
160
+
161
+ @dataclass(frozen=True)
162
+ class MongoBundle:
163
+ mongo: Mongo
164
+ db: AsyncDatabase
165
+ sinks: Sinks
166
+
167
+
168
+ def build_mongo_bundle() -> MongoBundle:
169
+ mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
170
+ db = mongo.get_db()
171
+ c101_sink: NfsSinkPort[C101DTO] = MongoSink(db)
172
+ c103_sink: NfsSinkPort[C103DTO] = MongoSink(db)
173
+ c104_sink: NfsSinkPort[C104DTO] = MongoSink(db)
174
+ c106_sink: NfsSinkPort[C106DTO] = MongoSink(db)
175
+ c108_sink: NfsSinkPort[C108DTO] = MongoSink(db)
176
+ sinks = Sinks(
177
+ c101=c101_sink,
178
+ c103=c103_sink,
179
+ c104=c104_sink,
180
+ c106=c106_sink,
181
+ c108=c108_sink,
182
+ )
183
+ return MongoBundle(mongo=mongo, db=db, sinks=sinks)
184
+
185
+
186
+ def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
187
+ return IngestUsecases(
188
+ c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
189
+ c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
190
+ c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
191
+ c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
192
+ c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
193
+ )
194
+
195
+
196
+ def build_usecases(
197
+ *, factory: BrowserFactoryPort | None = None, sink: Sink = "memory"
198
+ ) -> Usecases:
199
+ factory = factory or build_browser_factory()
200
+ fetch = build_fetch_usecases(factory=factory)
201
+
202
+ if sink == "memory":
203
+ bundle = build_memory_bundle()
204
+ ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
205
+ return Usecases(
206
+ fetch=fetch,
207
+ ingest=ingest,
208
+ sinks=bundle.sinks,
209
+ store=bundle.store,
210
+ browser_factory=factory,
211
+ )
212
+
213
+ if sink == "mongo":
214
+ bundle = build_mongo_bundle()
215
+ ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
216
+ return Usecases(
217
+ fetch=fetch,
218
+ ingest=ingest,
219
+ sinks=bundle.sinks,
220
+ mongo=bundle.mongo,
221
+ db=bundle.db,
222
+ browser_factory=factory,
223
+ )
224
+
225
+ raise ValueError(f"Unknown sink_kind: {sink}")
@@ -0,0 +1,61 @@
1
+ # scraper2_hj3415/app/domain/blocks.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Any, Mapping, Sequence
6
+ from scraper2_hj3415.app.domain.constants import BLOCK_KEYS_BY_ENDPOINT
7
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
8
+ from scraper2_hj3415.app.domain.types import BlockKey, MetricKey, Record
9
+ from scraper2_hj3415.app.domain.series import MetricSeries
10
+
11
+
12
+ def _validate_block_key(endpoint_kind: EndpointKind, block_key: str) -> None:
13
+ allowed = BLOCK_KEYS_BY_ENDPOINT.get(endpoint_kind)
14
+ if allowed is not None and block_key not in allowed:
15
+ raise ValueError(f"Invalid block key for {endpoint_kind}: {block_key!r}")
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class MetricsBlock:
20
+ endpoint_kind: EndpointKind
21
+ block_key: BlockKey
22
+ metrics: Mapping[MetricKey, MetricSeries]
23
+
24
+ def __post_init__(self) -> None:
25
+ _validate_block_key(self.endpoint_kind, self.block_key)
26
+
27
+ # 컨테이너 키와 엔티티 키 불일치 방지(선택)
28
+ for k, m in self.metrics.items():
29
+ if m.key != k:
30
+ raise ValueError(
31
+ f"Metric key mismatch: map key={k!r} != series key={m.key!r}"
32
+ )
33
+
34
+ def get(self, key: MetricKey) -> MetricSeries | None:
35
+ m = self.metrics.get(key)
36
+ if m and m.key != key:
37
+ raise ValueError("Metric key mismatch")
38
+ return m
39
+
40
+
41
+ # 다양한 블록형태 구성 추후 수정필요
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class RecordsBlock:
46
+ endpoint_kind: EndpointKind
47
+ block_key: BlockKey
48
+ rows: Sequence[Record]
49
+
50
+ def __post_init__(self) -> None:
51
+ _validate_block_key(self.endpoint_kind, self.block_key)
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class KvBlock:
56
+ endpoint_kind: EndpointKind
57
+ block_key: BlockKey
58
+ values: Mapping[str, Any]
59
+
60
+
61
+ BlockData = MetricsBlock | RecordsBlock | KvBlock
@@ -0,0 +1,33 @@
1
+ # scraper2_hj3415/app/domain/constants.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Mapping
5
+
6
+ from contracts_hj3415.nfs.types import BlockKeys
7
+ from contracts_hj3415.nfs.constants import C101_BLOCK_KEYS, C103_BLOCK_KEYS, C104_BLOCK_KEYS, C106_BLOCK_KEYS, C108_BLOCK_KEYS
8
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
9
+
10
+
11
+ BLOCK_KEYS_BY_ENDPOINT: Mapping[EndpointKind, tuple[str, ...]] = {
12
+ EndpointKind.C101: C101_BLOCK_KEYS,
13
+ EndpointKind.C103: C103_BLOCK_KEYS,
14
+ EndpointKind.C104: C104_BLOCK_KEYS,
15
+ EndpointKind.C106: C106_BLOCK_KEYS,
16
+ EndpointKind.C108: C108_BLOCK_KEYS,
17
+ }
18
+
19
+
20
+ def get_block_keys(endpoint: EndpointKind) -> tuple[str, ...]:
21
+ """
22
+ 엔드포인트의 "공식" 블록 키 목록.
23
+ - 도메인 레이어에 두되, selector/table index 같은 구현 디테일은 넣지 않는다.
24
+ """
25
+ return BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())
26
+
27
+
28
+ def is_known_block(endpoint: EndpointKind, key: BlockKeys) -> bool:
29
+ """
30
+ 블록 키가 해당 endpoint의 공식 목록에 포함되는지 여부.
31
+ (검증/필터링/동적 payload merge 등에 사용)
32
+ """
33
+ return key in BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())
@@ -0,0 +1,16 @@
1
+ # scraper2_hj3415/app/domain/doc.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Mapping
6
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
7
+ from scraper2_hj3415.app.domain.types import BlockKey, LabelsMap
8
+ from scraper2_hj3415.app.domain.blocks import BlockData
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class NfsDoc:
13
+ code: str
14
+ endpoint_kind: EndpointKind
15
+ blocks: Mapping[BlockKey, BlockData]
16
+ labels: Mapping[BlockKey, LabelsMap]
@@ -0,0 +1,11 @@
1
+ # scraper2_hj3415/app/domain/endpoint.py
2
+ from __future__ import annotations
3
+
4
+ from enum import StrEnum
5
+
6
+ class EndpointKind(StrEnum):
7
+ C101 = "c101"
8
+ C103 = "c103"
9
+ C104 = "c104"
10
+ C106 = "c106"
11
+ C108 = "c108"
@@ -0,0 +1,11 @@
1
+ # scraper2_hj3415/app/domain/series.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Mapping
6
+ from scraper2_hj3415.app.domain.types import MetricKey, Period, Num
7
+
8
+ @dataclass(frozen=True)
9
+ class MetricSeries:
10
+ key: MetricKey
11
+ values: Mapping[Period, Num]
@@ -0,0 +1,19 @@
1
+ # scraper2_hj3415/app/domain/types.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Mapping, Any, Sequence, TypeAlias, Literal
5
+
6
+ BlockKey = str
7
+ MetricKey = str
8
+ Period = str
9
+ Num = float | int | None
10
+
11
+ Record: TypeAlias = Mapping[str, Any]
12
+ Records: TypeAlias = Sequence[Record]
13
+ RawLabel = str
14
+ LabelsMap = dict[MetricKey, RawLabel]
15
+
16
+ Sink = Literal["memory", "mongo"]
17
+
18
+
19
+
@@ -0,0 +1,92 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/label.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+ from common_hj3415.utils import clean_text
8
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
9
+
10
+ # -----------------------------
11
+ # 일반적인 라벨 정규화
12
+ # -----------------------------
13
+
14
+ UI_LABEL_NOISE = (
15
+ "펼치기",
16
+ "접기",
17
+ "더보기",
18
+ )
19
+
20
+
21
+ def sanitize_label(x: Any) -> str:
22
+ """
23
+ raw label(항목_raw 포함)에서:
24
+ - '펼치기' 같은 UI 텍스트 제거
25
+ - 과도한 공백 정리
26
+ - 양끝 공백 제거
27
+ """
28
+ s = normalize_text(x)
29
+
30
+ # UI 노이즈 단어 제거
31
+ for w in UI_LABEL_NOISE:
32
+ s = s.replace(w, " ")
33
+
34
+ return clean_text(s)
35
+
36
+
37
+ # -----------------------------
38
+ # metric 라벨 정규화
39
+ # -----------------------------
40
+ _BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
41
+ _EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
42
+ _ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
43
+
44
+
45
+ def normalize_label_base(text: str | None) -> str:
46
+ s = sanitize_label(text)
47
+ s = _EXTRA_WORDS_PATTERN.sub("", s)
48
+ s = _BRACKET_PATTERN.sub("", s)
49
+ s = _ALL_PAREN_PATTERN.sub("", s)
50
+ s = s.replace("*", "")
51
+ return clean_text(s)
52
+
53
+
54
+ def normalize_metric_label(text: str | None) -> str:
55
+ # "보유 지분 (%)" → "보유 지분" (공백 유지)
56
+ return normalize_label_base(text)
57
+
58
+
59
+ def normalize_key_label(text: str | None) -> str:
60
+ # "보유 지분 (%)" → "보유지분"
61
+ s = normalize_label_base(text)
62
+ return s.replace(" ", "").replace("\xa0", "").replace("%", "").strip()
63
+
64
+
65
+ # -----------------------------
66
+ # 컬럼명 정규화
67
+ # -----------------------------
68
+ _COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
69
+ _COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
70
+ _COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
71
+
72
+
73
+ def normalize_col_label(col: str | None) -> str:
74
+ """
75
+ 컬럼명 정규화
76
+ 예)
77
+ "2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
78
+ "2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
79
+ "전년대비 (YoY)" -> "전년대비"
80
+ "전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
81
+ """
82
+ s = normalize_text(col)
83
+ # 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
84
+ s = _COL_DOTNUM.sub("", s)
85
+
86
+ # 2) 컨센서스 문구 제거
87
+ s = _COL_EXTRA_WORDS.sub("", s)
88
+
89
+ # 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
90
+ s = _COL_PAREN_PATTERN.sub("", s)
91
+
92
+ return clean_text(s)
@@ -0,0 +1,53 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/table.py
2
+ from __future__ import annotations
3
+
4
+ from collections import Counter
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from .label import normalize_col_label, normalize_metric_label
9
+
10
+
11
+ def _dedupe_columns(cols: list[str]) -> list[str]:
12
+ """
13
+ 정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
14
+ 예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
15
+ """
16
+ seen: Counter[str] = Counter()
17
+ out: list[str] = []
18
+ for c in cols:
19
+ c = c or ""
20
+ seen[c] += 1
21
+ if seen[c] == 1:
22
+ out.append(c)
23
+ else:
24
+ out.append(f"{c}_{seen[c]}")
25
+ return out
26
+
27
+
28
+ # -----------------------------
29
+ # 3) DataFrame 전체 정규화 + records 변환
30
+ # -----------------------------
31
+ def normalize_metrics_df(df: pd.DataFrame) -> pd.DataFrame:
32
+ """
33
+ - 컬럼명 전체 정규화
34
+ - '항목' 값 정규화
35
+ - NaN -> None
36
+ - 중복 컬럼명 자동 분리(_2/_3)
37
+ """
38
+ if df is None or df.empty:
39
+ return df
40
+
41
+ df = df.copy()
42
+
43
+ # 컬럼명 정규화 + 중복 방지
44
+ norm_cols = [normalize_col_label(c) for c in df.columns.astype(str).tolist()]
45
+ df.columns = _dedupe_columns(norm_cols)
46
+
47
+ # 항목 값 정규화
48
+ if "항목" in df.columns:
49
+ df["항목"] = df["항목"].map(normalize_metric_label)
50
+
51
+ # NaN -> None
52
+ df = df.replace({np.nan: None})
53
+ return df