scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
  3. scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
  4. scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
  5. scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
  6. scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
  7. scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
  8. scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
  9. scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
  10. scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
  11. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  12. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  13. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  14. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  15. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
  16. scraper2_hj3415/app/composition.py +225 -0
  17. scraper2_hj3415/app/domain/blocks.py +61 -0
  18. scraper2_hj3415/app/domain/constants.py +33 -0
  19. scraper2_hj3415/app/domain/doc.py +16 -0
  20. scraper2_hj3415/app/domain/endpoint.py +11 -0
  21. scraper2_hj3415/app/domain/series.py +11 -0
  22. scraper2_hj3415/app/domain/types.py +19 -0
  23. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  24. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  25. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  26. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  27. scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
  28. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  29. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  30. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  31. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  32. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  33. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  34. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  35. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  36. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  37. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  38. scraper2_hj3415/app/parsing/c103_parser.py +22 -0
  39. scraper2_hj3415/app/parsing/c104_parser.py +26 -0
  40. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  41. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  42. scraper2_hj3415/app/ports/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  44. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  45. scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
  46. scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
  47. scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
  48. scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
  49. scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
  50. scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
  51. scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
  52. scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
  53. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  55. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  56. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  57. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  58. scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
  59. scraper2_hj3415/app/services/__init__.py +0 -0
  60. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  61. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  62. scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
  63. scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
  64. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  65. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  66. scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
  67. scraper2_hj3415/app/usecases/__init__.py +0 -0
  68. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  69. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  70. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  71. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  72. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  73. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  74. scraper2/main.py → scraper2_hj3415/cli.py +45 -72
  75. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
  76. scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
  77. scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
  78. scraper2/adapters/out/playwright/browser.py +0 -102
  79. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  80. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  81. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  82. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  83. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  84. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  85. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  86. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  87. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  88. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  89. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  90. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  91. scraper2/app/composition.py +0 -204
  92. scraper2/app/parsing/_converters.py +0 -85
  93. scraper2/app/parsing/_normalize.py +0 -134
  94. scraper2/app/parsing/c101_parser.py +0 -143
  95. scraper2/app/parsing/c103_parser.py +0 -128
  96. scraper2/app/parsing/c104_parser.py +0 -143
  97. scraper2/app/parsing/c106_parser.py +0 -153
  98. scraper2/app/parsing/c108_parser.py +0 -65
  99. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  100. scraper2/app/ports/browser/browser_port.py +0 -22
  101. scraper2/app/ports/ingest_port.py +0 -14
  102. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  103. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  104. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  105. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  106. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  107. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  108. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  109. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  110. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  111. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  112. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  113. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  114. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  115. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  116. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  117. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  118. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  119. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  120. {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
  121. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  122. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  123. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  124. {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
  125. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  126. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  127. {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
  128. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  129. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  130. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  131. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  132. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  133. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  134. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
  135. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,379 @@
1
+ # scraper2_hj3415/app/adapters/site/wisereport_playwright.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from scraper2_hj3415.app.ports.site.wisereport_port import WiseReportPort
7
+ from logging_hj3415 import logger
8
+
9
+ _Q_SIGNAL_RE = re.compile(r"/(03|06|09)\b")
10
+
11
+
12
+ class WiseReportPlaywright(WiseReportPort):
13
+ def __init__(self, browser: BrowserPort):
14
+ self.browser = browser
15
+
16
+ async def _ensure_yearly_consensus_open_in_table_nth(
17
+ self,
18
+ *,
19
+ table_selector: str, # 예: TABLE_XPATH ("xpath=//div[@id='wrapper']//div//table")
20
+ table_index: int, # 예: TABLE_INDEX (2)
21
+ after_click_sleep_ms: int = 150,
22
+ max_rounds: int = 6,
23
+ wait_timeout_sec: float = 12.0,
24
+ ) -> bool:
25
+ """
26
+ 목표: 연간 컨센서스 컬럼이 '반드시 펼쳐진 상태'가 되게 한다.
27
+ 전략:
28
+ - TABLE_NTH 스코프 안에서
29
+ - btn_moreY 또는 btn_moreQQ 이면서
30
+ - '연간컨센서스보기' 텍스트를 가진 a 토글들 중
31
+ - computedStyle(display) != 'none' 인 것들을 전부 클릭
32
+ - 클릭마다 테이블 텍스트 변경을 기다림
33
+ """
34
+
35
+ table_scoped = f"{table_selector} >> nth={table_index}"
36
+
37
+ # table 내부의 토글(a)만 잡기 (btn_moreY / btn_moreQQ 둘 다)
38
+ VIEW_ALL = (
39
+ f"{table_scoped} >> xpath=.//a["
40
+ "("
41
+ "contains(@class,'btn_moreY') or contains(@class,'btn_moreQQ')"
42
+ ")"
43
+ " and .//span[contains(normalize-space(.),'연간컨센서스보기')]"
44
+ "]"
45
+ )
46
+
47
+ CLOSE_ALL = (
48
+ f"{table_scoped} >> xpath=.//a["
49
+ "("
50
+ "contains(@class,'btn_moreY') or contains(@class,'btn_moreQQ')"
51
+ ")"
52
+ " and .//span[contains(normalize-space(.),'연간컨센서스닫기')]"
53
+ "]"
54
+ )
55
+
56
+ # 테이블 텍스트 변화 감지용 “prev_text”
57
+ prev_text = await self.browser.wait_table_text_changed(
58
+ table_selector,
59
+ index=table_index,
60
+ prev_text=None,
61
+ timeout_sec=wait_timeout_sec,
62
+ min_lines=10,
63
+ )
64
+
65
+ logger.debug("ensure_yearly_consensus_open_in_table_nth: start")
66
+
67
+ # round를 두는 이유:
68
+ # - 보기 토글이 여러 개고, 클릭할 때 DOM이 재배치될 수 있음
69
+ # - 1번에 다 못 누르면 다음 라운드에서 다시 스캔
70
+ for round_no in range(1, max_rounds + 1):
71
+ view_cnt = await self.browser.count(VIEW_ALL)
72
+ close_cnt = await self.browser.count(CLOSE_ALL)
73
+ logger.debug(
74
+ f"round={round_no} toggle exists: view={view_cnt}, close={close_cnt}"
75
+ )
76
+
77
+ # "보기" 토글이 아예 없으면 -> 이미 다 펼쳐져 있거나(닫기만 존재),
78
+ # 혹은 페이지 구조가 달라서 못 찾는 것. 여기서는 '성공'으로 간주.
79
+ if view_cnt == 0:
80
+ logger.debug("no VIEW toggles found in-table -> treat as OPEN")
81
+ return True
82
+
83
+ clicked_any = False
84
+
85
+ # i를 0..view_cnt-1로 돌면서 display != none 인 것만 클릭
86
+ # (중간에 DOM 바뀌면 count/순서가 바뀔 수 있으니, 실패해도 계속)
87
+ for i in range(view_cnt):
88
+ try:
89
+ # 혹시 DOM이 바뀌어 index가 사라졌으면 skip
90
+ if not await self.browser.is_attached(VIEW_ALL, index=i):
91
+ continue
92
+
93
+ disp = await self.browser.computed_style(
94
+ VIEW_ALL, index=i, prop="display"
95
+ )
96
+ if disp.strip().lower() == "none":
97
+ continue
98
+
99
+ # 화면 밖이면 클릭 실패할 수 있으니 스크롤
100
+ await self.browser.scroll_into_view(VIEW_ALL, index=i)
101
+
102
+ # trial(실패해도 진행)
103
+ _ = await self.browser.try_click(
104
+ VIEW_ALL, index=i, timeout_ms=1500, force=False
105
+ )
106
+
107
+ # 실제 클릭
108
+ try:
109
+ await self.browser.click(
110
+ VIEW_ALL, index=i, timeout_ms=4000, force=False
111
+ )
112
+ except Exception:
113
+ await self.browser.click(
114
+ VIEW_ALL, index=i, timeout_ms=4000, force=True
115
+ )
116
+
117
+ await self.browser.sleep_ms(after_click_sleep_ms)
118
+
119
+ # 클릭 후 테이블 텍스트 변경 대기
120
+ prev_text = await self.browser.wait_table_text_changed(
121
+ table_selector,
122
+ index=table_index,
123
+ prev_text=prev_text,
124
+ timeout_sec=wait_timeout_sec,
125
+ min_lines=10,
126
+ )
127
+
128
+ clicked_any = True
129
+ logger.debug(f"clicked VIEW toggle: idx={i}, display={disp}")
130
+
131
+ except Exception as e:
132
+ logger.debug(
133
+ f"click VIEW toggle failed: idx={i}, err={type(e).__name__}: {e}"
134
+ )
135
+ continue
136
+
137
+ # 이번 라운드에서 클릭을 하나도 못 했으면:
138
+ # - 모든 VIEW가 display:none 이었거나
139
+ # - 클릭이 막혔거나
140
+ # => VIEW(display!=none)가 남아있는지 다시 검사
141
+ if not clicked_any:
142
+ remain = await self.browser.count(VIEW_ALL)
143
+ logger.debug(f"no clicks in round; remain VIEW count={remain}")
144
+ # VIEW는 있는데 전부 display:none 이면 사실상 '열림' 상태로 볼 수 있음
145
+ # (닫기만 보이는 케이스)
146
+ # 여기서는 “성공” 처리
147
+ return True
148
+
149
+ # 다음 라운드에서 다시 스캔해서 VIEW(display!=none)가 남아있으면 또 클릭
150
+ # (다 눌렀으면 결국 클릭할 게 없어짐)
151
+
152
+ # 라운드 다 돌았는데도 여기까지 왔다면,
153
+ # “보기 토글이 계속 display!=none으로 남는다” = 열리지 않는 구조/권한/오버레이 등
154
+ logger.warning("ensure_yearly_consensus_open_in_table_nth: exceeded max_rounds")
155
+ return False
156
+
157
+ async def _click_steps(
158
+ self,
159
+ steps: list[tuple[str, str]],
160
+ *,
161
+ jitter_sec: float = 0.6,
162
+ ) -> None:
163
+ for _name, selector in steps:
164
+ await self.browser.wait_attached(selector)
165
+ await self.browser.scroll_into_view(selector)
166
+
167
+ logger.info(f"click step: {_name}")
168
+
169
+ ok = await self.browser.try_click(selector, timeout_ms=1500, force=False)
170
+ try:
171
+ if ok:
172
+ await self.browser.click(selector, timeout_ms=4000, force=False)
173
+ else:
174
+ await self.browser.click(selector, timeout_ms=4000, force=True)
175
+ except Exception:
176
+ await self.browser.click(selector, timeout_ms=4000, force=True)
177
+
178
+ wait = int((0.5 + (jitter_sec * 0.5)) * 1000)
179
+ await self.browser.sleep_ms(wait)
180
+
181
+ async def _is_quarter_view_by_header(
182
+ self, *, table_selector: str, table_index: int
183
+ ) -> tuple[bool, list[str]]:
184
+ """
185
+ 분기 화면 판정:
186
+ - 헤더에 YYYY/03 or YYYY/06 or YYYY/09 가 하나라도 있으면 분기 화면으로 확정
187
+ - YYYY/12만 있는 경우는 (분기 전환 실패 가능성이 높으니) 분기 화면으로 보지 않음
188
+ """
189
+ periods = await self.browser.table_header_periods_mm_nth(
190
+ table_selector, index=table_index
191
+ )
192
+ is_q = any(_Q_SIGNAL_RE.search(p) for p in periods)
193
+ return is_q, periods
194
+
195
+ async def set_view_c103(
196
+ self,
197
+ *,
198
+ key: str,
199
+ steps: list[tuple[str, str]],
200
+ table_selector: str,
201
+ table_index: int,
202
+ max_attempts: int = 5,
203
+ stabilize_timeout_sec: float = 10.0,
204
+ ) -> None:
205
+ """
206
+ c103 화면을 원하는 상태(key)에 맞게 '확정'한다.
207
+ - key.endswith("q") => 분기 화면이어야 함
208
+ - key.endswith("y") => 연간 화면이어야 함 (즉 분기 시그널이 없어야 함)
209
+
210
+ 클릭 -> 안정화(wait_table_text_changed) -> 헤더 판정 -> 실패 시 복구/재시도.
211
+ """
212
+ want_q = key.endswith("q")
213
+ last_periods: list[str] = []
214
+ last_is_q: bool | None = None
215
+
216
+ for attempt in range(1, max_attempts + 1):
217
+ logger.info(f"set_view_c103: key={key} attempt={attempt} want_q={want_q}")
218
+
219
+ # 1) 클릭 (가벼운 지터 포함)
220
+ await self._click_steps(steps, jitter_sec=0.6)
221
+
222
+ # 2) 토글 펼치기 (네 함수 그대로 사용)
223
+ try:
224
+ await self._ensure_yearly_consensus_open_in_table_nth(
225
+ table_selector=table_selector,
226
+ table_index=table_index,
227
+ )
228
+ except Exception as e:
229
+ logger.debug(f"ensure open failed (ignored): {type(e).__name__}: {e}")
230
+
231
+ # 3) 렌더 안정화: '변경'은 보장 못하지만, 로딩 흔들림을 줄여줌
232
+ try:
233
+ _ = await self.browser.wait_table_text_changed(
234
+ table_selector,
235
+ index=table_index,
236
+ prev_text=None,
237
+ min_rows=5,
238
+ min_lines=30,
239
+ timeout_sec=stabilize_timeout_sec,
240
+ )
241
+ except Exception as e:
242
+ logger.debug(
243
+ f"stabilize wait failed (ignored): {type(e).__name__}: {e}"
244
+ )
245
+
246
+ # 4) 헤더로 상태 확정
247
+ is_q, periods = await self._is_quarter_view_by_header(
248
+ table_selector=table_selector,
249
+ table_index=table_index,
250
+ )
251
+ last_is_q, last_periods = is_q, periods
252
+
253
+ logger.info(
254
+ f"set_view_c103: key={key} header periods(head)={periods[:8]} is_q={is_q}"
255
+ )
256
+
257
+ if want_q == is_q:
258
+ return # ✅ 성공 확정
259
+
260
+ # 5) 실패 시 복구 전략
261
+ # attempt가 올라갈수록 강하게: 스크롤/force click은 click_steps 쪽을 강화하는 게 좋지만
262
+ # 여기서는 reload로 리셋을 걸어준다.
263
+ if attempt in (2, 4):
264
+ logger.warning(f"set_view_c103 mismatch -> reload | key={key}")
265
+ await self.browser.reload(timeout_ms=12_000)
266
+ await self.browser.sleep_ms(250)
267
+
268
+ raise RuntimeError(
269
+ f"set_view_c103 failed: key={key} want_q={want_q} last_is_q={last_is_q} "
270
+ f"last_periods={last_periods[:12]}"
271
+ )
272
+
273
+ async def set_view_c104(
274
+ self,
275
+ *,
276
+ key: str,
277
+ steps: list[tuple[str, str]],
278
+ table_selector: str,
279
+ table_index: int,
280
+ prev_text_by_idx: dict[int, str | None],
281
+ max_attempts: int = 5,
282
+ stabilize_timeout_sec: float = 10.0,
283
+ min_rows: int = 5,
284
+ min_lines: int = 30,
285
+ open_consensus: bool = True,
286
+ ) -> None:
287
+ """
288
+ c104 화면을 원하는 상태(key)에 맞게 '확정'한다.
289
+
290
+ - key.endswith("q") => 분기 화면이어야 함 (헤더에 03/06/09 존재)
291
+ - key.endswith("y") => 연간 화면이어야 함 (헤더에 03/06/09 없어야 함, 12는 보통 존재)
292
+
293
+ 절차:
294
+ 1) 클릭 steps
295
+ 2) (옵션) 연간컨센서스 펼치기
296
+ 3) wait_table_text_changed (idx별 prev_text 추적)
297
+ 4) header periods로 q/y 판정
298
+ 5) mismatch면 reload/재시도
299
+ """
300
+ want_q = key.endswith("q")
301
+
302
+ # idx별 prev_text 초기값 방어
303
+ if table_index not in prev_text_by_idx:
304
+ prev_text_by_idx[table_index] = None
305
+
306
+ last_periods: list[str] = []
307
+ last_is_q: bool | None = None
308
+
309
+ for attempt in range(1, max_attempts + 1):
310
+ logger.info(
311
+ f"set_view_c104: key={key} idx={table_index} attempt={attempt} want_q={want_q}"
312
+ )
313
+
314
+ # 1) 클릭(행동)
315
+ await self._click_steps(steps, jitter_sec=0.6)
316
+
317
+ # 2) 컨센서스 펼치기(옵션)
318
+ if open_consensus:
319
+ try:
320
+ await self._ensure_yearly_consensus_open_in_table_nth(
321
+ table_selector=table_selector,
322
+ table_index=table_index,
323
+ wait_timeout_sec=stabilize_timeout_sec,
324
+ )
325
+ except Exception as e:
326
+ logger.debug(
327
+ f"ensure open failed (ignored): {type(e).__name__}: {e}"
328
+ )
329
+
330
+ # 3) 안정화(변경/유효 텍스트 대기) - idx별로 prev 추적
331
+ try:
332
+ prev_text_by_idx[
333
+ table_index
334
+ ] = await self.browser.wait_table_text_changed(
335
+ table_selector,
336
+ index=table_index,
337
+ prev_text=prev_text_by_idx[table_index],
338
+ min_rows=min_rows,
339
+ min_lines=min_lines,
340
+ timeout_sec=stabilize_timeout_sec,
341
+ )
342
+ except Exception as e:
343
+ logger.debug(
344
+ f"stabilize wait failed (ignored): {type(e).__name__}: {e}"
345
+ )
346
+
347
+ # 4) 헤더로 상태 확정
348
+ is_q, periods = await self._is_quarter_view_by_header(
349
+ table_selector=table_selector,
350
+ table_index=table_index,
351
+ )
352
+ last_is_q, last_periods = is_q, periods
353
+
354
+ logger.info(
355
+ f"set_view_c104: key={key} idx={table_index} periods(head)={periods[:8]} is_q={is_q}"
356
+ )
357
+
358
+ # periods 자체가 비면 "불확정"으로 보고 재시도하는 게 안전
359
+ if not periods:
360
+ logger.warning(
361
+ f"set_view_c104: header periods empty -> retry | key={key} idx={table_index}"
362
+ )
363
+ else:
364
+ if want_q == is_q:
365
+ return # ✅ 성공 확정
366
+
367
+ # 5) 실패 복구 전략
368
+ if attempt in (2, 4):
369
+ logger.warning(
370
+ f"set_view_c104 mismatch/uncertain -> reload | key={key}"
371
+ )
372
+ await self.browser.reload(timeout_ms=12_000)
373
+
374
+ await self.browser.sleep_ms(250)
375
+
376
+ raise RuntimeError(
377
+ f"set_view_c104 failed: key={key} idx={table_index} want_q={want_q} "
378
+ f"last_is_q={last_is_q} last_periods={last_periods[:8]}"
379
+ )
@@ -0,0 +1,225 @@
1
+ # scraper2_hj3415/app/composition.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Optional
7
+
8
+ from pymongo.asynchronous.database import AsyncDatabase
9
+
10
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
11
+ from scraper2_hj3415.app.adapters.out.playwright.browser_factory import (
12
+ PlaywrightBrowserFactory,
13
+ )
14
+
15
+ from scraper2_hj3415.app.services.fetch.fetch_c101 import FetchC101
16
+ from scraper2_hj3415.app.services.fetch.fetch_c103 import FetchC103
17
+ from scraper2_hj3415.app.services.fetch.fetch_c104 import FetchC104
18
+ from scraper2_hj3415.app.services.fetch.fetch_c106 import FetchC106
19
+ from scraper2_hj3415.app.services.fetch.fetch_c108 import FetchC108
20
+
21
+ from scraper2_hj3415.app.usecases.ingest.ingest_c101 import IngestC101
22
+ from scraper2_hj3415.app.usecases.ingest.ingest_c103 import IngestC103
23
+ from scraper2_hj3415.app.usecases.ingest.ingest_c104 import IngestC104
24
+ from scraper2_hj3415.app.usecases.ingest.ingest_c106 import IngestC106
25
+ from scraper2_hj3415.app.usecases.ingest.ingest_c108 import IngestC108
26
+
27
+
28
+ from scraper2_hj3415.app.ports.sinks.nfs_sink_port import NfsSinkPort
29
+ from contracts_hj3415.nfs.c101_dto import C101DTO
30
+ from contracts_hj3415.nfs.c103_dto import C103DTO
31
+ from contracts_hj3415.nfs.c104_dto import C104DTO
32
+ from contracts_hj3415.nfs.c106_dto import C106DTO
33
+ from contracts_hj3415.nfs.c108_dto import C108DTO
34
+
35
+ from scraper2_hj3415.app.adapters.out.sinks.mongo_sink import MongoSink
36
+ from scraper2_hj3415.app.adapters.out.sinks.memory_sink import MemorySink
37
+
38
+ from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
39
+
40
+ from db2_hj3415.mongo import Mongo
41
+
42
+ from scraper2_hj3415.app.domain.types import Sink
43
+
44
+
45
+ def _env_bool(key: str, default: bool) -> bool:
46
+ v = os.getenv(key)
47
+ return (
48
+ default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
49
+ )
50
+
51
+
52
+ def _env_int(key: str, default: int) -> int:
53
+ v = os.getenv(key)
54
+ if v is None:
55
+ return default
56
+ try:
57
+ return int(v)
58
+ except ValueError:
59
+ return default
60
+
61
+
62
+ def build_browser_factory() -> BrowserFactoryPort:
63
+ return PlaywrightBrowserFactory(
64
+ headless=_env_bool("SCRAPER_HEADLESS", True),
65
+ timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
66
+ max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
67
+ )
68
+
69
+
70
+ # -------------------------
71
+ # Bundles
72
+ # -------------------------
73
+
74
+
75
+ @dataclass(frozen=True)
76
+ class FetchUsecases:
77
+ c101: FetchC101
78
+ c103: FetchC103
79
+ c104: FetchC104
80
+ c106: FetchC106
81
+ c108: FetchC108
82
+
83
+
84
+ @dataclass(frozen=True)
85
+ class Sinks:
86
+ c101: NfsSinkPort[C101DTO]
87
+ c103: NfsSinkPort[C103DTO]
88
+ c104: NfsSinkPort[C104DTO]
89
+ c106: NfsSinkPort[C106DTO]
90
+ c108: NfsSinkPort[C108DTO]
91
+
92
+
93
+ @dataclass(frozen=True)
94
+ class IngestUsecases:
95
+ c101: IngestC101
96
+ c103: IngestC103
97
+ c104: IngestC104
98
+ c106: IngestC106
99
+ c108: IngestC108
100
+
101
+
102
+ @dataclass(frozen=True)
103
+ class Usecases:
104
+ fetch: FetchUsecases
105
+ ingest: IngestUsecases
106
+ sinks: Sinks
107
+ store: InMemoryStore | None = None # ✅ memory일 때만
108
+ mongo: Mongo | None = None # ✅ mongo일 때만
109
+ db: AsyncDatabase | None = None # ✅ mongo일 때만
110
+ browser_factory: Optional[BrowserFactoryPort] = None
111
+
112
+ async def aclose(self) -> None:
113
+ if self.browser_factory is not None:
114
+ await self.browser_factory.aclose()
115
+
116
+ if self.mongo is not None:
117
+ await self.mongo.close()
118
+
119
+
120
+ # -------------------------
121
+ # builders
122
+ # -------------------------
123
+
124
+
125
+ def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
126
+ return FetchUsecases(
127
+ c101=FetchC101(factory=factory),
128
+ c103=FetchC103(factory=factory),
129
+ c104=FetchC104(factory=factory),
130
+ c106=FetchC106(factory=factory),
131
+ c108=FetchC108(factory=factory),
132
+ )
133
+
134
+
135
+ @dataclass(frozen=True)
136
+ class MemoryBundle:
137
+ store: InMemoryStore
138
+ sinks: Sinks
139
+
140
+
141
+ def build_memory_bundle() -> MemoryBundle:
142
+ store = InMemoryStore()
143
+ c101_sink: NfsSinkPort[C101DTO] = MemorySink(store)
144
+ c103_sink: NfsSinkPort[C103DTO] = MemorySink(store)
145
+ c104_sink: NfsSinkPort[C104DTO] = MemorySink(store)
146
+ c106_sink: NfsSinkPort[C106DTO] = MemorySink(store)
147
+ c108_sink: NfsSinkPort[C108DTO] = MemorySink(store)
148
+ sinks = Sinks(
149
+ c101=c101_sink,
150
+ c103=c103_sink,
151
+ c104=c104_sink,
152
+ c106=c106_sink,
153
+ c108=c108_sink,
154
+ )
155
+ return MemoryBundle(store=store, sinks=sinks)
156
+
157
+
158
+ # ---- mongo bundle ----
159
+
160
+
161
+ @dataclass(frozen=True)
162
+ class MongoBundle:
163
+ mongo: Mongo
164
+ db: AsyncDatabase
165
+ sinks: Sinks
166
+
167
+
168
+ def build_mongo_bundle() -> MongoBundle:
169
+ mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
170
+ db = mongo.get_db()
171
+ c101_sink: NfsSinkPort[C101DTO] = MongoSink(db)
172
+ c103_sink: NfsSinkPort[C103DTO] = MongoSink(db)
173
+ c104_sink: NfsSinkPort[C104DTO] = MongoSink(db)
174
+ c106_sink: NfsSinkPort[C106DTO] = MongoSink(db)
175
+ c108_sink: NfsSinkPort[C108DTO] = MongoSink(db)
176
+ sinks = Sinks(
177
+ c101=c101_sink,
178
+ c103=c103_sink,
179
+ c104=c104_sink,
180
+ c106=c106_sink,
181
+ c108=c108_sink,
182
+ )
183
+ return MongoBundle(mongo=mongo, db=db, sinks=sinks)
184
+
185
+
186
+ def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
187
+ return IngestUsecases(
188
+ c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
189
+ c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
190
+ c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
191
+ c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
192
+ c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
193
+ )
194
+
195
+
196
+ def build_usecases(
197
+ *, factory: BrowserFactoryPort | None = None, sink: Sink = "memory"
198
+ ) -> Usecases:
199
+ factory = factory or build_browser_factory()
200
+ fetch = build_fetch_usecases(factory=factory)
201
+
202
+ if sink == "memory":
203
+ bundle = build_memory_bundle()
204
+ ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
205
+ return Usecases(
206
+ fetch=fetch,
207
+ ingest=ingest,
208
+ sinks=bundle.sinks,
209
+ store=bundle.store,
210
+ browser_factory=factory,
211
+ )
212
+
213
+ if sink == "mongo":
214
+ bundle = build_mongo_bundle()
215
+ ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
216
+ return Usecases(
217
+ fetch=fetch,
218
+ ingest=ingest,
219
+ sinks=bundle.sinks,
220
+ mongo=bundle.mongo,
221
+ db=bundle.db,
222
+ browser_factory=factory,
223
+ )
224
+
225
+ raise ValueError(f"Unknown sink_kind: {sink}")
@@ -0,0 +1,61 @@
1
+ # scraper2_hj3415/app/domain/blocks.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Any, Mapping, Sequence
6
+ from scraper2_hj3415.app.domain.constants import BLOCK_KEYS_BY_ENDPOINT
7
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
8
+ from scraper2_hj3415.app.domain.types import BlockKey, MetricKey, Record
9
+ from scraper2_hj3415.app.domain.series import MetricSeries
10
+
11
+
12
+ def _validate_block_key(endpoint_kind: EndpointKind, block_key: str) -> None:
13
+ allowed = BLOCK_KEYS_BY_ENDPOINT.get(endpoint_kind)
14
+ if allowed is not None and block_key not in allowed:
15
+ raise ValueError(f"Invalid block key for {endpoint_kind}: {block_key!r}")
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class MetricsBlock:
20
+ endpoint_kind: EndpointKind
21
+ block_key: BlockKey
22
+ metrics: Mapping[MetricKey, MetricSeries]
23
+
24
+ def __post_init__(self) -> None:
25
+ _validate_block_key(self.endpoint_kind, self.block_key)
26
+
27
+ # 컨테이너 키와 엔티티 키 불일치 방지(선택)
28
+ for k, m in self.metrics.items():
29
+ if m.key != k:
30
+ raise ValueError(
31
+ f"Metric key mismatch: map key={k!r} != series key={m.key!r}"
32
+ )
33
+
34
+ def get(self, key: MetricKey) -> MetricSeries | None:
35
+ m = self.metrics.get(key)
36
+ if m and m.key != key:
37
+ raise ValueError("Metric key mismatch")
38
+ return m
39
+
40
+
41
+ # 다양한 블록형태 구성 추후 수정필요
42
+
43
+
44
+ @dataclass(frozen=True)
45
+ class RecordsBlock:
46
+ endpoint_kind: EndpointKind
47
+ block_key: BlockKey
48
+ rows: Sequence[Record]
49
+
50
+ def __post_init__(self) -> None:
51
+ _validate_block_key(self.endpoint_kind, self.block_key)
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class KvBlock:
56
+ endpoint_kind: EndpointKind
57
+ block_key: BlockKey
58
+ values: Mapping[str, Any]
59
+
60
+
61
+ BlockData = MetricsBlock | RecordsBlock | KvBlock
@@ -0,0 +1,33 @@
1
+ # scraper2_hj3415/app/domain/constants.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Mapping
5
+
6
+ from contracts_hj3415.nfs.types import BlockKey
7
+ from contracts_hj3415.nfs.constants import C101_BLOCK_KEYS, C103_BLOCK_KEYS, C104_BLOCK_KEYS, C106_BLOCK_KEYS, C108_BLOCK_KEYS
8
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
9
+
10
+
11
+ BLOCK_KEYS_BY_ENDPOINT: Mapping[EndpointKind, tuple[str, ...]] = {
12
+ EndpointKind.C101: C101_BLOCK_KEYS,
13
+ EndpointKind.C103: C103_BLOCK_KEYS,
14
+ EndpointKind.C104: C104_BLOCK_KEYS,
15
+ EndpointKind.C106: C106_BLOCK_KEYS,
16
+ EndpointKind.C108: C108_BLOCK_KEYS,
17
+ }
18
+
19
+
20
+ def get_block_keys(endpoint: EndpointKind) -> tuple[str, ...]:
21
+ """
22
+ 엔드포인트의 "공식" 블록 키 목록.
23
+ - 도메인 레이어에 두되, selector/table index 같은 구현 디테일은 넣지 않는다.
24
+ """
25
+ return BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())
26
+
27
+
28
+ def is_known_block(endpoint: EndpointKind, key: BlockKey) -> bool:
29
+ """
30
+ 블록 키가 해당 endpoint의 공식 목록에 포함되는지 여부.
31
+ (검증/필터링/동적 payload merge 등에 사용)
32
+ """
33
+ return key in BLOCK_KEYS_BY_ENDPOINT.get(endpoint, ())