scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -70
  60. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,254 @@
1
+ # scraper2_hj3415/app/parsing/c108_parser.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from html import unescape
6
+ from typing import Any
7
+ from common_hj3415.utils import clean_text
8
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
9
+
10
+ _TAGS = re.compile(r"<[^>]+>")
11
+ _WS = re.compile(r"\s+")
12
+
13
+ _TD_ID_RE = re.compile(r"^td(\d+)$") # td0, td1, ...
14
+ _C_ID_RE = re.compile(r"^c(\d+)$") # c0, c1, ...
15
+
16
+
17
+ def _clean_text(x: Any) -> str:
18
+ """
19
+ 경계/로깅/파싱 단계에서 Any를 안전하게 사람이 읽을 문자열로 만든다.
20
+ - Any → str
21
+ - html entity unescape
22
+ - 이후 normalize_text 적용
23
+ """
24
+ if x is None:
25
+ return ""
26
+ s = unescape(str(x)) # ❗ x or "" 대신 None만 처리 (falsy 보존)
27
+ return clean_text(s)
28
+
29
+
30
+ def _clean_html_to_text(html: str) -> str:
31
+ s = unescape(html or "")
32
+ s = s.replace("<br/>", "\n").replace("<br>", "\n").replace("<br />", "\n")
33
+ s = _TAGS.sub("", s)
34
+ s = s.replace("\r", "")
35
+ lines = [ln.strip() for ln in s.split("\n")]
36
+ lines = [ln for ln in lines if ln]
37
+ return "\n".join(lines).strip()
38
+
39
+
40
+ _UI_LINES = {"요약정보닫기"}
41
+ _UI_PREFIXES = ("요약정보 :", "요약정보:")
42
+ _BULLET_RE = re.compile(r"^\s*▶\s*")
43
+ _MULTI_NL = re.compile(r"\n{3,}")
44
+
45
+
46
+ def _prettify_report_text(
47
+ text: str,
48
+ *,
49
+ bullet: str = "- ",
50
+ ) -> str:
51
+ if not text:
52
+ return ""
53
+
54
+ lines = [ln.strip() for ln in text.split("\n")]
55
+ out: list[str] = []
56
+
57
+ for ln in lines:
58
+ if not ln:
59
+ continue
60
+
61
+ # UI 잔재 제거 (prefix)
62
+ for p in _UI_PREFIXES:
63
+ if ln.startswith(p):
64
+ ln = ln[len(p) :].strip()
65
+ break
66
+ if not ln:
67
+ continue
68
+
69
+ if ln in _UI_LINES:
70
+ continue
71
+
72
+ # 불릿 정리
73
+ if _BULLET_RE.match(ln):
74
+ ln = _BULLET_RE.sub(bullet, ln)
75
+
76
+ out.append(ln)
77
+
78
+ s = "\n".join(out)
79
+ s = _MULTI_NL.sub("\n\n", s).strip()
80
+ return s
81
+
82
+
83
+ def _parse_target_price(x: Any) -> int | None:
84
+ s = _clean_text(x)
85
+ if not s:
86
+ return None
87
+ s2 = re.sub(r"[^0-9]", "", s)
88
+ if not s2:
89
+ return None
90
+ try:
91
+ return int(s2)
92
+ except Exception:
93
+ return None
94
+
95
+
96
+ def _parse_pages(x: Any) -> int | None:
97
+ s = _clean_text(x)
98
+ m = re.search(r"(\d+)", s)
99
+ return int(m.group(1)) if m else None
100
+
101
+
102
+ async def parse_c108_recent_reports_dom(
103
+ browser: BrowserPort,
104
+ *,
105
+ table_selector: str = "#tableCmpDetail",
106
+ ) -> list[dict[str, Any]]:
107
+ """
108
+ pandas(read_html) 없이 DOM 기반으로 안정적으로 추출.
109
+
110
+ 전제:
111
+ - "정상 행"에는 td[id^='td'] 가 있고, 그 id가 tdN 형태다.
112
+ - "상세 요약(숨김)"은 td[id='cN'] data-content로 붙어있다.
113
+ - summary는 td[id='tdN'] data-content에, comment는 td[id='cN'] data-content에 들어있다.
114
+
115
+ BrowserPort 요구 기능:
116
+ - wait_attached(selector)
117
+ - count_in_nth(scope_selector, scope_index, inner_selector) -> int
118
+ - eval_in_nth_first(scope_selector, scope_index, inner_selector, expression) -> Any
119
+ (이미 네가 추가해둔 형태 그대로 사용)
120
+ """
121
+
122
+ await browser.wait_attached(table_selector)
123
+
124
+ # tbody tr 개수
125
+ tr_count = await browser.count_in_nth(
126
+ table_selector, scope_index=0, inner_selector="tbody tr"
127
+ )
128
+ if tr_count <= 0:
129
+ return []
130
+
131
+ out: list[dict[str, Any]] = []
132
+
133
+ for tr_idx in range(tr_count):
134
+ # row scope: table_selector >> tbody tr (nth=tr_idx)
135
+ row_scope = f"{table_selector} >> tbody tr >> nth={tr_idx}"
136
+
137
+ # 1) 이 행이 "정상 행"인지 판정: td[id^=td]가 있어야 함
138
+ td_id = await browser.eval_in_nth_first(
139
+ row_scope,
140
+ scope_index=0,
141
+ inner_selector="td[id^='td']",
142
+ expression="el => el.id",
143
+ )
144
+ td_id = _clean_text(td_id)
145
+ m = _TD_ID_RE.match(td_id)
146
+ if not m:
147
+ # 숨김 상세행(cN) 같은 건 스킵
148
+ continue
149
+
150
+ n = m.group(1) # row_id
151
+ # 2) 컬럼 텍스트 추출 (C108 테이블 구조에 맞게 td 순서 기준)
152
+ # 보통: 1=일자, 2=제목, 3=작성자, 4=제공처, 5=투자의견, 6=목표가, 7=분량 ...
153
+ date = _clean_text(
154
+ await browser.eval_in_nth_first(
155
+ row_scope,
156
+ scope_index=0,
157
+ inner_selector="td:nth-child(1)",
158
+ expression="el => el.innerText",
159
+ )
160
+ )
161
+ title = _clean_text(
162
+ await browser.eval_in_nth_first(
163
+ row_scope,
164
+ scope_index=0,
165
+ inner_selector="td:nth-child(2)",
166
+ expression="el => el.innerText",
167
+ )
168
+ )
169
+
170
+ # 최소 필터
171
+ if not date or not title:
172
+ continue
173
+
174
+ authors = _clean_text(
175
+ await browser.eval_in_nth_first(
176
+ row_scope,
177
+ scope_index=0,
178
+ inner_selector="td:nth-child(3)",
179
+ expression="el => el.innerText",
180
+ )
181
+ ) or None
182
+
183
+ provider = _clean_text(
184
+ await browser.eval_in_nth_first(
185
+ row_scope,
186
+ scope_index=0,
187
+ inner_selector="td:nth-child(4)",
188
+ expression="el => el.innerText",
189
+ )
190
+ ) or None
191
+
192
+ rating = _clean_text(
193
+ await browser.eval_in_nth_first(
194
+ row_scope,
195
+ scope_index=0,
196
+ inner_selector="td:nth-child(5)",
197
+ expression="el => el.innerText",
198
+ )
199
+ ) or None
200
+
201
+ target_price_raw = await browser.eval_in_nth_first(
202
+ row_scope,
203
+ scope_index=0,
204
+ inner_selector="td:nth-child(6)",
205
+ expression="el => el.innerText",
206
+ )
207
+ target_price = _parse_target_price(target_price_raw)
208
+
209
+ pages_raw = await browser.eval_in_nth_first(
210
+ row_scope,
211
+ scope_index=0,
212
+ inner_selector="td:nth-child(7)",
213
+ expression="el => el.innerText",
214
+ )
215
+ pages = _parse_pages(pages_raw)
216
+
217
+ # 3) summary/comment: N으로 tdN / cN의 data-content를 직접 읽기
218
+ # (DOM에 존재하지만 display:none인 경우도 data-content는 읽을 수 있음)
219
+ summary_html = await browser.eval_in_nth_first(
220
+ table_selector,
221
+ scope_index=0,
222
+ inner_selector=f"td#td{n}",
223
+ expression="el => el.getAttribute('data-content') || ''",
224
+ )
225
+ comment_html = await browser.eval_in_nth_first(
226
+ table_selector,
227
+ scope_index=0,
228
+ inner_selector=f"td#c{n}",
229
+ expression="el => el.getAttribute('data-content') || ''",
230
+ )
231
+
232
+ summary = _prettify_report_text(_clean_html_to_text(_clean_text(summary_html)))
233
+ comment = _prettify_report_text(_clean_html_to_text(_clean_text(comment_html)))
234
+
235
+ out.append(
236
+ {
237
+ "row_id": n,
238
+ "date": date,
239
+ "title": title,
240
+ "authors": authors,
241
+ "provider": provider,
242
+ "rating": rating,
243
+ "target_price": target_price,
244
+ "pages": pages,
245
+ "summary": summary or None,
246
+ "comment": comment or None,
247
+ }
248
+ )
249
+
250
+ return out
251
+
252
+
253
+ async def parse_c108_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
254
+ return {"리포트": await parse_c108_recent_reports_dom(browser)}
File without changes
File without changes
@@ -0,0 +1,9 @@
1
+ # scraper2_hj3415/app/ports/browser/browser_factory_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol, AsyncContextManager
4
+
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+
7
+ class BrowserFactoryPort(Protocol):
8
+ def lease(self) -> AsyncContextManager[BrowserPort]: ...
9
+ async def aclose(self) -> None: ...
@@ -0,0 +1,115 @@
1
+ # scraper2_hj3415/app/ports/browser/browser_port.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Protocol, Any
5
+
6
+
7
+ class BrowserPort(Protocol):
8
+ async def wait_table_nth_ready(
9
+ self,
10
+ table_selector: str,
11
+ *,
12
+ index: int,
13
+ min_rows: int = 1,
14
+ timeout_ms: int = 20_000,
15
+ poll_ms: int = 200,
16
+ ) -> None: ...
17
+ async def title(self) -> str: ...
18
+ async def current_url(self) -> str: ...
19
+ async def goto_and_wait_for_stable(self, url: str, timeout_ms: int = 10_000) -> None: ...
20
+ async def reload(self, *, timeout_ms: int = 10_000) -> None: ...
21
+ async def sleep_ms(self, ms: int) -> None: ...
22
+ async def wait_attached(
23
+ self, selector: str, *, timeout_ms: int = 10_000
24
+ ) -> None: ...
25
+ async def wait_visible(
26
+ self, selector: str, *, timeout_ms: int = 10_000
27
+ ) -> None: ...
28
+ async def click(
29
+ self,
30
+ selector: str,
31
+ *,
32
+ index: int = 0,
33
+ timeout_ms: int = 4_000,
34
+ force: bool = False,
35
+ ) -> None: ...
36
+ async def try_click(
37
+ self,
38
+ selector: str,
39
+ *,
40
+ index: int = 0,
41
+ timeout_ms: int = 1_500,
42
+ force: bool = False,
43
+ ) -> bool: ...
44
+ async def count(self, selector: str) -> int: ...
45
+ async def scroll_into_view(self, selector: str, *, index: int = 0) -> None: ...
46
+ async def text_content_first(self, selector: str) -> str: ...
47
+ async def all_texts(self, selector: str) -> list[str]: ...
48
+ async def get_text_by_text(self, needle: str) -> str: ...
49
+ async def inner_text(self, selector: str) -> str: ...
50
+ async def outer_html_nth(self, selector: str, index: int) -> str: ...
51
+ async def wait_table_text_changed(
52
+ self,
53
+ table_selector: str,
54
+ *,
55
+ index: int,
56
+ prev_text: str | None,
57
+ min_rows: int = 1,
58
+ min_lines: int = 50,
59
+ timeout_sec: float = 12.0,
60
+ poll_sec: float = 0.2,
61
+ ) -> str: ...
62
+ async def is_attached(self, selector: str, *, index: int = 0) -> bool: ...
63
+ async def computed_style(
64
+ self, selector: str, *, index: int = 0, prop: str
65
+ ) -> str: ...
66
+ async def count_in_nth(
67
+ self,
68
+ scope_selector: str,
69
+ *,
70
+ scope_index: int,
71
+ inner_selector: str,
72
+ ) -> int: ...
73
+ async def eval_in_nth_first(
74
+ self,
75
+ scope_selector: str,
76
+ *,
77
+ scope_index: int,
78
+ inner_selector: str,
79
+ expression: str,
80
+ ) -> Any: ...
81
+ async def inner_text_in_nth(
82
+ self,
83
+ scope_selector: str,
84
+ *,
85
+ scope_index: int,
86
+ inner_selector: str,
87
+ inner_index: int = 0,
88
+ timeout_ms: int = 10_000,
89
+ ) -> str:
90
+ """
91
+ scope_selector의 nth(scope_index) 요소 안에서
92
+ inner_selector의 nth(inner_index) 요소의 innerText를 반환.
93
+ (렌더링 기준 텍스트: 줄바꿈/스타일 영향 반영)
94
+ """
95
+ ...
96
+
97
+ async def text_content_in_nth(
98
+ self,
99
+ scope_selector: str,
100
+ *,
101
+ scope_index: int,
102
+ inner_selector: str,
103
+ inner_index: int = 0,
104
+ timeout_ms: int = 10_000,
105
+ ) -> str:
106
+ """
107
+ scope_selector의 nth(scope_index) 요소 안에서
108
+ inner_selector의 nth(inner_index) 요소의 textContent를 반환.
109
+ (DOM 기준 텍스트: 숨김 텍스트도 포함될 수 있음)
110
+ """
111
+ ...
112
+
113
+ async def table_records(
114
+ self, table_selector: str, *, header: int | list[int] | None = 0
115
+ ) -> list[dict[str, Any]]: ...
File without changes
@@ -0,0 +1,28 @@
1
+ # scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol, Iterable, Optional, TypeVar
4
+ from datetime import datetime
5
+
6
+ from contracts_hj3415.nfs.nfs_dto import NfsDTO
7
+
8
+ TDto = TypeVar("TDto", bound=NfsDTO)
9
+
10
+
11
+ class NfsIngestPort(Protocol[TDto]):
12
+ async def execute(
13
+ self,
14
+ code: str,
15
+ *,
16
+ sleep_sec: float = ...,
17
+ asof: Optional[datetime] = None,
18
+ ) -> TDto:
19
+ ...
20
+
21
+ async def execute_many(
22
+ self,
23
+ codes: Iterable[str],
24
+ *,
25
+ sleep_sec: float = ...,
26
+ asof: Optional[datetime] = None,
27
+ ) -> list[TDto]:
28
+ ...
File without changes
@@ -0,0 +1,20 @@
1
+ # scraper2_hj3415/app/ports/sinks/nfs_sink_port.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Protocol, Iterable, TypeVar
5
+ from contracts_hj3415.nfs.types import Endpoints
6
+ from contracts_hj3415.nfs.nfs_dto import NfsDTO
7
+
8
+ TDto = TypeVar("TDto", bound=NfsDTO)
9
+
10
+ class NfsSinkPort(Protocol[TDto]):
11
+ async def write(
12
+ self, dto: TDto, *, endpoint: Endpoints
13
+ ) -> None: ...
14
+
15
+ async def write_many(
16
+ self,
17
+ dtos: Iterable[TDto],
18
+ *,
19
+ endpoint: Endpoints
20
+ ) -> None: ...
File without changes
@@ -0,0 +1,20 @@
1
+ # scraper2_hj3415/app/ports/site/wisereport_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol
4
+
5
+ class WiseReportPort(Protocol):
6
+ async def ensure_yearly_consensus_open_in_table_nth(
7
+ self,
8
+ *,
9
+ table_selector: str, # 예: TABLE_XPATH ("xpath=//div[@id='wrapper']//div//table")
10
+ table_index: int, # 예: TABLE_INDEX (2)
11
+ after_click_sleep_ms: int = 150,
12
+ max_rounds: int = 6,
13
+ wait_timeout_sec: float = 12.0,
14
+ ) -> bool: ...
15
+ async def click_steps(
16
+ self,
17
+ steps: list[tuple[str, str]],
18
+ *,
19
+ jitter_sec: float = 0.6,
20
+ ) -> None: ...
File without changes
File without changes
@@ -0,0 +1,59 @@
1
+ # scraper2_hj3415/app/usecases/fetch/fetch_c101.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import random
6
+ from typing import Iterable
7
+
8
+ from logging_hj3415 import logger
9
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2_hj3415.app.parsing.c101_parser import parse_c101_to_dict
11
+
12
+ from scraper2_hj3415.app.services.nfs_doc_builders import build_c101_doc_from_parsed
13
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
14
+ from scraper2_hj3415.app.domain.doc import NfsDoc
15
+ from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
16
+
17
+
18
+ class FetchC101:
19
+ def __init__(self, factory: BrowserFactoryPort):
20
+ self.factory = factory
21
+
22
+ async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
23
+ async with self.factory.lease() as browser:
24
+ url = f"https://navercomp.wisereport.co.kr/v2/company/c1010001.aspx?cmp_cd={code}"
25
+ await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
26
+
27
+ if sleep_sec > 0:
28
+ await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
29
+
30
+ parsed = await parse_c101_to_dict(browser)
31
+
32
+ logger.debug(f"parsed data: {parsed}")
33
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C101]
34
+ if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
35
+ logger.warning(
36
+ f"c101 fetch: parsed result empty; return None | code={code}"
37
+ )
38
+ return None
39
+
40
+ doc = build_c101_doc_from_parsed(
41
+ code=code, parsed=parsed, keep_empty_blocks=True
42
+ )
43
+ logger.debug(f"c101 doc: {doc}")
44
+ return doc
45
+
46
+ async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
47
+ return await self._fetch_one(code, sleep_sec=sleep_sec)
48
+
49
+ async def execute_many(
50
+ self,
51
+ codes: Iterable[str],
52
+ *,
53
+ sleep_sec: float = 2.0,
54
+ ) -> list[NfsDoc]:
55
+ results = await asyncio.gather(
56
+ *(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes),
57
+ return_exceptions=False,
58
+ )
59
+ return [r for r in results if r is not None]
@@ -0,0 +1,135 @@
1
+ # scraper2_hj3415/app/usecases/fetch/fetch_c103.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import random
6
+ from typing import Iterable, Any
7
+
8
+ from logging_hj3415 import logger
9
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2_hj3415.app.ports.site.wisereport_port import WiseReportPort
11
+
12
+ from scraper2_hj3415.app.adapters.site.wisereport_playwright import WiseReportPlaywright
13
+ from scraper2_hj3415.app.parsing.c103_parser import parse_c103_current_table
14
+ from scraper2_hj3415.app.services.nfs_doc_builders import build_metrics_doc_from_parsed
15
+
16
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
17
+ from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
18
+ from scraper2_hj3415.app.domain.doc import NfsDoc
19
+
20
+ BTN_SETS: dict[str, list[tuple[str, str]]] = {
21
+ "손익계산서y": [
22
+ ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
23
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
24
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
25
+ ],
26
+ "재무상태표y": [
27
+ ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
28
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
29
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
30
+ ],
31
+ "현금흐름표y": [
32
+ ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
33
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
34
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
35
+ ],
36
+ "손익계산서q": [
37
+ ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
38
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
39
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
40
+ ],
41
+ "재무상태표q": [
42
+ ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
43
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
44
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
45
+ ],
46
+ "현금흐름표q": [
47
+ ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
48
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
49
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
50
+ ],
51
+ }
52
+
53
+
54
+ class FetchC103:
55
+ def __init__(self, factory: BrowserFactoryPort):
56
+ self.factory = factory
57
+
58
+ async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
59
+ async with self.factory.lease() as browser:
60
+ wr: WiseReportPort = WiseReportPlaywright(browser)
61
+
62
+ url = (
63
+ "https://navercomp.wisereport.co.kr/v2/company/c1030001.aspx"
64
+ f"?cn=&cmp_cd={code}"
65
+ )
66
+ await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
67
+
68
+ if sleep_sec > 0:
69
+ await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
70
+
71
+ parsed: dict[str, list[dict[str, Any]]] = {}
72
+ prev_text: str | None = None
73
+
74
+ # 최초 기준 텍스트 확보(없어도 동작하게)
75
+ prev_text = await browser.wait_table_text_changed(
76
+ "xpath=//div[@id='wrapper']//div//table",
77
+ index=2,
78
+ prev_text=None,
79
+ min_rows=5,
80
+ min_lines=50,
81
+ timeout_sec=10.0,
82
+ )
83
+
84
+ for key, steps in BTN_SETS.items():
85
+ # ✅ 상태 전환 (행동)
86
+ await wr.click_steps(steps, jitter_sec=0.6) # 포트/어댑터로 이동 권장
87
+ await wr.ensure_yearly_consensus_open_in_table_nth(
88
+ table_selector="xpath=//div[@id='wrapper']//div//table",
89
+ table_index=2,
90
+ )
91
+
92
+ # ✅ 데이터 변경 대기 (행동)
93
+ prev_text = await browser.wait_table_text_changed(
94
+ "xpath=//div[@id='wrapper']//div//table",
95
+ index=2,
96
+ prev_text=prev_text,
97
+ min_rows=5,
98
+ min_lines=50,
99
+ timeout_sec=12.0,
100
+ )
101
+
102
+ # ✅ 파싱은 “현재 화면 테이블”만
103
+ try:
104
+ parsed[key] = await parse_c103_current_table(browser)
105
+ except Exception:
106
+ parsed[key] = []
107
+
108
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C103]
109
+ if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
110
+ logger.warning(
111
+ f"c103 fetch: parsed result empty; return None | code={code}"
112
+ )
113
+ return None
114
+
115
+ doc = build_metrics_doc_from_parsed(
116
+ code=code,
117
+ endpoint_kind=EndpointKind.C103,
118
+ parsed=parsed,
119
+ block_keys=block_keys,
120
+ item_key="항목",
121
+ raw_label_key="항목_raw",
122
+ keep_empty_blocks=True,
123
+ )
124
+ return doc
125
+
126
+ async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
127
+ return await self._fetch_one(code, sleep_sec=sleep_sec)
128
+
129
+ async def execute_many(
130
+ self, codes: Iterable[str], *, sleep_sec: float = 2.0
131
+ ) -> list[NfsDoc]:
132
+ results = await asyncio.gather(
133
+ *(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes)
134
+ )
135
+ return [r for r in results if r is not None]