scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
  3. scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
  4. scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
  5. scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
  6. scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
  7. scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
  8. scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
  9. scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
  10. scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
  11. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  12. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  13. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  14. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  15. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
  16. scraper2_hj3415/app/composition.py +225 -0
  17. scraper2_hj3415/app/domain/blocks.py +61 -0
  18. scraper2_hj3415/app/domain/constants.py +33 -0
  19. scraper2_hj3415/app/domain/doc.py +16 -0
  20. scraper2_hj3415/app/domain/endpoint.py +11 -0
  21. scraper2_hj3415/app/domain/series.py +11 -0
  22. scraper2_hj3415/app/domain/types.py +19 -0
  23. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  24. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  25. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  26. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  27. scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
  28. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  29. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  30. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  31. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  32. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  33. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  34. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  35. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  36. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  37. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  38. scraper2_hj3415/app/parsing/c103_parser.py +22 -0
  39. scraper2_hj3415/app/parsing/c104_parser.py +26 -0
  40. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  41. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  42. scraper2_hj3415/app/ports/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  44. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  45. scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
  46. scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
  47. scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
  48. scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
  49. scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
  50. scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
  51. scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
  52. scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
  53. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  55. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  56. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  57. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  58. scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
  59. scraper2_hj3415/app/services/__init__.py +0 -0
  60. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  61. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  62. scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
  63. scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
  64. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  65. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  66. scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
  67. scraper2_hj3415/app/usecases/__init__.py +0 -0
  68. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  69. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  70. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  71. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  72. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  73. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  74. scraper2/main.py → scraper2_hj3415/cli.py +45 -72
  75. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
  76. scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
  77. scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
  78. scraper2/adapters/out/playwright/browser.py +0 -102
  79. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  80. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  81. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  82. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  83. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  84. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  85. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  86. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  87. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  88. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  89. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  90. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  91. scraper2/app/composition.py +0 -204
  92. scraper2/app/parsing/_converters.py +0 -85
  93. scraper2/app/parsing/_normalize.py +0 -134
  94. scraper2/app/parsing/c101_parser.py +0 -143
  95. scraper2/app/parsing/c103_parser.py +0 -128
  96. scraper2/app/parsing/c104_parser.py +0 -143
  97. scraper2/app/parsing/c106_parser.py +0 -153
  98. scraper2/app/parsing/c108_parser.py +0 -65
  99. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  100. scraper2/app/ports/browser/browser_port.py +0 -22
  101. scraper2/app/ports/ingest_port.py +0 -14
  102. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  103. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  104. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  105. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  106. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  107. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  108. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  109. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  110. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  111. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  112. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  113. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  114. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  115. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  116. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  117. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  118. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  119. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  120. {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
  121. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  122. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  123. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  124. {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
  125. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  126. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  127. {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
  128. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  129. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  130. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  131. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  132. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  133. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  134. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
  135. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from common_hj3415.utils import clean_text
7
+
8
+ _EARNING_SURPRISE_TABLE = "#earning_list"
9
+
10
+ def _strip_bullets_commas(s: str) -> str:
11
+ """
12
+ "● 120,064.0" / "101,922.8" 같은 텍스트에서 숫자 파싱을 방해하는 것 제거.
13
+ """
14
+ s = clean_text(s)
15
+ s = s.replace(",", "")
16
+ s = s.replace("●", "")
17
+ s = s.replace("○", "")
18
+ s = s.replace("▲", "")
19
+ s = s.replace("▼", "")
20
+ return clean_text(s)
21
+
22
+
23
+ def _to_number_like(x: Any) -> Any:
24
+ """
25
+ 숫자면 float/int로, 아니면 문자열 그대로.
26
+ """
27
+ if x is None:
28
+ return None
29
+ if isinstance(x, (int, float)):
30
+ return x
31
+ s = _strip_bullets_commas(str(x))
32
+ if not s:
33
+ return None
34
+
35
+ # 숫자 패턴이면 숫자로
36
+ # - "65.00" "209.17" "-123.4"
37
+ if re.fullmatch(r"[-+]?\d+(\.\d+)?", s):
38
+ # 정수면 int 유지하고 싶으면 여기서 분기 가능
39
+ try:
40
+ f = float(s)
41
+ # "65.0" 같이 소수점 .0이면 int로 바꿀지 정책 선택
42
+ return f
43
+ except Exception:
44
+ return s
45
+
46
+ return s
47
+
48
+
49
+ def _norm_item_label(item: str) -> str:
50
+ """
51
+ item(th 텍스트) 정규화:
52
+ - "전분기대비보기 전년동기대비" -> "전년동기대비"
53
+ - "Surprise" 등은 그대로
54
+ """
55
+ t = clean_text(item)
56
+
57
+ # 버튼 텍스트가 섞이는 케이스: "전분기대비보기 전년동기대비"
58
+ if ("전분기대비" in t) and ("전년동기대비" in t):
59
+ return "전년동기대비"
60
+ if "전분기대비" in t:
61
+ return "전분기대비"
62
+ if "전년동기대비" in t:
63
+ return "전년동기대비"
64
+ if "컨센서스" in t:
65
+ return "컨센서스"
66
+ if "잠정치" in t:
67
+ return "잠정치"
68
+ if "Surprise" in t or "SURPRISE" in t or "surprise" in t:
69
+ return "Surprise"
70
+
71
+ return t
72
+
73
+
74
+ async def _row_cells_texts(
75
+ browser: BrowserPort,
76
+ *,
77
+ row_sel: str,
78
+ ) -> list[str]:
79
+ """
80
+ tbody의 특정 tr에서 th/td 텍스트를 왼쪽부터 순서대로 모두 가져온다.
81
+ """
82
+ # th,td 전체 개수
83
+ n = await browser.count_in_nth(
84
+ _EARNING_SURPRISE_TABLE,
85
+ scope_index=0,
86
+ inner_selector=f"{row_sel} th, {row_sel} td",
87
+ )
88
+
89
+ out: list[str] = []
90
+ for j in range(n):
91
+ txt = await browser.inner_text_in_nth(
92
+ _EARNING_SURPRISE_TABLE,
93
+ scope_index=0,
94
+ inner_selector=f"{row_sel} th, {row_sel} td",
95
+ inner_index=j,
96
+ )
97
+ out.append(clean_text(txt))
98
+ return out
99
+
100
+
101
+ async def parse_c101_earnings_surprise_table(
102
+ browser: BrowserPort,
103
+ *,
104
+ debug_rows: bool = False,
105
+ ) -> dict[str, Any]:
106
+ """
107
+ earning_list HTML 구조(제공된 원본)에 맞춘 안정 파서.
108
+
109
+ 반환:
110
+ {
111
+ "periods": [...],
112
+ "metrics": { section: { item: {period: value} } },
113
+ "meta": {...},
114
+ ...(debug_rows면 "rows": raw_cells_rows)
115
+ }
116
+ """
117
+ await browser.wait_attached(_EARNING_SURPRISE_TABLE)
118
+
119
+ row_cnt = await browser.count_in_nth(
120
+ _EARNING_SURPRISE_TABLE,
121
+ scope_index=0,
122
+ inner_selector="tbody tr",
123
+ )
124
+ if not row_cnt:
125
+ out = {"periods": [], "metrics": {}, "meta": {}}
126
+ if debug_rows:
127
+ out["rows"] = []
128
+ return out
129
+
130
+ raw_cells_rows: list[list[str]] = []
131
+
132
+ periods: list[str] = []
133
+ period_count = 0
134
+
135
+ metrics: dict[str, dict[str, dict[str, Any]]] = {}
136
+ meta: dict[str, dict[str, Any]] = {}
137
+
138
+ current_section: str | None = None
139
+
140
+ for i in range(1, row_cnt + 1): # nth-child 1-based
141
+ row_sel = f"tbody tr:nth-child({i})"
142
+ cells = await _row_cells_texts(browser, row_sel=row_sel)
143
+ raw_cells_rows.append(cells)
144
+
145
+ if not cells:
146
+ continue
147
+
148
+ joined = " ".join([c for c in cells if c])
149
+
150
+ # 1) periods 추출: "재무연월" 헤더 row
151
+ # HTML: <th colspan="2">재무연월</th> + <th>2025/09</th> + <th>2025/12</th>
152
+ if ("재무연월" in joined) and not periods:
153
+ # cells 예: ["재무연월", "2025/09", "2025/12"] 또는 table 구조에 따라 3~4개
154
+ # 여기서는 "YYYY/NN" 패턴만 뽑는 게 가장 안전함
155
+ periods = [c for c in cells if re.fullmatch(r"\d{4}/\d{2}", c)]
156
+ period_count = len(periods)
157
+ continue
158
+
159
+ # periods 없으면 본문 해석 불가
160
+ if not periods:
161
+ continue
162
+
163
+ # 2) meta row: "잠정치발표(예정)일/회계기준"
164
+ if "잠정치발표(예정)일/회계기준" in joined:
165
+ # 보통 cells: ["잠정치발표(예정)일/회계기준", "2025/10/14(연결)", "2026/01/08(연결)"]
166
+ vals = [c for c in cells if c and "잠정치발표" not in c]
167
+ vals = vals[-period_count:] if period_count else vals
168
+ meta["잠정치발표(예정)일/회계기준"] = {
169
+ periods[idx]: vals[idx] if idx < len(vals) else None
170
+ for idx in range(period_count)
171
+ }
172
+ continue
173
+
174
+ # 3) 본문 row 정규화: 항상 [section, item, v1, v2, ...] 로 맞추기
175
+ # HTML 케이스:
176
+ # - 섹션 시작 행(영업이익/당기순이익): cells = ["영업이익", "컨센서스", v1, v2]
177
+ # - rowspan 내부 다음 행: cells = ["잠정치", v1, v2] (section 없음 → 왼쪽 패딩 필요)
178
+ # - ext0 행(전분기대비): cells = ["", "전분기대비", v1, v2] (첫 칸 빈 th)
179
+ #
180
+ # period_count가 2라면, 정상형은 길이 2 + period_count = 4
181
+ want_len = 2 + period_count
182
+
183
+ norm = cells[:]
184
+ if len(norm) == want_len - 1:
185
+ # section th가 빠진 케이스: ["잠정치", v1, v2] -> ["", "잠정치", v1, v2]
186
+ norm = [""] + norm
187
+ elif len(norm) < want_len:
188
+ # 애매한 경우: 오른쪽을 None으로 채움
189
+ norm = ([""] * (want_len - len(norm))) + norm
190
+ norm = norm[-want_len:]
191
+
192
+ section_cell = clean_text(norm[0])
193
+ item_cell = clean_text(norm[1])
194
+ value_cells = norm[2 : 2 + period_count]
195
+
196
+ # section 갱신
197
+ if section_cell:
198
+ current_section = section_cell
199
+ metrics.setdefault(current_section, {})
200
+ if not current_section:
201
+ # 섹션이 한 번도 잡히지 않은 상태면 skip
202
+ continue
203
+
204
+ item = _norm_item_label(item_cell)
205
+ if not item:
206
+ continue
207
+
208
+ # 값 매핑
209
+ bucket = metrics[current_section].setdefault(item, {})
210
+ for idx, p in enumerate(periods):
211
+ raw_v = value_cells[idx] if idx < len(value_cells) else None
212
+ bucket[p] = _to_number_like(raw_v)
213
+
214
+ out: dict[str, Any] = {"periods": periods, "metrics": metrics, "meta": meta}
215
+ if debug_rows:
216
+ out["rows"] = raw_cells_rows
217
+ return out
@@ -0,0 +1,95 @@
1
+ # scraper2_hj3415/app/parsing/c101/fundamentals.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
7
+ from common_hj3415.utils import clean_text
8
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
9
+ from scraper2_hj3415.app.parsing._normalize.values import to_number_or_text
10
+
11
+ _FUNDAMENTALS_TABLE = "div.fund.fl_le table.gHead03"
12
+
13
+
14
+ def _normalize_period_key(s: str) -> str:
15
+ """
16
+ 예)
17
+ "2024/12(A)" -> "2024/12"
18
+ "2025/12(E)" -> "2025/12"
19
+ "2025/12" -> "2025/12"
20
+ """
21
+ s = s.strip()
22
+ # 뒤쪽 괄호 주석 제거: (A) (E) (P) 등
23
+ s = re.sub(r"\([^)]*\)$", "", s).strip()
24
+ return s
25
+
26
+ EXCLUDED_METRICS = {"회계기준"}
27
+
28
+ async def parse_c101_fundamentals_table(
29
+ browser: BrowserPort,
30
+ ) -> dict[str, dict[str, Any]]:
31
+ """
32
+ '펀더멘털 주요지표(실적/컨센서스)' 테이블을
33
+ metric_key -> {period_key -> value} 형태로 반환한다.
34
+
35
+ 반환 예)
36
+ {
37
+ "PBR": {"2024/12": 13.62, "2025/12": None},
38
+ "회계기준": {"2024/12": "연결", "2025/12": "연결"},
39
+ ...
40
+ }
41
+ """
42
+ await browser.wait_attached(_FUNDAMENTALS_TABLE)
43
+
44
+ rows = await browser.table_records(_FUNDAMENTALS_TABLE, header=0)
45
+ if not rows:
46
+ return {}
47
+
48
+ cleaned_rows: list[dict[str, Any]] = []
49
+ for r in rows:
50
+ rr: dict[str, Any] = {}
51
+ for k, v in r.items():
52
+ kk = clean_text(k)
53
+ if not kk:
54
+ continue
55
+ rr[kk] = normalize_text(v) if kk == "주요지표" else to_number_or_text(v)
56
+
57
+ if rr.get("주요지표"):
58
+ cleaned_rows.append(rr)
59
+
60
+ if not cleaned_rows:
61
+ return {}
62
+
63
+ # columns: 순서 보존 합치기
64
+ seen: set[str] = set()
65
+ columns: list[str] = []
66
+ for rr in cleaned_rows:
67
+ for kk in rr.keys():
68
+ if kk not in seen:
69
+ seen.add(kk)
70
+ columns.append(kk)
71
+
72
+ metric_col = "주요지표" if "주요지표" in columns else columns[0]
73
+ raw_value_cols = [c for c in columns if c != metric_col]
74
+
75
+ # period_cols 정규화(괄호 제거)
76
+ # ⚠️ "2024/12(A)" / "2025/12" 같은 원본 컬럼명을 유지해야 rr.get(...)이 되므로
77
+ # (원본컬럼, 정규화컬럼) 페어로 들고 간다.
78
+ col_pairs: list[tuple[str, str]] = [(c, _normalize_period_key(c)) for c in raw_value_cols]
79
+
80
+ metrics: dict[str, dict[str, Any]] = {}
81
+
82
+ for rr in cleaned_rows:
83
+ name = rr.get(metric_col)
84
+ if not name:
85
+ continue
86
+
87
+ metric_key = str(name).strip()
88
+ if metric_key in EXCLUDED_METRICS:
89
+ continue # ⬅️ 여기서 제외
90
+
91
+ bucket = metrics.setdefault(metric_key, {})
92
+ for raw_c, norm_c in col_pairs:
93
+ bucket[norm_c] = rr.get(raw_c)
94
+
95
+ return metrics
@@ -0,0 +1,57 @@
1
+ # scraper2_hj3415/app/parsing/c101/major_shareholders.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
7
+ from scraper2_hj3415.app.parsing._normalize.label import normalize_key_label
8
+ from scraper2_hj3415.app.parsing._normalize.values import to_int, to_float
9
+
10
+ def _pick_value_by_norm_key(row: dict[str, Any], candidates: list[str]) -> Any:
11
+ # row의 키들을 정규화 맵으로 만든 뒤 후보를 정규화해서 조회
12
+ norm_map: dict[str, str] = {
13
+ normalize_key_label(k): k for k in row.keys()
14
+ }
15
+ for cand in candidates:
16
+ rk = norm_map.get(normalize_key_label(cand))
17
+ if rk is None:
18
+ continue
19
+ v = row.get(rk)
20
+ # "키만 있고 값이 비어있는" 경우 다음 후보 탐색
21
+ if v is None:
22
+ continue
23
+ if isinstance(v, str) and not v.strip():
24
+ continue
25
+ return v
26
+ return None
27
+
28
+
29
+ async def parse_c101_major_shareholders(browser: BrowserPort) -> list[dict[str, Any]]:
30
+ table_sel = "#cTB13"
31
+ await browser.wait_attached(table_sel)
32
+
33
+ records = await browser.table_records(table_sel, header=0)
34
+
35
+ if not records:
36
+ return []
37
+
38
+ out: list[dict[str, Any]] = []
39
+ for r in records:
40
+ name = normalize_text(_pick_value_by_norm_key(r, ["주요주주", "주요주주명"]))
41
+ if not name:
42
+ continue
43
+
44
+ shares_raw = _pick_value_by_norm_key(
45
+ r, ["보유주식수(보통)", "보유주식수", "보유주식수(보통주)"]
46
+ )
47
+ ratio_raw = _pick_value_by_norm_key(r, ["보유지분(%)", "보유지분", "보유지분%"])
48
+
49
+ out.append(
50
+ {
51
+ "주요주주": name,
52
+ "보유주식수": to_int(shares_raw), # 파싱 실패 시 None 가능
53
+ "보유지분": to_float(ratio_raw), # "0.91%"도 처리되게 파서 보장 필요
54
+ }
55
+ )
56
+
57
+ return out
@@ -0,0 +1,47 @@
1
+ # scraper2_hj3415/app/parsing/c101/sise.py
2
+ from __future__ import annotations
3
+
4
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
5
+ from common_hj3415.utils import clean_text
6
+ from ._sise_normalizer import normalize_sise_kv_map
7
+
8
+ _SISE_TABLE = "#cTB11"
9
+
10
+ async def parse_c101_sise_table(browser: BrowserPort) -> dict[str, str]:
11
+ """
12
+ #cTB11 시세정보 테이블을 th(항목명) -> td(값) dict로 추출한다.
13
+ - 화면에 보이는 텍스트 기준(innerText)
14
+ """
15
+ await browser.wait_attached(_SISE_TABLE)
16
+
17
+ row_cnt = await browser.count_in_nth(
18
+ _SISE_TABLE,
19
+ scope_index=0,
20
+ inner_selector="tbody tr",
21
+ )
22
+
23
+ out: dict[str, str] = {}
24
+
25
+ for i in range(1, row_cnt + 1): # nth-child는 1-based
26
+ row_sel = f"tbody tr:nth-child({i})"
27
+
28
+ key = await browser.inner_text_in_nth(
29
+ _SISE_TABLE,
30
+ scope_index=0,
31
+ inner_selector=f"{row_sel} th",
32
+ inner_index=0,
33
+ )
34
+ val = await browser.inner_text_in_nth(
35
+ _SISE_TABLE,
36
+ scope_index=0,
37
+ inner_selector=f"{row_sel} td",
38
+ inner_index=0,
39
+ )
40
+
41
+ k = clean_text(key)
42
+ v = clean_text(val)
43
+ if k:
44
+ out[k] = v
45
+ raw = out
46
+ return normalize_sise_kv_map(raw)
47
+
@@ -0,0 +1,87 @@
1
+ # scraper2_hj3415/app/parsing/c101/summary_cmp.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from common_hj3415.utils import clean_text
7
+ from scraper2_hj3415.app.parsing._normalize.values import to_number
8
+
9
+
10
+ async def parse_c101_summary_cmp_table(browser: BrowserPort) -> dict[str, Any]:
11
+ """
12
+ <table class="cmp-table"> (회사 요약 테이블)에서 종목 기본 + EPS/BPS/PER... 등을 추출한다.
13
+
14
+ 반환 예:
15
+ {
16
+ "종목명": "삼성전자",
17
+ "코드": "005930",
18
+ "영문명": "SamsungElec",
19
+ "시장": "KOSPI : 코스피 전기·전자",
20
+ "WICS": "WICS : 반도체와반도체장비",
21
+ "EPS": 4816,
22
+ "BPS": 60632,
23
+ "PER": 31.58,
24
+ "업종PER": 21.93,
25
+ "PBR": 2.51,
26
+ "현금배당수익률": 0.95,
27
+ "결산": "12월 결산",
28
+ }
29
+ """
30
+ out: dict[str, Any] = {}
31
+
32
+ # 테이블 존재 확인
33
+ await browser.wait_attached("table.cmp-table")
34
+
35
+ # --- 1) td0101: 종목명/코드/영문/시장/WICS ---
36
+ out["종목명"] = clean_text(
37
+ await browser.text_content_first("table.cmp-table td.td0101 span.name")
38
+ )
39
+ out["코드"] = clean_text(
40
+ await browser.text_content_first("table.cmp-table td.td0101 b.num")
41
+ )
42
+
43
+ # td0101의 dt 텍스트들을 읽어 분류
44
+ dt0101 = await browser.all_texts("table.cmp-table td.td0101 dl > dt")
45
+ for t in dt0101[1:] if dt0101 else []:
46
+ t = clean_text(t)
47
+ if not t:
48
+ continue
49
+ if t.startswith("KOSPI") or t.startswith("KOSDAQ"):
50
+ out["시장"] = t
51
+ elif t.startswith("WICS"):
52
+ out["WICS"] = t
53
+ else:
54
+ if "영문명" not in out:
55
+ out["영문명"] = t
56
+
57
+ # --- 2) td0301: EPS/BPS/PER/업종PER/PBR/현금배당수익률/결산 ---
58
+ base_dl = "table.cmp-table td.td0301 dl"
59
+ dt_sel = f"{base_dl} > dt"
60
+
61
+ dt_texts = await browser.all_texts(dt_sel) # dt 전체 텍스트(숫자 포함)
62
+ if not dt_texts:
63
+ return out
64
+
65
+ # dt는 DOM 상에서 1..N 순서
66
+ for i, raw_dt in enumerate(dt_texts, start=1):
67
+ dt_text = clean_text(raw_dt)
68
+ if not dt_text:
69
+ continue
70
+
71
+ num_sel = f"{base_dl} > dt:nth-child({i}) b.num"
72
+
73
+ # 숫자 없는 라인: 예) "12월 결산"
74
+ if not await browser.is_attached(num_sel):
75
+ if "결산" in dt_text:
76
+ out["결산"] = dt_text
77
+ continue
78
+
79
+ num_text = clean_text(await browser.text_content_first(num_sel))
80
+ if not num_text:
81
+ continue
82
+
83
+ label = clean_text(dt_text.replace(num_text, "")).replace(":", "")
84
+ if label:
85
+ out[label] = to_number(num_text)
86
+
87
+ return out
@@ -0,0 +1,197 @@
1
+ # scraper2_hj3415/app/parsing/c101/yearly_consensus.py
2
+ from __future__ import annotations
3
+
4
+ from io import StringIO
5
+ import re
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
11
+ from scraper2_hj3415.app.parsing._normalize.values import to_float
12
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
13
+ from common_hj3415.utils import clean_text
14
+ from logging_hj3415 import logger
15
+
16
+ _YEARLY_CONSENSUS_TABLE = "#cTB25"
17
+
18
+
19
+ # -----------------------------
20
+ # column / period normalize
21
+ # -----------------------------
22
+ _COL_UNIT_RE = re.compile(r"\([^)]*\)") # (억원, %), (원), (배) ... 제거용
23
+ _PERIOD_RE = re.compile(r"^\s*(\d{4})\s*\(?([A-Za-z])?\)?\s*$") # 2022(A), 2025(E)
24
+
25
+
26
+ def _flatten_col(col: Any) -> str:
27
+ """
28
+ pd.read_html(header=[0,1])로 생긴 MultiIndex 컬럼을 '매출액_금액' 같은 단일 키로 만든다.
29
+ - ('매출액(억원, %)', '금액') -> '매출액_금액'
30
+ - ('매출액(억원, %)', 'YoY') -> '매출액_YoY'
31
+ - 단위 괄호 제거
32
+ """
33
+ if isinstance(col, tuple):
34
+ parts = [clean_text(str(p)) for p in col if clean_text(str(p))]
35
+ if len(parts) == 2 and parts[0] == parts[1]:
36
+ s = parts[0]
37
+ else:
38
+ s = "_".join(parts) if parts else ""
39
+ else:
40
+ s = clean_text(str(col))
41
+
42
+ # 단위 괄호 제거
43
+ s = _COL_UNIT_RE.sub("", s)
44
+ s = clean_text(s)
45
+
46
+ # 컬럼 표기 깨짐 보정
47
+ s = s.replace("주재 무제표", "주재무제표")
48
+
49
+ # 공백 제거(키 안정화)
50
+ s = s.replace(" ", "")
51
+ return s
52
+
53
+
54
+ def _normalize_period(
55
+ s: Any,
56
+ *,
57
+ keep_suffix: bool = False,
58
+ ) -> str | None:
59
+ """
60
+ 기간 문자열을 표준 period key로 정규화한다.
61
+
62
+ - "2022(A)", "2026(E)", "2022" 등을 처리
63
+ - 기본 정책: 연간 = YYYY/12
64
+ """
65
+ t = normalize_text(s)
66
+ if not t:
67
+ return None
68
+
69
+ # 헤더 방어
70
+ if t == "재무년월":
71
+ return None
72
+
73
+ # 이미 표준 포맷이면 그대로
74
+ if re.fullmatch(r"\d{4}/\d{2}", t):
75
+ return t
76
+
77
+ m = _PERIOD_RE.match(t)
78
+ if not m:
79
+ return None
80
+
81
+ year, suffix = m.groups() # suffix: "A" | "E" | None
82
+
83
+ if keep_suffix and suffix:
84
+ return f"{year}{suffix}"
85
+
86
+ return f"{year}/{12}"
87
+
88
+
89
+ def _normalize_metric_key(col_key: str) -> str:
90
+ """
91
+ 최종 metric key를 사람이 쓰기 좋은 형태로 정리.
92
+ """
93
+ k = col_key
94
+
95
+ # 매출액은 '금액'/'YoY'가 분리되어 있으니 명시적으로 이름을 고정
96
+ if k.startswith("매출액_금액"):
97
+ return "매출액"
98
+ if k.startswith("매출액_YoY"):
99
+ return "매출액YoY"
100
+
101
+ # 나머지는 그대로(단위/공백은 _flatten_col에서 제거됨)
102
+ # 예: "영업이익", "당기순이익", "EPS", "PER", "PBR", "ROE", "EV/EBITDA", "순부채비율"
103
+ return k
104
+
105
+
106
+ def _html_to_df(html: str) -> pd.DataFrame | None:
107
+ """
108
+ yearly consensus 테이블은 2줄 헤더이므로 header=[0,1]로 읽고 flatten한다.
109
+ """
110
+ try:
111
+ dfs = pd.read_html(StringIO(html), header=[0, 1])
112
+ except Exception as e:
113
+ logger.exception("pd.read_html failed: {}", e)
114
+ return None
115
+ if not dfs:
116
+ return None
117
+ df = dfs[0]
118
+ if df is None or df.empty:
119
+ return None
120
+
121
+ df = df.copy()
122
+ df.columns = [_flatten_col(c) for c in df.columns]
123
+ return df
124
+
125
+
126
+ def _df_to_metric_map(df: pd.DataFrame) -> dict[str, dict[str, Any]]:
127
+ """
128
+ DataFrame(row: period, col: metric) -> {metric: {period: value}} 로 pivot
129
+ """
130
+ if df is None or df.empty:
131
+ return {}
132
+
133
+ # NaN -> None
134
+ df = df.where(pd.notnull(df), None)
135
+
136
+ # '재무년월' 컬럼 찾기(안정)
137
+ # 보통 "재무년월"로 flatten 되지만, 혹시 깨지는 경우 대비
138
+ period_col = None
139
+ for c in df.columns:
140
+ if "재무년월" == c or c.endswith("재무년월") or "재무년월" in c:
141
+ period_col = c
142
+ break
143
+ if not period_col:
144
+ logger.warning("[cTB25] period column not found")
145
+ return {}
146
+
147
+ out: dict[str, dict[str, Any]] = {}
148
+
149
+ for _, row in df.iterrows():
150
+ period = _normalize_period(row.get(period_col), keep_suffix=True)
151
+ if not period:
152
+ continue
153
+
154
+ for col, raw_val in row.items():
155
+ if col == period_col:
156
+ continue
157
+ # 주재무제표는 metric-map에서 제외(원하면 따로 meta로 빼도 됨)
158
+ if "주재무제표" in str(col):
159
+ continue
160
+
161
+ metric = _normalize_metric_key(str(col))
162
+
163
+ num = to_float(raw_val)
164
+ val: Any = num if num is not None else (normalize_text(raw_val) or None)
165
+
166
+ out.setdefault(metric, {})[period] = val
167
+
168
+ return out
169
+
170
+
171
+ async def parse_c101_yearly_consensus_table(
172
+ browser: BrowserPort,
173
+ ) -> dict[str, dict[str, Any]]:
174
+ """
175
+ #cTB25 (3년 실적 + 2년 추정) 테이블을
176
+ {metric: {period: value}} 형태로 반환한다.
177
+ """
178
+ await browser.wait_attached(_YEARLY_CONSENSUS_TABLE)
179
+ await browser.wait_table_nth_ready(
180
+ _YEARLY_CONSENSUS_TABLE,
181
+ index=0,
182
+ min_rows=5,
183
+ timeout_ms=30_000,
184
+ poll_ms=200,
185
+ )
186
+
187
+ html = await browser.outer_html_nth(_YEARLY_CONSENSUS_TABLE, 0)
188
+ if not html or "<table" not in html:
189
+ logger.warning("[cTB25] outerHTML invalid or empty")
190
+ return {}
191
+
192
+ df = _html_to_df(html)
193
+ if df is None:
194
+ logger.warning("[cTB25] df is empty/invalid")
195
+ return {}
196
+
197
+ return _df_to_metric_map(df)