scraper2-hj3415 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -80
  60. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.0.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.0.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,57 @@
1
+ # scraper2_hj3415/app/parsing/c101/major_shareholders.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
7
+ from scraper2_hj3415.app.parsing._normalize.label import normalize_key_label
8
+ from scraper2_hj3415.app.parsing._normalize.values import to_int, to_float
9
+
10
+ def _pick_value_by_norm_key(row: dict[str, Any], candidates: list[str]) -> Any:
11
+ # row의 키들을 정규화 맵으로 만든 뒤 후보를 정규화해서 조회
12
+ norm_map: dict[str, str] = {
13
+ normalize_key_label(k): k for k in row.keys()
14
+ }
15
+ for cand in candidates:
16
+ rk = norm_map.get(normalize_key_label(cand))
17
+ if rk is None:
18
+ continue
19
+ v = row.get(rk)
20
+ # "키만 있고 값이 비어있는" 경우 다음 후보 탐색
21
+ if v is None:
22
+ continue
23
+ if isinstance(v, str) and not v.strip():
24
+ continue
25
+ return v
26
+ return None
27
+
28
+
29
+ async def parse_c101_major_shareholders(browser: BrowserPort) -> list[dict[str, Any]]:
30
+ table_sel = "#cTB13"
31
+ await browser.wait_attached(table_sel)
32
+
33
+ records = await browser.table_records(table_sel, header=0)
34
+
35
+ if not records:
36
+ return []
37
+
38
+ out: list[dict[str, Any]] = []
39
+ for r in records:
40
+ name = normalize_text(_pick_value_by_norm_key(r, ["주요주주", "주요주주명"]))
41
+ if not name:
42
+ continue
43
+
44
+ shares_raw = _pick_value_by_norm_key(
45
+ r, ["보유주식수(보통)", "보유주식수", "보유주식수(보통주)"]
46
+ )
47
+ ratio_raw = _pick_value_by_norm_key(r, ["보유지분(%)", "보유지분", "보유지분%"])
48
+
49
+ out.append(
50
+ {
51
+ "주요주주": name,
52
+ "보유주식수": to_int(shares_raw), # 파싱 실패 시 None 가능
53
+ "보유지분": to_float(ratio_raw), # "0.91%"도 처리되게 파서 보장 필요
54
+ }
55
+ )
56
+
57
+ return out
@@ -0,0 +1,47 @@
1
+ # scraper2_hj3415/app/parsing/c101/sise.py
2
+ from __future__ import annotations
3
+
4
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
5
+ from common_hj3415.utils import clean_text
6
+ from ._sise_normalizer import normalize_sise_kv_map
7
+
8
+ _SISE_TABLE = "#cTB11"
9
+
10
+ async def parse_c101_sise_table(browser: BrowserPort) -> dict[str, str]:
11
+ """
12
+ #cTB11 시세정보 테이블을 th(항목명) -> td(값) dict로 추출한다.
13
+ - 화면에 보이는 텍스트 기준(innerText)
14
+ """
15
+ await browser.wait_attached(_SISE_TABLE)
16
+
17
+ row_cnt = await browser.count_in_nth(
18
+ _SISE_TABLE,
19
+ scope_index=0,
20
+ inner_selector="tbody tr",
21
+ )
22
+
23
+ out: dict[str, str] = {}
24
+
25
+ for i in range(1, row_cnt + 1): # nth-child는 1-based
26
+ row_sel = f"tbody tr:nth-child({i})"
27
+
28
+ key = await browser.inner_text_in_nth(
29
+ _SISE_TABLE,
30
+ scope_index=0,
31
+ inner_selector=f"{row_sel} th",
32
+ inner_index=0,
33
+ )
34
+ val = await browser.inner_text_in_nth(
35
+ _SISE_TABLE,
36
+ scope_index=0,
37
+ inner_selector=f"{row_sel} td",
38
+ inner_index=0,
39
+ )
40
+
41
+ k = clean_text(key)
42
+ v = clean_text(val)
43
+ if k:
44
+ out[k] = v
45
+ raw = out
46
+ return normalize_sise_kv_map(raw)
47
+
@@ -0,0 +1,87 @@
1
+ # scraper2_hj3415/app/parsing/c101/summary_cmp.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from common_hj3415.utils import clean_text
7
+ from scraper2_hj3415.app.parsing._normalize.values import to_number
8
+
9
+
10
+ async def parse_c101_summary_cmp_table(browser: BrowserPort) -> dict[str, Any]:
11
+ """
12
+ <table class="cmp-table"> (회사 요약 테이블)에서 종목 기본 + EPS/BPS/PER... 등을 추출한다.
13
+
14
+ 반환 예:
15
+ {
16
+ "종목명": "삼성전자",
17
+ "코드": "005930",
18
+ "영문명": "SamsungElec",
19
+ "시장": "KOSPI : 코스피 전기·전자",
20
+ "WICS": "WICS : 반도체와반도체장비",
21
+ "EPS": 4816,
22
+ "BPS": 60632,
23
+ "PER": 31.58,
24
+ "업종PER": 21.93,
25
+ "PBR": 2.51,
26
+ "현금배당수익률": 0.95,
27
+ "결산": "12월 결산",
28
+ }
29
+ """
30
+ out: dict[str, Any] = {}
31
+
32
+ # 테이블 존재 확인
33
+ await browser.wait_attached("table.cmp-table")
34
+
35
+ # --- 1) td0101: 종목명/코드/영문/시장/WICS ---
36
+ out["종목명"] = clean_text(
37
+ await browser.text_content_first("table.cmp-table td.td0101 span.name")
38
+ )
39
+ out["코드"] = clean_text(
40
+ await browser.text_content_first("table.cmp-table td.td0101 b.num")
41
+ )
42
+
43
+ # td0101의 dt 텍스트들을 읽어 분류
44
+ dt0101 = await browser.all_texts("table.cmp-table td.td0101 dl > dt")
45
+ for t in dt0101[1:] if dt0101 else []:
46
+ t = clean_text(t)
47
+ if not t:
48
+ continue
49
+ if t.startswith("KOSPI") or t.startswith("KOSDAQ"):
50
+ out["시장"] = t
51
+ elif t.startswith("WICS"):
52
+ out["WICS"] = t
53
+ else:
54
+ if "영문명" not in out:
55
+ out["영문명"] = t
56
+
57
+ # --- 2) td0301: EPS/BPS/PER/업종PER/PBR/현금배당수익률/결산 ---
58
+ base_dl = "table.cmp-table td.td0301 dl"
59
+ dt_sel = f"{base_dl} > dt"
60
+
61
+ dt_texts = await browser.all_texts(dt_sel) # dt 전체 텍스트(숫자 포함)
62
+ if not dt_texts:
63
+ return out
64
+
65
+ # dt는 DOM 상에서 1..N 순서
66
+ for i, raw_dt in enumerate(dt_texts, start=1):
67
+ dt_text = clean_text(raw_dt)
68
+ if not dt_text:
69
+ continue
70
+
71
+ num_sel = f"{base_dl} > dt:nth-child({i}) b.num"
72
+
73
+ # 숫자 없는 라인: 예) "12월 결산"
74
+ if not await browser.is_attached(num_sel):
75
+ if "결산" in dt_text:
76
+ out["결산"] = dt_text
77
+ continue
78
+
79
+ num_text = clean_text(await browser.text_content_first(num_sel))
80
+ if not num_text:
81
+ continue
82
+
83
+ label = clean_text(dt_text.replace(num_text, "")).replace(":", "")
84
+ if label:
85
+ out[label] = to_number(num_text)
86
+
87
+ return out
@@ -0,0 +1,197 @@
1
+ # scraper2_hj3415/app/parsing/c101/yearly_consensus.py
2
+ from __future__ import annotations
3
+
4
+ from io import StringIO
5
+ import re
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
11
+ from scraper2_hj3415.app.parsing._normalize.values import to_float
12
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
13
+ from common_hj3415.utils import clean_text
14
+ from logging_hj3415 import logger
15
+
16
+ _YEARLY_CONSENSUS_TABLE = "#cTB25"
17
+
18
+
19
+ # -----------------------------
20
+ # column / period normalize
21
+ # -----------------------------
22
+ _COL_UNIT_RE = re.compile(r"\([^)]*\)") # (억원, %), (원), (배) ... 제거용
23
+ _PERIOD_RE = re.compile(r"^\s*(\d{4})\s*\(?([A-Za-z])?\)?\s*$") # 2022(A), 2025(E)
24
+
25
+
26
+ def _flatten_col(col: Any) -> str:
27
+ """
28
+ pd.read_html(header=[0,1])로 생긴 MultiIndex 컬럼을 '매출액_금액' 같은 단일 키로 만든다.
29
+ - ('매출액(억원, %)', '금액') -> '매출액_금액'
30
+ - ('매출액(억원, %)', 'YoY') -> '매출액_YoY'
31
+ - 단위 괄호 제거
32
+ """
33
+ if isinstance(col, tuple):
34
+ parts = [clean_text(str(p)) for p in col if clean_text(str(p))]
35
+ if len(parts) == 2 and parts[0] == parts[1]:
36
+ s = parts[0]
37
+ else:
38
+ s = "_".join(parts) if parts else ""
39
+ else:
40
+ s = clean_text(str(col))
41
+
42
+ # 단위 괄호 제거
43
+ s = _COL_UNIT_RE.sub("", s)
44
+ s = clean_text(s)
45
+
46
+ # 컬럼 표기 깨짐 보정
47
+ s = s.replace("주재 무제표", "주재무제표")
48
+
49
+ # 공백 제거(키 안정화)
50
+ s = s.replace(" ", "")
51
+ return s
52
+
53
+
54
+ def _normalize_period(
55
+ s: Any,
56
+ *,
57
+ keep_suffix: bool = False,
58
+ ) -> str | None:
59
+ """
60
+ 기간 문자열을 표준 period key로 정규화한다.
61
+
62
+ - "2022(A)", "2026(E)", "2022" 등을 처리
63
+ - 기본 정책: 연간 = YYYY/12
64
+ """
65
+ t = normalize_text(s)
66
+ if not t:
67
+ return None
68
+
69
+ # 헤더 방어
70
+ if t == "재무년월":
71
+ return None
72
+
73
+ # 이미 표준 포맷이면 그대로
74
+ if re.fullmatch(r"\d{4}/\d{2}", t):
75
+ return t
76
+
77
+ m = _PERIOD_RE.match(t)
78
+ if not m:
79
+ return None
80
+
81
+ year, suffix = m.groups() # suffix: "A" | "E" | None
82
+
83
+ if keep_suffix and suffix:
84
+ return f"{year}{suffix}"
85
+
86
+ return f"{year}/{12}"
87
+
88
+
89
+ def _normalize_metric_key(col_key: str) -> str:
90
+ """
91
+ 최종 metric key를 사람이 쓰기 좋은 형태로 정리.
92
+ """
93
+ k = col_key
94
+
95
+ # 매출액은 '금액'/'YoY'가 분리되어 있으니 명시적으로 이름을 고정
96
+ if k.startswith("매출액_금액"):
97
+ return "매출액"
98
+ if k.startswith("매출액_YoY"):
99
+ return "매출액YoY"
100
+
101
+ # 나머지는 그대로(단위/공백은 _flatten_col에서 제거됨)
102
+ # 예: "영업이익", "당기순이익", "EPS", "PER", "PBR", "ROE", "EV/EBITDA", "순부채비율"
103
+ return k
104
+
105
+
106
+ def _html_to_df(html: str) -> pd.DataFrame | None:
107
+ """
108
+ yearly consensus 테이블은 2줄 헤더이므로 header=[0,1]로 읽고 flatten한다.
109
+ """
110
+ try:
111
+ dfs = pd.read_html(StringIO(html), header=[0, 1])
112
+ except Exception as e:
113
+ logger.exception("pd.read_html failed: {}", e)
114
+ return None
115
+ if not dfs:
116
+ return None
117
+ df = dfs[0]
118
+ if df is None or df.empty:
119
+ return None
120
+
121
+ df = df.copy()
122
+ df.columns = [_flatten_col(c) for c in df.columns]
123
+ return df
124
+
125
+
126
+ def _df_to_metric_map(df: pd.DataFrame) -> dict[str, dict[str, Any]]:
127
+ """
128
+ DataFrame(row: period, col: metric) -> {metric: {period: value}} 로 pivot
129
+ """
130
+ if df is None or df.empty:
131
+ return {}
132
+
133
+ # NaN -> None
134
+ df = df.where(pd.notnull(df), None)
135
+
136
+ # '재무년월' 컬럼 찾기(안정)
137
+ # 보통 "재무년월"로 flatten 되지만, 혹시 깨지는 경우 대비
138
+ period_col = None
139
+ for c in df.columns:
140
+ if "재무년월" == c or c.endswith("재무년월") or "재무년월" in c:
141
+ period_col = c
142
+ break
143
+ if not period_col:
144
+ logger.warning("[cTB25] period column not found")
145
+ return {}
146
+
147
+ out: dict[str, dict[str, Any]] = {}
148
+
149
+ for _, row in df.iterrows():
150
+ period = _normalize_period(row.get(period_col), keep_suffix=True)
151
+ if not period:
152
+ continue
153
+
154
+ for col, raw_val in row.items():
155
+ if col == period_col:
156
+ continue
157
+ # 주재무제표는 metric-map에서 제외(원하면 따로 meta로 빼도 됨)
158
+ if "주재무제표" in str(col):
159
+ continue
160
+
161
+ metric = _normalize_metric_key(str(col))
162
+
163
+ num = to_float(raw_val)
164
+ val: Any = num if num is not None else (normalize_text(raw_val) or None)
165
+
166
+ out.setdefault(metric, {})[period] = val
167
+
168
+ return out
169
+
170
+
171
+ async def parse_c101_yearly_consensus_table(
172
+ browser: BrowserPort,
173
+ ) -> dict[str, dict[str, Any]]:
174
+ """
175
+ #cTB25 (3년 실적 + 2년 추정) 테이블을
176
+ {metric: {period: value}} 형태로 반환한다.
177
+ """
178
+ await browser.wait_attached(_YEARLY_CONSENSUS_TABLE)
179
+ await browser.wait_table_nth_ready(
180
+ _YEARLY_CONSENSUS_TABLE,
181
+ index=0,
182
+ min_rows=5,
183
+ timeout_ms=30_000,
184
+ poll_ms=200,
185
+ )
186
+
187
+ html = await browser.outer_html_nth(_YEARLY_CONSENSUS_TABLE, 0)
188
+ if not html or "<table" not in html:
189
+ logger.warning("[cTB25] outerHTML invalid or empty")
190
+ return {}
191
+
192
+ df = _html_to_df(html)
193
+ if df is None:
194
+ logger.warning("[cTB25] df is empty/invalid")
195
+ return {}
196
+
197
+ return _df_to_metric_map(df)
@@ -0,0 +1,45 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
5
+ from logging_hj3415 import logger
6
+
7
+ from .c101.sise import parse_c101_sise_table
8
+ from .c101.earning_surprise import parse_c101_earnings_surprise_table
9
+ from .c101.fundamentals import parse_c101_fundamentals_table
10
+ from .c101.major_shareholders import parse_c101_major_shareholders
11
+ from .c101.company_overview import parse_c101_company_overview
12
+ from .c101.summary_cmp import parse_c101_summary_cmp_table
13
+ from .c101.yearly_consensus import parse_c101_yearly_consensus_table
14
+
15
+ async def parse_c101_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
16
+ parsed_summary_cmp = await parse_c101_summary_cmp_table(browser)
17
+ logger.debug(f"parsed_summary_cmp data: {parsed_summary_cmp}")
18
+
19
+ parsed_sise = await parse_c101_sise_table(browser)
20
+ logger.debug(f"parsed_sise data: {parsed_sise}")
21
+
22
+ parsed_company_overview = await parse_c101_company_overview(browser)
23
+ logger.debug(f"parsed_company_overview data: {parsed_company_overview}")
24
+
25
+ parsed_major_shareholders = await parse_c101_major_shareholders(browser)
26
+ logger.debug(f"parsed_major_shareholders data: {parsed_major_shareholders}")
27
+
28
+ parsed_fundamentals = await parse_c101_fundamentals_table(browser)
29
+ logger.debug(f"parsed_fundamentals data: {parsed_fundamentals}")
30
+
31
+ parsed_earnings_surprise = await parse_c101_earnings_surprise_table(browser)
32
+ logger.debug(f"parsed_earnings_surprise data: {parsed_earnings_surprise}")
33
+
34
+ parsed_yearly_consensus = await parse_c101_yearly_consensus_table(browser)
35
+ logger.debug(f"parsed_yearly_consensus data: {parsed_yearly_consensus}")
36
+
37
+ return {
38
+ "요약": parsed_summary_cmp,
39
+ "시세": parsed_sise,
40
+ "주주현황": parsed_major_shareholders,
41
+ "기업개요": parsed_company_overview,
42
+ "펀더멘털": parsed_fundamentals,
43
+ "어닝서프라이즈": parsed_earnings_surprise,
44
+ "연간컨센서스": parsed_yearly_consensus,
45
+ }
@@ -0,0 +1,19 @@
1
+ # scraper2_hj3415/app/parsing/c103_parser.py
2
+ from __future__ import annotations
3
+ from typing import Any
4
+
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from scraper2_hj3415.app.parsing._tables.html_table import try_html_table_to_df, df_to_c1034_metric_list
7
+
8
+ TABLE_XPATH = "xpath=//div[@id='wrapper']//div//table"
9
+ TABLE_INDEX = 2
10
+
11
+
12
+ async def parse_c103_current_table(browser: BrowserPort) -> list[dict[str, Any]]:
13
+ """
14
+ ✅ 현재 화면 상태(탭/연간/분기/검색 결과)가 이미 준비되었다는 전제.
15
+ 이 상태에서 TABLE_INDEX 테이블만 읽어서 rows로 변환한다.
16
+ """
17
+ html = await browser.outer_html_nth(TABLE_XPATH, TABLE_INDEX)
18
+ df = try_html_table_to_df(html)
19
+ return df_to_c1034_metric_list(df)
@@ -0,0 +1,23 @@
1
+ # scraper2_hj3415/app/parsing/c104_parser.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
7
+ from scraper2_hj3415.app.parsing._tables.html_table import try_html_table_to_df, df_to_c1034_metric_list
8
+
9
+ TABLE_XPATH = 'xpath=//table[@class="gHead01 all-width data-list"]'
10
+
11
+
12
+ async def parse_c104_current_table(
13
+ browser: BrowserPort,
14
+ *,
15
+ table_index: int,
16
+ ) -> list[dict[str, Any]]:
17
+ """
18
+ ✅ 현재 화면 상태(탭/연간/분기/검색 결과)가 이미 준비되었다는 전제.
19
+ 이 상태에서 지정된 table_index 테이블만 읽어서 rows로 변환한다.
20
+ """
21
+ html = await browser.outer_html_nth(TABLE_XPATH, table_index)
22
+ df = try_html_table_to_df(html)
23
+ return df_to_c1034_metric_list(df)
@@ -0,0 +1,137 @@
1
+ # scraper2_hj3415/app/parsing/c106_parser.py
2
+ from __future__ import annotations
3
+
4
+ from io import StringIO
5
+ import re
6
+ import numpy as np
7
+ import pandas as pd
8
+ from typing import Any
9
+
10
+ from common_hj3415.utils import clean_text
11
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
12
+ from scraper2_hj3415.app.parsing._normalize.label import (
13
+ normalize_metric_label,
14
+ sanitize_label,
15
+ )
16
+ from logging_hj3415 import logger
17
+
18
+ _CODE_RE = re.compile(r"\b\d{6}\b")
19
+
20
+
21
+ async def parse_c106_header_codes(browser: BrowserPort) -> list[str]:
22
+ """
23
+ 현재 페이지에서 '기업간비교자료' 헤더(회사명들)에서 종목코드(6자리)만 추출한다.
24
+ (goto/sleep 없음)
25
+ """
26
+ selector = (
27
+ 'xpath=//caption[contains(normalize-space(.), "기업간비교자료")]'
28
+ "/following-sibling::thead//th[not(@colspan)]"
29
+ )
30
+ await browser.wait_attached(selector)
31
+ th_texts = await browser.all_texts(selector)
32
+
33
+ codes: list[str] = []
34
+ for i, t in enumerate(th_texts):
35
+ text = (t or "").strip()
36
+ if not text:
37
+ continue
38
+ m = _CODE_RE.search(text)
39
+ if not m:
40
+ continue
41
+ codes.append(m.group(0))
42
+
43
+ # 중복 제거(순서 유지)
44
+ seen: set[str] = set()
45
+ uniq: list[str] = []
46
+ for c in codes:
47
+ if c not in seen:
48
+ seen.add(c)
49
+ uniq.append(c)
50
+ logger.debug(f"c106 header codes: {uniq}")
51
+ return uniq
52
+
53
+
54
+ def html_table_to_df(html: str, codes: list[str]) -> pd.DataFrame:
55
+ df = pd.read_html(StringIO(html), header=None)[0]
56
+ if df is None or df.empty:
57
+ return pd.DataFrame()
58
+
59
+ df.columns = ["항목_group", "항목"] + codes
60
+ df["항목_group"] = df["항목_group"].ffill()
61
+
62
+ # 첫 두 줄 주가데이터 주입(기존 로직 유지)
63
+ for i in range(min(2, len(df))):
64
+ row = df.loc[i].tolist()
65
+ new_row = ["주가데이터"] + row
66
+ df.loc[i] = new_row[: len(df.columns)]
67
+
68
+ df = df[df["항목"].notna()].reset_index(drop=True)
69
+ df.loc[df["항목"].isin(["투자의견", "목표주가(원)"]), "항목_group"] = "기타지표"
70
+ df = df[df["항목"] != "재무연월"].reset_index(drop=True)
71
+
72
+ for col in df.columns[2:]:
73
+ df[col] = df[col].replace("-", "0")
74
+ df[col] = pd.to_numeric(df[col], errors="coerce")
75
+
76
+ df["항목_group"] = df["항목_group"].astype("string").map(clean_text)
77
+ df["항목"] = df["항목"].astype("string").map(clean_text)
78
+
79
+ return df.replace({np.nan: None})
80
+
81
+
82
+ def df_to_c106_metric_list(df: pd.DataFrame) -> list[dict[str, Any]]:
83
+ """
84
+ C106 DataFrame -> records(list[dict])
85
+
86
+ A안 적용:
87
+ - 항목(key)은 normalize_c1034_item으로 강하게 정규화(괄호/별표 등 제거)
88
+ - 항목_raw는 정규화 전(단 UI 노이즈만 제거된) 원라벨을 저장
89
+ - 항목_group은 그대로 두되, 필요 없으면 caller에서 삭제하면 됨
90
+ """
91
+ if df is None or df.empty:
92
+ return []
93
+
94
+ df = df.copy()
95
+
96
+ # raw 보존(정규화 전, UI 노이즈만 제거)
97
+ raw = df["항목"].where(df["항목"].notna(), None)
98
+ df["항목_raw"] = raw.map(
99
+ lambda x: sanitize_label(str(x)) if x is not None else None
100
+ )
101
+
102
+ # 항목_group 컬럼들은 제거(있을 때만)
103
+ drop_cols = [c for c in ("항목_group", "항목_group_raw") if c in df.columns]
104
+ if drop_cols:
105
+ df = df.drop(columns=drop_cols)
106
+
107
+ # key 정규화(A안)
108
+ df["항목"] = df["항목"].map(
109
+ lambda x: normalize_metric_label(str(x)) if x is not None else ""
110
+ )
111
+
112
+ # 유효 행만
113
+ df = df[df["항목"].astype(str).str.strip() != ""].reset_index(drop=True)
114
+
115
+ # NaN -> None
116
+ df = df.where(pd.notnull(df), None)
117
+
118
+ return df.to_dict(orient="records")
119
+
120
+
121
+ async def parse_c106_current_table(
122
+ browser: BrowserPort,
123
+ *,
124
+ columns: list[str],
125
+ table_selector: str = "#cTB611",
126
+ table_index: int = 0,
127
+ timeout_ms: int = 10_000,
128
+ ) -> list[dict[str, Any]]:
129
+ """
130
+ ✅ 현재 화면(이미 goto/대기 완료된 상태)에서 비교테이블만 파싱한다.
131
+ """
132
+ await browser.wait_table_nth_ready(
133
+ table_selector, index=table_index, min_rows=3, timeout_ms=timeout_ms
134
+ )
135
+ html = await browser.outer_html_nth(table_selector, table_index)
136
+ df = html_table_to_df(html, columns)
137
+ return df_to_c106_metric_list(df)