scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -70
  60. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,31 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/text.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ from common_hj3415.utils import clean_text
7
+
8
+
9
+ def normalize_text(x: object | None) -> str:
10
+ """
11
+ 임의의 값을 문자열로 정규화한다.
12
+ - None → ""
13
+ - 문자열 표현 규칙(clean_text) 적용
14
+ """
15
+ s = "" if x is None else str(x)
16
+ return clean_text(s)
17
+
18
+
19
+ _NUM_EMPTY = {"", "-", "N/A", "NA", "null", "None"}
20
+
21
+
22
+ def display_text(x: Any) -> str:
23
+ """
24
+ 출력용 문자열로 정규화한다.
25
+ - '-', 'N/A' 등 의미 없는 값은 제거
26
+ """
27
+ s = normalize_text(x)
28
+ if not s or s in _NUM_EMPTY:
29
+ return ""
30
+ return s
31
+
@@ -0,0 +1,70 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/values.py
2
+ from __future__ import annotations
3
+ import re
4
+ from typing import Any
5
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
6
+
7
+
8
+ def parse_numeric(
9
+ x: Any,
10
+ *,
11
+ strip_units: bool = False,
12
+ keep_text: bool = False,
13
+ ) -> int | float | str | None:
14
+ """
15
+ 문자열을 숫자로 파싱 시도한다.
16
+
17
+ - strip_units=True:
18
+ '원', '%', '억' 등 단위를 제거한 뒤 숫자 파싱
19
+ - strip_units=False:
20
+ 순수 숫자만 파싱
21
+ """
22
+ s = normalize_text(x)
23
+ if not s:
24
+ return None
25
+
26
+ t = s.replace(",", "")
27
+ if strip_units:
28
+ t = (
29
+ t.replace("원", "")
30
+ .replace("억원", "")
31
+ .replace("억", "")
32
+ .replace("%", "")
33
+ .strip()
34
+ )
35
+
36
+ # 정수
37
+ if re.fullmatch(r"-?\d+", t):
38
+ return int(t)
39
+
40
+ # 실수
41
+ if re.fullmatch(r"-?\d+(\.\d+)?", t):
42
+ return float(t)
43
+
44
+ return s if keep_text else None
45
+
46
+
47
+ def to_number(x: Any) -> int | float | None:
48
+ """숫자만 허용 (실패 시 None)"""
49
+ return parse_numeric(x, strip_units=True, keep_text=False)
50
+
51
+ def to_number_or_text(x: Any) -> float | str | None:
52
+ """숫자면 숫자, 아니면 텍스트"""
53
+ return parse_numeric(x, strip_units=True, keep_text=True)
54
+
55
+ def to_num_or_text(x: Any) -> int | float | str | None:
56
+ """범용 셀 정규화"""
57
+ return parse_numeric(x, strip_units=False, keep_text=True)
58
+
59
+ def to_int(x: Any) -> int | None:
60
+ v = parse_numeric(x, strip_units=True, keep_text=False)
61
+ if isinstance(v, (int, float)):
62
+ return int(v)
63
+ return None
64
+
65
+ def to_float(x: Any) -> float | None:
66
+ v = parse_numeric(x, strip_units=True, keep_text=False)
67
+ if isinstance(v, (int, float)):
68
+ return float(v)
69
+ return None
70
+
@@ -0,0 +1,88 @@
1
+ from typing import Any, Sequence
2
+
3
+ from io import StringIO
4
+ import pandas as pd
5
+
6
+ from logging_hj3415 import logger
7
+ from scraper2_hj3415.app.parsing._normalize.table import normalize_metrics_df
8
+ from common_hj3415.utils import clean_text
9
+
10
+
11
+ def _flatten_col(col: Any) -> str:
12
+ """
13
+ pandas MultiIndex 컬럼(tuple)을 사람이 쓰기 좋은 단일 key로 변환한다.
14
+ - ('재무년월','재무년월') 같은 중복은 하나로 축약
15
+ - 단위 문자열 제거
16
+ - '주재 무제표' 같은 깨진 라벨 보정
17
+ """
18
+ if isinstance(col, tuple):
19
+ parts = [clean_text(p) for p in col if clean_text(p)]
20
+ if not parts:
21
+ s = ""
22
+ elif len(parts) == 2 and parts[0] == parts[1]:
23
+ s = parts[0]
24
+ else:
25
+ s = "_".join(parts)
26
+ else:
27
+ s = clean_text(col)
28
+
29
+ s = (
30
+ s.replace("(억원, %)", "")
31
+ .replace("(원)", "")
32
+ .replace("(배)", "")
33
+ .replace("(%)", "")
34
+ .strip()
35
+ )
36
+ s = s.replace("주재 무제표", "주재무제표")
37
+ return clean_text(s)
38
+
39
+
40
+ def try_html_table_to_df(
41
+ html: str, *, flatten_cols: bool = False, header: int | Sequence[int] = 0
42
+ ) -> pd.DataFrame | None:
43
+ try:
44
+ dfs = pd.read_html(StringIO(html), header=header)
45
+ except Exception as e:
46
+ logger.exception("pd.read_html failed: {}", e)
47
+ return None
48
+ if not dfs:
49
+ return None
50
+ df = dfs[0]
51
+ if df is None or df.empty:
52
+ return None
53
+
54
+ if flatten_cols:
55
+ df = df.copy()
56
+ df.columns = [_flatten_col(c) for c in df.columns]
57
+ return df
58
+
59
+
60
+ def df_to_c1034_metric_list(df: pd.DataFrame) -> list[dict[str, Any]]:
61
+ """
62
+ C103 테이블 DataFrame -> 정규화된 records(list[dict])
63
+ - 항목이 비면 제거
64
+ - 항목_raw(정규화 전 원래 라벨) 보존
65
+ """
66
+ if df is None or df.empty:
67
+ return []
68
+
69
+ df = df.copy()
70
+
71
+ # 정규화 전에 원래 항목 라벨 보존
72
+ if "항목" in df.columns:
73
+ df["항목_raw"] = (
74
+ df["항목"]
75
+ .where(df["항목"].notna(), None)
76
+ .map(lambda x: str(x) if x is not None else None)
77
+ )
78
+
79
+ df = normalize_metrics_df(df)
80
+
81
+ records: list[dict[str, Any]] = []
82
+ for r in df.to_dict(orient="records"):
83
+ item = r.get("항목")
84
+ if not item:
85
+ continue
86
+ records.append(r)
87
+ return records
88
+
File without changes
@@ -0,0 +1,103 @@
1
+ # scraper2/app/parsing/_sise_normalize.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Mapping
6
+
7
+ # 공통 구분자: 값/키 둘 다 여기로 쪼갬
8
+ _DEFAULT_SEP = "/"
9
+
10
+ _UNIT_REPLACEMENTS = {
11
+ "Weeks": "주",
12
+ "Week": "주",
13
+ # 필요해지면 여기에 추가
14
+ # "Days": "일",
15
+ # "Months": "개월",
16
+ }
17
+
18
+
19
+ def _clean_token(s: str) -> str:
20
+ # 괄호/공백 제거 + 중복 공백 정리
21
+ s = s.strip()
22
+ s = s.replace("(", " ").replace(")", " ")
23
+ s = re.sub(r"\s+", " ", s).strip()
24
+ return s
25
+
26
+
27
+ def _compact_key(s: str) -> str:
28
+ s = _clean_token(s)
29
+ s = _replace_units(s) # ✅ 여기서 Weeks → 주
30
+ return s.replace(" ", "")
31
+
32
+ def _split_slash(s: str) -> list[str]:
33
+ return [p.strip() for p in s.split(_DEFAULT_SEP)]
34
+
35
+ def _replace_units(s: str) -> str:
36
+ for src, dst in _UNIT_REPLACEMENTS.items():
37
+ s = s.replace(src, dst)
38
+ return s
39
+
40
+
41
+ def _maybe_expand_pair_key_value(key: str, value: str) -> dict[str, str] | None:
42
+ ks = _split_slash(key)
43
+ vs = _split_slash(value)
44
+ if len(ks) <= 1 or len(ks) != len(vs):
45
+ return None
46
+
47
+ out: dict[str, str] = {}
48
+
49
+ # 1) 특수 케이스: "수익률 (1M/3M/6M/1Y)"
50
+ first = _clean_token(ks[0])
51
+ m = re.match(r"^(?P<prefix>.+?)\s+(?P<token>[0-9A-Za-z]+)$", first)
52
+ if m:
53
+ prefix = m.group("prefix").strip()
54
+ token0 = m.group("token").strip()
55
+ tokens = [token0] + [_clean_token(x) for x in ks[1:]]
56
+ for tok, v in zip(tokens, vs):
57
+ out[_compact_key(f"{prefix}{tok}")] = v
58
+ return out
59
+
60
+ # 2) 일반 케이스 + "prefix 전파" (52Weeks 최고/최저 같은 패턴)
61
+ # 첫 토큰이 "52Weeks 최고"처럼 "prefix + label"이면,
62
+ # 이후 토큰이 "최저"처럼 prefix가 생략된 경우 prefix를 붙여준다.
63
+ first_tok = _clean_token(ks[0])
64
+ m2 = re.match(r"^(?P<prefix>[0-9A-Za-z]+)\s+(?P<label>.+)$", first_tok)
65
+ if m2:
66
+ prefix = m2.group("prefix").strip()
67
+ label0 = m2.group("label").strip()
68
+ labels = [label0] + [_clean_token(x) for x in ks[1:]]
69
+ for lab, v in zip(labels, vs):
70
+ out[_compact_key(f"{prefix}{lab}")] = v
71
+ return out
72
+
73
+ # 3) 완전 일반: 그대로 매칭
74
+ for k, v in zip(ks, vs):
75
+ out[_compact_key(k)] = v
76
+ return out
77
+
78
+
79
+ def normalize_sise_kv_map(src: Mapping[str, str]) -> dict[str, str]:
80
+ """
81
+ c101 시세 블록(dict[str,str])을 "정규화된 키 dict"로 변환.
82
+
83
+ 정규화 규칙:
84
+ - key/value에 "/"가 있고 길이가 맞으면 분해해 여러 항목으로 확장
85
+ 예) "거래량/거래대금" -> "거래량", "거래대금"
86
+ 예) "52Weeks 최고/최저" -> "52Weeks최고", "52Weeks최저"
87
+ 예) "수익률 (1M/3M/6M/1Y)" -> "수익률1M", "수익률3M", ...
88
+ - 나머지는 key의 공백 제거 정도만 적용해서 유지
89
+ """
90
+ out: dict[str, str] = {}
91
+
92
+ for k, v in src.items():
93
+ k = k.strip()
94
+ v = v.strip()
95
+
96
+ expanded = _maybe_expand_pair_key_value(k, v)
97
+ if expanded:
98
+ out.update(expanded)
99
+ continue
100
+
101
+ out[_compact_key(k)] = v
102
+
103
+ return out
@@ -0,0 +1,47 @@
1
+ # scraper2_hj3415/app/parsing/c101/company_overview.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
7
+ from common_hj3415.utils import clean_text
8
+
9
+ # 정규표현식
10
+ _DATE_RE = re.compile(r"(\d{4}\.\d{2}\.\d{2})") # YYYY.MM.DD
11
+
12
+ async def parse_c101_company_overview(browser: BrowserPort) -> dict[str, Any]:
13
+ """
14
+ '기업개요' 섹션에서
15
+ - 기준일자([기준:YYYY.MM.DD])
16
+ - 개요 문장들(li.dot_cmp)
17
+ 을 추출한다.
18
+ """
19
+ out: dict[str, Any] = {}
20
+
21
+ 기준_sel = "div.header-table p"
22
+ 개요_ul_sel = "div.cmp_comment ul.dot_cmp"
23
+ 개요_li_sel = "div.cmp_comment ul.dot_cmp > li.dot_cmp"
24
+
25
+ # 1) 기준일자
26
+ await browser.wait_attached(기준_sel)
27
+ raw = clean_text(await browser.text_content_first(기준_sel))
28
+
29
+ m = _DATE_RE.search(raw)
30
+ out["기준일자"] = m.group(1) if m else raw
31
+
32
+ # 2) 개요 문장들
33
+ await browser.wait_attached(개요_ul_sel)
34
+ li_texts = await browser.all_texts(개요_li_sel)
35
+
36
+ lines: list[str] = []
37
+ for t in li_texts:
38
+ ct = clean_text(t)
39
+ if ct:
40
+ lines.append(ct)
41
+
42
+ # out["개요_리스트"] = lines # 일단 필요 없음
43
+ out["개요"] = "".join(
44
+ lines
45
+ ) # 정책: 저장용이면 join("") 유지, 표시용이면 "\n".join 고려
46
+
47
+ return out
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any
5
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
6
+ from common_hj3415.utils import clean_text
7
+
8
+ _EARNING_SURPRISE_TABLE = "#earning_list"
9
+
10
+ def _strip_bullets_commas(s: str) -> str:
11
+ """
12
+ "● 120,064.0" / "101,922.8" 같은 텍스트에서 숫자 파싱을 방해하는 것 제거.
13
+ """
14
+ s = clean_text(s)
15
+ s = s.replace(",", "")
16
+ s = s.replace("●", "")
17
+ s = s.replace("○", "")
18
+ s = s.replace("▲", "")
19
+ s = s.replace("▼", "")
20
+ return clean_text(s)
21
+
22
+
23
+ def _to_number_like(x: Any) -> Any:
24
+ """
25
+ 숫자면 float/int로, 아니면 문자열 그대로.
26
+ """
27
+ if x is None:
28
+ return None
29
+ if isinstance(x, (int, float)):
30
+ return x
31
+ s = _strip_bullets_commas(str(x))
32
+ if not s:
33
+ return None
34
+
35
+ # 숫자 패턴이면 숫자로
36
+ # - "65.00" "209.17" "-123.4"
37
+ if re.fullmatch(r"[-+]?\d+(\.\d+)?", s):
38
+ # 정수면 int 유지하고 싶으면 여기서 분기 가능
39
+ try:
40
+ f = float(s)
41
+ # "65.0" 같이 소수점 .0이면 int로 바꿀지 정책 선택
42
+ return f
43
+ except Exception:
44
+ return s
45
+
46
+ return s
47
+
48
+
49
+ def _norm_item_label(item: str) -> str:
50
+ """
51
+ item(th 텍스트) 정규화:
52
+ - "전분기대비보기 전년동기대비" -> "전년동기대비"
53
+ - "Surprise" 등은 그대로
54
+ """
55
+ t = clean_text(item)
56
+
57
+ # 버튼 텍스트가 섞이는 케이스: "전분기대비보기 전년동기대비"
58
+ if ("전분기대비" in t) and ("전년동기대비" in t):
59
+ return "전년동기대비"
60
+ if "전분기대비" in t:
61
+ return "전분기대비"
62
+ if "전년동기대비" in t:
63
+ return "전년동기대비"
64
+ if "컨센서스" in t:
65
+ return "컨센서스"
66
+ if "잠정치" in t:
67
+ return "잠정치"
68
+ if "Surprise" in t or "SURPRISE" in t or "surprise" in t:
69
+ return "Surprise"
70
+
71
+ return t
72
+
73
+
74
+ async def _row_cells_texts(
75
+ browser: BrowserPort,
76
+ *,
77
+ row_sel: str,
78
+ ) -> list[str]:
79
+ """
80
+ tbody의 특정 tr에서 th/td 텍스트를 왼쪽부터 순서대로 모두 가져온다.
81
+ """
82
+ # th,td 전체 개수
83
+ n = await browser.count_in_nth(
84
+ _EARNING_SURPRISE_TABLE,
85
+ scope_index=0,
86
+ inner_selector=f"{row_sel} th, {row_sel} td",
87
+ )
88
+
89
+ out: list[str] = []
90
+ for j in range(n):
91
+ txt = await browser.inner_text_in_nth(
92
+ _EARNING_SURPRISE_TABLE,
93
+ scope_index=0,
94
+ inner_selector=f"{row_sel} th, {row_sel} td",
95
+ inner_index=j,
96
+ )
97
+ out.append(clean_text(txt))
98
+ return out
99
+
100
+
101
+ async def parse_c101_earnings_surprise_table(
102
+ browser: BrowserPort,
103
+ *,
104
+ debug_rows: bool = False,
105
+ ) -> dict[str, Any]:
106
+ """
107
+ earning_list HTML 구조(제공된 원본)에 맞춘 안정 파서.
108
+
109
+ 반환:
110
+ {
111
+ "periods": [...],
112
+ "metrics": { section: { item: {period: value} } },
113
+ "meta": {...},
114
+ ...(debug_rows면 "rows": raw_cells_rows)
115
+ }
116
+ """
117
+ await browser.wait_attached(_EARNING_SURPRISE_TABLE)
118
+
119
+ row_cnt = await browser.count_in_nth(
120
+ _EARNING_SURPRISE_TABLE,
121
+ scope_index=0,
122
+ inner_selector="tbody tr",
123
+ )
124
+ if not row_cnt:
125
+ out = {"periods": [], "metrics": {}, "meta": {}}
126
+ if debug_rows:
127
+ out["rows"] = []
128
+ return out
129
+
130
+ raw_cells_rows: list[list[str]] = []
131
+
132
+ periods: list[str] = []
133
+ period_count = 0
134
+
135
+ metrics: dict[str, dict[str, dict[str, Any]]] = {}
136
+ meta: dict[str, dict[str, Any]] = {}
137
+
138
+ current_section: str | None = None
139
+
140
+ for i in range(1, row_cnt + 1): # nth-child 1-based
141
+ row_sel = f"tbody tr:nth-child({i})"
142
+ cells = await _row_cells_texts(browser, row_sel=row_sel)
143
+ raw_cells_rows.append(cells)
144
+
145
+ if not cells:
146
+ continue
147
+
148
+ joined = " ".join([c for c in cells if c])
149
+
150
+ # 1) periods 추출: "재무연월" 헤더 row
151
+ # HTML: <th colspan="2">재무연월</th> + <th>2025/09</th> + <th>2025/12</th>
152
+ if ("재무연월" in joined) and not periods:
153
+ # cells 예: ["재무연월", "2025/09", "2025/12"] 또는 table 구조에 따라 3~4개
154
+ # 여기서는 "YYYY/NN" 패턴만 뽑는 게 가장 안전함
155
+ periods = [c for c in cells if re.fullmatch(r"\d{4}/\d{2}", c)]
156
+ period_count = len(periods)
157
+ continue
158
+
159
+ # periods 없으면 본문 해석 불가
160
+ if not periods:
161
+ continue
162
+
163
+ # 2) meta row: "잠정치발표(예정)일/회계기준"
164
+ if "잠정치발표(예정)일/회계기준" in joined:
165
+ # 보통 cells: ["잠정치발표(예정)일/회계기준", "2025/10/14(연결)", "2026/01/08(연결)"]
166
+ vals = [c for c in cells if c and "잠정치발표" not in c]
167
+ vals = vals[-period_count:] if period_count else vals
168
+ meta["잠정치발표(예정)일/회계기준"] = {
169
+ periods[idx]: vals[idx] if idx < len(vals) else None
170
+ for idx in range(period_count)
171
+ }
172
+ continue
173
+
174
+ # 3) 본문 row 정규화: 항상 [section, item, v1, v2, ...] 로 맞추기
175
+ # HTML 케이스:
176
+ # - 섹션 시작 행(영업이익/당기순이익): cells = ["영업이익", "컨센서스", v1, v2]
177
+ # - rowspan 내부 다음 행: cells = ["잠정치", v1, v2] (section 없음 → 왼쪽 패딩 필요)
178
+ # - ext0 행(전분기대비): cells = ["", "전분기대비", v1, v2] (첫 칸 빈 th)
179
+ #
180
+ # period_count가 2라면, 정상형은 길이 2 + period_count = 4
181
+ want_len = 2 + period_count
182
+
183
+ norm = cells[:]
184
+ if len(norm) == want_len - 1:
185
+ # section th가 빠진 케이스: ["잠정치", v1, v2] -> ["", "잠정치", v1, v2]
186
+ norm = [""] + norm
187
+ elif len(norm) < want_len:
188
+ # 애매한 경우: 오른쪽을 None으로 채움
189
+ norm = ([""] * (want_len - len(norm))) + norm
190
+ norm = norm[-want_len:]
191
+
192
+ section_cell = clean_text(norm[0])
193
+ item_cell = clean_text(norm[1])
194
+ value_cells = norm[2 : 2 + period_count]
195
+
196
+ # section 갱신
197
+ if section_cell:
198
+ current_section = section_cell
199
+ metrics.setdefault(current_section, {})
200
+ if not current_section:
201
+ # 섹션이 한 번도 잡히지 않은 상태면 skip
202
+ continue
203
+
204
+ item = _norm_item_label(item_cell)
205
+ if not item:
206
+ continue
207
+
208
+ # 값 매핑
209
+ bucket = metrics[current_section].setdefault(item, {})
210
+ for idx, p in enumerate(periods):
211
+ raw_v = value_cells[idx] if idx < len(value_cells) else None
212
+ bucket[p] = _to_number_like(raw_v)
213
+
214
+ out: dict[str, Any] = {"periods": periods, "metrics": metrics, "meta": meta}
215
+ if debug_rows:
216
+ out["rows"] = raw_cells_rows
217
+ return out
@@ -0,0 +1,95 @@
1
+ # scraper2_hj3415/app/parsing/c101/fundamentals.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
7
+ from common_hj3415.utils import clean_text
8
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
9
+ from scraper2_hj3415.app.parsing._normalize.values import to_number_or_text
10
+
11
+ _FUNDAMENTALS_TABLE = "div.fund.fl_le table.gHead03"
12
+
13
+
14
+ def _normalize_period_key(s: str) -> str:
15
+ """
16
+ 예)
17
+ "2024/12(A)" -> "2024/12"
18
+ "2025/12(E)" -> "2025/12"
19
+ "2025/12" -> "2025/12"
20
+ """
21
+ s = s.strip()
22
+ # 뒤쪽 괄호 주석 제거: (A) (E) (P) 등
23
+ s = re.sub(r"\([^)]*\)$", "", s).strip()
24
+ return s
25
+
26
+ EXCLUDED_METRICS = {"회계기준"}
27
+
28
+ async def parse_c101_fundamentals_table(
29
+ browser: BrowserPort,
30
+ ) -> dict[str, dict[str, Any]]:
31
+ """
32
+ '펀더멘털 주요지표(실적/컨센서스)' 테이블을
33
+ metric_key -> {period_key -> value} 형태로 반환한다.
34
+
35
+ 반환 예)
36
+ {
37
+ "PBR": {"2024/12": 13.62, "2025/12": None},
38
+ "회계기준": {"2024/12": "연결", "2025/12": "연결"},
39
+ ...
40
+ }
41
+ """
42
+ await browser.wait_attached(_FUNDAMENTALS_TABLE)
43
+
44
+ rows = await browser.table_records(_FUNDAMENTALS_TABLE, header=0)
45
+ if not rows:
46
+ return {}
47
+
48
+ cleaned_rows: list[dict[str, Any]] = []
49
+ for r in rows:
50
+ rr: dict[str, Any] = {}
51
+ for k, v in r.items():
52
+ kk = clean_text(k)
53
+ if not kk:
54
+ continue
55
+ rr[kk] = normalize_text(v) if kk == "주요지표" else to_number_or_text(v)
56
+
57
+ if rr.get("주요지표"):
58
+ cleaned_rows.append(rr)
59
+
60
+ if not cleaned_rows:
61
+ return {}
62
+
63
+ # columns: 순서 보존 합치기
64
+ seen: set[str] = set()
65
+ columns: list[str] = []
66
+ for rr in cleaned_rows:
67
+ for kk in rr.keys():
68
+ if kk not in seen:
69
+ seen.add(kk)
70
+ columns.append(kk)
71
+
72
+ metric_col = "주요지표" if "주요지표" in columns else columns[0]
73
+ raw_value_cols = [c for c in columns if c != metric_col]
74
+
75
+ # period_cols 정규화(괄호 제거)
76
+ # ⚠️ "2024/12(A)" / "2025/12" 같은 원본 컬럼명을 유지해야 rr.get(...)이 되므로
77
+ # (원본컬럼, 정규화컬럼) 페어로 들고 간다.
78
+ col_pairs: list[tuple[str, str]] = [(c, _normalize_period_key(c)) for c in raw_value_cols]
79
+
80
+ metrics: dict[str, dict[str, Any]] = {}
81
+
82
+ for rr in cleaned_rows:
83
+ name = rr.get(metric_col)
84
+ if not name:
85
+ continue
86
+
87
+ metric_key = str(name).strip()
88
+ if metric_key in EXCLUDED_METRICS:
89
+ continue # ⬅️ 여기서 제외
90
+
91
+ bucket = metrics.setdefault(metric_key, {})
92
+ for raw_c, norm_c in col_pairs:
93
+ bucket[norm_c] = rr.get(raw_c)
94
+
95
+ return metrics