scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
  3. scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
  4. scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
  5. scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
  6. scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
  7. scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
  8. scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
  9. scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
  10. scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
  11. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  12. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  13. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  14. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  15. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
  16. scraper2_hj3415/app/composition.py +225 -0
  17. scraper2_hj3415/app/domain/blocks.py +61 -0
  18. scraper2_hj3415/app/domain/constants.py +33 -0
  19. scraper2_hj3415/app/domain/doc.py +16 -0
  20. scraper2_hj3415/app/domain/endpoint.py +11 -0
  21. scraper2_hj3415/app/domain/series.py +11 -0
  22. scraper2_hj3415/app/domain/types.py +19 -0
  23. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  24. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  25. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  26. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  27. scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
  28. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  29. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  30. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  31. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  32. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  33. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  34. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  35. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  36. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  37. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  38. scraper2_hj3415/app/parsing/c103_parser.py +22 -0
  39. scraper2_hj3415/app/parsing/c104_parser.py +26 -0
  40. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  41. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  42. scraper2_hj3415/app/ports/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  44. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  45. scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
  46. scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
  47. scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
  48. scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
  49. scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
  50. scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
  51. scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
  52. scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
  53. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  55. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  56. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  57. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  58. scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
  59. scraper2_hj3415/app/services/__init__.py +0 -0
  60. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  61. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  62. scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
  63. scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
  64. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  65. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  66. scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
  67. scraper2_hj3415/app/usecases/__init__.py +0 -0
  68. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  69. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  70. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  71. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  72. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  73. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  74. scraper2/main.py → scraper2_hj3415/cli.py +45 -72
  75. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
  76. scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
  77. scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
  78. scraper2/adapters/out/playwright/browser.py +0 -102
  79. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  80. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  81. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  82. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  83. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  84. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  85. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  86. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  87. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  88. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  89. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  90. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  91. scraper2/app/composition.py +0 -204
  92. scraper2/app/parsing/_converters.py +0 -85
  93. scraper2/app/parsing/_normalize.py +0 -134
  94. scraper2/app/parsing/c101_parser.py +0 -143
  95. scraper2/app/parsing/c103_parser.py +0 -128
  96. scraper2/app/parsing/c104_parser.py +0 -143
  97. scraper2/app/parsing/c106_parser.py +0 -153
  98. scraper2/app/parsing/c108_parser.py +0 -65
  99. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  100. scraper2/app/ports/browser/browser_port.py +0 -22
  101. scraper2/app/ports/ingest_port.py +0 -14
  102. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  103. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  104. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  105. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  106. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  107. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  108. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  109. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  110. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  111. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  112. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  113. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  114. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  115. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  116. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  117. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  118. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  119. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  120. {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
  121. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  122. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  123. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  124. {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
  125. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  126. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  127. {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
  128. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  129. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  130. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  131. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  132. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  133. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  134. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
  135. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,16 @@
1
+ # scraper2_hj3415/app/domain/doc.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Mapping
6
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
7
+ from scraper2_hj3415.app.domain.types import BlockKey, LabelsMap
8
+ from scraper2_hj3415.app.domain.blocks import BlockData
9
+
10
+
11
+ @dataclass(frozen=True)
12
+ class NfsDoc:
13
+ code: str
14
+ endpoint_kind: EndpointKind
15
+ blocks: Mapping[BlockKey, BlockData]
16
+ labels: Mapping[BlockKey, LabelsMap]
@@ -0,0 +1,11 @@
1
+ # scraper2_hj3415/app/domain/endpoint.py
2
+ from __future__ import annotations
3
+
4
+ from enum import StrEnum
5
+
6
+ class EndpointKind(StrEnum):
7
+ C101 = "c101"
8
+ C103 = "c103"
9
+ C104 = "c104"
10
+ C106 = "c106"
11
+ C108 = "c108"
@@ -0,0 +1,11 @@
1
+ # scraper2_hj3415/app/domain/series.py
2
+ from __future__ import annotations
3
+
4
+ from dataclasses import dataclass
5
+ from typing import Mapping
6
+ from scraper2_hj3415.app.domain.types import MetricKey, Period, Num
7
+
8
+ @dataclass(frozen=True)
9
+ class MetricSeries:
10
+ key: MetricKey
11
+ values: Mapping[Period, Num]
@@ -0,0 +1,19 @@
1
+ # scraper2_hj3415/app/domain/types.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Mapping, Any, Sequence, TypeAlias, Literal
5
+
6
+ BlockKey = str
7
+ MetricKey = str
8
+ Period = str
9
+ Num = float | int | None
10
+
11
+ Record: TypeAlias = Mapping[str, Any]
12
+ Records: TypeAlias = Sequence[Record]
13
+ RawLabel = str
14
+ LabelsMap = dict[MetricKey, RawLabel]
15
+
16
+ Sink = Literal["memory", "mongo"]
17
+
18
+
19
+
@@ -0,0 +1,92 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/label.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+
7
+ from common_hj3415.utils import clean_text
8
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
9
+
10
+ # -----------------------------
11
+ # 일반적인 라벨 정규화
12
+ # -----------------------------
13
+
14
+ UI_LABEL_NOISE = (
15
+ "펼치기",
16
+ "접기",
17
+ "더보기",
18
+ )
19
+
20
+
21
+ def sanitize_label(x: Any) -> str:
22
+ """
23
+ raw label(항목_raw 포함)에서:
24
+ - '펼치기' 같은 UI 텍스트 제거
25
+ - 과도한 공백 정리
26
+ - 양끝 공백 제거
27
+ """
28
+ s = normalize_text(x)
29
+
30
+ # UI 노이즈 단어 제거
31
+ for w in UI_LABEL_NOISE:
32
+ s = s.replace(w, " ")
33
+
34
+ return clean_text(s)
35
+
36
+
37
+ # -----------------------------
38
+ # metric 라벨 정규화
39
+ # -----------------------------
40
+ _BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
41
+ _EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
42
+ _ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
43
+
44
+
45
+ def normalize_label_base(text: str | None) -> str:
46
+ s = sanitize_label(text)
47
+ s = _EXTRA_WORDS_PATTERN.sub("", s)
48
+ s = _BRACKET_PATTERN.sub("", s)
49
+ s = _ALL_PAREN_PATTERN.sub("", s)
50
+ s = s.replace("*", "")
51
+ return clean_text(s)
52
+
53
+
54
+ def normalize_metric_label(text: str | None) -> str:
55
+ # "보유 지분 (%)" → "보유 지분" (공백 유지)
56
+ return normalize_label_base(text)
57
+
58
+
59
+ def normalize_key_label(text: str | None) -> str:
60
+ # "보유 지분 (%)" → "보유지분"
61
+ s = normalize_label_base(text)
62
+ return s.replace(" ", "").replace("\xa0", "").replace("%", "").strip()
63
+
64
+
65
+ # -----------------------------
66
+ # 컬럼명 정규화
67
+ # -----------------------------
68
+ _COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
69
+ _COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
70
+ _COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
71
+
72
+
73
+ def normalize_col_label(col: str | None) -> str:
74
+ """
75
+ 컬럼명 정규화
76
+ 예)
77
+ "2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
78
+ "2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
79
+ "전년대비 (YoY)" -> "전년대비"
80
+ "전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
81
+ """
82
+ s = normalize_text(col)
83
+ # 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
84
+ s = _COL_DOTNUM.sub("", s)
85
+
86
+ # 2) 컨센서스 문구 제거
87
+ s = _COL_EXTRA_WORDS.sub("", s)
88
+
89
+ # 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
90
+ s = _COL_PAREN_PATTERN.sub("", s)
91
+
92
+ return clean_text(s)
@@ -0,0 +1,53 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/table.py
2
+ from __future__ import annotations
3
+
4
+ from collections import Counter
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from .label import normalize_col_label, normalize_metric_label
9
+
10
+
11
+ def _dedupe_columns(cols: list[str]) -> list[str]:
12
+ """
13
+ 정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
14
+ 예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
15
+ """
16
+ seen: Counter[str] = Counter()
17
+ out: list[str] = []
18
+ for c in cols:
19
+ c = c or ""
20
+ seen[c] += 1
21
+ if seen[c] == 1:
22
+ out.append(c)
23
+ else:
24
+ out.append(f"{c}_{seen[c]}")
25
+ return out
26
+
27
+
28
+ # -----------------------------
29
+ # 3) DataFrame 전체 정규화 + records 변환
30
+ # -----------------------------
31
+ def normalize_metrics_df(df: pd.DataFrame) -> pd.DataFrame:
32
+ """
33
+ - 컬럼명 전체 정규화
34
+ - '항목' 값 정규화
35
+ - NaN -> None
36
+ - 중복 컬럼명 자동 분리(_2/_3)
37
+ """
38
+ if df is None or df.empty:
39
+ return df
40
+
41
+ df = df.copy()
42
+
43
+ # 컬럼명 정규화 + 중복 방지
44
+ norm_cols = [normalize_col_label(c) for c in df.columns.astype(str).tolist()]
45
+ df.columns = _dedupe_columns(norm_cols)
46
+
47
+ # 항목 값 정규화
48
+ if "항목" in df.columns:
49
+ df["항목"] = df["항목"].map(normalize_metric_label)
50
+
51
+ # NaN -> None
52
+ df = df.replace({np.nan: None})
53
+ return df
@@ -0,0 +1,31 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/text.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ from common_hj3415.utils import clean_text
7
+
8
+
9
+ def normalize_text(x: object | None) -> str:
10
+ """
11
+ 임의의 값을 문자열로 정규화한다.
12
+ - None → ""
13
+ - 문자열 표현 규칙(clean_text) 적용
14
+ """
15
+ s = "" if x is None else str(x)
16
+ return clean_text(s)
17
+
18
+
19
+ _NUM_EMPTY = {"", "-", "N/A", "NA", "null", "None"}
20
+
21
+
22
+ def display_text(x: Any) -> str:
23
+ """
24
+ 출력용 문자열로 정규화한다.
25
+ - '-', 'N/A' 등 의미 없는 값은 제거
26
+ """
27
+ s = normalize_text(x)
28
+ if not s or s in _NUM_EMPTY:
29
+ return ""
30
+ return s
31
+
@@ -0,0 +1,70 @@
1
+ # scraper2_hj3415/app/parsing/_normalize/values.py
2
+ from __future__ import annotations
3
+ import re
4
+ from typing import Any
5
+ from scraper2_hj3415.app.parsing._normalize.text import normalize_text
6
+
7
+
8
+ def parse_numeric(
9
+ x: Any,
10
+ *,
11
+ strip_units: bool = False,
12
+ keep_text: bool = False,
13
+ ) -> int | float | str | None:
14
+ """
15
+ 문자열을 숫자로 파싱 시도한다.
16
+
17
+ - strip_units=True:
18
+ '원', '%', '억' 등 단위를 제거한 뒤 숫자 파싱
19
+ - strip_units=False:
20
+ 순수 숫자만 파싱
21
+ """
22
+ s = normalize_text(x)
23
+ if not s:
24
+ return None
25
+
26
+ t = s.replace(",", "")
27
+ if strip_units:
28
+ t = (
29
+ t.replace("원", "")
30
+ .replace("억원", "")
31
+ .replace("억", "")
32
+ .replace("%", "")
33
+ .strip()
34
+ )
35
+
36
+ # 정수
37
+ if re.fullmatch(r"-?\d+", t):
38
+ return int(t)
39
+
40
+ # 실수
41
+ if re.fullmatch(r"-?\d+(\.\d+)?", t):
42
+ return float(t)
43
+
44
+ return s if keep_text else None
45
+
46
+
47
+ def to_number(x: Any) -> int | float | None:
48
+ """숫자만 허용 (실패 시 None)"""
49
+ return parse_numeric(x, strip_units=True, keep_text=False)
50
+
51
+ def to_number_or_text(x: Any) -> float | str | None:
52
+ """숫자면 숫자, 아니면 텍스트"""
53
+ return parse_numeric(x, strip_units=True, keep_text=True)
54
+
55
+ def to_num_or_text(x: Any) -> int | float | str | None:
56
+ """범용 셀 정규화"""
57
+ return parse_numeric(x, strip_units=False, keep_text=True)
58
+
59
+ def to_int(x: Any) -> int | None:
60
+ v = parse_numeric(x, strip_units=True, keep_text=False)
61
+ if isinstance(v, (int, float)):
62
+ return int(v)
63
+ return None
64
+
65
+ def to_float(x: Any) -> float | None:
66
+ v = parse_numeric(x, strip_units=True, keep_text=False)
67
+ if isinstance(v, (int, float)):
68
+ return float(v)
69
+ return None
70
+
@@ -0,0 +1,89 @@
1
+ from typing import Any, Sequence
2
+
3
+ from io import StringIO
4
+ import pandas as pd
5
+
6
+ from logging_hj3415 import logger
7
+ from scraper2_hj3415.app.parsing._normalize.table import normalize_metrics_df
8
+ from scraper2_hj3415.app.parsing._normalize.label import sanitize_label
9
+ from common_hj3415.utils import clean_text
10
+
11
+
12
+ def _flatten_col(col: Any) -> str:
13
+ """
14
+ pandas MultiIndex 컬럼(tuple)을 사람이 쓰기 좋은 단일 key로 변환한다.
15
+ - ('재무년월','재무년월') 같은 중복은 하나로 축약
16
+ - 단위 문자열 제거
17
+ - '주재 무제표' 같은 깨진 라벨 보정
18
+ """
19
+ if isinstance(col, tuple):
20
+ parts = [clean_text(p) for p in col if clean_text(p)]
21
+ if not parts:
22
+ s = ""
23
+ elif len(parts) == 2 and parts[0] == parts[1]:
24
+ s = parts[0]
25
+ else:
26
+ s = "_".join(parts)
27
+ else:
28
+ s = clean_text(col)
29
+
30
+ s = (
31
+ s.replace("(억원, %)", "")
32
+ .replace("(원)", "")
33
+ .replace("(배)", "")
34
+ .replace("(%)", "")
35
+ .strip()
36
+ )
37
+ s = s.replace("주재 무제표", "주재무제표")
38
+ return clean_text(s)
39
+
40
+
41
+ def try_html_table_to_df(
42
+ html: str, *, flatten_cols: bool = False, header: int | Sequence[int] = 0
43
+ ) -> pd.DataFrame | None:
44
+ try:
45
+ dfs = pd.read_html(StringIO(html), header=header)
46
+ except Exception as e:
47
+ logger.exception("pd.read_html failed: {}", e)
48
+ return None
49
+ if not dfs:
50
+ return None
51
+ df = dfs[0]
52
+ if df is None or df.empty:
53
+ return None
54
+
55
+ if flatten_cols:
56
+ df = df.copy()
57
+ df.columns = [_flatten_col(c) for c in df.columns]
58
+ return df
59
+
60
+
61
+ def df_to_c1034_metric_list(df: pd.DataFrame) -> list[dict[str, Any]]:
62
+ """
63
+ C103 테이블 DataFrame -> 정규화된 records(list[dict])
64
+ - 항목이 비면 제거
65
+ - 항목_raw(정규화 전 원래 라벨) 보존
66
+ """
67
+ if df is None or df.empty:
68
+ return []
69
+
70
+ df = df.copy()
71
+
72
+ # 정규화 전에 원래 항목 라벨 보존(공백, 펼치기, 닫기등 제외)
73
+ if "항목" in df.columns:
74
+ df["항목_raw"] = (
75
+ df["항목"]
76
+ .where(df["항목"].notna(), None)
77
+ .map(lambda x: sanitize_label(x) if x is not None else None)
78
+ )
79
+
80
+ df = normalize_metrics_df(df)
81
+
82
+ records: list[dict[str, Any]] = []
83
+ for r in df.to_dict(orient="records"):
84
+ item = r.get("항목")
85
+ if not item:
86
+ continue
87
+ records.append(r)
88
+ return records
89
+
File without changes
@@ -0,0 +1,103 @@
1
+ # scraper2/app/parsing/_sise_normalize.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Mapping
6
+
7
+ # 공통 구분자: 값/키 둘 다 여기로 쪼갬
8
+ _DEFAULT_SEP = "/"
9
+
10
+ _UNIT_REPLACEMENTS = {
11
+ "Weeks": "주",
12
+ "Week": "주",
13
+ # 필요해지면 여기에 추가
14
+ # "Days": "일",
15
+ # "Months": "개월",
16
+ }
17
+
18
+
19
+ def _clean_token(s: str) -> str:
20
+ # 괄호/공백 제거 + 중복 공백 정리
21
+ s = s.strip()
22
+ s = s.replace("(", " ").replace(")", " ")
23
+ s = re.sub(r"\s+", " ", s).strip()
24
+ return s
25
+
26
+
27
+ def _compact_key(s: str) -> str:
28
+ s = _clean_token(s)
29
+ s = _replace_units(s) # ✅ 여기서 Weeks → 주
30
+ return s.replace(" ", "")
31
+
32
+ def _split_slash(s: str) -> list[str]:
33
+ return [p.strip() for p in s.split(_DEFAULT_SEP)]
34
+
35
+ def _replace_units(s: str) -> str:
36
+ for src, dst in _UNIT_REPLACEMENTS.items():
37
+ s = s.replace(src, dst)
38
+ return s
39
+
40
+
41
+ def _maybe_expand_pair_key_value(key: str, value: str) -> dict[str, str] | None:
42
+ ks = _split_slash(key)
43
+ vs = _split_slash(value)
44
+ if len(ks) <= 1 or len(ks) != len(vs):
45
+ return None
46
+
47
+ out: dict[str, str] = {}
48
+
49
+ # 1) 특수 케이스: "수익률 (1M/3M/6M/1Y)"
50
+ first = _clean_token(ks[0])
51
+ m = re.match(r"^(?P<prefix>.+?)\s+(?P<token>[0-9A-Za-z]+)$", first)
52
+ if m:
53
+ prefix = m.group("prefix").strip()
54
+ token0 = m.group("token").strip()
55
+ tokens = [token0] + [_clean_token(x) for x in ks[1:]]
56
+ for tok, v in zip(tokens, vs):
57
+ out[_compact_key(f"{prefix}{tok}")] = v
58
+ return out
59
+
60
+ # 2) 일반 케이스 + "prefix 전파" (52Weeks 최고/최저 같은 패턴)
61
+ # 첫 토큰이 "52Weeks 최고"처럼 "prefix + label"이면,
62
+ # 이후 토큰이 "최저"처럼 prefix가 생략된 경우 prefix를 붙여준다.
63
+ first_tok = _clean_token(ks[0])
64
+ m2 = re.match(r"^(?P<prefix>[0-9A-Za-z]+)\s+(?P<label>.+)$", first_tok)
65
+ if m2:
66
+ prefix = m2.group("prefix").strip()
67
+ label0 = m2.group("label").strip()
68
+ labels = [label0] + [_clean_token(x) for x in ks[1:]]
69
+ for lab, v in zip(labels, vs):
70
+ out[_compact_key(f"{prefix}{lab}")] = v
71
+ return out
72
+
73
+ # 3) 완전 일반: 그대로 매칭
74
+ for k, v in zip(ks, vs):
75
+ out[_compact_key(k)] = v
76
+ return out
77
+
78
+
79
+ def normalize_sise_kv_map(src: Mapping[str, str]) -> dict[str, str]:
80
+ """
81
+ c101 시세 블록(dict[str,str])을 "정규화된 키 dict"로 변환.
82
+
83
+ 정규화 규칙:
84
+ - key/value에 "/"가 있고 길이가 맞으면 분해해 여러 항목으로 확장
85
+ 예) "거래량/거래대금" -> "거래량", "거래대금"
86
+ 예) "52Weeks 최고/최저" -> "52Weeks최고", "52Weeks최저"
87
+ 예) "수익률 (1M/3M/6M/1Y)" -> "수익률1M", "수익률3M", ...
88
+ - 나머지는 key의 공백 제거 정도만 적용해서 유지
89
+ """
90
+ out: dict[str, str] = {}
91
+
92
+ for k, v in src.items():
93
+ k = k.strip()
94
+ v = v.strip()
95
+
96
+ expanded = _maybe_expand_pair_key_value(k, v)
97
+ if expanded:
98
+ out.update(expanded)
99
+ continue
100
+
101
+ out[_compact_key(k)] = v
102
+
103
+ return out
@@ -0,0 +1,47 @@
1
+ # scraper2_hj3415/app/parsing/c101/company_overview.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Any
6
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
7
+ from common_hj3415.utils import clean_text
8
+
9
+ # 정규표현식
10
+ _DATE_RE = re.compile(r"(\d{4}\.\d{2}\.\d{2})") # YYYY.MM.DD
11
+
12
+ async def parse_c101_company_overview(browser: BrowserPort) -> dict[str, Any]:
13
+ """
14
+ '기업개요' 섹션에서
15
+ - 기준일자([기준:YYYY.MM.DD])
16
+ - 개요 문장들(li.dot_cmp)
17
+ 을 추출한다.
18
+ """
19
+ out: dict[str, Any] = {}
20
+
21
+ 기준_sel = "div.header-table p"
22
+ 개요_ul_sel = "div.cmp_comment ul.dot_cmp"
23
+ 개요_li_sel = "div.cmp_comment ul.dot_cmp > li.dot_cmp"
24
+
25
+ # 1) 기준일자
26
+ await browser.wait_attached(기준_sel)
27
+ raw = clean_text(await browser.text_content_first(기준_sel))
28
+
29
+ m = _DATE_RE.search(raw)
30
+ out["기준일자"] = m.group(1) if m else raw
31
+
32
+ # 2) 개요 문장들
33
+ await browser.wait_attached(개요_ul_sel)
34
+ li_texts = await browser.all_texts(개요_li_sel)
35
+
36
+ lines: list[str] = []
37
+ for t in li_texts:
38
+ ct = clean_text(t)
39
+ if ct:
40
+ lines.append(ct)
41
+
42
+ # out["개요_리스트"] = lines # 일단 필요 없음
43
+ out["개요"] = "".join(
44
+ lines
45
+ ) # 정책: 저장용이면 join("") 유지, 표시용이면 "\n".join 고려
46
+
47
+ return out