scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
- scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +22 -0
- scraper2_hj3415/app/parsing/c104_parser.py +26 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
- scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
- scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
- scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
- scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
- scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
- scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
- scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +45 -72
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
- scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/doc.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Mapping
|
|
6
|
+
from scraper2_hj3415.app.domain.endpoint import EndpointKind
|
|
7
|
+
from scraper2_hj3415.app.domain.types import BlockKey, LabelsMap
|
|
8
|
+
from scraper2_hj3415.app.domain.blocks import BlockData
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class NfsDoc:
|
|
13
|
+
code: str
|
|
14
|
+
endpoint_kind: EndpointKind
|
|
15
|
+
blocks: Mapping[BlockKey, BlockData]
|
|
16
|
+
labels: Mapping[BlockKey, LabelsMap]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/series.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Mapping
|
|
6
|
+
from scraper2_hj3415.app.domain.types import MetricKey, Period, Num
|
|
7
|
+
|
|
8
|
+
@dataclass(frozen=True)
|
|
9
|
+
class MetricSeries:
|
|
10
|
+
key: MetricKey
|
|
11
|
+
values: Mapping[Period, Num]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# scraper2_hj3415/app/domain/types.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Mapping, Any, Sequence, TypeAlias, Literal
|
|
5
|
+
|
|
6
|
+
BlockKey = str
|
|
7
|
+
MetricKey = str
|
|
8
|
+
Period = str
|
|
9
|
+
Num = float | int | None
|
|
10
|
+
|
|
11
|
+
Record: TypeAlias = Mapping[str, Any]
|
|
12
|
+
Records: TypeAlias = Sequence[Record]
|
|
13
|
+
RawLabel = str
|
|
14
|
+
LabelsMap = dict[MetricKey, RawLabel]
|
|
15
|
+
|
|
16
|
+
Sink = Literal["memory", "mongo"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/_normalize/label.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from common_hj3415.utils import clean_text
|
|
8
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
9
|
+
|
|
10
|
+
# -----------------------------
|
|
11
|
+
# 일반적인 라벨 정규화
|
|
12
|
+
# -----------------------------
|
|
13
|
+
|
|
14
|
+
UI_LABEL_NOISE = (
|
|
15
|
+
"펼치기",
|
|
16
|
+
"접기",
|
|
17
|
+
"더보기",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def sanitize_label(x: Any) -> str:
|
|
22
|
+
"""
|
|
23
|
+
raw label(항목_raw 포함)에서:
|
|
24
|
+
- '펼치기' 같은 UI 텍스트 제거
|
|
25
|
+
- 과도한 공백 정리
|
|
26
|
+
- 양끝 공백 제거
|
|
27
|
+
"""
|
|
28
|
+
s = normalize_text(x)
|
|
29
|
+
|
|
30
|
+
# UI 노이즈 단어 제거
|
|
31
|
+
for w in UI_LABEL_NOISE:
|
|
32
|
+
s = s.replace(w, " ")
|
|
33
|
+
|
|
34
|
+
return clean_text(s)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# -----------------------------
|
|
38
|
+
# metric 라벨 정규화
|
|
39
|
+
# -----------------------------
|
|
40
|
+
_BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
|
|
41
|
+
_EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
|
|
42
|
+
_ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def normalize_label_base(text: str | None) -> str:
|
|
46
|
+
s = sanitize_label(text)
|
|
47
|
+
s = _EXTRA_WORDS_PATTERN.sub("", s)
|
|
48
|
+
s = _BRACKET_PATTERN.sub("", s)
|
|
49
|
+
s = _ALL_PAREN_PATTERN.sub("", s)
|
|
50
|
+
s = s.replace("*", "")
|
|
51
|
+
return clean_text(s)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def normalize_metric_label(text: str | None) -> str:
|
|
55
|
+
# "보유 지분 (%)" → "보유 지분" (공백 유지)
|
|
56
|
+
return normalize_label_base(text)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def normalize_key_label(text: str | None) -> str:
|
|
60
|
+
# "보유 지분 (%)" → "보유지분"
|
|
61
|
+
s = normalize_label_base(text)
|
|
62
|
+
return s.replace(" ", "").replace("\xa0", "").replace("%", "").strip()
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# -----------------------------
|
|
66
|
+
# 컬럼명 정규화
|
|
67
|
+
# -----------------------------
|
|
68
|
+
_COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
|
|
69
|
+
_COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
|
|
70
|
+
_COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def normalize_col_label(col: str | None) -> str:
|
|
74
|
+
"""
|
|
75
|
+
컬럼명 정규화
|
|
76
|
+
예)
|
|
77
|
+
"2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
|
|
78
|
+
"2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
|
|
79
|
+
"전년대비 (YoY)" -> "전년대비"
|
|
80
|
+
"전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
|
|
81
|
+
"""
|
|
82
|
+
s = normalize_text(col)
|
|
83
|
+
# 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
|
|
84
|
+
s = _COL_DOTNUM.sub("", s)
|
|
85
|
+
|
|
86
|
+
# 2) 컨센서스 문구 제거
|
|
87
|
+
s = _COL_EXTRA_WORDS.sub("", s)
|
|
88
|
+
|
|
89
|
+
# 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
|
|
90
|
+
s = _COL_PAREN_PATTERN.sub("", s)
|
|
91
|
+
|
|
92
|
+
return clean_text(s)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/_normalize/table.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from collections import Counter
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from .label import normalize_col_label, normalize_metric_label
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _dedupe_columns(cols: list[str]) -> list[str]:
|
|
12
|
+
"""
|
|
13
|
+
정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
|
|
14
|
+
예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
|
|
15
|
+
"""
|
|
16
|
+
seen: Counter[str] = Counter()
|
|
17
|
+
out: list[str] = []
|
|
18
|
+
for c in cols:
|
|
19
|
+
c = c or ""
|
|
20
|
+
seen[c] += 1
|
|
21
|
+
if seen[c] == 1:
|
|
22
|
+
out.append(c)
|
|
23
|
+
else:
|
|
24
|
+
out.append(f"{c}_{seen[c]}")
|
|
25
|
+
return out
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# -----------------------------
|
|
29
|
+
# 3) DataFrame 전체 정규화 + records 변환
|
|
30
|
+
# -----------------------------
|
|
31
|
+
def normalize_metrics_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
32
|
+
"""
|
|
33
|
+
- 컬럼명 전체 정규화
|
|
34
|
+
- '항목' 값 정규화
|
|
35
|
+
- NaN -> None
|
|
36
|
+
- 중복 컬럼명 자동 분리(_2/_3)
|
|
37
|
+
"""
|
|
38
|
+
if df is None or df.empty:
|
|
39
|
+
return df
|
|
40
|
+
|
|
41
|
+
df = df.copy()
|
|
42
|
+
|
|
43
|
+
# 컬럼명 정규화 + 중복 방지
|
|
44
|
+
norm_cols = [normalize_col_label(c) for c in df.columns.astype(str).tolist()]
|
|
45
|
+
df.columns = _dedupe_columns(norm_cols)
|
|
46
|
+
|
|
47
|
+
# 항목 값 정규화
|
|
48
|
+
if "항목" in df.columns:
|
|
49
|
+
df["항목"] = df["항목"].map(normalize_metric_label)
|
|
50
|
+
|
|
51
|
+
# NaN -> None
|
|
52
|
+
df = df.replace({np.nan: None})
|
|
53
|
+
return df
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/_normalize/text.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from common_hj3415.utils import clean_text
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize_text(x: object | None) -> str:
|
|
10
|
+
"""
|
|
11
|
+
임의의 값을 문자열로 정규화한다.
|
|
12
|
+
- None → ""
|
|
13
|
+
- 문자열 표현 규칙(clean_text) 적용
|
|
14
|
+
"""
|
|
15
|
+
s = "" if x is None else str(x)
|
|
16
|
+
return clean_text(s)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_NUM_EMPTY = {"", "-", "N/A", "NA", "null", "None"}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def display_text(x: Any) -> str:
|
|
23
|
+
"""
|
|
24
|
+
출력용 문자열로 정규화한다.
|
|
25
|
+
- '-', 'N/A' 등 의미 없는 값은 제거
|
|
26
|
+
"""
|
|
27
|
+
s = normalize_text(x)
|
|
28
|
+
if not s or s in _NUM_EMPTY:
|
|
29
|
+
return ""
|
|
30
|
+
return s
|
|
31
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/_normalize/values.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any
|
|
5
|
+
from scraper2_hj3415.app.parsing._normalize.text import normalize_text
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def parse_numeric(
|
|
9
|
+
x: Any,
|
|
10
|
+
*,
|
|
11
|
+
strip_units: bool = False,
|
|
12
|
+
keep_text: bool = False,
|
|
13
|
+
) -> int | float | str | None:
|
|
14
|
+
"""
|
|
15
|
+
문자열을 숫자로 파싱 시도한다.
|
|
16
|
+
|
|
17
|
+
- strip_units=True:
|
|
18
|
+
'원', '%', '억' 등 단위를 제거한 뒤 숫자 파싱
|
|
19
|
+
- strip_units=False:
|
|
20
|
+
순수 숫자만 파싱
|
|
21
|
+
"""
|
|
22
|
+
s = normalize_text(x)
|
|
23
|
+
if not s:
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
t = s.replace(",", "")
|
|
27
|
+
if strip_units:
|
|
28
|
+
t = (
|
|
29
|
+
t.replace("원", "")
|
|
30
|
+
.replace("억원", "")
|
|
31
|
+
.replace("억", "")
|
|
32
|
+
.replace("%", "")
|
|
33
|
+
.strip()
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# 정수
|
|
37
|
+
if re.fullmatch(r"-?\d+", t):
|
|
38
|
+
return int(t)
|
|
39
|
+
|
|
40
|
+
# 실수
|
|
41
|
+
if re.fullmatch(r"-?\d+(\.\d+)?", t):
|
|
42
|
+
return float(t)
|
|
43
|
+
|
|
44
|
+
return s if keep_text else None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def to_number(x: Any) -> int | float | None:
|
|
48
|
+
"""숫자만 허용 (실패 시 None)"""
|
|
49
|
+
return parse_numeric(x, strip_units=True, keep_text=False)
|
|
50
|
+
|
|
51
|
+
def to_number_or_text(x: Any) -> float | str | None:
|
|
52
|
+
"""숫자면 숫자, 아니면 텍스트"""
|
|
53
|
+
return parse_numeric(x, strip_units=True, keep_text=True)
|
|
54
|
+
|
|
55
|
+
def to_num_or_text(x: Any) -> int | float | str | None:
|
|
56
|
+
"""범용 셀 정규화"""
|
|
57
|
+
return parse_numeric(x, strip_units=False, keep_text=True)
|
|
58
|
+
|
|
59
|
+
def to_int(x: Any) -> int | None:
|
|
60
|
+
v = parse_numeric(x, strip_units=True, keep_text=False)
|
|
61
|
+
if isinstance(v, (int, float)):
|
|
62
|
+
return int(v)
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
def to_float(x: Any) -> float | None:
|
|
66
|
+
v = parse_numeric(x, strip_units=True, keep_text=False)
|
|
67
|
+
if isinstance(v, (int, float)):
|
|
68
|
+
return float(v)
|
|
69
|
+
return None
|
|
70
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from typing import Any, Sequence
|
|
2
|
+
|
|
3
|
+
from io import StringIO
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from logging_hj3415 import logger
|
|
7
|
+
from scraper2_hj3415.app.parsing._normalize.table import normalize_metrics_df
|
|
8
|
+
from scraper2_hj3415.app.parsing._normalize.label import sanitize_label
|
|
9
|
+
from common_hj3415.utils import clean_text
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _flatten_col(col: Any) -> str:
|
|
13
|
+
"""
|
|
14
|
+
pandas MultiIndex 컬럼(tuple)을 사람이 쓰기 좋은 단일 key로 변환한다.
|
|
15
|
+
- ('재무년월','재무년월') 같은 중복은 하나로 축약
|
|
16
|
+
- 단위 문자열 제거
|
|
17
|
+
- '주재 무제표' 같은 깨진 라벨 보정
|
|
18
|
+
"""
|
|
19
|
+
if isinstance(col, tuple):
|
|
20
|
+
parts = [clean_text(p) for p in col if clean_text(p)]
|
|
21
|
+
if not parts:
|
|
22
|
+
s = ""
|
|
23
|
+
elif len(parts) == 2 and parts[0] == parts[1]:
|
|
24
|
+
s = parts[0]
|
|
25
|
+
else:
|
|
26
|
+
s = "_".join(parts)
|
|
27
|
+
else:
|
|
28
|
+
s = clean_text(col)
|
|
29
|
+
|
|
30
|
+
s = (
|
|
31
|
+
s.replace("(억원, %)", "")
|
|
32
|
+
.replace("(원)", "")
|
|
33
|
+
.replace("(배)", "")
|
|
34
|
+
.replace("(%)", "")
|
|
35
|
+
.strip()
|
|
36
|
+
)
|
|
37
|
+
s = s.replace("주재 무제표", "주재무제표")
|
|
38
|
+
return clean_text(s)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def try_html_table_to_df(
|
|
42
|
+
html: str, *, flatten_cols: bool = False, header: int | Sequence[int] = 0
|
|
43
|
+
) -> pd.DataFrame | None:
|
|
44
|
+
try:
|
|
45
|
+
dfs = pd.read_html(StringIO(html), header=header)
|
|
46
|
+
except Exception as e:
|
|
47
|
+
logger.exception("pd.read_html failed: {}", e)
|
|
48
|
+
return None
|
|
49
|
+
if not dfs:
|
|
50
|
+
return None
|
|
51
|
+
df = dfs[0]
|
|
52
|
+
if df is None or df.empty:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
if flatten_cols:
|
|
56
|
+
df = df.copy()
|
|
57
|
+
df.columns = [_flatten_col(c) for c in df.columns]
|
|
58
|
+
return df
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def df_to_c1034_metric_list(df: pd.DataFrame) -> list[dict[str, Any]]:
|
|
62
|
+
"""
|
|
63
|
+
C103 테이블 DataFrame -> 정규화된 records(list[dict])
|
|
64
|
+
- 항목이 비면 제거
|
|
65
|
+
- 항목_raw(정규화 전 원래 라벨) 보존
|
|
66
|
+
"""
|
|
67
|
+
if df is None or df.empty:
|
|
68
|
+
return []
|
|
69
|
+
|
|
70
|
+
df = df.copy()
|
|
71
|
+
|
|
72
|
+
# 정규화 전에 원래 항목 라벨 보존(공백, 펼치기, 닫기등 제외)
|
|
73
|
+
if "항목" in df.columns:
|
|
74
|
+
df["항목_raw"] = (
|
|
75
|
+
df["항목"]
|
|
76
|
+
.where(df["항목"].notna(), None)
|
|
77
|
+
.map(lambda x: sanitize_label(x) if x is not None else None)
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
df = normalize_metrics_df(df)
|
|
81
|
+
|
|
82
|
+
records: list[dict[str, Any]] = []
|
|
83
|
+
for r in df.to_dict(orient="records"):
|
|
84
|
+
item = r.get("항목")
|
|
85
|
+
if not item:
|
|
86
|
+
continue
|
|
87
|
+
records.append(r)
|
|
88
|
+
return records
|
|
89
|
+
|
|
File without changes
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# scraper2/app/parsing/_sise_normalize.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Mapping
|
|
6
|
+
|
|
7
|
+
# 공통 구분자: 값/키 둘 다 여기로 쪼갬
|
|
8
|
+
_DEFAULT_SEP = "/"
|
|
9
|
+
|
|
10
|
+
_UNIT_REPLACEMENTS = {
|
|
11
|
+
"Weeks": "주",
|
|
12
|
+
"Week": "주",
|
|
13
|
+
# 필요해지면 여기에 추가
|
|
14
|
+
# "Days": "일",
|
|
15
|
+
# "Months": "개월",
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _clean_token(s: str) -> str:
|
|
20
|
+
# 괄호/공백 제거 + 중복 공백 정리
|
|
21
|
+
s = s.strip()
|
|
22
|
+
s = s.replace("(", " ").replace(")", " ")
|
|
23
|
+
s = re.sub(r"\s+", " ", s).strip()
|
|
24
|
+
return s
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _compact_key(s: str) -> str:
|
|
28
|
+
s = _clean_token(s)
|
|
29
|
+
s = _replace_units(s) # ✅ 여기서 Weeks → 주
|
|
30
|
+
return s.replace(" ", "")
|
|
31
|
+
|
|
32
|
+
def _split_slash(s: str) -> list[str]:
|
|
33
|
+
return [p.strip() for p in s.split(_DEFAULT_SEP)]
|
|
34
|
+
|
|
35
|
+
def _replace_units(s: str) -> str:
|
|
36
|
+
for src, dst in _UNIT_REPLACEMENTS.items():
|
|
37
|
+
s = s.replace(src, dst)
|
|
38
|
+
return s
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _maybe_expand_pair_key_value(key: str, value: str) -> dict[str, str] | None:
|
|
42
|
+
ks = _split_slash(key)
|
|
43
|
+
vs = _split_slash(value)
|
|
44
|
+
if len(ks) <= 1 or len(ks) != len(vs):
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
out: dict[str, str] = {}
|
|
48
|
+
|
|
49
|
+
# 1) 특수 케이스: "수익률 (1M/3M/6M/1Y)"
|
|
50
|
+
first = _clean_token(ks[0])
|
|
51
|
+
m = re.match(r"^(?P<prefix>.+?)\s+(?P<token>[0-9A-Za-z]+)$", first)
|
|
52
|
+
if m:
|
|
53
|
+
prefix = m.group("prefix").strip()
|
|
54
|
+
token0 = m.group("token").strip()
|
|
55
|
+
tokens = [token0] + [_clean_token(x) for x in ks[1:]]
|
|
56
|
+
for tok, v in zip(tokens, vs):
|
|
57
|
+
out[_compact_key(f"{prefix}{tok}")] = v
|
|
58
|
+
return out
|
|
59
|
+
|
|
60
|
+
# 2) 일반 케이스 + "prefix 전파" (52Weeks 최고/최저 같은 패턴)
|
|
61
|
+
# 첫 토큰이 "52Weeks 최고"처럼 "prefix + label"이면,
|
|
62
|
+
# 이후 토큰이 "최저"처럼 prefix가 생략된 경우 prefix를 붙여준다.
|
|
63
|
+
first_tok = _clean_token(ks[0])
|
|
64
|
+
m2 = re.match(r"^(?P<prefix>[0-9A-Za-z]+)\s+(?P<label>.+)$", first_tok)
|
|
65
|
+
if m2:
|
|
66
|
+
prefix = m2.group("prefix").strip()
|
|
67
|
+
label0 = m2.group("label").strip()
|
|
68
|
+
labels = [label0] + [_clean_token(x) for x in ks[1:]]
|
|
69
|
+
for lab, v in zip(labels, vs):
|
|
70
|
+
out[_compact_key(f"{prefix}{lab}")] = v
|
|
71
|
+
return out
|
|
72
|
+
|
|
73
|
+
# 3) 완전 일반: 그대로 매칭
|
|
74
|
+
for k, v in zip(ks, vs):
|
|
75
|
+
out[_compact_key(k)] = v
|
|
76
|
+
return out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def normalize_sise_kv_map(src: Mapping[str, str]) -> dict[str, str]:
|
|
80
|
+
"""
|
|
81
|
+
c101 시세 블록(dict[str,str])을 "정규화된 키 dict"로 변환.
|
|
82
|
+
|
|
83
|
+
정규화 규칙:
|
|
84
|
+
- key/value에 "/"가 있고 길이가 맞으면 분해해 여러 항목으로 확장
|
|
85
|
+
예) "거래량/거래대금" -> "거래량", "거래대금"
|
|
86
|
+
예) "52Weeks 최고/최저" -> "52Weeks최고", "52Weeks최저"
|
|
87
|
+
예) "수익률 (1M/3M/6M/1Y)" -> "수익률1M", "수익률3M", ...
|
|
88
|
+
- 나머지는 key의 공백 제거 정도만 적용해서 유지
|
|
89
|
+
"""
|
|
90
|
+
out: dict[str, str] = {}
|
|
91
|
+
|
|
92
|
+
for k, v in src.items():
|
|
93
|
+
k = k.strip()
|
|
94
|
+
v = v.strip()
|
|
95
|
+
|
|
96
|
+
expanded = _maybe_expand_pair_key_value(k, v)
|
|
97
|
+
if expanded:
|
|
98
|
+
out.update(expanded)
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
out[_compact_key(k)] = v
|
|
102
|
+
|
|
103
|
+
return out
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# scraper2_hj3415/app/parsing/c101/company_overview.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any
|
|
6
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
7
|
+
from common_hj3415.utils import clean_text
|
|
8
|
+
|
|
9
|
+
# 정규표현식
|
|
10
|
+
_DATE_RE = re.compile(r"(\d{4}\.\d{2}\.\d{2})") # YYYY.MM.DD
|
|
11
|
+
|
|
12
|
+
async def parse_c101_company_overview(browser: BrowserPort) -> dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
'기업개요' 섹션에서
|
|
15
|
+
- 기준일자([기준:YYYY.MM.DD])
|
|
16
|
+
- 개요 문장들(li.dot_cmp)
|
|
17
|
+
을 추출한다.
|
|
18
|
+
"""
|
|
19
|
+
out: dict[str, Any] = {}
|
|
20
|
+
|
|
21
|
+
기준_sel = "div.header-table p"
|
|
22
|
+
개요_ul_sel = "div.cmp_comment ul.dot_cmp"
|
|
23
|
+
개요_li_sel = "div.cmp_comment ul.dot_cmp > li.dot_cmp"
|
|
24
|
+
|
|
25
|
+
# 1) 기준일자
|
|
26
|
+
await browser.wait_attached(기준_sel)
|
|
27
|
+
raw = clean_text(await browser.text_content_first(기준_sel))
|
|
28
|
+
|
|
29
|
+
m = _DATE_RE.search(raw)
|
|
30
|
+
out["기준일자"] = m.group(1) if m else raw
|
|
31
|
+
|
|
32
|
+
# 2) 개요 문장들
|
|
33
|
+
await browser.wait_attached(개요_ul_sel)
|
|
34
|
+
li_texts = await browser.all_texts(개요_li_sel)
|
|
35
|
+
|
|
36
|
+
lines: list[str] = []
|
|
37
|
+
for t in li_texts:
|
|
38
|
+
ct = clean_text(t)
|
|
39
|
+
if ct:
|
|
40
|
+
lines.append(ct)
|
|
41
|
+
|
|
42
|
+
# out["개요_리스트"] = lines # 일단 필요 없음
|
|
43
|
+
out["개요"] = "".join(
|
|
44
|
+
lines
|
|
45
|
+
) # 정책: 저장용이면 join("") 유지, 표시용이면 "\n".join 고려
|
|
46
|
+
|
|
47
|
+
return out
|