scraper2-hj3415 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scraper2/.DS_Store +0 -0
  2. scraper2/adapters/out/.DS_Store +0 -0
  3. scraper2/adapters/out/playwright/browser.py +103 -0
  4. scraper2/adapters/out/playwright/browser_factory.py +112 -0
  5. scraper2/adapters/out/playwright/session.py +121 -0
  6. scraper2/adapters/out/sinks/.DS_Store +0 -0
  7. scraper2/adapters/out/sinks/memory/__init__.py +15 -0
  8. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
  9. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
  10. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
  11. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
  12. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
  13. scraper2/adapters/out/sinks/memory/store.py +74 -0
  14. scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
  15. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
  16. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
  17. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
  18. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
  19. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
  20. scraper2/app/composition.py +195 -0
  21. scraper2/app/parsing/_converters.py +85 -0
  22. scraper2/app/parsing/_normalize.py +134 -0
  23. scraper2/app/parsing/c101_parser.py +143 -0
  24. scraper2/app/parsing/c103_parser.py +128 -0
  25. scraper2/app/parsing/c104_parser.py +143 -0
  26. scraper2/app/parsing/c106_parser.py +153 -0
  27. scraper2/app/parsing/c108_parser.py +65 -0
  28. scraper2/app/ports/browser/browser_factory_port.py +11 -0
  29. scraper2/app/ports/browser/browser_port.py +22 -0
  30. scraper2/app/ports/ingest_port.py +13 -0
  31. scraper2/app/ports/sinks/base_sink_port.py +14 -0
  32. scraper2/app/ports/sinks/c101_sink_port.py +9 -0
  33. scraper2/app/ports/sinks/c103_sink_port.py +9 -0
  34. scraper2/app/ports/sinks/c104_sink_port.py +9 -0
  35. scraper2/app/ports/sinks/c106_sink_port.py +9 -0
  36. scraper2/app/ports/sinks/c108_sink_port.py +9 -0
  37. scraper2/app/usecases/fetch/fetch_c101.py +43 -0
  38. scraper2/app/usecases/fetch/fetch_c103.py +103 -0
  39. scraper2/app/usecases/fetch/fetch_c104.py +76 -0
  40. scraper2/app/usecases/fetch/fetch_c106.py +90 -0
  41. scraper2/app/usecases/fetch/fetch_c108.py +49 -0
  42. scraper2/app/usecases/ingest/ingest_c101.py +36 -0
  43. scraper2/app/usecases/ingest/ingest_c103.py +37 -0
  44. scraper2/app/usecases/ingest/ingest_c104.py +37 -0
  45. scraper2/app/usecases/ingest/ingest_c106.py +38 -0
  46. scraper2/app/usecases/ingest/ingest_c108.py +39 -0
  47. scraper2/main.py +257 -0
  48. scraper2_hj3415-2.0.0.dist-info/METADATA +164 -0
  49. scraper2_hj3415-2.0.0.dist-info/RECORD +63 -0
  50. scraper2_hj3415-2.0.0.dist-info/entry_points.txt +3 -0
  51. scraper2_hj3415/__main__.py +0 -6
  52. scraper2_hj3415/adapters/_shared/utils.py +0 -29
  53. scraper2_hj3415/adapters/clients/browser.py +0 -124
  54. scraper2_hj3415/adapters/clients/http.py +0 -51
  55. scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
  56. scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
  57. scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
  58. scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
  59. scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
  60. scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
  61. scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
  62. scraper2_hj3415/core/constants.py +0 -47
  63. scraper2_hj3415/core/ports/sink_port.py +0 -16
  64. scraper2_hj3415/core/ports/source_port.py +0 -13
  65. scraper2_hj3415/core/types.py +0 -11
  66. scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
  67. scraper2_hj3415/di.py +0 -103
  68. scraper2_hj3415/entrypoints/cli.py +0 -226
  69. scraper2_hj3415/entrypoints/main.py +0 -20
  70. scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
  71. scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
  72. scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
  73. {scraper2_hj3415 → scraper2}/__init__.py +0 -0
  74. {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
  75. {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
  76. {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
  77. {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
  78. {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
  79. {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
  80. {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
  81. {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
  82. {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
  83. {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
  84. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.0.0.dist-info}/WHEEL +0 -0
  85. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,41 @@
1
+ # scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+ from typing import Iterable, Optional
6
+
7
+ from pymongo.asynchronous.database import AsyncDatabase
8
+
9
+ from contracts.nfs.c106 import C106DTO
10
+ from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
11
+
12
+ from db2.nfs import (
13
+ upsert_latest,
14
+ upsert_latest_many,
15
+ insert_snapshot,
16
+ insert_snapshots_many,
17
+ )
18
+
19
+ _ENDPOINT = "c106"
20
+
21
+
22
+ class MongoC106Sink(C106SinkPort):
23
+ def __init__(self, db: AsyncDatabase):
24
+ self._db = db
25
+
26
+ async def write(self, dto: C106DTO, *, asof: Optional[datetime] = None) -> None:
27
+ await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
28
+ await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
+
30
+ async def write_many(
31
+ self,
32
+ dtos: Iterable[C106DTO],
33
+ *,
34
+ asof: Optional[datetime] = None,
35
+ ) -> None:
36
+ dtos_list = list(dtos)
37
+ if not dtos_list:
38
+ return
39
+
40
+ await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
41
+ await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -0,0 +1,41 @@
1
+ # scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+ from typing import Iterable, Optional
6
+
7
+ from pymongo.asynchronous.database import AsyncDatabase
8
+
9
+ from contracts.nfs.c108 import C108DTO
10
+ from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
11
+
12
+ from db2.nfs import (
13
+ upsert_latest,
14
+ upsert_latest_many,
15
+ insert_snapshot,
16
+ insert_snapshots_many,
17
+ )
18
+
19
+ _ENDPOINT = "c108"
20
+
21
+
22
+ class MongoC108Sink(C108SinkPort):
23
+ def __init__(self, db: AsyncDatabase):
24
+ self._db = db
25
+
26
+ async def write(self, dto: C108DTO, *, asof: Optional[datetime] = None) -> None:
27
+ await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
28
+ await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
+
30
+ async def write_many(
31
+ self,
32
+ dtos: Iterable[C108DTO],
33
+ *,
34
+ asof: Optional[datetime] = None,
35
+ ) -> None:
36
+ dtos_list = list(dtos)
37
+ if not dtos_list:
38
+ return
39
+
40
+ await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
41
+ await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -0,0 +1,195 @@
1
+ # scraper2/app/composition.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from dataclasses import dataclass
6
+ from typing import Literal
7
+
8
+ from pymongo.asynchronous.database import AsyncDatabase
9
+
10
+ from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
11
+ from scraper2.adapters.out.playwright.browser_factory import PlaywrightBrowserFactory
12
+
13
+ from scraper2.app.usecases.fetch.fetch_c101 import FetchC101
14
+ from scraper2.app.usecases.fetch.fetch_c103 import FetchC103
15
+ from scraper2.app.usecases.fetch.fetch_c104 import FetchC104
16
+ from scraper2.app.usecases.fetch.fetch_c106 import FetchC106
17
+ from scraper2.app.usecases.fetch.fetch_c108 import FetchC108
18
+
19
+ from scraper2.app.ports.ingest_port import IngestPort
20
+ from scraper2.app.usecases.ingest.ingest_c101 import IngestC101
21
+ from scraper2.app.usecases.ingest.ingest_c103 import IngestC103
22
+ from scraper2.app.usecases.ingest.ingest_c104 import IngestC104
23
+ from scraper2.app.usecases.ingest.ingest_c106 import IngestC106
24
+ from scraper2.app.usecases.ingest.ingest_c108 import IngestC108
25
+
26
+ from scraper2.adapters.out.sinks.memory.store import InMemoryStore
27
+ from scraper2.adapters.out.sinks.memory.c101_memory_sink import MemoryC101Sink
28
+ from scraper2.adapters.out.sinks.memory.c103_memory_sink import MemoryC103Sink
29
+ from scraper2.adapters.out.sinks.memory.c104_memory_sink import MemoryC104Sink
30
+ from scraper2.adapters.out.sinks.memory.c106_memory_sink import MemoryC106Sink
31
+ from scraper2.adapters.out.sinks.memory.c108_memory_sink import MemoryC108Sink
32
+
33
+ from scraper2.adapters.out.sinks.mongo.c101_mongo_sink import MongoC101Sink
34
+ from scraper2.adapters.out.sinks.mongo.c103_mongo_sink import MongoC103Sink
35
+ from scraper2.adapters.out.sinks.mongo.c104_mongo_sink import MongoC104Sink
36
+ from scraper2.adapters.out.sinks.mongo.c106_mongo_sink import MongoC106Sink
37
+ from scraper2.adapters.out.sinks.mongo.c108_mongo_sink import MongoC108Sink
38
+
39
+ from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
40
+ from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
41
+ from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
42
+ from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
43
+ from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
44
+
45
+ from db2.mongo import Mongo
46
+
47
+ SinkKind = Literal["memory", "mongo"]
48
+
49
+
50
+ def _env_bool(key: str, default: bool) -> bool:
51
+ v = os.getenv(key)
52
+ return default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
53
+
54
+
55
+ def _env_int(key: str, default: int) -> int:
56
+ v = os.getenv(key)
57
+ if v is None:
58
+ return default
59
+ try:
60
+ return int(v)
61
+ except ValueError:
62
+ return default
63
+
64
+
65
+ def build_browser_factory() -> BrowserFactoryPort:
66
+ return PlaywrightBrowserFactory(
67
+ headless=_env_bool("SCRAPER_HEADLESS", True),
68
+ timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
69
+ max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
70
+ )
71
+
72
+
73
+ # -------------------------
74
+ # Bundles
75
+ # -------------------------
76
+
77
+ @dataclass(frozen=True)
78
+ class FetchUsecases:
79
+ c101: FetchC101
80
+ c103: FetchC103
81
+ c104: FetchC104
82
+ c106: FetchC106
83
+ c108: FetchC108
84
+
85
+
86
+ @dataclass(frozen=True)
87
+ class Sinks:
88
+ c101: C101SinkPort
89
+ c103: C103SinkPort
90
+ c104: C104SinkPort
91
+ c106: C106SinkPort
92
+ c108: C108SinkPort
93
+
94
+
95
+ @dataclass(frozen=True)
96
+ class IngestUsecases:
97
+ c101: IngestPort
98
+ c103: IngestPort
99
+ c104: IngestPort
100
+ c106: IngestPort
101
+ c108: IngestPort
102
+
103
+
104
+ @dataclass(frozen=True)
105
+ class Usecases:
106
+ fetch: FetchUsecases
107
+ ingest: IngestUsecases
108
+ sinks: Sinks
109
+ store: InMemoryStore | None = None # ✅ memory일 때만
110
+ mongo: Mongo | None = None # ✅ mongo일 때만
111
+ db: AsyncDatabase | None = None # ✅ mongo일 때만
112
+
113
+
114
+ # -------------------------
115
+ # builders
116
+ # -------------------------
117
+
118
+ def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
119
+ return FetchUsecases(
120
+ c101=FetchC101(factory=factory),
121
+ c103=FetchC103(factory=factory),
122
+ c104=FetchC104(factory=factory),
123
+ c106=FetchC106(factory=factory),
124
+ c108=FetchC108(factory=factory),
125
+ )
126
+
127
+
128
+ @dataclass(frozen=True)
129
+ class MemoryBundle:
130
+ store: InMemoryStore
131
+ sinks: Sinks
132
+
133
+
134
+ def build_memory_bundle() -> MemoryBundle:
135
+ store = InMemoryStore()
136
+ sinks = Sinks(
137
+ c101=MemoryC101Sink(store),
138
+ c103=MemoryC103Sink(store),
139
+ c104=MemoryC104Sink(store),
140
+ c106=MemoryC106Sink(store),
141
+ c108=MemoryC108Sink(store),
142
+ )
143
+ return MemoryBundle(store=store, sinks=sinks)
144
+
145
+ # ---- mongo bundle ----
146
+
147
+ @dataclass(frozen=True)
148
+ class MongoBundle:
149
+ mongo: Mongo
150
+ db: AsyncDatabase
151
+ sinks: Sinks
152
+
153
+
154
+ def build_mongo_bundle() -> MongoBundle:
155
+ mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
156
+ db = mongo.get_db()
157
+ sinks = Sinks(
158
+ c101=MongoC101Sink(db),
159
+ c103=MongoC103Sink(db),
160
+ c104=MongoC104Sink(db),
161
+ c106=MongoC106Sink(db),
162
+ c108=MongoC108Sink(db),
163
+ )
164
+ return MongoBundle(mongo=mongo, db=db, sinks=sinks)
165
+
166
+
167
+ def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
168
+ return IngestUsecases(
169
+ c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
170
+ c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
171
+ c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
172
+ c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
173
+ c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
174
+ )
175
+
176
+
177
+ def build_usecases(
178
+ *,
179
+ factory: BrowserFactoryPort | None = None,
180
+ sink_kind: SinkKind = "memory",
181
+ ) -> Usecases:
182
+ factory = factory or build_browser_factory()
183
+ fetch = build_fetch_usecases(factory=factory)
184
+
185
+ if sink_kind == "memory":
186
+ bundle = build_memory_bundle()
187
+ ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
188
+ return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, store=bundle.store)
189
+
190
+ if sink_kind == "mongo":
191
+ bundle = build_mongo_bundle()
192
+ ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
193
+ return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, mongo=bundle.mongo, db=bundle.db)
194
+
195
+ raise ValueError(f"Unknown sink_kind: {sink_kind}")
@@ -0,0 +1,85 @@
1
+ from __future__ import annotations
2
+ import re
3
+ from typing import Iterable
4
+
5
+
6
+ _EMPTY_VALUES = {"", "-", "N/A", "NA", "null", "None"}
7
+
8
+
9
+ def normalize(s: str | None) -> str:
10
+ if s is None:
11
+ return ""
12
+ return s.strip()
13
+
14
+
15
+ def _is_empty(s: str) -> bool:
16
+ return s in _EMPTY_VALUES
17
+
18
+
19
+ def to_int(s: str | None) -> int | None:
20
+ """
21
+ C101 파서용 정수 변환기
22
+
23
+ 처리 규칙:
24
+ - None / '' / '-' / 'N/A' → None
25
+ - ',', '주', '원', '주식' 제거
26
+ - '1,234' → 1234
27
+ """
28
+ s = normalize(s)
29
+ if _is_empty(s):
30
+ return None
31
+
32
+ for ch in (",", "원", "주", "주식"):
33
+ s = s.replace(ch, "")
34
+
35
+ try:
36
+ return int(s)
37
+ except ValueError:
38
+ return None
39
+
40
+
41
+ def to_float(s: str | None) -> float | None:
42
+ """
43
+ C101 파서용 실수 변환기
44
+
45
+ 처리 규칙:
46
+ - None / '' / '-' / 'N/A' → None
47
+ - ',', '%', '원' 제거
48
+ - '12.34%' → 12.34
49
+ """
50
+ s = normalize(s)
51
+ if _is_empty(s):
52
+ return None
53
+
54
+ for ch in (",", "%", "원"):
55
+ s = s.replace(ch, "")
56
+
57
+ try:
58
+ return float(s)
59
+ except ValueError:
60
+ return None
61
+
62
+
63
+ def parse_won(text: str) -> int:
64
+ """
65
+ 한국 화폐 표현 문자열을 숫자로 변환 (조원, 억원, 만원, 원, 억 등 처리)
66
+ """
67
+ units = {
68
+ "조원": 1_000_000_000_000,
69
+ "억원": 100_000_000,
70
+ "억": 100_000_000,
71
+ "만원": 10_000,
72
+ "원": 1,
73
+ }
74
+
75
+ text = text.replace(",", "").strip()
76
+ match = re.match(r"([-+]?[0-9]*\.?[0-9]+)([가-힣]+)", text)
77
+
78
+ if not match:
79
+ raise ValueError(f"형식이 잘못된 금액 문자열: {text}")
80
+
81
+ number, unit = match.groups()
82
+ if unit not in units:
83
+ raise ValueError(f"알 수 없는 단위: {unit}")
84
+
85
+ return int(float(number) * units[unit])
@@ -0,0 +1,134 @@
1
+ # scraper2/app/parsing/_normalize.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from collections import Counter
6
+ from typing import Any
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ # -----------------------------
13
+ # 1) 항목(행의 "항목" 값) 정규화
14
+ # -----------------------------
15
+ _IFRS_PATTERN = re.compile(r"\(IFRS[^)]*\)")
16
+ _ETC_PAREN_PATTERN = re.compile(r"\((E|YoY|QoQ)[^)]*\)")
17
+ _BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
18
+ _EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
19
+ _ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
20
+
21
+ def normalize_c1034_item(text: str | None) -> str:
22
+ """
23
+ C103 항목명(행 값) 정규화
24
+ - 펼치기/컨센서스 제거
25
+ - 모든 괄호 내용 제거 (발표기준, 연결, 개별 등 포함)
26
+ - [구K-IFRS] 제거
27
+ - '*' 제거
28
+ - 공백 정리
29
+ """
30
+ if not text:
31
+ return ""
32
+
33
+ s = str(text)
34
+
35
+ # 1) 불필요한 키워드
36
+ s = _EXTRA_WORDS_PATTERN.sub("", s)
37
+
38
+ # 2) 대괄호 제거
39
+ s = _BRACKET_PATTERN.sub("", s)
40
+
41
+ # 3) 모든 괄호 제거 (중요)
42
+ s = _ALL_PAREN_PATTERN.sub("", s)
43
+
44
+ # 4) 별표 제거
45
+ s = s.replace("*", "")
46
+
47
+ # 5) 공백 정리
48
+ s = re.sub(r"\s+", " ", s).strip()
49
+
50
+ return s
51
+
52
+
53
+ # -----------------------------
54
+ # 2) 컬럼명 정규화
55
+ # -----------------------------
56
+ _COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
57
+ _COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
58
+ _COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
59
+ _COL_MULTI_SPACE = re.compile(r"\s+")
60
+
61
+ def normalize_c1034_col(col: str | None) -> str:
62
+ """
63
+ C103 컬럼명 정규화
64
+ 예)
65
+ "2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
66
+ "2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
67
+ "전년대비 (YoY)" -> "전년대비"
68
+ "전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
69
+ """
70
+ if col is None:
71
+ return ""
72
+
73
+ s = str(col)
74
+
75
+ # 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
76
+ s = _COL_DOTNUM.sub("", s)
77
+
78
+ # 2) 컨센서스 문구 제거
79
+ s = _COL_EXTRA_WORDS.sub("", s)
80
+
81
+ # 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
82
+ s = _COL_PAREN_PATTERN.sub("", s)
83
+
84
+ # 4) 공백 정리
85
+ s = _COL_MULTI_SPACE.sub(" ", s).strip()
86
+
87
+ return s
88
+
89
+
90
+ def _dedupe_columns(cols: list[str]) -> list[str]:
91
+ """
92
+ 정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
93
+ 예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
94
+ """
95
+ seen: Counter[str] = Counter()
96
+ out: list[str] = []
97
+ for c in cols:
98
+ c = c or ""
99
+ seen[c] += 1
100
+ if seen[c] == 1:
101
+ out.append(c)
102
+ else:
103
+ out.append(f"{c}_{seen[c]}")
104
+ return out
105
+
106
+
107
+ # -----------------------------
108
+ # 3) DataFrame 전체 정규화 + records 변환
109
+ # -----------------------------
110
+ def normalize_c1034_df(df: pd.DataFrame) -> pd.DataFrame:
111
+ """
112
+ - 컬럼명 전체 정규화
113
+ - '항목' 값 정규화
114
+ - NaN -> None
115
+ - 중복 컬럼명 자동 분리(_2/_3)
116
+ """
117
+ if df is None or df.empty:
118
+ return df
119
+
120
+ df = df.copy()
121
+
122
+ # 컬럼명 정규화 + 중복 방지
123
+ norm_cols = [normalize_c1034_col(c) for c in df.columns.astype(str).tolist()]
124
+ df.columns = _dedupe_columns(norm_cols)
125
+
126
+ # 항목 값 정규화
127
+ if "항목" in df.columns:
128
+ df["항목"] = df["항목"].map(normalize_c1034_item)
129
+
130
+ # NaN -> None
131
+ df = df.replace({np.nan: None})
132
+ return df
133
+
134
+
@@ -0,0 +1,143 @@
1
+ # scraper2/app/parsing/c101_parser.py
2
+ from __future__ import annotations
3
+
4
+ from scraper2.app.parsing._converters import to_int, to_float, normalize, parse_won
5
+ from scraper2.app.ports.browser.browser_port import BrowserPort
6
+ from typing import Any
7
+
8
+
9
+ class C101ParseError(RuntimeError):
10
+ pass
11
+
12
+
13
+ def _after_colon(s: str) -> str:
14
+ # "업종: XXX" 같은 케이스
15
+ parts = s.split(":")
16
+ return parts[1].strip() if len(parts) > 1 else s.strip()
17
+
18
+
19
+ async def parse_c101_to_dict(browser: BrowserPort) -> dict[str, Any] | None:
20
+ """
21
+ - BrowserPort만 사용
22
+ - dict만 반환
23
+ - 실패 시 None (기존 동작 유지)
24
+ """
25
+
26
+ # 날짜 파싱: 텍스트 검색 기반
27
+ raw_date_str = await browser.text_first_by_text("[기준:")
28
+ if not raw_date_str:
29
+ return None
30
+ 날짜 = raw_date_str.replace("[기준:", "").replace("]", "").strip()
31
+
32
+ # 1) 재무정보 (1st table)
33
+ # 여기선 "tbody 존재"만 기다리면 됨
34
+ await browser.wait("#pArea > div.wrapper-table > div > table > tbody")
35
+
36
+ 종목명 = normalize(await browser.text(
37
+ "#pArea > div.wrapper-table > div > table > tbody "
38
+ "tr:nth-child(1) > td > dl > dt:nth-child(1) > span"
39
+ ))
40
+ 코드 = normalize(await browser.text(
41
+ "#pArea > div.wrapper-table > div > table > tbody "
42
+ "tr:nth-child(1) > td > dl > dt:nth-child(1) > b"
43
+ ))
44
+ 업종_raw = await browser.text(
45
+ "#pArea > div.wrapper-table > div > table > tbody "
46
+ "tr:nth-child(1) > td > dl > dt:nth-child(4)"
47
+ )
48
+ 업종 = _after_colon(업종_raw)
49
+
50
+ eps = to_int(await browser.text(
51
+ "#pArea > div.wrapper-table > div > table > tbody "
52
+ "tr:nth-child(3) > td > dl > dt:nth-child(1) > b"
53
+ ))
54
+ bps = to_int(await browser.text(
55
+ "#pArea > div.wrapper-table > div > table > tbody "
56
+ "tr:nth-child(3) > td > dl > dt:nth-child(2) > b"
57
+ ))
58
+ per = to_float(await browser.text(
59
+ "#pArea > div.wrapper-table > div > table > tbody "
60
+ "tr:nth-child(3) > td > dl > dt:nth-child(3) > b"
61
+ ))
62
+ 업종per = to_float(await browser.text(
63
+ "#pArea > div.wrapper-table > div > table > tbody "
64
+ "tr:nth-child(3) > td > dl > dt:nth-child(4) > b"
65
+ ))
66
+ pbr = to_float(await browser.text(
67
+ "#pArea > div.wrapper-table > div > table > tbody "
68
+ "tr:nth-child(3) > td > dl > dt:nth-child(5) > b"
69
+ ))
70
+ 배당수익률 = to_float(await browser.text(
71
+ "#pArea > div.wrapper-table > div > table > tbody "
72
+ "tr:nth-child(3) > td > dl > dt:nth-child(6) > b"
73
+ ))
74
+
75
+ # 2) 주가 정보 (2nd table)
76
+ await browser.wait("#cTB11 > tbody")
77
+
78
+ 주가 = to_int(await browser.text("#cTB11 > tbody tr:nth-child(1) > td > strong"))
79
+
80
+ 전일대비_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(2)")
81
+ 전일대비 = to_int(전일대비_raw.replace("원", ""))
82
+
83
+ 수익률_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(3)")
84
+ 수익률 = to_float(수익률_raw.replace("%", ""))
85
+
86
+ 최고최저52 = await browser.text("#cTB11 > tbody tr:nth-child(2) > td")
87
+ 최고52, 최저52 = (to_int(x.strip().replace("원", "")) for x in 최고최저52.split("/"))
88
+
89
+ 거래량거래대금 = await browser.text("#cTB11 > tbody tr:nth-child(4) > td")
90
+ 거래량_str, 거래대금_str = (x.strip() for x in 거래량거래대금.split("/"))
91
+ 거래량 = to_int(거래량_str.replace("주", ""))
92
+ 거래대금 = parse_won(거래대금_str)
93
+
94
+ 시가총액 = parse_won(await browser.text("#cTB11 > tbody tr:nth-child(5) > td"))
95
+ 베타52주 = to_float(await browser.text("#cTB11 > tbody tr:nth-child(6) > td"))
96
+
97
+ 발행주식유동비율 = await browser.text("#cTB11 > tbody tr:nth-child(7) > td")
98
+ 발행주식_str, 유동비율_str = (x.strip() for x in 발행주식유동비율.split("/"))
99
+ 발행주식 = to_int(발행주식_str.replace("주", ""))
100
+ 유동비율 = to_float(유동비율_str.replace("%", ""))
101
+
102
+ 외국인지분율 = to_float((await browser.text("#cTB11 > tbody tr:nth-child(8) > td")).replace("%", ""))
103
+
104
+ 수익률1M3M6M1Y = await browser.text("#cTB11 > tbody tr:nth-child(9) > td")
105
+ 수익률1M, 수익률3M, 수익률6M, 수익률1Y = (
106
+ to_float(x.strip().replace("%", "")) for x in 수익률1M3M6M1Y.split("/")
107
+ )
108
+
109
+ # 3) 개요
110
+ # ul 아래 li들을 전부 읽어서 합치기
111
+ await browser.wait("#wrapper > div:nth-child(6) > div.cmp_comment > ul")
112
+ li_texts = await browser.texts("#wrapper > div:nth-child(6) > div.cmp_comment > ul li")
113
+ 개요 = "".join(t.strip() for t in li_texts if t and t.strip())
114
+
115
+ return {
116
+ "종목명": 종목명,
117
+ "코드": 코드,
118
+ "날짜": 날짜,
119
+ "업종": 업종,
120
+ "eps": eps,
121
+ "bps": bps,
122
+ "per": per,
123
+ "업종per": 업종per,
124
+ "pbr": pbr,
125
+ "배당수익률": 배당수익률,
126
+ "주가": 주가,
127
+ "전일대비": 전일대비,
128
+ "수익률": 수익률,
129
+ "최고52": 최고52,
130
+ "최저52": 최저52,
131
+ "거래량": 거래량,
132
+ "거래대금": 거래대금,
133
+ "시가총액": 시가총액,
134
+ "베타52주": 베타52주,
135
+ "발행주식": 발행주식,
136
+ "유동비율": 유동비율,
137
+ "외국인지분율": 외국인지분율,
138
+ "수익률1M": 수익률1M,
139
+ "수익률3M": 수익률3M,
140
+ "수익률6M": 수익률6M,
141
+ "수익률1Y": 수익률1Y,
142
+ "개요": 개요,
143
+ }