scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +103 -0
- scraper2/adapters/out/playwright/browser_factory.py +112 -0
- scraper2/adapters/out/playwright/session.py +121 -0
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +15 -0
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/store.py +74 -0
- scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
- scraper2/app/composition.py +195 -0
- scraper2/app/parsing/_converters.py +85 -0
- scraper2/app/parsing/_normalize.py +134 -0
- scraper2/app/parsing/c101_parser.py +143 -0
- scraper2/app/parsing/c103_parser.py +128 -0
- scraper2/app/parsing/c104_parser.py +143 -0
- scraper2/app/parsing/c106_parser.py +153 -0
- scraper2/app/parsing/c108_parser.py +65 -0
- scraper2/app/ports/browser/browser_factory_port.py +11 -0
- scraper2/app/ports/browser/browser_port.py +22 -0
- scraper2/app/ports/ingest_port.py +13 -0
- scraper2/app/ports/sinks/base_sink_port.py +14 -0
- scraper2/app/ports/sinks/c101_sink_port.py +9 -0
- scraper2/app/ports/sinks/c103_sink_port.py +9 -0
- scraper2/app/ports/sinks/c104_sink_port.py +9 -0
- scraper2/app/ports/sinks/c106_sink_port.py +9 -0
- scraper2/app/ports/sinks/c108_sink_port.py +9 -0
- scraper2/app/usecases/fetch/fetch_c101.py +43 -0
- scraper2/app/usecases/fetch/fetch_c103.py +103 -0
- scraper2/app/usecases/fetch/fetch_c104.py +76 -0
- scraper2/app/usecases/fetch/fetch_c106.py +90 -0
- scraper2/app/usecases/fetch/fetch_c108.py +49 -0
- scraper2/app/usecases/ingest/ingest_c101.py +36 -0
- scraper2/app/usecases/ingest/ingest_c103.py +37 -0
- scraper2/app/usecases/ingest/ingest_c104.py +37 -0
- scraper2/app/usecases/ingest/ingest_c106.py +38 -0
- scraper2/app/usecases/ingest/ingest_c108.py +39 -0
- scraper2/main.py +257 -0
- scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
- scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
- scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
- scraper2_hj3415/__main__.py +0 -6
- scraper2_hj3415/adapters/_shared/utils.py +0 -29
- scraper2_hj3415/adapters/clients/browser.py +0 -124
- scraper2_hj3415/adapters/clients/http.py +0 -51
- scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
- scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
- scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
- scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
- scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
- scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
- scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
- scraper2_hj3415/core/constants.py +0 -47
- scraper2_hj3415/core/ports/sink_port.py +0 -16
- scraper2_hj3415/core/ports/source_port.py +0 -13
- scraper2_hj3415/core/types.py +0 -11
- scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
- scraper2_hj3415/di.py +0 -103
- scraper2_hj3415/entrypoints/cli.py +0 -226
- scraper2_hj3415/entrypoints/main.py +0 -20
- scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
- scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
- scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
- {scraper2_hj3415 → scraper2}/__init__.py +0 -0
- {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
- {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
- {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
- {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
- {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
- {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
- {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts.nfs.c106 import C106DTO
|
|
10
|
+
from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
|
|
11
|
+
|
|
12
|
+
from db2.nfs import (
|
|
13
|
+
upsert_latest,
|
|
14
|
+
upsert_latest_many,
|
|
15
|
+
insert_snapshot,
|
|
16
|
+
insert_snapshots_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_ENDPOINT = "c106"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MongoC106Sink(C106SinkPort):
|
|
23
|
+
def __init__(self, db: AsyncDatabase):
|
|
24
|
+
self._db = db
|
|
25
|
+
|
|
26
|
+
async def write(self, dto: C106DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
+
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
+
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
+
|
|
30
|
+
async def write_many(
|
|
31
|
+
self,
|
|
32
|
+
dtos: Iterable[C106DTO],
|
|
33
|
+
*,
|
|
34
|
+
asof: Optional[datetime] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
dtos_list = list(dtos)
|
|
37
|
+
if not dtos_list:
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
+
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts.nfs.c108 import C108DTO
|
|
10
|
+
from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
|
|
11
|
+
|
|
12
|
+
from db2.nfs import (
|
|
13
|
+
upsert_latest,
|
|
14
|
+
upsert_latest_many,
|
|
15
|
+
insert_snapshot,
|
|
16
|
+
insert_snapshots_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_ENDPOINT = "c108"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MongoC108Sink(C108SinkPort):
|
|
23
|
+
def __init__(self, db: AsyncDatabase):
|
|
24
|
+
self._db = db
|
|
25
|
+
|
|
26
|
+
async def write(self, dto: C108DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
+
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
+
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
+
|
|
30
|
+
async def write_many(
|
|
31
|
+
self,
|
|
32
|
+
dtos: Iterable[C108DTO],
|
|
33
|
+
*,
|
|
34
|
+
asof: Optional[datetime] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
dtos_list = list(dtos)
|
|
37
|
+
if not dtos_list:
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
+
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# scraper2/app/composition.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
9
|
+
|
|
10
|
+
from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
11
|
+
from scraper2.adapters.out.playwright.browser_factory import PlaywrightBrowserFactory
|
|
12
|
+
|
|
13
|
+
from scraper2.app.usecases.fetch.fetch_c101 import FetchC101
|
|
14
|
+
from scraper2.app.usecases.fetch.fetch_c103 import FetchC103
|
|
15
|
+
from scraper2.app.usecases.fetch.fetch_c104 import FetchC104
|
|
16
|
+
from scraper2.app.usecases.fetch.fetch_c106 import FetchC106
|
|
17
|
+
from scraper2.app.usecases.fetch.fetch_c108 import FetchC108
|
|
18
|
+
|
|
19
|
+
from scraper2.app.ports.ingest_port import IngestPort
|
|
20
|
+
from scraper2.app.usecases.ingest.ingest_c101 import IngestC101
|
|
21
|
+
from scraper2.app.usecases.ingest.ingest_c103 import IngestC103
|
|
22
|
+
from scraper2.app.usecases.ingest.ingest_c104 import IngestC104
|
|
23
|
+
from scraper2.app.usecases.ingest.ingest_c106 import IngestC106
|
|
24
|
+
from scraper2.app.usecases.ingest.ingest_c108 import IngestC108
|
|
25
|
+
|
|
26
|
+
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
27
|
+
from scraper2.adapters.out.sinks.memory.c101_memory_sink import MemoryC101Sink
|
|
28
|
+
from scraper2.adapters.out.sinks.memory.c103_memory_sink import MemoryC103Sink
|
|
29
|
+
from scraper2.adapters.out.sinks.memory.c104_memory_sink import MemoryC104Sink
|
|
30
|
+
from scraper2.adapters.out.sinks.memory.c106_memory_sink import MemoryC106Sink
|
|
31
|
+
from scraper2.adapters.out.sinks.memory.c108_memory_sink import MemoryC108Sink
|
|
32
|
+
|
|
33
|
+
from scraper2.adapters.out.sinks.mongo.c101_mongo_sink import MongoC101Sink
|
|
34
|
+
from scraper2.adapters.out.sinks.mongo.c103_mongo_sink import MongoC103Sink
|
|
35
|
+
from scraper2.adapters.out.sinks.mongo.c104_mongo_sink import MongoC104Sink
|
|
36
|
+
from scraper2.adapters.out.sinks.mongo.c106_mongo_sink import MongoC106Sink
|
|
37
|
+
from scraper2.adapters.out.sinks.mongo.c108_mongo_sink import MongoC108Sink
|
|
38
|
+
|
|
39
|
+
from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
|
|
40
|
+
from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
|
|
41
|
+
from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
|
|
42
|
+
from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
|
|
43
|
+
from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
|
|
44
|
+
|
|
45
|
+
from db2.mongo import Mongo
|
|
46
|
+
|
|
47
|
+
SinkKind = Literal["memory", "mongo"]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _env_bool(key: str, default: bool) -> bool:
|
|
51
|
+
v = os.getenv(key)
|
|
52
|
+
return default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _env_int(key: str, default: int) -> int:
|
|
56
|
+
v = os.getenv(key)
|
|
57
|
+
if v is None:
|
|
58
|
+
return default
|
|
59
|
+
try:
|
|
60
|
+
return int(v)
|
|
61
|
+
except ValueError:
|
|
62
|
+
return default
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def build_browser_factory() -> BrowserFactoryPort:
|
|
66
|
+
return PlaywrightBrowserFactory(
|
|
67
|
+
headless=_env_bool("SCRAPER_HEADLESS", True),
|
|
68
|
+
timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
|
|
69
|
+
max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# -------------------------
|
|
74
|
+
# Bundles
|
|
75
|
+
# -------------------------
|
|
76
|
+
|
|
77
|
+
@dataclass(frozen=True)
|
|
78
|
+
class FetchUsecases:
|
|
79
|
+
c101: FetchC101
|
|
80
|
+
c103: FetchC103
|
|
81
|
+
c104: FetchC104
|
|
82
|
+
c106: FetchC106
|
|
83
|
+
c108: FetchC108
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass(frozen=True)
|
|
87
|
+
class Sinks:
|
|
88
|
+
c101: C101SinkPort
|
|
89
|
+
c103: C103SinkPort
|
|
90
|
+
c104: C104SinkPort
|
|
91
|
+
c106: C106SinkPort
|
|
92
|
+
c108: C108SinkPort
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@dataclass(frozen=True)
|
|
96
|
+
class IngestUsecases:
|
|
97
|
+
c101: IngestPort
|
|
98
|
+
c103: IngestPort
|
|
99
|
+
c104: IngestPort
|
|
100
|
+
c106: IngestPort
|
|
101
|
+
c108: IngestPort
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass(frozen=True)
|
|
105
|
+
class Usecases:
|
|
106
|
+
fetch: FetchUsecases
|
|
107
|
+
ingest: IngestUsecases
|
|
108
|
+
sinks: Sinks
|
|
109
|
+
store: InMemoryStore | None = None # ✅ memory일 때만
|
|
110
|
+
mongo: Mongo | None = None # ✅ mongo일 때만
|
|
111
|
+
db: AsyncDatabase | None = None # ✅ mongo일 때만
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# -------------------------
|
|
115
|
+
# builders
|
|
116
|
+
# -------------------------
|
|
117
|
+
|
|
118
|
+
def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
|
|
119
|
+
return FetchUsecases(
|
|
120
|
+
c101=FetchC101(factory=factory),
|
|
121
|
+
c103=FetchC103(factory=factory),
|
|
122
|
+
c104=FetchC104(factory=factory),
|
|
123
|
+
c106=FetchC106(factory=factory),
|
|
124
|
+
c108=FetchC108(factory=factory),
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
@dataclass(frozen=True)
|
|
129
|
+
class MemoryBundle:
|
|
130
|
+
store: InMemoryStore
|
|
131
|
+
sinks: Sinks
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def build_memory_bundle() -> MemoryBundle:
|
|
135
|
+
store = InMemoryStore()
|
|
136
|
+
sinks = Sinks(
|
|
137
|
+
c101=MemoryC101Sink(store),
|
|
138
|
+
c103=MemoryC103Sink(store),
|
|
139
|
+
c104=MemoryC104Sink(store),
|
|
140
|
+
c106=MemoryC106Sink(store),
|
|
141
|
+
c108=MemoryC108Sink(store),
|
|
142
|
+
)
|
|
143
|
+
return MemoryBundle(store=store, sinks=sinks)
|
|
144
|
+
|
|
145
|
+
# ---- mongo bundle ----
|
|
146
|
+
|
|
147
|
+
@dataclass(frozen=True)
|
|
148
|
+
class MongoBundle:
|
|
149
|
+
mongo: Mongo
|
|
150
|
+
db: AsyncDatabase
|
|
151
|
+
sinks: Sinks
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def build_mongo_bundle() -> MongoBundle:
|
|
155
|
+
mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
|
|
156
|
+
db = mongo.get_db()
|
|
157
|
+
sinks = Sinks(
|
|
158
|
+
c101=MongoC101Sink(db),
|
|
159
|
+
c103=MongoC103Sink(db),
|
|
160
|
+
c104=MongoC104Sink(db),
|
|
161
|
+
c106=MongoC106Sink(db),
|
|
162
|
+
c108=MongoC108Sink(db),
|
|
163
|
+
)
|
|
164
|
+
return MongoBundle(mongo=mongo, db=db, sinks=sinks)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
|
|
168
|
+
return IngestUsecases(
|
|
169
|
+
c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
|
|
170
|
+
c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
|
|
171
|
+
c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
|
|
172
|
+
c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
|
|
173
|
+
c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def build_usecases(
|
|
178
|
+
*,
|
|
179
|
+
factory: BrowserFactoryPort | None = None,
|
|
180
|
+
sink_kind: SinkKind = "memory",
|
|
181
|
+
) -> Usecases:
|
|
182
|
+
factory = factory or build_browser_factory()
|
|
183
|
+
fetch = build_fetch_usecases(factory=factory)
|
|
184
|
+
|
|
185
|
+
if sink_kind == "memory":
|
|
186
|
+
bundle = build_memory_bundle()
|
|
187
|
+
ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
|
|
188
|
+
return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, store=bundle.store)
|
|
189
|
+
|
|
190
|
+
if sink_kind == "mongo":
|
|
191
|
+
bundle = build_mongo_bundle()
|
|
192
|
+
ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
|
|
193
|
+
return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, mongo=bundle.mongo, db=bundle.db)
|
|
194
|
+
|
|
195
|
+
raise ValueError(f"Unknown sink_kind: {sink_kind}")
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
import re
|
|
3
|
+
from typing import Iterable
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
_EMPTY_VALUES = {"", "-", "N/A", "NA", "null", "None"}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def normalize(s: str | None) -> str:
|
|
10
|
+
if s is None:
|
|
11
|
+
return ""
|
|
12
|
+
return s.strip()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _is_empty(s: str) -> bool:
|
|
16
|
+
return s in _EMPTY_VALUES
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def to_int(s: str | None) -> int | None:
|
|
20
|
+
"""
|
|
21
|
+
C101 파서용 정수 변환기
|
|
22
|
+
|
|
23
|
+
처리 규칙:
|
|
24
|
+
- None / '' / '-' / 'N/A' → None
|
|
25
|
+
- ',', '주', '원', '주식' 제거
|
|
26
|
+
- '1,234' → 1234
|
|
27
|
+
"""
|
|
28
|
+
s = normalize(s)
|
|
29
|
+
if _is_empty(s):
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
for ch in (",", "원", "주", "주식"):
|
|
33
|
+
s = s.replace(ch, "")
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
return int(s)
|
|
37
|
+
except ValueError:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def to_float(s: str | None) -> float | None:
|
|
42
|
+
"""
|
|
43
|
+
C101 파서용 실수 변환기
|
|
44
|
+
|
|
45
|
+
처리 규칙:
|
|
46
|
+
- None / '' / '-' / 'N/A' → None
|
|
47
|
+
- ',', '%', '원' 제거
|
|
48
|
+
- '12.34%' → 12.34
|
|
49
|
+
"""
|
|
50
|
+
s = normalize(s)
|
|
51
|
+
if _is_empty(s):
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
for ch in (",", "%", "원"):
|
|
55
|
+
s = s.replace(ch, "")
|
|
56
|
+
|
|
57
|
+
try:
|
|
58
|
+
return float(s)
|
|
59
|
+
except ValueError:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def parse_won(text: str) -> int:
|
|
64
|
+
"""
|
|
65
|
+
한국 화폐 표현 문자열을 숫자로 변환 (조원, 억원, 만원, 원, 억 등 처리)
|
|
66
|
+
"""
|
|
67
|
+
units = {
|
|
68
|
+
"조원": 1_000_000_000_000,
|
|
69
|
+
"억원": 100_000_000,
|
|
70
|
+
"억": 100_000_000,
|
|
71
|
+
"만원": 10_000,
|
|
72
|
+
"원": 1,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
text = text.replace(",", "").strip()
|
|
76
|
+
match = re.match(r"([-+]?[0-9]*\.?[0-9]+)([가-힣]+)", text)
|
|
77
|
+
|
|
78
|
+
if not match:
|
|
79
|
+
raise ValueError(f"형식이 잘못된 금액 문자열: {text}")
|
|
80
|
+
|
|
81
|
+
number, unit = match.groups()
|
|
82
|
+
if unit not in units:
|
|
83
|
+
raise ValueError(f"알 수 없는 단위: {unit}")
|
|
84
|
+
|
|
85
|
+
return int(float(number) * units[unit])
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# scraper2/app/parsing/_normalize.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from collections import Counter
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# -----------------------------
|
|
13
|
+
# 1) 항목(행의 "항목" 값) 정규화
|
|
14
|
+
# -----------------------------
|
|
15
|
+
_IFRS_PATTERN = re.compile(r"\(IFRS[^)]*\)")
|
|
16
|
+
_ETC_PAREN_PATTERN = re.compile(r"\((E|YoY|QoQ)[^)]*\)")
|
|
17
|
+
_BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
|
|
18
|
+
_EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
|
|
19
|
+
_ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
|
|
20
|
+
|
|
21
|
+
def normalize_c1034_item(text: str | None) -> str:
|
|
22
|
+
"""
|
|
23
|
+
C103 항목명(행 값) 정규화
|
|
24
|
+
- 펼치기/컨센서스 제거
|
|
25
|
+
- 모든 괄호 내용 제거 (발표기준, 연결, 개별 등 포함)
|
|
26
|
+
- [구K-IFRS] 제거
|
|
27
|
+
- '*' 제거
|
|
28
|
+
- 공백 정리
|
|
29
|
+
"""
|
|
30
|
+
if not text:
|
|
31
|
+
return ""
|
|
32
|
+
|
|
33
|
+
s = str(text)
|
|
34
|
+
|
|
35
|
+
# 1) 불필요한 키워드
|
|
36
|
+
s = _EXTRA_WORDS_PATTERN.sub("", s)
|
|
37
|
+
|
|
38
|
+
# 2) 대괄호 제거
|
|
39
|
+
s = _BRACKET_PATTERN.sub("", s)
|
|
40
|
+
|
|
41
|
+
# 3) 모든 괄호 제거 (중요)
|
|
42
|
+
s = _ALL_PAREN_PATTERN.sub("", s)
|
|
43
|
+
|
|
44
|
+
# 4) 별표 제거
|
|
45
|
+
s = s.replace("*", "")
|
|
46
|
+
|
|
47
|
+
# 5) 공백 정리
|
|
48
|
+
s = re.sub(r"\s+", " ", s).strip()
|
|
49
|
+
|
|
50
|
+
return s
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# -----------------------------
|
|
54
|
+
# 2) 컬럼명 정규화
|
|
55
|
+
# -----------------------------
|
|
56
|
+
_COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
|
|
57
|
+
_COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
|
|
58
|
+
_COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
|
|
59
|
+
_COL_MULTI_SPACE = re.compile(r"\s+")
|
|
60
|
+
|
|
61
|
+
def normalize_c1034_col(col: str | None) -> str:
|
|
62
|
+
"""
|
|
63
|
+
C103 컬럼명 정규화
|
|
64
|
+
예)
|
|
65
|
+
"2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
|
|
66
|
+
"2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
|
|
67
|
+
"전년대비 (YoY)" -> "전년대비"
|
|
68
|
+
"전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
|
|
69
|
+
"""
|
|
70
|
+
if col is None:
|
|
71
|
+
return ""
|
|
72
|
+
|
|
73
|
+
s = str(col)
|
|
74
|
+
|
|
75
|
+
# 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
|
|
76
|
+
s = _COL_DOTNUM.sub("", s)
|
|
77
|
+
|
|
78
|
+
# 2) 컨센서스 문구 제거
|
|
79
|
+
s = _COL_EXTRA_WORDS.sub("", s)
|
|
80
|
+
|
|
81
|
+
# 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
|
|
82
|
+
s = _COL_PAREN_PATTERN.sub("", s)
|
|
83
|
+
|
|
84
|
+
# 4) 공백 정리
|
|
85
|
+
s = _COL_MULTI_SPACE.sub(" ", s).strip()
|
|
86
|
+
|
|
87
|
+
return s
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _dedupe_columns(cols: list[str]) -> list[str]:
|
|
91
|
+
"""
|
|
92
|
+
정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
|
|
93
|
+
예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
|
|
94
|
+
"""
|
|
95
|
+
seen: Counter[str] = Counter()
|
|
96
|
+
out: list[str] = []
|
|
97
|
+
for c in cols:
|
|
98
|
+
c = c or ""
|
|
99
|
+
seen[c] += 1
|
|
100
|
+
if seen[c] == 1:
|
|
101
|
+
out.append(c)
|
|
102
|
+
else:
|
|
103
|
+
out.append(f"{c}_{seen[c]}")
|
|
104
|
+
return out
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# -----------------------------
|
|
108
|
+
# 3) DataFrame 전체 정규화 + records 변환
|
|
109
|
+
# -----------------------------
|
|
110
|
+
def normalize_c1034_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
111
|
+
"""
|
|
112
|
+
- 컬럼명 전체 정규화
|
|
113
|
+
- '항목' 값 정규화
|
|
114
|
+
- NaN -> None
|
|
115
|
+
- 중복 컬럼명 자동 분리(_2/_3)
|
|
116
|
+
"""
|
|
117
|
+
if df is None or df.empty:
|
|
118
|
+
return df
|
|
119
|
+
|
|
120
|
+
df = df.copy()
|
|
121
|
+
|
|
122
|
+
# 컬럼명 정규화 + 중복 방지
|
|
123
|
+
norm_cols = [normalize_c1034_col(c) for c in df.columns.astype(str).tolist()]
|
|
124
|
+
df.columns = _dedupe_columns(norm_cols)
|
|
125
|
+
|
|
126
|
+
# 항목 값 정규화
|
|
127
|
+
if "항목" in df.columns:
|
|
128
|
+
df["항목"] = df["항목"].map(normalize_c1034_item)
|
|
129
|
+
|
|
130
|
+
# NaN -> None
|
|
131
|
+
df = df.replace({np.nan: None})
|
|
132
|
+
return df
|
|
133
|
+
|
|
134
|
+
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# scraper2/app/parsing/c101_parser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from scraper2.app.parsing._converters import to_int, to_float, normalize, parse_won
|
|
5
|
+
from scraper2.app.ports.browser.browser_port import BrowserPort
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class C101ParseError(RuntimeError):
|
|
10
|
+
pass
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _after_colon(s: str) -> str:
|
|
14
|
+
# "업종: XXX" 같은 케이스
|
|
15
|
+
parts = s.split(":")
|
|
16
|
+
return parts[1].strip() if len(parts) > 1 else s.strip()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
async def parse_c101_to_dict(browser: BrowserPort) -> dict[str, Any] | None:
|
|
20
|
+
"""
|
|
21
|
+
- BrowserPort만 사용
|
|
22
|
+
- dict만 반환
|
|
23
|
+
- 실패 시 None (기존 동작 유지)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
# 날짜 파싱: 텍스트 검색 기반
|
|
27
|
+
raw_date_str = await browser.text_first_by_text("[기준:")
|
|
28
|
+
if not raw_date_str:
|
|
29
|
+
return None
|
|
30
|
+
날짜 = raw_date_str.replace("[기준:", "").replace("]", "").strip()
|
|
31
|
+
|
|
32
|
+
# 1) 재무정보 (1st table)
|
|
33
|
+
# 여기선 "tbody 존재"만 기다리면 됨
|
|
34
|
+
await browser.wait("#pArea > div.wrapper-table > div > table > tbody")
|
|
35
|
+
|
|
36
|
+
종목명 = normalize(await browser.text(
|
|
37
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
38
|
+
"tr:nth-child(1) > td > dl > dt:nth-child(1) > span"
|
|
39
|
+
))
|
|
40
|
+
코드 = normalize(await browser.text(
|
|
41
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
42
|
+
"tr:nth-child(1) > td > dl > dt:nth-child(1) > b"
|
|
43
|
+
))
|
|
44
|
+
업종_raw = await browser.text(
|
|
45
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
46
|
+
"tr:nth-child(1) > td > dl > dt:nth-child(4)"
|
|
47
|
+
)
|
|
48
|
+
업종 = _after_colon(업종_raw)
|
|
49
|
+
|
|
50
|
+
eps = to_int(await browser.text(
|
|
51
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
52
|
+
"tr:nth-child(3) > td > dl > dt:nth-child(1) > b"
|
|
53
|
+
))
|
|
54
|
+
bps = to_int(await browser.text(
|
|
55
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
56
|
+
"tr:nth-child(3) > td > dl > dt:nth-child(2) > b"
|
|
57
|
+
))
|
|
58
|
+
per = to_float(await browser.text(
|
|
59
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
60
|
+
"tr:nth-child(3) > td > dl > dt:nth-child(3) > b"
|
|
61
|
+
))
|
|
62
|
+
업종per = to_float(await browser.text(
|
|
63
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
64
|
+
"tr:nth-child(3) > td > dl > dt:nth-child(4) > b"
|
|
65
|
+
))
|
|
66
|
+
pbr = to_float(await browser.text(
|
|
67
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
68
|
+
"tr:nth-child(3) > td > dl > dt:nth-child(5) > b"
|
|
69
|
+
))
|
|
70
|
+
배당수익률 = to_float(await browser.text(
|
|
71
|
+
"#pArea > div.wrapper-table > div > table > tbody "
|
|
72
|
+
"tr:nth-child(3) > td > dl > dt:nth-child(6) > b"
|
|
73
|
+
))
|
|
74
|
+
|
|
75
|
+
# 2) 주가 정보 (2nd table)
|
|
76
|
+
await browser.wait("#cTB11 > tbody")
|
|
77
|
+
|
|
78
|
+
주가 = to_int(await browser.text("#cTB11 > tbody tr:nth-child(1) > td > strong"))
|
|
79
|
+
|
|
80
|
+
전일대비_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(2)")
|
|
81
|
+
전일대비 = to_int(전일대비_raw.replace("원", ""))
|
|
82
|
+
|
|
83
|
+
수익률_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(3)")
|
|
84
|
+
수익률 = to_float(수익률_raw.replace("%", ""))
|
|
85
|
+
|
|
86
|
+
최고최저52 = await browser.text("#cTB11 > tbody tr:nth-child(2) > td")
|
|
87
|
+
최고52, 최저52 = (to_int(x.strip().replace("원", "")) for x in 최고최저52.split("/"))
|
|
88
|
+
|
|
89
|
+
거래량거래대금 = await browser.text("#cTB11 > tbody tr:nth-child(4) > td")
|
|
90
|
+
거래량_str, 거래대금_str = (x.strip() for x in 거래량거래대금.split("/"))
|
|
91
|
+
거래량 = to_int(거래량_str.replace("주", ""))
|
|
92
|
+
거래대금 = parse_won(거래대금_str)
|
|
93
|
+
|
|
94
|
+
시가총액 = parse_won(await browser.text("#cTB11 > tbody tr:nth-child(5) > td"))
|
|
95
|
+
베타52주 = to_float(await browser.text("#cTB11 > tbody tr:nth-child(6) > td"))
|
|
96
|
+
|
|
97
|
+
발행주식유동비율 = await browser.text("#cTB11 > tbody tr:nth-child(7) > td")
|
|
98
|
+
발행주식_str, 유동비율_str = (x.strip() for x in 발행주식유동비율.split("/"))
|
|
99
|
+
발행주식 = to_int(발행주식_str.replace("주", ""))
|
|
100
|
+
유동비율 = to_float(유동비율_str.replace("%", ""))
|
|
101
|
+
|
|
102
|
+
외국인지분율 = to_float((await browser.text("#cTB11 > tbody tr:nth-child(8) > td")).replace("%", ""))
|
|
103
|
+
|
|
104
|
+
수익률1M3M6M1Y = await browser.text("#cTB11 > tbody tr:nth-child(9) > td")
|
|
105
|
+
수익률1M, 수익률3M, 수익률6M, 수익률1Y = (
|
|
106
|
+
to_float(x.strip().replace("%", "")) for x in 수익률1M3M6M1Y.split("/")
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# 3) 개요
|
|
110
|
+
# ul 아래 li들을 전부 읽어서 합치기
|
|
111
|
+
await browser.wait("#wrapper > div:nth-child(6) > div.cmp_comment > ul")
|
|
112
|
+
li_texts = await browser.texts("#wrapper > div:nth-child(6) > div.cmp_comment > ul li")
|
|
113
|
+
개요 = "".join(t.strip() for t in li_texts if t and t.strip())
|
|
114
|
+
|
|
115
|
+
return {
|
|
116
|
+
"종목명": 종목명,
|
|
117
|
+
"코드": 코드,
|
|
118
|
+
"날짜": 날짜,
|
|
119
|
+
"업종": 업종,
|
|
120
|
+
"eps": eps,
|
|
121
|
+
"bps": bps,
|
|
122
|
+
"per": per,
|
|
123
|
+
"업종per": 업종per,
|
|
124
|
+
"pbr": pbr,
|
|
125
|
+
"배당수익률": 배당수익률,
|
|
126
|
+
"주가": 주가,
|
|
127
|
+
"전일대비": 전일대비,
|
|
128
|
+
"수익률": 수익률,
|
|
129
|
+
"최고52": 최고52,
|
|
130
|
+
"최저52": 최저52,
|
|
131
|
+
"거래량": 거래량,
|
|
132
|
+
"거래대금": 거래대금,
|
|
133
|
+
"시가총액": 시가총액,
|
|
134
|
+
"베타52주": 베타52주,
|
|
135
|
+
"발행주식": 발행주식,
|
|
136
|
+
"유동비율": 유동비율,
|
|
137
|
+
"외국인지분율": 외국인지분율,
|
|
138
|
+
"수익률1M": 수익률1M,
|
|
139
|
+
"수익률3M": 수익률3M,
|
|
140
|
+
"수익률6M": 수익률6M,
|
|
141
|
+
"수익률1Y": 수익률1Y,
|
|
142
|
+
"개요": 개요,
|
|
143
|
+
}
|