scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +103 -0
- scraper2/adapters/out/playwright/browser_factory.py +112 -0
- scraper2/adapters/out/playwright/session.py +121 -0
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +15 -0
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/store.py +74 -0
- scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
- scraper2/app/composition.py +195 -0
- scraper2/app/parsing/_converters.py +85 -0
- scraper2/app/parsing/_normalize.py +134 -0
- scraper2/app/parsing/c101_parser.py +143 -0
- scraper2/app/parsing/c103_parser.py +128 -0
- scraper2/app/parsing/c104_parser.py +143 -0
- scraper2/app/parsing/c106_parser.py +153 -0
- scraper2/app/parsing/c108_parser.py +65 -0
- scraper2/app/ports/browser/browser_factory_port.py +11 -0
- scraper2/app/ports/browser/browser_port.py +22 -0
- scraper2/app/ports/ingest_port.py +13 -0
- scraper2/app/ports/sinks/base_sink_port.py +14 -0
- scraper2/app/ports/sinks/c101_sink_port.py +9 -0
- scraper2/app/ports/sinks/c103_sink_port.py +9 -0
- scraper2/app/ports/sinks/c104_sink_port.py +9 -0
- scraper2/app/ports/sinks/c106_sink_port.py +9 -0
- scraper2/app/ports/sinks/c108_sink_port.py +9 -0
- scraper2/app/usecases/fetch/fetch_c101.py +43 -0
- scraper2/app/usecases/fetch/fetch_c103.py +103 -0
- scraper2/app/usecases/fetch/fetch_c104.py +76 -0
- scraper2/app/usecases/fetch/fetch_c106.py +90 -0
- scraper2/app/usecases/fetch/fetch_c108.py +49 -0
- scraper2/app/usecases/ingest/ingest_c101.py +36 -0
- scraper2/app/usecases/ingest/ingest_c103.py +37 -0
- scraper2/app/usecases/ingest/ingest_c104.py +37 -0
- scraper2/app/usecases/ingest/ingest_c106.py +38 -0
- scraper2/app/usecases/ingest/ingest_c108.py +39 -0
- scraper2/main.py +257 -0
- scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
- scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
- scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
- scraper2_hj3415/__main__.py +0 -6
- scraper2_hj3415/adapters/_shared/utils.py +0 -29
- scraper2_hj3415/adapters/clients/browser.py +0 -124
- scraper2_hj3415/adapters/clients/http.py +0 -51
- scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
- scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
- scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
- scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
- scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
- scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
- scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
- scraper2_hj3415/core/constants.py +0 -47
- scraper2_hj3415/core/ports/sink_port.py +0 -16
- scraper2_hj3415/core/ports/source_port.py +0 -13
- scraper2_hj3415/core/types.py +0 -11
- scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
- scraper2_hj3415/di.py +0 -103
- scraper2_hj3415/entrypoints/cli.py +0 -226
- scraper2_hj3415/entrypoints/main.py +0 -20
- scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
- scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
- scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
- {scraper2_hj3415 → scraper2}/__init__.py +0 -0
- {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
- {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
- {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
- {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
- {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
- {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
- {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
scraper2/main.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
# scraper2/main.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
from typing import Any, Literal, cast
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
|
|
10
|
+
from scraper2.app.composition import build_usecases
|
|
11
|
+
from scraper2.app.ports.ingest_port import IngestPort
|
|
12
|
+
|
|
13
|
+
from logging_hj3415 import setup_logging
|
|
14
|
+
|
|
15
|
+
setup_logging()
|
|
16
|
+
|
|
17
|
+
Endpoint = Literal["c101", "c103", "c104", "c106", "c108", "all"]
|
|
18
|
+
Sink = Literal["memory", "mongo"]
|
|
19
|
+
|
|
20
|
+
app = typer.Typer(no_args_is_help=True)
|
|
21
|
+
|
|
22
|
+
nfs_app = typer.Typer(no_args_is_help=True, help="NFS 수집/저장")
|
|
23
|
+
mi_app = typer.Typer(no_args_is_help=True, help="(reserved) MI commands")
|
|
24
|
+
|
|
25
|
+
app.add_typer(nfs_app, name="nfs")
|
|
26
|
+
app.add_typer(mi_app, name="mi")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# -------------------------
|
|
30
|
+
# small helpers
|
|
31
|
+
# -------------------------
|
|
32
|
+
|
|
33
|
+
def _endpoint_list(ep: Endpoint) -> list[str]:
|
|
34
|
+
return ["c101", "c103", "c104", "c106", "c108"] if ep == "all" else [ep]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
async def _maybe_await_close(obj: Any) -> None:
|
|
38
|
+
close = getattr(obj, "close", None)
|
|
39
|
+
if close is None:
|
|
40
|
+
return
|
|
41
|
+
out = close()
|
|
42
|
+
if asyncio.iscoroutine(out):
|
|
43
|
+
await out
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
async def _mongo_bootstrap(db) -> None:
|
|
47
|
+
from db2.nfs import ensure_indexes
|
|
48
|
+
from db2.settings import get_settings
|
|
49
|
+
|
|
50
|
+
s = get_settings()
|
|
51
|
+
await ensure_indexes(db, snapshot_ttl_days=s.SNAPSHOT_TTL_DAYS)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
async def _load_codes_from_universe(db, *, universe: str) -> list[str]:
|
|
55
|
+
"""
|
|
56
|
+
db2.universe에 저장된 universe_latest에서 codes 로드.
|
|
57
|
+
(네 db2 API 명에 맞춰 조정하면 됨)
|
|
58
|
+
"""
|
|
59
|
+
from db2.universe import get_universe_latest # 네가 가진 API
|
|
60
|
+
|
|
61
|
+
doc = await get_universe_latest(db, universe=universe)
|
|
62
|
+
if not doc:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
# doc 형태가 {"items":[{code,name,...}, ...]} 혹은 {"payload":{"items":[...]}} 일 수 있어서 방어
|
|
66
|
+
data = doc
|
|
67
|
+
if isinstance(data, dict) and "payload" in data and isinstance(data["payload"], dict):
|
|
68
|
+
data = data["payload"]
|
|
69
|
+
if isinstance(data, dict) and "items" in data:
|
|
70
|
+
data = data["items"]
|
|
71
|
+
|
|
72
|
+
if not isinstance(data, list):
|
|
73
|
+
return []
|
|
74
|
+
|
|
75
|
+
codes: list[str] = []
|
|
76
|
+
for row in data:
|
|
77
|
+
if not isinstance(row, dict):
|
|
78
|
+
continue
|
|
79
|
+
code = str(row.get("code") or "").strip()
|
|
80
|
+
if code:
|
|
81
|
+
codes.append(code)
|
|
82
|
+
return codes
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
import time
|
|
86
|
+
|
|
87
|
+
async def _run_ingest_with_progress(
|
|
88
|
+
*,
|
|
89
|
+
ucs: Any,
|
|
90
|
+
endpoint: Endpoint,
|
|
91
|
+
codes: list[str],
|
|
92
|
+
sleep_sec: float,
|
|
93
|
+
show: bool,
|
|
94
|
+
show_n: int,
|
|
95
|
+
chunk_size: int = 10,
|
|
96
|
+
progress_every: int = 1,
|
|
97
|
+
) -> None:
|
|
98
|
+
total = len(codes)
|
|
99
|
+
if total == 0:
|
|
100
|
+
typer.echo("(no codes)")
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
t0 = time.perf_counter() # ✅ 시작 시각
|
|
104
|
+
run_asof = datetime.now(timezone.utc)
|
|
105
|
+
|
|
106
|
+
def _chunks(xs: list[str], n: int):
|
|
107
|
+
for i in range(0, len(xs), n):
|
|
108
|
+
yield xs[i:i + n]
|
|
109
|
+
|
|
110
|
+
async def _run_one_endpoint(ep: str) -> None:
|
|
111
|
+
ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
|
|
112
|
+
|
|
113
|
+
ok = 0
|
|
114
|
+
fail = 0
|
|
115
|
+
|
|
116
|
+
typer.echo(f"\n=== START: {ep} === total={total}, chunk_size={chunk_size}")
|
|
117
|
+
|
|
118
|
+
for idx, batch in enumerate(_chunks(codes, chunk_size), start=1):
|
|
119
|
+
try:
|
|
120
|
+
results = await ingest_uc.execute_many(batch, sleep_sec=sleep_sec, asof=run_asof)
|
|
121
|
+
ok += sum(1 for r in results if r is not None)
|
|
122
|
+
except Exception as e:
|
|
123
|
+
fail += len(batch)
|
|
124
|
+
typer.echo(f"[WARN] batch failed: {e!r}")
|
|
125
|
+
|
|
126
|
+
done = min(idx * chunk_size, total)
|
|
127
|
+
|
|
128
|
+
if progress_every > 0 and (idx % progress_every == 0 or done == total):
|
|
129
|
+
typer.echo(f"progress: {done}/{total} (ok={ok}, fail={fail})")
|
|
130
|
+
|
|
131
|
+
typer.echo(f"=== DONE: {ep} === ok={ok}, fail={fail}, total={total}")
|
|
132
|
+
|
|
133
|
+
# --- 실제 실행 ---
|
|
134
|
+
for ep in _endpoint_list(endpoint):
|
|
135
|
+
await _run_one_endpoint(ep)
|
|
136
|
+
|
|
137
|
+
elapsed = time.perf_counter() - t0 # ✅ 종료 시각
|
|
138
|
+
typer.echo(f"\n⏱ elapsed time: {_format_elapsed(elapsed)}")
|
|
139
|
+
|
|
140
|
+
def _dto_to_pretty(obj: Any) -> str:
|
|
141
|
+
# pydantic v2 우선
|
|
142
|
+
if hasattr(obj, "model_dump_json"):
|
|
143
|
+
return obj.model_dump_json(indent=2, by_alias=False)
|
|
144
|
+
if hasattr(obj, "model_dump"):
|
|
145
|
+
import json
|
|
146
|
+
return json.dumps(obj.model_dump(), ensure_ascii=False, indent=2)
|
|
147
|
+
# dict fallback
|
|
148
|
+
if isinstance(obj, dict):
|
|
149
|
+
import json
|
|
150
|
+
return json.dumps(obj, ensure_ascii=False, indent=2, default=str)
|
|
151
|
+
return str(obj)
|
|
152
|
+
|
|
153
|
+
def _format_elapsed(sec: float) -> str:
|
|
154
|
+
if sec < 60:
|
|
155
|
+
return f"{sec:.1f}s"
|
|
156
|
+
m, s = divmod(int(sec), 60)
|
|
157
|
+
if m < 60:
|
|
158
|
+
return f"{m}m {s}s"
|
|
159
|
+
h, m = divmod(m, 60)
|
|
160
|
+
return f"{h}h {m}m {s}s"
|
|
161
|
+
|
|
162
|
+
# -------------------------
|
|
163
|
+
# nfs subcommands: one / all
|
|
164
|
+
# -------------------------
|
|
165
|
+
|
|
166
|
+
@nfs_app.command("one")
|
|
167
|
+
def nfs_one(
|
|
168
|
+
endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
|
|
169
|
+
code: str = typer.Argument(..., help="종목코드 (예: 005930)"),
|
|
170
|
+
sleep_sec: float = typer.Option(2.0, "--sleep"),
|
|
171
|
+
sink: Sink = typer.Option("memory", "--sink"),
|
|
172
|
+
show: bool = typer.Option(True, "--show/--no-show", help="결과 DTO 출력"),
|
|
173
|
+
):
|
|
174
|
+
code = code.strip()
|
|
175
|
+
if not code:
|
|
176
|
+
raise typer.BadParameter("code는 비어있을 수 없습니다.")
|
|
177
|
+
|
|
178
|
+
async def _run():
|
|
179
|
+
ucs = build_usecases(sink_kind=sink)
|
|
180
|
+
|
|
181
|
+
if sink == "mongo":
|
|
182
|
+
if ucs.db is None:
|
|
183
|
+
raise RuntimeError("mongo sink인데 ucs.db가 없습니다. composition에서 db를 노출하세요.")
|
|
184
|
+
await _mongo_bootstrap(ucs.db)
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
run_asof = datetime.now(timezone.utc)
|
|
188
|
+
for ep in _endpoint_list(endpoint):
|
|
189
|
+
ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
|
|
190
|
+
results = await ingest_uc.execute_many([code], sleep_sec=sleep_sec, asof=run_asof)
|
|
191
|
+
dto = results[0] if results else None
|
|
192
|
+
|
|
193
|
+
typer.echo(f"\n=== ONE DONE: {ep} {code} ===")
|
|
194
|
+
if not show:
|
|
195
|
+
continue
|
|
196
|
+
if dto is None:
|
|
197
|
+
typer.echo("(no result)")
|
|
198
|
+
else:
|
|
199
|
+
typer.echo(_dto_to_pretty(dto))
|
|
200
|
+
finally:
|
|
201
|
+
if getattr(ucs, "mongo", None) is not None:
|
|
202
|
+
await _maybe_await_close(ucs.mongo)
|
|
203
|
+
|
|
204
|
+
asyncio.run(_run())
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
@nfs_app.command("all")
|
|
208
|
+
def nfs_all(
|
|
209
|
+
endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
|
|
210
|
+
universe: str = typer.Option("krx300", "--universe"),
|
|
211
|
+
limit: int = typer.Option(0, "--limit", help="0=전체"),
|
|
212
|
+
sleep_sec: float = typer.Option(2.0, "--sleep"),
|
|
213
|
+
sink: Sink = typer.Option("mongo", "--sink"),
|
|
214
|
+
chunk_size: int = typer.Option(5, "--chunk", help="진행률 표시용 배치 크기"),
|
|
215
|
+
show: bool = typer.Option(False, "--show/--no-show", help="일부 코드만 출력"),
|
|
216
|
+
show_n: int = typer.Option(3, "--show-n"),
|
|
217
|
+
):
|
|
218
|
+
async def _run():
|
|
219
|
+
ucs = build_usecases(sink_kind=sink)
|
|
220
|
+
if ucs.db is None:
|
|
221
|
+
raise RuntimeError("all 모드는 DB가 필요합니다. mongo sink로 ucs.db를 노출하세요.")
|
|
222
|
+
await _mongo_bootstrap(ucs.db)
|
|
223
|
+
|
|
224
|
+
codes = await _load_codes_from_universe(ucs.db, universe=universe)
|
|
225
|
+
if not codes:
|
|
226
|
+
raise RuntimeError(f"universe='{universe}' codes가 비었습니다. 먼저 krx sync로 universe를 채우세요.")
|
|
227
|
+
|
|
228
|
+
if limit and limit > 0:
|
|
229
|
+
codes = codes[:limit]
|
|
230
|
+
|
|
231
|
+
typer.echo(f"\n=== NFS ALL === universe={universe}, endpoint={endpoint}, codes={len(codes)}, sink={sink}")
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
await _run_ingest_with_progress(
|
|
235
|
+
ucs=ucs,
|
|
236
|
+
endpoint=endpoint,
|
|
237
|
+
codes=codes,
|
|
238
|
+
sleep_sec=sleep_sec,
|
|
239
|
+
show=show,
|
|
240
|
+
show_n=show_n,
|
|
241
|
+
chunk_size=chunk_size,
|
|
242
|
+
progress_every=1, # chunk마다
|
|
243
|
+
)
|
|
244
|
+
finally:
|
|
245
|
+
if getattr(ucs, "mongo", None) is not None:
|
|
246
|
+
await _maybe_await_close(ucs.mongo)
|
|
247
|
+
|
|
248
|
+
asyncio.run(_run())
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
@mi_app.callback(invoke_without_command=True)
|
|
252
|
+
def mi():
|
|
253
|
+
pass
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
if __name__ == "__main__":
|
|
257
|
+
app()
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: scraper2-hj3415
|
|
3
|
+
Version: 2.1.0
|
|
4
|
+
Summary: Naver WiseReport scraper
|
|
5
|
+
Keywords: example,demo
|
|
6
|
+
Author-email: Hyungjin Kim <hj3415@gmail.com>
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Typing :: Typed
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: playwright>=1.55
|
|
14
|
+
Requires-Dist: pandas>=2.3.3
|
|
15
|
+
Requires-Dist: pandas-stubs>=2.3.3
|
|
16
|
+
Requires-Dist: lxml>=6.0.2
|
|
17
|
+
Requires-Dist: typer>=0.21.0
|
|
18
|
+
Requires-Dist: db2-hj3415
|
|
19
|
+
|
|
20
|
+
# scraper2
|
|
21
|
+
|
|
22
|
+
KRX 기반 국내 주식 재무(NFS) 데이터 스크래퍼 CLI 도구입니다.
|
|
23
|
+
종목 단건 수집(one)과 유니버스 전체 수집(all)을 명확히 분리해 운영합니다.
|
|
24
|
+
|
|
25
|
+
⸻
|
|
26
|
+
|
|
27
|
+
설치
|
|
28
|
+
|
|
29
|
+
pip install -e .
|
|
30
|
+
|
|
31
|
+
설치 후 CLI 확인:
|
|
32
|
+
|
|
33
|
+
scraper2 --help
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
⸻
|
|
37
|
+
|
|
38
|
+
CLI 구조
|
|
39
|
+
|
|
40
|
+
scraper2
|
|
41
|
+
├─ nfs
|
|
42
|
+
│ ├─ one # 종목 1개 수집
|
|
43
|
+
│ └─ all # 유니버스 전체 수집
|
|
44
|
+
└─ mi # (reserved)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
⸻
|
|
48
|
+
|
|
49
|
+
1️⃣ 단일 종목 수집 (one)
|
|
50
|
+
|
|
51
|
+
사용법
|
|
52
|
+
|
|
53
|
+
scraper2 nfs one <endpoint> <code> [options]
|
|
54
|
+
|
|
55
|
+
예시
|
|
56
|
+
|
|
57
|
+
# 삼성전자 c101 단건 수집 (메모리)
|
|
58
|
+
scraper2 nfs one c101 005930
|
|
59
|
+
|
|
60
|
+
# 여러 endpoint 동시 수집
|
|
61
|
+
scraper2 nfs one all 005930
|
|
62
|
+
|
|
63
|
+
# MongoDB 저장
|
|
64
|
+
scraper2 nfs one c101 005930 --sink mongo
|
|
65
|
+
|
|
66
|
+
# 결과 DTO 출력 비활성화
|
|
67
|
+
scraper2 nfs one c101 005930 --no-show
|
|
68
|
+
|
|
69
|
+
특징
|
|
70
|
+
• 항상 종목 1개만 처리
|
|
71
|
+
• 결과 DTO를 즉시 출력 (디버깅·검증용)
|
|
72
|
+
• 운영 자동화보다는 수동 테스트 / 검증용
|
|
73
|
+
|
|
74
|
+
⸻
|
|
75
|
+
|
|
76
|
+
2️⃣ 유니버스 전체 수집 (all)
|
|
77
|
+
|
|
78
|
+
사용 전 필수
|
|
79
|
+
|
|
80
|
+
# universe를 먼저 DB에 적재
|
|
81
|
+
krx sync
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
⸻
|
|
85
|
+
|
|
86
|
+
사용법
|
|
87
|
+
|
|
88
|
+
scraper2 nfs all <endpoint> [options]
|
|
89
|
+
|
|
90
|
+
예시
|
|
91
|
+
|
|
92
|
+
# krx300 전체 종목 c101 수집
|
|
93
|
+
scraper2 nfs all c101
|
|
94
|
+
|
|
95
|
+
# 모든 endpoint 수집
|
|
96
|
+
scraper2 nfs all all
|
|
97
|
+
|
|
98
|
+
# 특정 유니버스 지정
|
|
99
|
+
scraper2 nfs all c103 --universe krx300
|
|
100
|
+
|
|
101
|
+
# 처리 개수 제한 (테스트용)
|
|
102
|
+
scraper2 nfs all c101 --limit 10
|
|
103
|
+
|
|
104
|
+
# 진행률 배치 크기 조정
|
|
105
|
+
scraper2 nfs all c101 --chunk 5
|
|
106
|
+
|
|
107
|
+
실행 중 출력 예시
|
|
108
|
+
|
|
109
|
+
=== NFS ALL === universe=krx300, endpoint=c101, codes=298, sink=mongo
|
|
110
|
+
|
|
111
|
+
=== START: c101 === total=298, chunk_size=5
|
|
112
|
+
progress: 25/298 (ok=25, fail=0)
|
|
113
|
+
progress: 50/298 (ok=50, fail=0)
|
|
114
|
+
...
|
|
115
|
+
=== DONE: c101 === ok=297, fail=1, total=298
|
|
116
|
+
|
|
117
|
+
⏱ elapsed time: 6m 42s
|
|
118
|
+
|
|
119
|
+
특징
|
|
120
|
+
• DB에 저장된 universe_latest 기준
|
|
121
|
+
• 진행률, 성공/실패 개수 실시간 출력
|
|
122
|
+
• 전체 작업 종료 시 총 소요 시간 표시
|
|
123
|
+
• 운영/스케줄러용 메인 모드
|
|
124
|
+
|
|
125
|
+
⸻
|
|
126
|
+
|
|
127
|
+
3️⃣ 작업 스케줄러 예시 (cron)
|
|
128
|
+
|
|
129
|
+
매일 새벽 2시 전체 수집
|
|
130
|
+
|
|
131
|
+
0 2 * * * /path/to/venv/bin/scraper2 nfs all all >> /var/log/scraper2_nfs.log 2>&1
|
|
132
|
+
|
|
133
|
+
endpoint 분리 실행 (권장)
|
|
134
|
+
|
|
135
|
+
0 2 * * * /path/to/venv/bin/scraper2 nfs all c101 >> /var/log/nfs_c101.log 2>&1
|
|
136
|
+
10 2 * * * /path/to/venv/bin/scraper2 nfs all c103 >> /var/log/nfs_c103.log 2>&1
|
|
137
|
+
|
|
138
|
+
테스트용 제한 실행
|
|
139
|
+
|
|
140
|
+
0 1 * * * /path/to/venv/bin/scraper2 nfs all c101 --limit 20 >> /var/log/nfs_test.log 2>&1
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
⸻
|
|
144
|
+
|
|
145
|
+
운영 권장 사항
|
|
146
|
+
• all 모드는 반드시 mongo sink 사용
|
|
147
|
+
• universe는 krx sync로 관리
|
|
148
|
+
• 로그에는
|
|
149
|
+
• 진행률
|
|
150
|
+
• ok / fail 개수
|
|
151
|
+
• 전체 소요 시간
|
|
152
|
+
이 모두 남도록 설계됨
|
|
153
|
+
• 장애 발생 시 어디까지 진행됐는지 로그만 보고 판단 가능
|
|
154
|
+
|
|
155
|
+
⸻
|
|
156
|
+
|
|
157
|
+
요약
|
|
158
|
+
|
|
159
|
+
목적 명령
|
|
160
|
+
단건 검증 scraper2 nfs one c101 005930
|
|
161
|
+
전체 수집 scraper2 nfs all all
|
|
162
|
+
스케줄러 scraper2 nfs all <endpoint>
|
|
163
|
+
|
|
164
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
scraper2/.DS_Store,sha256=miZHKI70yhXtniLCotYbxAAOTB3ML9T6Tb8xQYPeK8w,6148
|
|
2
|
+
scraper2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
scraper2/main.py,sha256=UTm9RcYpdBP8dMyvsJG-y0_9AINqOVgJfIkdFitMLfs,8081
|
|
4
|
+
scraper2/adapters/out/.DS_Store,sha256=nUqwRB5F2DM82P8BALYvDI0YoD1UbmngfSi8ukKkY7E,6148
|
|
5
|
+
scraper2/adapters/out/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
scraper2/adapters/out/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
scraper2/adapters/out/playwright/browser.py,sha256=XomDThsVhsd5MdwhatgXyaCocs1XCjYEKC_YHmo1jsk,3988
|
|
8
|
+
scraper2/adapters/out/playwright/browser_factory.py,sha256=y008Dbu6VmzrEe7zn7CbODndQEVJtv_EBu04GsMbPGM,3697
|
|
9
|
+
scraper2/adapters/out/playwright/session.py,sha256=hQDmYpi7pIVDjkymaTKQzJVWUsRRlvJg1V777V8q44M,3727
|
|
10
|
+
scraper2/adapters/out/sinks/.DS_Store,sha256=c6VOGBl6bMmFzab335dcT09WinGd4BCZXZiPjrZjd7o,6148
|
|
11
|
+
scraper2/adapters/out/sinks/memory/__init__.py,sha256=djvn50E0dBZr-H6Xmh9FMYalG2Zc0jL5kTXaBjnRaRo,400
|
|
12
|
+
scraper2/adapters/out/sinks/memory/c101_memory_sink.py,sha256=klVgT_ra8EQYjJCk6ilWDkrNG3dj_ITP-eqdwX07nY4,682
|
|
13
|
+
scraper2/adapters/out/sinks/memory/c103_memory_sink.py,sha256=hov3yH3uQqFie1BzxHa3GKsHjm_e5011ItZ4AiqaJIg,683
|
|
14
|
+
scraper2/adapters/out/sinks/memory/c104_memory_sink.py,sha256=-cmNhK17OdsFfVklJjQFF4IWJvmH5ppGcjnpLf8VOns,683
|
|
15
|
+
scraper2/adapters/out/sinks/memory/c106_memory_sink.py,sha256=xyrMDjIpPZJ2CHc8687weEF72wKcVl_flFp7PU6GNdo,682
|
|
16
|
+
scraper2/adapters/out/sinks/memory/c108_memory_sink.py,sha256=HYxZ1YymdSlGzPpaNsdQGZhpzVr-ArgNgM0PyJdfubA,682
|
|
17
|
+
scraper2/adapters/out/sinks/memory/store.py,sha256=h4dwiCF5gne5kloRdD78NWlqtcaailIqoId9xAjtBk4,2738
|
|
18
|
+
scraper2/adapters/out/sinks/mongo/__init__.py,sha256=YmEZqNqh7S4PFTufxd5sCF2k24rTOsxY3ZFrFVyQzh8,382
|
|
19
|
+
scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py,sha256=CpcafwjBJ-Jo-sm02hRC1H214B4aKfvV5MNoNP1AfxQ,1270
|
|
20
|
+
scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py,sha256=gKoAOL3Dj4_JVjhdc5QAZObk49pFT-ERCyJCpUF9j2k,1203
|
|
21
|
+
scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py,sha256=IicVnc2fyeBXoBbgMasB7utzF7f1S6upgHV4g3sjs4g,1203
|
|
22
|
+
scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py,sha256=FMdCp8WVjPwidnh7tIPUoViQWr48O16xtB34O_iCtJI,1203
|
|
23
|
+
scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py,sha256=eSvIRtofWvNKVPchglwL1mOw5hsKDpUfNz5EOum-H3Y,1203
|
|
24
|
+
scraper2/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
25
|
+
scraper2/app/composition.py,sha256=HijMEE-2lMEMiYKlVN9ZWJD-E4vwgzIf9-plYzoSwGc,6229
|
|
26
|
+
scraper2/app/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
27
|
+
scraper2/app/parsing/_converters.py,sha256=z0kSL4nQnqq5w7QfJZBSbjZOLo1JhoqqDLpqlEAN4bo,1815
|
|
28
|
+
scraper2/app/parsing/_normalize.py,sha256=2qqbXxTbzbuYlu7ttzQjyKgatFnwopme2b_fd0zahio,3738
|
|
29
|
+
scraper2/app/parsing/c101_parser.py,sha256=QybZcd_7om4u3w5BWzbVNMcj50WrHtKr8hDOmkBMZGw,5537
|
|
30
|
+
scraper2/app/parsing/c103_parser.py,sha256=BIHJ0OHUaGbVu3kyfgYQQIKf4O_lj4ZTXXPk6vl7Iok,3744
|
|
31
|
+
scraper2/app/parsing/c104_parser.py,sha256=NGnzdVbhdqXFqJphwEDSlJPnM18RU759FCgry20a4ko,4193
|
|
32
|
+
scraper2/app/parsing/c106_parser.py,sha256=JCml8HHnczgMUVnUkRI8AMEJ9mog1dOJfdd6hQKtv9I,4505
|
|
33
|
+
scraper2/app/parsing/c108_parser.py,sha256=VEzzXliatoRdxR2_uSnHMHLNvV5h2irYiyoXAMQm8jc,1961
|
|
34
|
+
scraper2/app/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
+
scraper2/app/ports/ingest_port.py,sha256=RHaRXJanq4VdDUXr42pEJr0vWru7zlIHNOTv7k940os,316
|
|
36
|
+
scraper2/app/ports/browser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
scraper2/app/ports/browser/browser_factory_port.py,sha256=dJ3JCc38MVF7wPmCH4MO3BdnHIXE5wSmfsV7cLysJ54,401
|
|
38
|
+
scraper2/app/ports/browser/browser_port.py,sha256=tFYkdCeTCF-5XHvT1duioj31ytsc-ATQpmEgposi1X4,1133
|
|
39
|
+
scraper2/app/ports/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
scraper2/app/ports/sinks/base_sink_port.py,sha256=1fJ0fCZ-uDOwfCy2MJLPOV5b_twsfOKGphlsxM2-uOw,414
|
|
41
|
+
scraper2/app/ports/sinks/c101_sink_port.py,sha256=dO_A4AR-7lbPO_MlYz_CnumhO5SM_aJfC-09I2nTr2U,297
|
|
42
|
+
scraper2/app/ports/sinks/c103_sink_port.py,sha256=xjRkgu_mxCMwnMHMyJy5dTHw8AxwrXmiWzT6cUCujXg,297
|
|
43
|
+
scraper2/app/ports/sinks/c104_sink_port.py,sha256=0k_47ZZFTCt9jsFhWhDhhvtfdyZa3hVfF65bcLJX1AU,297
|
|
44
|
+
scraper2/app/ports/sinks/c106_sink_port.py,sha256=cge47IiMoFGC_wmHAcHn2nTiH0h65df5N8HLwqzBuY4,297
|
|
45
|
+
scraper2/app/ports/sinks/c108_sink_port.py,sha256=RLZRHJTvdZRsHcs18J0H98XQirW6xRmuMDx2XhiB3ac,297
|
|
46
|
+
scraper2/app/usecases/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
47
|
+
scraper2/app/usecases/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
+
scraper2/app/usecases/fetch/fetch_c101.py,sha256=uqvnH_D8Jp2_BtoiEUcbwkPulv7M9qcq5WI77vsOzCc,1480
|
|
49
|
+
scraper2/app/usecases/fetch/fetch_c103.py,sha256=NHXkUQxM2-Z7N7oW0uW88G40j57eGHX7RQtXCnZVWcY,3321
|
|
50
|
+
scraper2/app/usecases/fetch/fetch_c104.py,sha256=ZAja-G3hCEqLCzVDba2iuu1EFN_wUiDnm9iMcG5nsO4,2518
|
|
51
|
+
scraper2/app/usecases/fetch/fetch_c106.py,sha256=cIFNJ1-_MgyOCIGtVSkEKbiVRBFIhwtKiv0C9uKvrB0,3049
|
|
52
|
+
scraper2/app/usecases/fetch/fetch_c108.py,sha256=okVbNmCcXcgy9-9GOhvgqrHd6ujXv_lL0uogPr9POEs,1685
|
|
53
|
+
scraper2/app/usecases/ingest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
54
|
+
scraper2/app/usecases/ingest/ingest_c101.py,sha256=feaasz-Tx_CCdcs_4Wh3KBQxj3oMefacbq3Ds6UDpmk,1149
|
|
55
|
+
scraper2/app/usecases/ingest/ingest_c103.py,sha256=T_IKs5ikckin_05FnL-dxirAW1PPav867AuQsVsrZ5Y,1150
|
|
56
|
+
scraper2/app/usecases/ingest/ingest_c104.py,sha256=2rGTcFbsATsn3d2KsSEkL5x4fmkGo7x9MHrysoxiICM,1150
|
|
57
|
+
scraper2/app/usecases/ingest/ingest_c106.py,sha256=mQrbASbKVUQyVcIWiXzIczbX-1mNR5NGk760enVCfvo,1190
|
|
58
|
+
scraper2/app/usecases/ingest/ingest_c108.py,sha256=49ULzdl0dN6z3istAKg29PcD5wHTxYholqAZiIEmUzU,1191
|
|
59
|
+
scraper2_hj3415-2.1.0.dist-info/entry_points.txt,sha256=jUNx7ZJQedQ3QnsDN1ompQ0PjwdvVmnKdHHFMfQQPlI,46
|
|
60
|
+
scraper2_hj3415-2.1.0.dist-info/licenses/LICENSE,sha256=QBiVGQuKAESeCfQE344Ik2ex6g2zfYdu9WqrRWydxIs,1068
|
|
61
|
+
scraper2_hj3415-2.1.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
|
|
62
|
+
scraper2_hj3415-2.1.0.dist-info/METADATA,sha256=BM0PNscoPswXr_CaQCCas_f8VZ_QSLxiq1KL8oGTpZk,3425
|
|
63
|
+
scraper2_hj3415-2.1.0.dist-info/RECORD,,
|
scraper2_hj3415/__main__.py
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
# scraper2_hj3415/adapters/_shared/utils.py
|
|
2
|
-
|
|
3
|
-
from loguru import logger
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from typing import Iterable, Iterator, TypeVar, List
|
|
6
|
-
|
|
7
|
-
T = TypeVar("T")
|
|
8
|
-
|
|
9
|
-
def chunked(iterable: Iterable[T], size: int) -> Iterator[List[T]]:
|
|
10
|
-
# 큰 데이터 시퀀스를 size개씩 끊어서(List로 묶어서) 하나씩 yield 하는 제너레이터 함수.
|
|
11
|
-
buf: List[T] = []
|
|
12
|
-
for item in iterable:
|
|
13
|
-
buf.append(item)
|
|
14
|
-
if len(buf) >= size:
|
|
15
|
-
yield buf
|
|
16
|
-
buf = []
|
|
17
|
-
if buf:
|
|
18
|
-
yield buf
|
|
19
|
-
|
|
20
|
-
def log_df(df: pd.DataFrame, name: str = "DataFrame", max_rows: int = 10):
|
|
21
|
-
"""짧고 깔끔하게 DataFrame 로그 찍기"""
|
|
22
|
-
if df.empty:
|
|
23
|
-
logger.info(f"[{name}] DataFrame is empty.")
|
|
24
|
-
return
|
|
25
|
-
|
|
26
|
-
head = df.head(max_rows)
|
|
27
|
-
msg = head.to_markdown(index=False)
|
|
28
|
-
more = "" if len(df) <= max_rows else f"\n... ({len(df)} rows total)"
|
|
29
|
-
logger.debug(f"\n[{name}] shape={df.shape}\n{msg}{more}")
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
# src/scraper2_hj3415/adapters/clients/browser.py
|
|
2
|
-
|
|
3
|
-
from contextlib import asynccontextmanager
|
|
4
|
-
from typing import AsyncGenerator
|
|
5
|
-
import subprocess
|
|
6
|
-
import sys
|
|
7
|
-
import os
|
|
8
|
-
|
|
9
|
-
from playwright.async_api import async_playwright, Browser, Error as PWError
|
|
10
|
-
|
|
11
|
-
def _install_playwright_browsers(*names: str) -> None:
|
|
12
|
-
"""
|
|
13
|
-
playwright install [names...] 를 코드에서 실행.
|
|
14
|
-
macOS/Windows에선 install만, Linux면 필요시 install-deps도 함께.
|
|
15
|
-
"""
|
|
16
|
-
args = [sys.executable, "-m", "playwright", "install", *names]
|
|
17
|
-
subprocess.run(args, check=True)
|
|
18
|
-
|
|
19
|
-
if sys.platform.startswith("linux"):
|
|
20
|
-
try:
|
|
21
|
-
subprocess.run(
|
|
22
|
-
[sys.executable, "-m", "playwright", "install-deps"], check=True
|
|
23
|
-
)
|
|
24
|
-
except Exception:
|
|
25
|
-
pass
|
|
26
|
-
|
|
27
|
-
class PlaywrightSession:
|
|
28
|
-
def __init__(self, browser, pw):
|
|
29
|
-
self.browser = browser
|
|
30
|
-
self.pw = pw
|
|
31
|
-
|
|
32
|
-
@classmethod
|
|
33
|
-
async def create(cls, headless=True):
|
|
34
|
-
browser, pw = await PlaywrightSession.create_browser(headless=headless)
|
|
35
|
-
return cls(browser, pw)
|
|
36
|
-
|
|
37
|
-
@staticmethod
|
|
38
|
-
async def create_browser(headless: bool = True) -> tuple[Browser, any]:
|
|
39
|
-
"""
|
|
40
|
-
Playwright Browser 인스턴스를 생성하고 반환합니다.
|
|
41
|
-
(asynccontextmanager를 사용하지 않는 일반 버전)
|
|
42
|
-
|
|
43
|
-
사용 예시:
|
|
44
|
-
browser, pw = await create_browser()
|
|
45
|
-
page = await browser.new_page()
|
|
46
|
-
await page.goto("https://example.com")
|
|
47
|
-
html = await page.content()
|
|
48
|
-
await browser.close()
|
|
49
|
-
await pw.stop()
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
headless: 브라우저를 headless 모드로 실행할지 여부 (기본 True)
|
|
53
|
-
Returns:
|
|
54
|
-
(browser, pw): (Browser 객체, async_playwright 인스턴스)
|
|
55
|
-
"""
|
|
56
|
-
pw = await async_playwright().start()
|
|
57
|
-
try:
|
|
58
|
-
try:
|
|
59
|
-
browser = await pw.chromium.launch(headless=headless)
|
|
60
|
-
except PWError as e:
|
|
61
|
-
msg = str(e)
|
|
62
|
-
need_install = (
|
|
63
|
-
"Executable doesn't exist" in msg
|
|
64
|
-
or "Please run the following command to download new browsers"
|
|
65
|
-
in msg
|
|
66
|
-
)
|
|
67
|
-
if need_install and os.getenv("PW_SKIP_AUTO_INSTALL") != "1":
|
|
68
|
-
await pw.stop()
|
|
69
|
-
_install_playwright_browsers("chromium")
|
|
70
|
-
pw = await async_playwright().start()
|
|
71
|
-
browser = await pw.chromium.launch(headless=headless)
|
|
72
|
-
else:
|
|
73
|
-
raise
|
|
74
|
-
return browser, pw
|
|
75
|
-
except Exception:
|
|
76
|
-
await pw.stop()
|
|
77
|
-
raise
|
|
78
|
-
|
|
79
|
-
async def close(self):
|
|
80
|
-
await self.browser.close()
|
|
81
|
-
await self.pw.stop()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
@asynccontextmanager
|
|
86
|
-
async def browser_context(headless: bool = True) -> AsyncGenerator[Browser, None]:
|
|
87
|
-
"""
|
|
88
|
-
Playwright Browser 인스턴스를 생성하고 반환합니다.
|
|
89
|
-
블록을 벗어나면 자동으로 종료됩니다.
|
|
90
|
-
|
|
91
|
-
Usage:
|
|
92
|
-
async with create_browser() as browser:
|
|
93
|
-
page = await browser.new_page()
|
|
94
|
-
await page.goto("https://example.com")
|
|
95
|
-
html = await page.content()
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
headless: 브라우저를 headless 모드로 실행할지 여부 (기본 True)
|
|
99
|
-
"""
|
|
100
|
-
pw = await async_playwright().start()
|
|
101
|
-
try:
|
|
102
|
-
try:
|
|
103
|
-
browser = await pw.chromium.launch(headless=headless)
|
|
104
|
-
except PWError as e:
|
|
105
|
-
# 바이너리 미설치 상황 감지
|
|
106
|
-
msg = str(e)
|
|
107
|
-
need_install = "Executable doesn't exist" in msg or "Please run the following command to download new browsers" in msg
|
|
108
|
-
if need_install and os.getenv("PW_SKIP_AUTO_INSTALL") != "1":
|
|
109
|
-
# 일단 정리
|
|
110
|
-
await pw.stop()
|
|
111
|
-
# 브라우저 설치
|
|
112
|
-
_install_playwright_browsers("chromium")
|
|
113
|
-
# 재시작 후 재시도
|
|
114
|
-
pw = await async_playwright().start()
|
|
115
|
-
browser = await pw.chromium.launch(headless=headless)
|
|
116
|
-
else:
|
|
117
|
-
raise
|
|
118
|
-
|
|
119
|
-
try:
|
|
120
|
-
yield browser
|
|
121
|
-
finally:
|
|
122
|
-
await browser.close()
|
|
123
|
-
finally:
|
|
124
|
-
await pw.stop()
|