scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scraper2/.DS_Store +0 -0
  2. scraper2/adapters/out/.DS_Store +0 -0
  3. scraper2/adapters/out/playwright/browser.py +103 -0
  4. scraper2/adapters/out/playwright/browser_factory.py +112 -0
  5. scraper2/adapters/out/playwright/session.py +121 -0
  6. scraper2/adapters/out/sinks/.DS_Store +0 -0
  7. scraper2/adapters/out/sinks/memory/__init__.py +15 -0
  8. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
  9. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
  10. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
  11. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
  12. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
  13. scraper2/adapters/out/sinks/memory/store.py +74 -0
  14. scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
  15. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
  16. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
  17. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
  18. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
  19. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
  20. scraper2/app/composition.py +195 -0
  21. scraper2/app/parsing/_converters.py +85 -0
  22. scraper2/app/parsing/_normalize.py +134 -0
  23. scraper2/app/parsing/c101_parser.py +143 -0
  24. scraper2/app/parsing/c103_parser.py +128 -0
  25. scraper2/app/parsing/c104_parser.py +143 -0
  26. scraper2/app/parsing/c106_parser.py +153 -0
  27. scraper2/app/parsing/c108_parser.py +65 -0
  28. scraper2/app/ports/browser/browser_factory_port.py +11 -0
  29. scraper2/app/ports/browser/browser_port.py +22 -0
  30. scraper2/app/ports/ingest_port.py +13 -0
  31. scraper2/app/ports/sinks/base_sink_port.py +14 -0
  32. scraper2/app/ports/sinks/c101_sink_port.py +9 -0
  33. scraper2/app/ports/sinks/c103_sink_port.py +9 -0
  34. scraper2/app/ports/sinks/c104_sink_port.py +9 -0
  35. scraper2/app/ports/sinks/c106_sink_port.py +9 -0
  36. scraper2/app/ports/sinks/c108_sink_port.py +9 -0
  37. scraper2/app/usecases/fetch/fetch_c101.py +43 -0
  38. scraper2/app/usecases/fetch/fetch_c103.py +103 -0
  39. scraper2/app/usecases/fetch/fetch_c104.py +76 -0
  40. scraper2/app/usecases/fetch/fetch_c106.py +90 -0
  41. scraper2/app/usecases/fetch/fetch_c108.py +49 -0
  42. scraper2/app/usecases/ingest/ingest_c101.py +36 -0
  43. scraper2/app/usecases/ingest/ingest_c103.py +37 -0
  44. scraper2/app/usecases/ingest/ingest_c104.py +37 -0
  45. scraper2/app/usecases/ingest/ingest_c106.py +38 -0
  46. scraper2/app/usecases/ingest/ingest_c108.py +39 -0
  47. scraper2/main.py +257 -0
  48. scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
  49. scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
  50. scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
  51. scraper2_hj3415/__main__.py +0 -6
  52. scraper2_hj3415/adapters/_shared/utils.py +0 -29
  53. scraper2_hj3415/adapters/clients/browser.py +0 -124
  54. scraper2_hj3415/adapters/clients/http.py +0 -51
  55. scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
  56. scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
  57. scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
  58. scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
  59. scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
  60. scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
  61. scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
  62. scraper2_hj3415/core/constants.py +0 -47
  63. scraper2_hj3415/core/ports/sink_port.py +0 -16
  64. scraper2_hj3415/core/ports/source_port.py +0 -13
  65. scraper2_hj3415/core/types.py +0 -11
  66. scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
  67. scraper2_hj3415/di.py +0 -103
  68. scraper2_hj3415/entrypoints/cli.py +0 -226
  69. scraper2_hj3415/entrypoints/main.py +0 -20
  70. scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
  71. scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
  72. scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
  73. {scraper2_hj3415 → scraper2}/__init__.py +0 -0
  74. {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
  75. {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
  76. {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
  77. {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
  78. {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
  79. {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
  80. {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
  81. {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
  82. {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
  83. {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
  84. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
  85. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
scraper2/main.py ADDED
@@ -0,0 +1,257 @@
1
+ # scraper2/main.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ from typing import Any, Literal, cast
6
+
7
+ import typer
8
+ from datetime import datetime, timezone
9
+
10
+ from scraper2.app.composition import build_usecases
11
+ from scraper2.app.ports.ingest_port import IngestPort
12
+
13
+ from logging_hj3415 import setup_logging
14
+
15
+ setup_logging()
16
+
17
+ Endpoint = Literal["c101", "c103", "c104", "c106", "c108", "all"]
18
+ Sink = Literal["memory", "mongo"]
19
+
20
+ app = typer.Typer(no_args_is_help=True)
21
+
22
+ nfs_app = typer.Typer(no_args_is_help=True, help="NFS 수집/저장")
23
+ mi_app = typer.Typer(no_args_is_help=True, help="(reserved) MI commands")
24
+
25
+ app.add_typer(nfs_app, name="nfs")
26
+ app.add_typer(mi_app, name="mi")
27
+
28
+
29
+ # -------------------------
30
+ # small helpers
31
+ # -------------------------
32
+
33
+ def _endpoint_list(ep: Endpoint) -> list[str]:
34
+ return ["c101", "c103", "c104", "c106", "c108"] if ep == "all" else [ep]
35
+
36
+
37
+ async def _maybe_await_close(obj: Any) -> None:
38
+ close = getattr(obj, "close", None)
39
+ if close is None:
40
+ return
41
+ out = close()
42
+ if asyncio.iscoroutine(out):
43
+ await out
44
+
45
+
46
+ async def _mongo_bootstrap(db) -> None:
47
+ from db2.nfs import ensure_indexes
48
+ from db2.settings import get_settings
49
+
50
+ s = get_settings()
51
+ await ensure_indexes(db, snapshot_ttl_days=s.SNAPSHOT_TTL_DAYS)
52
+
53
+
54
+ async def _load_codes_from_universe(db, *, universe: str) -> list[str]:
55
+ """
56
+ db2.universe에 저장된 universe_latest에서 codes 로드.
57
+ (네 db2 API 명에 맞춰 조정하면 됨)
58
+ """
59
+ from db2.universe import get_universe_latest # 네가 가진 API
60
+
61
+ doc = await get_universe_latest(db, universe=universe)
62
+ if not doc:
63
+ return []
64
+
65
+ # doc 형태가 {"items":[{code,name,...}, ...]} 혹은 {"payload":{"items":[...]}} 일 수 있어서 방어
66
+ data = doc
67
+ if isinstance(data, dict) and "payload" in data and isinstance(data["payload"], dict):
68
+ data = data["payload"]
69
+ if isinstance(data, dict) and "items" in data:
70
+ data = data["items"]
71
+
72
+ if not isinstance(data, list):
73
+ return []
74
+
75
+ codes: list[str] = []
76
+ for row in data:
77
+ if not isinstance(row, dict):
78
+ continue
79
+ code = str(row.get("code") or "").strip()
80
+ if code:
81
+ codes.append(code)
82
+ return codes
83
+
84
+
85
+ import time
86
+
87
+ async def _run_ingest_with_progress(
88
+ *,
89
+ ucs: Any,
90
+ endpoint: Endpoint,
91
+ codes: list[str],
92
+ sleep_sec: float,
93
+ show: bool,
94
+ show_n: int,
95
+ chunk_size: int = 10,
96
+ progress_every: int = 1,
97
+ ) -> None:
98
+ total = len(codes)
99
+ if total == 0:
100
+ typer.echo("(no codes)")
101
+ return
102
+
103
+ t0 = time.perf_counter() # ✅ 시작 시각
104
+ run_asof = datetime.now(timezone.utc)
105
+
106
+ def _chunks(xs: list[str], n: int):
107
+ for i in range(0, len(xs), n):
108
+ yield xs[i:i + n]
109
+
110
+ async def _run_one_endpoint(ep: str) -> None:
111
+ ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
112
+
113
+ ok = 0
114
+ fail = 0
115
+
116
+ typer.echo(f"\n=== START: {ep} === total={total}, chunk_size={chunk_size}")
117
+
118
+ for idx, batch in enumerate(_chunks(codes, chunk_size), start=1):
119
+ try:
120
+ results = await ingest_uc.execute_many(batch, sleep_sec=sleep_sec, asof=run_asof)
121
+ ok += sum(1 for r in results if r is not None)
122
+ except Exception as e:
123
+ fail += len(batch)
124
+ typer.echo(f"[WARN] batch failed: {e!r}")
125
+
126
+ done = min(idx * chunk_size, total)
127
+
128
+ if progress_every > 0 and (idx % progress_every == 0 or done == total):
129
+ typer.echo(f"progress: {done}/{total} (ok={ok}, fail={fail})")
130
+
131
+ typer.echo(f"=== DONE: {ep} === ok={ok}, fail={fail}, total={total}")
132
+
133
+ # --- 실제 실행 ---
134
+ for ep in _endpoint_list(endpoint):
135
+ await _run_one_endpoint(ep)
136
+
137
+ elapsed = time.perf_counter() - t0 # ✅ 종료 시각
138
+ typer.echo(f"\n⏱ elapsed time: {_format_elapsed(elapsed)}")
139
+
140
+ def _dto_to_pretty(obj: Any) -> str:
141
+ # pydantic v2 우선
142
+ if hasattr(obj, "model_dump_json"):
143
+ return obj.model_dump_json(indent=2, by_alias=False)
144
+ if hasattr(obj, "model_dump"):
145
+ import json
146
+ return json.dumps(obj.model_dump(), ensure_ascii=False, indent=2)
147
+ # dict fallback
148
+ if isinstance(obj, dict):
149
+ import json
150
+ return json.dumps(obj, ensure_ascii=False, indent=2, default=str)
151
+ return str(obj)
152
+
153
+ def _format_elapsed(sec: float) -> str:
154
+ if sec < 60:
155
+ return f"{sec:.1f}s"
156
+ m, s = divmod(int(sec), 60)
157
+ if m < 60:
158
+ return f"{m}m {s}s"
159
+ h, m = divmod(m, 60)
160
+ return f"{h}h {m}m {s}s"
161
+
162
+ # -------------------------
163
+ # nfs subcommands: one / all
164
+ # -------------------------
165
+
166
+ @nfs_app.command("one")
167
+ def nfs_one(
168
+ endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
169
+ code: str = typer.Argument(..., help="종목코드 (예: 005930)"),
170
+ sleep_sec: float = typer.Option(2.0, "--sleep"),
171
+ sink: Sink = typer.Option("memory", "--sink"),
172
+ show: bool = typer.Option(True, "--show/--no-show", help="결과 DTO 출력"),
173
+ ):
174
+ code = code.strip()
175
+ if not code:
176
+ raise typer.BadParameter("code는 비어있을 수 없습니다.")
177
+
178
+ async def _run():
179
+ ucs = build_usecases(sink_kind=sink)
180
+
181
+ if sink == "mongo":
182
+ if ucs.db is None:
183
+ raise RuntimeError("mongo sink인데 ucs.db가 없습니다. composition에서 db를 노출하세요.")
184
+ await _mongo_bootstrap(ucs.db)
185
+
186
+ try:
187
+ run_asof = datetime.now(timezone.utc)
188
+ for ep in _endpoint_list(endpoint):
189
+ ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
190
+ results = await ingest_uc.execute_many([code], sleep_sec=sleep_sec, asof=run_asof)
191
+ dto = results[0] if results else None
192
+
193
+ typer.echo(f"\n=== ONE DONE: {ep} {code} ===")
194
+ if not show:
195
+ continue
196
+ if dto is None:
197
+ typer.echo("(no result)")
198
+ else:
199
+ typer.echo(_dto_to_pretty(dto))
200
+ finally:
201
+ if getattr(ucs, "mongo", None) is not None:
202
+ await _maybe_await_close(ucs.mongo)
203
+
204
+ asyncio.run(_run())
205
+
206
+
207
+ @nfs_app.command("all")
208
+ def nfs_all(
209
+ endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
210
+ universe: str = typer.Option("krx300", "--universe"),
211
+ limit: int = typer.Option(0, "--limit", help="0=전체"),
212
+ sleep_sec: float = typer.Option(2.0, "--sleep"),
213
+ sink: Sink = typer.Option("mongo", "--sink"),
214
+ chunk_size: int = typer.Option(5, "--chunk", help="진행률 표시용 배치 크기"),
215
+ show: bool = typer.Option(False, "--show/--no-show", help="일부 코드만 출력"),
216
+ show_n: int = typer.Option(3, "--show-n"),
217
+ ):
218
+ async def _run():
219
+ ucs = build_usecases(sink_kind=sink)
220
+ if ucs.db is None:
221
+ raise RuntimeError("all 모드는 DB가 필요합니다. mongo sink로 ucs.db를 노출하세요.")
222
+ await _mongo_bootstrap(ucs.db)
223
+
224
+ codes = await _load_codes_from_universe(ucs.db, universe=universe)
225
+ if not codes:
226
+ raise RuntimeError(f"universe='{universe}' codes가 비었습니다. 먼저 krx sync로 universe를 채우세요.")
227
+
228
+ if limit and limit > 0:
229
+ codes = codes[:limit]
230
+
231
+ typer.echo(f"\n=== NFS ALL === universe={universe}, endpoint={endpoint}, codes={len(codes)}, sink={sink}")
232
+
233
+ try:
234
+ await _run_ingest_with_progress(
235
+ ucs=ucs,
236
+ endpoint=endpoint,
237
+ codes=codes,
238
+ sleep_sec=sleep_sec,
239
+ show=show,
240
+ show_n=show_n,
241
+ chunk_size=chunk_size,
242
+ progress_every=1, # chunk마다
243
+ )
244
+ finally:
245
+ if getattr(ucs, "mongo", None) is not None:
246
+ await _maybe_await_close(ucs.mongo)
247
+
248
+ asyncio.run(_run())
249
+
250
+
251
+ @mi_app.callback(invoke_without_command=True)
252
+ def mi():
253
+ pass
254
+
255
+
256
+ if __name__ == "__main__":
257
+ app()
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: scraper2-hj3415
3
+ Version: 2.1.0
4
+ Summary: Naver WiseReport scraper
5
+ Keywords: example,demo
6
+ Author-email: Hyungjin Kim <hj3415@gmail.com>
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Typing :: Typed
12
+ License-File: LICENSE
13
+ Requires-Dist: playwright>=1.55
14
+ Requires-Dist: pandas>=2.3.3
15
+ Requires-Dist: pandas-stubs>=2.3.3
16
+ Requires-Dist: lxml>=6.0.2
17
+ Requires-Dist: typer>=0.21.0
18
+ Requires-Dist: db2-hj3415
19
+
20
+ # scraper2
21
+
22
+ KRX 기반 국내 주식 재무(NFS) 데이터 스크래퍼 CLI 도구입니다.
23
+ 종목 단건 수집(one)과 유니버스 전체 수집(all)을 명확히 분리해 운영합니다.
24
+
25
+
26
+
27
+ 설치
28
+
29
+ pip install -e .
30
+
31
+ 설치 후 CLI 확인:
32
+
33
+ scraper2 --help
34
+
35
+
36
+
37
+
38
+ CLI 구조
39
+
40
+ scraper2
41
+ ├─ nfs
42
+ │ ├─ one # 종목 1개 수집
43
+ │ └─ all # 유니버스 전체 수집
44
+ └─ mi # (reserved)
45
+
46
+
47
+
48
+
49
+ 1️⃣ 단일 종목 수집 (one)
50
+
51
+ 사용법
52
+
53
+ scraper2 nfs one <endpoint> <code> [options]
54
+
55
+ 예시
56
+
57
+ # 삼성전자 c101 단건 수집 (메모리)
58
+ scraper2 nfs one c101 005930
59
+
60
+ # 여러 endpoint 동시 수집
61
+ scraper2 nfs one all 005930
62
+
63
+ # MongoDB 저장
64
+ scraper2 nfs one c101 005930 --sink mongo
65
+
66
+ # 결과 DTO 출력 비활성화
67
+ scraper2 nfs one c101 005930 --no-show
68
+
69
+ 특징
70
+ • 항상 종목 1개만 처리
71
+ • 결과 DTO를 즉시 출력 (디버깅·검증용)
72
+ • 운영 자동화보다는 수동 테스트 / 검증용
73
+
74
+
75
+
76
+ 2️⃣ 유니버스 전체 수집 (all)
77
+
78
+ 사용 전 필수
79
+
80
+ # universe를 먼저 DB에 적재
81
+ krx sync
82
+
83
+
84
+
85
+
86
+ 사용법
87
+
88
+ scraper2 nfs all <endpoint> [options]
89
+
90
+ 예시
91
+
92
+ # krx300 전체 종목 c101 수집
93
+ scraper2 nfs all c101
94
+
95
+ # 모든 endpoint 수집
96
+ scraper2 nfs all all
97
+
98
+ # 특정 유니버스 지정
99
+ scraper2 nfs all c103 --universe krx300
100
+
101
+ # 처리 개수 제한 (테스트용)
102
+ scraper2 nfs all c101 --limit 10
103
+
104
+ # 진행률 배치 크기 조정
105
+ scraper2 nfs all c101 --chunk 5
106
+
107
+ 실행 중 출력 예시
108
+
109
+ === NFS ALL === universe=krx300, endpoint=c101, codes=298, sink=mongo
110
+
111
+ === START: c101 === total=298, chunk_size=5
112
+ progress: 25/298 (ok=25, fail=0)
113
+ progress: 50/298 (ok=50, fail=0)
114
+ ...
115
+ === DONE: c101 === ok=297, fail=1, total=298
116
+
117
+ ⏱ elapsed time: 6m 42s
118
+
119
+ 특징
120
+ • DB에 저장된 universe_latest 기준
121
+ • 진행률, 성공/실패 개수 실시간 출력
122
+ • 전체 작업 종료 시 총 소요 시간 표시
123
+ • 운영/스케줄러용 메인 모드
124
+
125
+
126
+
127
+ 3️⃣ 작업 스케줄러 예시 (cron)
128
+
129
+ 매일 새벽 2시 전체 수집
130
+
131
+ 0 2 * * * /path/to/venv/bin/scraper2 nfs all all >> /var/log/scraper2_nfs.log 2>&1
132
+
133
+ endpoint 분리 실행 (권장)
134
+
135
+ 0 2 * * * /path/to/venv/bin/scraper2 nfs all c101 >> /var/log/nfs_c101.log 2>&1
136
+ 10 2 * * * /path/to/venv/bin/scraper2 nfs all c103 >> /var/log/nfs_c103.log 2>&1
137
+
138
+ 테스트용 제한 실행
139
+
140
+ 0 1 * * * /path/to/venv/bin/scraper2 nfs all c101 --limit 20 >> /var/log/nfs_test.log 2>&1
141
+
142
+
143
+
144
+
145
+ 운영 권장 사항
146
+ • all 모드는 반드시 mongo sink 사용
147
+ • universe는 krx sync로 관리
148
+ • 로그에는
149
+ • 진행률
150
+ • ok / fail 개수
151
+ • 전체 소요 시간
152
+ 이 모두 남도록 설계됨
153
+ • 장애 발생 시 어디까지 진행됐는지 로그만 보고 판단 가능
154
+
155
+
156
+
157
+ 요약
158
+
159
+ 목적 명령
160
+ 단건 검증 scraper2 nfs one c101 005930
161
+ 전체 수집 scraper2 nfs all all
162
+ 스케줄러 scraper2 nfs all <endpoint>
163
+
164
+
@@ -0,0 +1,63 @@
1
+ scraper2/.DS_Store,sha256=miZHKI70yhXtniLCotYbxAAOTB3ML9T6Tb8xQYPeK8w,6148
2
+ scraper2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ scraper2/main.py,sha256=UTm9RcYpdBP8dMyvsJG-y0_9AINqOVgJfIkdFitMLfs,8081
4
+ scraper2/adapters/out/.DS_Store,sha256=nUqwRB5F2DM82P8BALYvDI0YoD1UbmngfSi8ukKkY7E,6148
5
+ scraper2/adapters/out/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ scraper2/adapters/out/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scraper2/adapters/out/playwright/browser.py,sha256=XomDThsVhsd5MdwhatgXyaCocs1XCjYEKC_YHmo1jsk,3988
8
+ scraper2/adapters/out/playwright/browser_factory.py,sha256=y008Dbu6VmzrEe7zn7CbODndQEVJtv_EBu04GsMbPGM,3697
9
+ scraper2/adapters/out/playwright/session.py,sha256=hQDmYpi7pIVDjkymaTKQzJVWUsRRlvJg1V777V8q44M,3727
10
+ scraper2/adapters/out/sinks/.DS_Store,sha256=c6VOGBl6bMmFzab335dcT09WinGd4BCZXZiPjrZjd7o,6148
11
+ scraper2/adapters/out/sinks/memory/__init__.py,sha256=djvn50E0dBZr-H6Xmh9FMYalG2Zc0jL5kTXaBjnRaRo,400
12
+ scraper2/adapters/out/sinks/memory/c101_memory_sink.py,sha256=klVgT_ra8EQYjJCk6ilWDkrNG3dj_ITP-eqdwX07nY4,682
13
+ scraper2/adapters/out/sinks/memory/c103_memory_sink.py,sha256=hov3yH3uQqFie1BzxHa3GKsHjm_e5011ItZ4AiqaJIg,683
14
+ scraper2/adapters/out/sinks/memory/c104_memory_sink.py,sha256=-cmNhK17OdsFfVklJjQFF4IWJvmH5ppGcjnpLf8VOns,683
15
+ scraper2/adapters/out/sinks/memory/c106_memory_sink.py,sha256=xyrMDjIpPZJ2CHc8687weEF72wKcVl_flFp7PU6GNdo,682
16
+ scraper2/adapters/out/sinks/memory/c108_memory_sink.py,sha256=HYxZ1YymdSlGzPpaNsdQGZhpzVr-ArgNgM0PyJdfubA,682
17
+ scraper2/adapters/out/sinks/memory/store.py,sha256=h4dwiCF5gne5kloRdD78NWlqtcaailIqoId9xAjtBk4,2738
18
+ scraper2/adapters/out/sinks/mongo/__init__.py,sha256=YmEZqNqh7S4PFTufxd5sCF2k24rTOsxY3ZFrFVyQzh8,382
19
+ scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py,sha256=CpcafwjBJ-Jo-sm02hRC1H214B4aKfvV5MNoNP1AfxQ,1270
20
+ scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py,sha256=gKoAOL3Dj4_JVjhdc5QAZObk49pFT-ERCyJCpUF9j2k,1203
21
+ scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py,sha256=IicVnc2fyeBXoBbgMasB7utzF7f1S6upgHV4g3sjs4g,1203
22
+ scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py,sha256=FMdCp8WVjPwidnh7tIPUoViQWr48O16xtB34O_iCtJI,1203
23
+ scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py,sha256=eSvIRtofWvNKVPchglwL1mOw5hsKDpUfNz5EOum-H3Y,1203
24
+ scraper2/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ scraper2/app/composition.py,sha256=HijMEE-2lMEMiYKlVN9ZWJD-E4vwgzIf9-plYzoSwGc,6229
26
+ scraper2/app/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ scraper2/app/parsing/_converters.py,sha256=z0kSL4nQnqq5w7QfJZBSbjZOLo1JhoqqDLpqlEAN4bo,1815
28
+ scraper2/app/parsing/_normalize.py,sha256=2qqbXxTbzbuYlu7ttzQjyKgatFnwopme2b_fd0zahio,3738
29
+ scraper2/app/parsing/c101_parser.py,sha256=QybZcd_7om4u3w5BWzbVNMcj50WrHtKr8hDOmkBMZGw,5537
30
+ scraper2/app/parsing/c103_parser.py,sha256=BIHJ0OHUaGbVu3kyfgYQQIKf4O_lj4ZTXXPk6vl7Iok,3744
31
+ scraper2/app/parsing/c104_parser.py,sha256=NGnzdVbhdqXFqJphwEDSlJPnM18RU759FCgry20a4ko,4193
32
+ scraper2/app/parsing/c106_parser.py,sha256=JCml8HHnczgMUVnUkRI8AMEJ9mog1dOJfdd6hQKtv9I,4505
33
+ scraper2/app/parsing/c108_parser.py,sha256=VEzzXliatoRdxR2_uSnHMHLNvV5h2irYiyoXAMQm8jc,1961
34
+ scraper2/app/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
+ scraper2/app/ports/ingest_port.py,sha256=RHaRXJanq4VdDUXr42pEJr0vWru7zlIHNOTv7k940os,316
36
+ scraper2/app/ports/browser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ scraper2/app/ports/browser/browser_factory_port.py,sha256=dJ3JCc38MVF7wPmCH4MO3BdnHIXE5wSmfsV7cLysJ54,401
38
+ scraper2/app/ports/browser/browser_port.py,sha256=tFYkdCeTCF-5XHvT1duioj31ytsc-ATQpmEgposi1X4,1133
39
+ scraper2/app/ports/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ scraper2/app/ports/sinks/base_sink_port.py,sha256=1fJ0fCZ-uDOwfCy2MJLPOV5b_twsfOKGphlsxM2-uOw,414
41
+ scraper2/app/ports/sinks/c101_sink_port.py,sha256=dO_A4AR-7lbPO_MlYz_CnumhO5SM_aJfC-09I2nTr2U,297
42
+ scraper2/app/ports/sinks/c103_sink_port.py,sha256=xjRkgu_mxCMwnMHMyJy5dTHw8AxwrXmiWzT6cUCujXg,297
43
+ scraper2/app/ports/sinks/c104_sink_port.py,sha256=0k_47ZZFTCt9jsFhWhDhhvtfdyZa3hVfF65bcLJX1AU,297
44
+ scraper2/app/ports/sinks/c106_sink_port.py,sha256=cge47IiMoFGC_wmHAcHn2nTiH0h65df5N8HLwqzBuY4,297
45
+ scraper2/app/ports/sinks/c108_sink_port.py,sha256=RLZRHJTvdZRsHcs18J0H98XQirW6xRmuMDx2XhiB3ac,297
46
+ scraper2/app/usecases/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
+ scraper2/app/usecases/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ scraper2/app/usecases/fetch/fetch_c101.py,sha256=uqvnH_D8Jp2_BtoiEUcbwkPulv7M9qcq5WI77vsOzCc,1480
49
+ scraper2/app/usecases/fetch/fetch_c103.py,sha256=NHXkUQxM2-Z7N7oW0uW88G40j57eGHX7RQtXCnZVWcY,3321
50
+ scraper2/app/usecases/fetch/fetch_c104.py,sha256=ZAja-G3hCEqLCzVDba2iuu1EFN_wUiDnm9iMcG5nsO4,2518
51
+ scraper2/app/usecases/fetch/fetch_c106.py,sha256=cIFNJ1-_MgyOCIGtVSkEKbiVRBFIhwtKiv0C9uKvrB0,3049
52
+ scraper2/app/usecases/fetch/fetch_c108.py,sha256=okVbNmCcXcgy9-9GOhvgqrHd6ujXv_lL0uogPr9POEs,1685
53
+ scraper2/app/usecases/ingest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
54
+ scraper2/app/usecases/ingest/ingest_c101.py,sha256=feaasz-Tx_CCdcs_4Wh3KBQxj3oMefacbq3Ds6UDpmk,1149
55
+ scraper2/app/usecases/ingest/ingest_c103.py,sha256=T_IKs5ikckin_05FnL-dxirAW1PPav867AuQsVsrZ5Y,1150
56
+ scraper2/app/usecases/ingest/ingest_c104.py,sha256=2rGTcFbsATsn3d2KsSEkL5x4fmkGo7x9MHrysoxiICM,1150
57
+ scraper2/app/usecases/ingest/ingest_c106.py,sha256=mQrbASbKVUQyVcIWiXzIczbX-1mNR5NGk760enVCfvo,1190
58
+ scraper2/app/usecases/ingest/ingest_c108.py,sha256=49ULzdl0dN6z3istAKg29PcD5wHTxYholqAZiIEmUzU,1191
59
+ scraper2_hj3415-2.1.0.dist-info/entry_points.txt,sha256=jUNx7ZJQedQ3QnsDN1ompQ0PjwdvVmnKdHHFMfQQPlI,46
60
+ scraper2_hj3415-2.1.0.dist-info/licenses/LICENSE,sha256=QBiVGQuKAESeCfQE344Ik2ex6g2zfYdu9WqrRWydxIs,1068
61
+ scraper2_hj3415-2.1.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
62
+ scraper2_hj3415-2.1.0.dist-info/METADATA,sha256=BM0PNscoPswXr_CaQCCas_f8VZ_QSLxiq1KL8oGTpZk,3425
63
+ scraper2_hj3415-2.1.0.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ scraper2=scraper2.main:app
3
+
@@ -1,6 +0,0 @@
1
- # scraper2_hj3415/__main__.py
2
-
3
- from scraper2_hj3415.entrypoints.cli import app
4
-
5
- if __name__ == "__main__":
6
- app()
@@ -1,29 +0,0 @@
1
- # scraper2_hj3415/adapters/_shared/utils.py
2
-
3
- from loguru import logger
4
- import pandas as pd
5
- from typing import Iterable, Iterator, TypeVar, List
6
-
7
- T = TypeVar("T")
8
-
9
- def chunked(iterable: Iterable[T], size: int) -> Iterator[List[T]]:
10
- # 큰 데이터 시퀀스를 size개씩 끊어서(List로 묶어서) 하나씩 yield 하는 제너레이터 함수.
11
- buf: List[T] = []
12
- for item in iterable:
13
- buf.append(item)
14
- if len(buf) >= size:
15
- yield buf
16
- buf = []
17
- if buf:
18
- yield buf
19
-
20
- def log_df(df: pd.DataFrame, name: str = "DataFrame", max_rows: int = 10):
21
- """짧고 깔끔하게 DataFrame 로그 찍기"""
22
- if df.empty:
23
- logger.info(f"[{name}] DataFrame is empty.")
24
- return
25
-
26
- head = df.head(max_rows)
27
- msg = head.to_markdown(index=False)
28
- more = "" if len(df) <= max_rows else f"\n... ({len(df)} rows total)"
29
- logger.debug(f"\n[{name}] shape={df.shape}\n{msg}{more}")
@@ -1,124 +0,0 @@
1
- # src/scraper2_hj3415/adapters/clients/browser.py
2
-
3
- from contextlib import asynccontextmanager
4
- from typing import AsyncGenerator
5
- import subprocess
6
- import sys
7
- import os
8
-
9
- from playwright.async_api import async_playwright, Browser, Error as PWError
10
-
11
- def _install_playwright_browsers(*names: str) -> None:
12
- """
13
- playwright install [names...] 를 코드에서 실행.
14
- macOS/Windows에선 install만, Linux면 필요시 install-deps도 함께.
15
- """
16
- args = [sys.executable, "-m", "playwright", "install", *names]
17
- subprocess.run(args, check=True)
18
-
19
- if sys.platform.startswith("linux"):
20
- try:
21
- subprocess.run(
22
- [sys.executable, "-m", "playwright", "install-deps"], check=True
23
- )
24
- except Exception:
25
- pass
26
-
27
- class PlaywrightSession:
28
- def __init__(self, browser, pw):
29
- self.browser = browser
30
- self.pw = pw
31
-
32
- @classmethod
33
- async def create(cls, headless=True):
34
- browser, pw = await PlaywrightSession.create_browser(headless=headless)
35
- return cls(browser, pw)
36
-
37
- @staticmethod
38
- async def create_browser(headless: bool = True) -> tuple[Browser, any]:
39
- """
40
- Playwright Browser 인스턴스를 생성하고 반환합니다.
41
- (asynccontextmanager를 사용하지 않는 일반 버전)
42
-
43
- 사용 예시:
44
- browser, pw = await create_browser()
45
- page = await browser.new_page()
46
- await page.goto("https://example.com")
47
- html = await page.content()
48
- await browser.close()
49
- await pw.stop()
50
-
51
- Args:
52
- headless: 브라우저를 headless 모드로 실행할지 여부 (기본 True)
53
- Returns:
54
- (browser, pw): (Browser 객체, async_playwright 인스턴스)
55
- """
56
- pw = await async_playwright().start()
57
- try:
58
- try:
59
- browser = await pw.chromium.launch(headless=headless)
60
- except PWError as e:
61
- msg = str(e)
62
- need_install = (
63
- "Executable doesn't exist" in msg
64
- or "Please run the following command to download new browsers"
65
- in msg
66
- )
67
- if need_install and os.getenv("PW_SKIP_AUTO_INSTALL") != "1":
68
- await pw.stop()
69
- _install_playwright_browsers("chromium")
70
- pw = await async_playwright().start()
71
- browser = await pw.chromium.launch(headless=headless)
72
- else:
73
- raise
74
- return browser, pw
75
- except Exception:
76
- await pw.stop()
77
- raise
78
-
79
- async def close(self):
80
- await self.browser.close()
81
- await self.pw.stop()
82
-
83
-
84
-
85
- @asynccontextmanager
86
- async def browser_context(headless: bool = True) -> AsyncGenerator[Browser, None]:
87
- """
88
- Playwright Browser 인스턴스를 생성하고 반환합니다.
89
- 블록을 벗어나면 자동으로 종료됩니다.
90
-
91
- Usage:
92
- async with create_browser() as browser:
93
- page = await browser.new_page()
94
- await page.goto("https://example.com")
95
- html = await page.content()
96
-
97
- Args:
98
- headless: 브라우저를 headless 모드로 실행할지 여부 (기본 True)
99
- """
100
- pw = await async_playwright().start()
101
- try:
102
- try:
103
- browser = await pw.chromium.launch(headless=headless)
104
- except PWError as e:
105
- # 바이너리 미설치 상황 감지
106
- msg = str(e)
107
- need_install = "Executable doesn't exist" in msg or "Please run the following command to download new browsers" in msg
108
- if need_install and os.getenv("PW_SKIP_AUTO_INSTALL") != "1":
109
- # 일단 정리
110
- await pw.stop()
111
- # 브라우저 설치
112
- _install_playwright_browsers("chromium")
113
- # 재시작 후 재시도
114
- pw = await async_playwright().start()
115
- browser = await pw.chromium.launch(headless=headless)
116
- else:
117
- raise
118
-
119
- try:
120
- yield browser
121
- finally:
122
- await browser.close()
123
- finally:
124
- await pw.stop()