scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scraper2/.DS_Store +0 -0
  2. scraper2/adapters/out/.DS_Store +0 -0
  3. scraper2/adapters/out/playwright/browser.py +103 -0
  4. scraper2/adapters/out/playwright/browser_factory.py +112 -0
  5. scraper2/adapters/out/playwright/session.py +121 -0
  6. scraper2/adapters/out/sinks/.DS_Store +0 -0
  7. scraper2/adapters/out/sinks/memory/__init__.py +15 -0
  8. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
  9. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
  10. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
  11. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
  12. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
  13. scraper2/adapters/out/sinks/memory/store.py +74 -0
  14. scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
  15. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
  16. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
  17. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
  18. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
  19. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
  20. scraper2/app/composition.py +195 -0
  21. scraper2/app/parsing/_converters.py +85 -0
  22. scraper2/app/parsing/_normalize.py +134 -0
  23. scraper2/app/parsing/c101_parser.py +143 -0
  24. scraper2/app/parsing/c103_parser.py +128 -0
  25. scraper2/app/parsing/c104_parser.py +143 -0
  26. scraper2/app/parsing/c106_parser.py +153 -0
  27. scraper2/app/parsing/c108_parser.py +65 -0
  28. scraper2/app/ports/browser/browser_factory_port.py +11 -0
  29. scraper2/app/ports/browser/browser_port.py +22 -0
  30. scraper2/app/ports/ingest_port.py +13 -0
  31. scraper2/app/ports/sinks/base_sink_port.py +14 -0
  32. scraper2/app/ports/sinks/c101_sink_port.py +9 -0
  33. scraper2/app/ports/sinks/c103_sink_port.py +9 -0
  34. scraper2/app/ports/sinks/c104_sink_port.py +9 -0
  35. scraper2/app/ports/sinks/c106_sink_port.py +9 -0
  36. scraper2/app/ports/sinks/c108_sink_port.py +9 -0
  37. scraper2/app/usecases/fetch/fetch_c101.py +43 -0
  38. scraper2/app/usecases/fetch/fetch_c103.py +103 -0
  39. scraper2/app/usecases/fetch/fetch_c104.py +76 -0
  40. scraper2/app/usecases/fetch/fetch_c106.py +90 -0
  41. scraper2/app/usecases/fetch/fetch_c108.py +49 -0
  42. scraper2/app/usecases/ingest/ingest_c101.py +36 -0
  43. scraper2/app/usecases/ingest/ingest_c103.py +37 -0
  44. scraper2/app/usecases/ingest/ingest_c104.py +37 -0
  45. scraper2/app/usecases/ingest/ingest_c106.py +38 -0
  46. scraper2/app/usecases/ingest/ingest_c108.py +39 -0
  47. scraper2/main.py +257 -0
  48. scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
  49. scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
  50. scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
  51. scraper2_hj3415/__main__.py +0 -6
  52. scraper2_hj3415/adapters/_shared/utils.py +0 -29
  53. scraper2_hj3415/adapters/clients/browser.py +0 -124
  54. scraper2_hj3415/adapters/clients/http.py +0 -51
  55. scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
  56. scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
  57. scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
  58. scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
  59. scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
  60. scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
  61. scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
  62. scraper2_hj3415/core/constants.py +0 -47
  63. scraper2_hj3415/core/ports/sink_port.py +0 -16
  64. scraper2_hj3415/core/ports/source_port.py +0 -13
  65. scraper2_hj3415/core/types.py +0 -11
  66. scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
  67. scraper2_hj3415/di.py +0 -103
  68. scraper2_hj3415/entrypoints/cli.py +0 -226
  69. scraper2_hj3415/entrypoints/main.py +0 -20
  70. scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
  71. scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
  72. scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
  73. {scraper2_hj3415 → scraper2}/__init__.py +0 -0
  74. {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
  75. {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
  76. {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
  77. {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
  78. {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
  79. {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
  80. {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
  81. {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
  82. {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
  83. {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
  84. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
  85. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +0,0 @@
1
- # scraper2_hj3415/core/ports/source_port.py
2
-
3
- from typing import Literal, Protocol, List
4
- from scraper2_hj3415.core.types import NormalizedBundle
5
-
6
- class C1034BundleSourcePort(Protocol):
7
- async def list_bundles(
8
- self,
9
- cmp_cd: str,
10
- page: Literal["c103", "c104"],
11
- *,
12
- concurrency: int = 2,
13
- ) -> List[NormalizedBundle]: ...
@@ -1,11 +0,0 @@
1
- # scraper2_hj3415/core/types.py
2
-
3
- from dataclasses import dataclass
4
- import pandas as pd
5
-
6
- @dataclass(frozen=True)
7
- class NormalizedBundle:
8
- fact: pd.DataFrame
9
- dim_account: pd.DataFrame
10
- dim_period: pd.DataFrame
11
- delta: pd.DataFrame
@@ -1,139 +0,0 @@
1
- # scraper2_hj3415/core/usecases/c1034_ingest.py
2
-
3
- from __future__ import annotations
4
- from dataclasses import dataclass, asdict
5
- from typing import Iterable, Literal, Sequence, Callable, Awaitable
6
-
7
- from loguru import logger
8
- from scraper2_hj3415.core.types import NormalizedBundle
9
- from scraper2_hj3415.core.ports.source_port import C1034BundleSourcePort
10
- from scraper2_hj3415.core.ports.sink_port import C1034SinkPort
11
-
12
-
13
- @dataclass(frozen=True)
14
- class IngestStats:
15
- dim_account_rows: int = 0
16
- dim_period_rows: int = 0
17
- fact_rows: int = 0
18
- delta_rows: int = 0
19
-
20
- def __add__(self, other: "IngestStats") -> "IngestStats":
21
- return IngestStats(
22
- dim_account_rows=self.dim_account_rows + other.dim_account_rows,
23
- dim_period_rows=self.dim_period_rows + other.dim_period_rows,
24
- fact_rows=self.fact_rows + other.fact_rows,
25
- delta_rows=self.delta_rows + other.delta_rows,
26
- )
27
-
28
- class C1034IngestUseCase:
29
- """
30
- 수집(Source) → 정규화(Pipeline) → 저장(Sink) 전체를 담당하는 유스케이스.
31
-
32
- 생성 방법:
33
- 일반 생성자 대신 반드시 아래 팩토리 메서드를 사용해야 합니다.
34
-
35
- uc = await C1034IngestUseCase.create(repo)
36
-
37
- 내부적으로 Playwright 브라우저 세션을 자동 생성하며,
38
- 종료 시에는 `await uc.close()` 로 정리해야 합니다.
39
- """
40
-
41
- def __init__(
42
- self,
43
- *,
44
- source: C1034BundleSourcePort,
45
- sink: C1034SinkPort,
46
- on_bundle: Callable[[str, NormalizedBundle], Awaitable[None]] | None = None,
47
- ) -> None:
48
- self.source = source
49
- self.sink = sink
50
- self.on_bundle = on_bundle
51
-
52
- async def _save_bundle(
53
- self, page_label: str, bundle: NormalizedBundle
54
- ) -> IngestStats:
55
- await self.sink.save_all(
56
- dim_account_df=bundle.dim_account,
57
- dim_period_df=bundle.dim_period,
58
- fact_df=bundle.fact,
59
- delta_df=bundle.delta,
60
- )
61
- stats = IngestStats(
62
- dim_account_rows=len(bundle.dim_account),
63
- dim_period_rows=len(bundle.dim_period),
64
- fact_rows=len(bundle.fact),
65
- delta_rows=len(bundle.delta),
66
- )
67
- logger.debug(f"[{page_label}] saved bundle stats={asdict(stats)}")
68
- return stats
69
-
70
- async def ingest_c103(
71
- self, cmp_cd: str, *, save: bool = True, collect_only: bool = False
72
- ):
73
- logger.info(f"[c103] ingest start cmp_cd={cmp_cd} save={save}")
74
- bundles = await self.source.list_bundles(cmp_cd, page="c103", concurrency=2)
75
- if self.on_bundle:
76
- for b in bundles:
77
- await self.on_bundle("c103", b)
78
- if save and not collect_only:
79
- stats = IngestStats()
80
- for b in bundles:
81
- stats += await self._save_bundle("c103", b)
82
- logger.info(f"[c103] ingest done cmp_cd={cmp_cd} result={asdict(stats)}")
83
- return stats
84
- else:
85
- logger.info(
86
- f"[c103] ingest done cmp_cd={cmp_cd} result={len(bundles)} bundles"
87
- )
88
- return bundles
89
-
90
- async def ingest_c104(self, cmp_cd: str, *, save: bool = True, collect_only: bool = False):
91
- logger.info(f"[c104] ingest start cmp_cd={cmp_cd} save={save}")
92
- bundles = await self.source.list_bundles(cmp_cd, page="c104", concurrency=2)
93
- if self.on_bundle:
94
- for b in bundles:
95
- await self.on_bundle("c104", b)
96
- if save and not collect_only:
97
- stats = IngestStats()
98
- for b in bundles:
99
- stats += await self._save_bundle("c104", b)
100
- logger.info(f"[c104] ingest done cmp_cd={cmp_cd} result={asdict(stats)}")
101
- return stats
102
- else:
103
- logger.info(f"[c104] ingest done cmp_cd={cmp_cd} result={len(bundles)} bundles")
104
- return bundles
105
-
106
- async def ingest_all(self, cmp_cd: str, pages: Sequence[Literal["c103","c104"]] = ("c103","c104"), *, save=True, collect_only=False):
107
- total = IngestStats()
108
- collected: list[NormalizedBundle] = []
109
- if "c103" in pages:
110
- r = await self.ingest_c103(cmp_cd, save=save, collect_only=collect_only)
111
- total = total + r if isinstance(r, IngestStats) else total; collected += ([] if isinstance(r, IngestStats) else r)
112
- if "c104" in pages:
113
- r = await self.ingest_c104(cmp_cd, save=save, collect_only=collect_only)
114
- total = total + r if isinstance(r, IngestStats) else total; collected += ([] if isinstance(r, IngestStats) else r)
115
- return total if save and not collect_only else collected
116
-
117
- async def ingest_many(self, cmp_cds: Iterable[str], pages: Sequence[Literal["c103","c104"]] = ("c103","c104"), *, concurrency=3, save=True, collect_only=False):
118
- import asyncio
119
- sem = asyncio.Semaphore(max(1, concurrency))
120
- agg = IngestStats()
121
- collected: dict[str, list[NormalizedBundle]] = {}
122
-
123
- async def _worker(code: str):
124
- async with sem:
125
- try:
126
- res = await self.ingest_all(code, pages=pages, save=save, collect_only=collect_only)
127
- return ("stats", res, code) if isinstance(res, IngestStats) else ("bundles", res, code)
128
- except Exception as e:
129
- logger.exception(f"ingest error cmp_cd={code}: {e}")
130
- return ("error", e, code)
131
-
132
- tasks = [_worker(c) for c in cmp_cds]
133
- for coro in asyncio.as_completed(tasks):
134
- kind, payload, code = await coro
135
- if kind == "stats":
136
- agg = agg + payload # type: ignore[operator]
137
- elif kind == "bundles":
138
- collected[code] = payload # type: ignore[assignment]
139
- return agg if save and not collect_only else collected
scraper2_hj3415/di.py DELETED
@@ -1,103 +0,0 @@
1
- # scraper2_hj3415/di.py
2
- from __future__ import annotations
3
-
4
- import os
5
- from contextlib import asynccontextmanager
6
- from typing import AsyncIterator, Callable, Awaitable
7
-
8
- from scraper2_hj3415.core.types import NormalizedBundle
9
- from scraper2_hj3415.core.usecases.c1034_ingest import C1034IngestUseCase
10
- from scraper2_hj3415.core.ports.sink_port import C1034SinkPort
11
- from scraper2_hj3415.core.ports.source_port import C1034BundleSourcePort
12
- from contracts_hj3415.ports.c1034_write_repo import C1034WriteRepoPort
13
-
14
- # ── Adapters (실어댑터) ──────────────────────────────────────────────────────
15
- from scraper2_hj3415.adapters.clients.browser import PlaywrightSession
16
- from scraper2_hj3415.adapters.nfs.sources.bundle_source import NfsBundleSource
17
- from scraper2_hj3415.adapters.nfs.sinks.c1034_sink import C1034Sink
18
-
19
-
20
- # ── 설정값 로딩 도우미 ───────────────────────────────────────────────────────
21
- def _env_bool(key: str, default: bool) -> bool:
22
- v = os.getenv(key)
23
- if v is None:
24
- return default
25
- return v.lower() in {"1", "true", "yes", "on"}
26
-
27
- def _env_int(key: str, default: int) -> int:
28
- try:
29
- return int(os.getenv(key, "").strip() or default)
30
- except Exception:
31
- return default
32
-
33
-
34
- # ── DI: Playwright 세션 + Source + Sink + UseCase 조립 ──────────────────────
35
- @asynccontextmanager
36
- async def provide_ingest_usecase(
37
- *,
38
- repo: C1034WriteRepoPort, # ← db2_hj3415 등 구현체 주입 필수
39
- headless: bool | None = None, # 기본: ENV로 제어
40
- chunk: int | None = None, # 기본: ENV로 제어
41
- on_bundle: Callable[[str, NormalizedBundle], Awaitable[None]] | None = None,
42
- ) -> AsyncIterator[C1034IngestUseCase]:
43
- """
44
- 실제 어댑터를 사용해 C1034IngestUseCase를 만들어 주는 async context manager.
45
-
46
- Lifecyle:
47
- - PlaywrightSession 생성 → NfsBundleSource 생성 → C1034Sink(repo) 생성
48
- - with 블록 종료 시 세션 정리
49
-
50
- Args:
51
- repo: C1034RepositoryPort 구현체 (예: db2_hj3415의 Mongo/Beanie 리포지토리)
52
- headless: 브라우저 headless 여부 (None이면 환경변수로 결정)
53
- chunk: Sink 배치 저장 크기 (None이면 환경변수로 결정)
54
- on_bundle: 각 번들 저장 전 훅(옵션)
55
-
56
- ENV:
57
- SCRAPER_HEADLESS=true|false (default: true)
58
- SCRAPER_SINK_CHUNK=1000 (default: 1000)
59
- """
60
- # 환경변수 기반 기본값
61
- headless = _env_bool("SCRAPER_HEADLESS", True) if headless is None else headless
62
- chunk = _env_int("SCRAPER_SINK_CHUNK", 1000) if chunk is None else chunk
63
-
64
- # 1) 브라우저 세션
65
- session = await PlaywrightSession.create(headless=headless)
66
-
67
- try:
68
- # 2) Source 어댑터 (실제 네트워크/브라우저 사용)
69
- source: C1034BundleSourcePort = NfsBundleSource(session)
70
-
71
- # 3) Sink 어댑터 (repo 구현체로 저장)
72
- sink: C1034SinkPort = C1034Sink(repo, chunk=chunk)
73
-
74
- # 4) UseCase 조립 (Core는 어댑터 타입을 몰라도 Port로만 의존)
75
- uc = C1034IngestUseCase(source=source, sink=sink, on_bundle=on_bundle)
76
- yield uc
77
- finally:
78
- await session.close()
79
-
80
-
81
- # ── 세션 재사용 버전 (이미 열린 PlaywrightSession 주입) ──────────────────────
82
- @asynccontextmanager
83
- async def provide_ingest_usecase_with_session(
84
- *,
85
- session: PlaywrightSession, # 이미 띄워둔 세션을 재사용
86
- repo: C1034WriteRepoPort,
87
- chunk: int | None = None,
88
- on_bundle: Callable[[str, NormalizedBundle], Awaitable[None]] | None = None,
89
- ) -> AsyncIterator[C1034IngestUseCase]:
90
- """
91
- 외부에서 PlaywrightSession을 직접 관리하는 경우 사용하세요.
92
- (이 컨텍스트는 세션을 닫지 않습니다)
93
- """
94
- chunk = _env_int("SCRAPER_SINK_CHUNK", 1000) if chunk is None else chunk
95
-
96
- source: C1034BundleSourcePort = NfsBundleSource(session)
97
- sink: C1034SinkPort = C1034Sink(repo, chunk=chunk)
98
- uc = C1034IngestUseCase(source=source, sink=sink, on_bundle=on_bundle)
99
- try:
100
- yield uc
101
- finally:
102
- # 세션은 외부 소유이므로 닫지 않음
103
- pass
@@ -1,226 +0,0 @@
1
- # scraper2_hj3415/entrypoints/cli.py
2
- from __future__ import annotations
3
-
4
- import os
5
- import asyncio
6
- import json
7
- from pathlib import Path
8
- from typing import List, Optional, Sequence
9
-
10
- import typer
11
- from loguru import logger
12
-
13
- from scraper2_hj3415.di import provide_ingest_usecase
14
- from scraper2_hj3415.core.usecases.c1034_ingest import IngestStats
15
-
16
- # Beanie v2 (PyMongo AsyncMongoClient)
17
- from db2_hj3415.adapters.mongo.db import init_beanie_async, close_client
18
- # 저장 어댑터 (구현체)
19
- from db2_hj3415.adapters.nfs.repo_impls.c1034_write_repo_impl import MongoC1034WriteRepo
20
-
21
- app = typer.Typer(
22
- add_completion=False,
23
- no_args_is_help=True,
24
- help="scraper2_hj3415 - C103/C104 수집/정규화/저장 CLI",
25
- )
26
-
27
- # ────────────────────────────────────────────────
28
- # 환경변수 로딩 도우미
29
- # ────────────────────────────────────────────────
30
- def get_env_or_fail(key: str) -> str:
31
- value = os.getenv(key)
32
- if not value:
33
- typer.echo(f"❌ 환경변수 {key} 가 설정되어 있지 않습니다.")
34
- raise typer.Exit(code=1)
35
- return value
36
-
37
- # -----------------------
38
- # 공통 로깅 설정
39
- # -----------------------
40
- def _setup_logging(verbose: int, quiet: bool) -> None:
41
- level = "INFO"
42
- if quiet:
43
- level = "WARNING"
44
- if verbose >= 2:
45
- level = "DEBUG"
46
- elif verbose == 1 and not quiet:
47
- level = "INFO"
48
-
49
- logger.remove()
50
- logger.add(typer.echo, colorize=True, level=level,
51
- format="<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
52
- "<level>{level: <8}</level> | "
53
- "<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
54
- "<level>{message}</level>")
55
-
56
-
57
- # -----------------------
58
- # 공통 실행 유틸
59
- # -----------------------
60
- async def _with_beanie(mongo_uri: str, db_name: str, coro):
61
- client = await init_beanie_async(uri=mongo_uri, db_name=db_name)
62
- try:
63
- return await coro()
64
- finally:
65
- await close_client(client)
66
-
67
-
68
- def _parse_pages(pages: Sequence[str]) -> Sequence[str]:
69
- pages = [p.lower() for p in pages]
70
- for p in pages:
71
- if p not in {"c103", "c104"}:
72
- raise typer.BadParameter("pages는 c103, c104 중에서 선택하세요.")
73
- return tuple(pages)
74
-
75
-
76
- def _print_stats_or_bundles(result):
77
- if isinstance(result, IngestStats):
78
- typer.echo(json.dumps({
79
- "dim_account_rows": result.dim_account_rows,
80
- "dim_period_rows": result.dim_period_rows,
81
- "fact_rows": result.fact_rows,
82
- "delta_rows": result.delta_rows,
83
- }, ensure_ascii=False, indent=2))
84
- else:
85
- # collect_only=True 인 경우
86
- typer.echo(f"Collected bundles: {len(result)}")
87
-
88
-
89
- # -----------------------
90
- # Commands
91
- # -----------------------
92
- @app.command(help="단일 종목 수집/저장 (c103/c104)")
93
- def ingest_one(
94
- cmp_cd: str = typer.Argument(..., help="종목코드(6자리)"),
95
- pages: List[str] = typer.Option(["c103", "c104"], "--pages", "-p", help="수집할 페이지 선택(반복 옵션)"),
96
- save: bool = typer.Option(True, help="DB 저장 여부"),
97
- collect_only: bool = typer.Option(False, help="수집만 수행(저장은 안 함)"),
98
- concurrency: int = typer.Option(2, help="페이지 내부 동시성(번들 수집)"),
99
- verbose: int = typer.Option(0, "--verbose", "-v", count=True),
100
- quiet: bool = typer.Option(False, "--quiet", "-q"),
101
- ):
102
- """
103
- 지정한 종목의 C103/C104 페이지를 수집하고 MongoDB에 저장합니다.
104
- Mongo 연결정보는 반드시 환경변수로 설정해야 합니다:
105
-
106
- export MONGO_URI="mongodb://localhost:27017"
107
- export MONGO_DB="nfs_db"
108
- """
109
-
110
- headless = get_env_or_fail("SCRAPER_HEADLESS")
111
- chunk = get_env_or_fail("SCRAPER_SINK_CHUNK")
112
- mongo_uri = get_env_or_fail("MONGO_URI")
113
- mongo_db = get_env_or_fail("MONGO_DB")
114
-
115
- _setup_logging(verbose, quiet)
116
- pages_ = _parse_pages(pages)
117
-
118
- if collect_only and save:
119
- typer.secho(
120
- "[warn] --collect-only 가 지정되어 저장은 수행하지 않습니다 (--no-save 적용).",
121
- fg=typer.colors.YELLOW,
122
- )
123
- save = False
124
-
125
- async def run():
126
- repo = MongoC1034WriteRepo()
127
- async with provide_ingest_usecase(repo=repo, headless=headless, chunk=chunk) as uc:
128
- # 주의: 내부 source는 concurrency=2로 고정되어 있으면, 필요 시 di에서 파라미터로 넘겨 조정하세요.
129
- result = await uc.ingest_all(cmp_cd, pages=pages_, save=save, collect_only=collect_only)
130
- _print_stats_or_bundles(result)
131
-
132
- asyncio.run(_with_beanie(mongo_uri, mongo_db, run))
133
-
134
-
135
- @app.command(help="여러 종목을 파일/인자에서 받아 일괄 수집/저장")
136
- def ingest_many(
137
- cmp_cds: Optional[List[str]] = typer.Argument(None, help="종목코드들(공백 구분)"),
138
- file: Optional[Path] = typer.Option(None, "--file", "-f", help="한 줄당 1개 종목코드"),
139
- pages: List[str] = typer.Option(["c103", "c104"], "--pages", "-p"),
140
- save: bool = typer.Option(True, help="DB 저장 여부"),
141
- collect_only: bool = typer.Option(False, help="수집만 수행(저장은 안 함)"),
142
- concurrency: int = typer.Option(3, help="종목 병렬 처리 동시성"),
143
- verbose: int = typer.Option(0, "--verbose", "-v", count=True),
144
- quiet: bool = typer.Option(False, "--quiet", "-q"),
145
- ):
146
-
147
- headless = get_env_or_fail("SCRAPER_HEADLESS")
148
- chunk = get_env_or_fail("SCRAPER_SINK_CHUNK")
149
- mongo_uri = get_env_or_fail("MONGO_URI")
150
- mongo_db = get_env_or_fail("MONGO_DB")
151
-
152
- _setup_logging(verbose, quiet)
153
- pages_ = _parse_pages(pages)
154
-
155
- if collect_only and save:
156
- typer.secho(
157
- "[warn] --collect-only 가 지정되어 저장은 수행하지 않습니다 (--no-save 적용).",
158
- fg=typer.colors.YELLOW,
159
- )
160
- save = False
161
-
162
- codes: List[str] = []
163
- if file:
164
- codes.extend([line.strip() for line in file.read_text().splitlines() if line.strip()])
165
- if cmp_cds:
166
- codes.extend(cmp_cds)
167
- if not codes:
168
- raise typer.BadParameter("종목코드를 인자 또는 --file 로 제공하세요.")
169
-
170
- async def run():
171
- repo = MongoC1034WriteRepo()
172
- async with provide_ingest_usecase(repo=repo, headless=headless, chunk=chunk) as uc:
173
- result = await uc.ingest_many(
174
- codes,
175
- pages=pages_,
176
- concurrency=concurrency,
177
- save=save,
178
- collect_only=collect_only,
179
- )
180
- if isinstance(result, IngestStats):
181
- _print_stats_or_bundles(result)
182
- else:
183
- # collect_only=True 인 경우: dict[str, list[NormalizedBundle]]
184
- typer.echo(f"Collected codes: {list(result.keys())}")
185
-
186
- asyncio.run(_with_beanie(mongo_uri, mongo_db, run))
187
-
188
-
189
- @app.command(help="DB 연결 확인(ping) 및 모델 인덱스 보장 체크")
190
- def healthcheck(
191
- verbose: int = typer.Option(0, "--verbose", "-v", count=True),
192
- quiet: bool = typer.Option(False, "--quiet", "-q"),
193
- ):
194
- mongo_uri = get_env_or_fail("MONGO_URI")
195
- mongo_db = get_env_or_fail("MONGO_DB")
196
-
197
- _setup_logging(verbose, quiet)
198
-
199
- async def run():
200
- client = await init_beanie_async(uri=mongo_uri, db_name=mongo_db)
201
- try:
202
- await client.admin.command("ping")
203
- typer.echo("OK: Mongo ping success & Beanie init done.")
204
- finally:
205
- await close_client(client)
206
-
207
- asyncio.run(run())
208
-
209
-
210
- @app.command(help="버전/환경 출력")
211
- def info():
212
- import platform
213
- import scraper2_hj3415
214
- typer.echo(json.dumps({
215
- "python": platform.python_version(),
216
- "package": "scraper2_hj3415",
217
- "version": getattr(scraper2_hj3415, "__version__", "0.0.0"),
218
- }, ensure_ascii=False, indent=2))
219
-
220
-
221
- def main():
222
- app()
223
-
224
-
225
- if __name__ == "__main__":
226
- main()
@@ -1,20 +0,0 @@
1
- # 예) entrypoints/main.py
2
- import asyncio
3
- from scraper2_hj3415.di import provide_ingest_usecase
4
-
5
- from db2_hj3415.adapters.mongo.db import init_beanie_async, close_client
6
- from db2_hj3415.adapters.nfs.repo_impls.c1034_write_repo_impl import MongoC1034WriteRepo # ← 예시
7
-
8
- async def main():
9
- client = await init_beanie_async(
10
- uri="mongodb://192.168.100.172:27017",
11
- db_name="nfs_db",
12
- )
13
- repo = MongoC1034WriteRepo()
14
- async with provide_ingest_usecase(repo=repo) as uc:
15
- stats = await uc.ingest_all("005930", pages=("c103", "c104"), save=True)
16
- print(stats)
17
- await close_client(client)
18
-
19
- if __name__ == "__main__":
20
- asyncio.run(main())
@@ -1,66 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: scraper2-hj3415
3
- Version: 1.0.1
4
- Summary: Naver WiseReport C103/C104 scraper + ingestion orchestrator
5
- Keywords: example,demo
6
- Author-email: Hyungjin Kim <hj3415@gmail.com>
7
- Requires-Python: >=3.11
8
- Description-Content-Type: text/markdown
9
- Classifier: Programming Language :: Python :: 3
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Typing :: Typed
12
- License-File: LICENSE
13
- Requires-Dist: logging-hj3415>=0.1
14
- Requires-Dist: httpx>=0.28
15
- Requires-Dist: tenacity>=9.1
16
- Requires-Dist: playwright>=1.55
17
- Requires-Dist: pandas>=2.3
18
- Requires-Dist: tabulate>=0.9
19
- Requires-Dist: contracts-hj3415>=0.1
20
- Requires-Dist: db2-hj3415>=0.1
21
- Requires-Dist: pytest>=8 ; extra == "dev"
22
- Requires-Dist: pytest-cov>=7.0 ; extra == "dev"
23
- Requires-Dist: pytest-asyncio>=1.2 ; extra == "dev"
24
- Requires-Dist: ruff>=0.5 ; extra == "dev"
25
- Requires-Dist: mypy>=1.10 ; extra == "dev"
26
- Provides-Extra: dev
27
-
28
- ## 단일 종목 수집 후 저장
29
-
30
- ### 환경변수로 기본값 제어 가능
31
- export SCRAPER_HEADLESS=true
32
- export SCRAPER_SINK_CHUNK=1000
33
-
34
- ### Mongo 연결은 db2-hj3415 쪽 init 로직이 CLI 내부에 있다면, 옵션 또는 env로 지정
35
- export MONGO_URI="mongodb://localhost:27017"
36
- export MONGO_DB="nfs_db"
37
-
38
- ### 삼성전자: c103 + c104 모두 저장
39
- scraper2 ingest one 005930 --pages c103 c104
40
-
41
- ### 예: 저장하지 않고 번들만 수집
42
-
43
- scraper2 ingest one 005930 --pages c103 c104 --no-save --collect-only
44
-
45
- #### 가능 옵션:
46
- • --pages c103 c104 : 처리할 페이지 선택
47
- • --save/--no-save : DB 저장 여부(defalut --save)
48
- • --collect-only : 수집만 하고 저장하지 않음(defalut False)
49
-
50
- ---
51
-
52
- ## 여러 종목 동시 수집
53
-
54
- ### 쉼표 구분
55
- scraper2 ingest many 005930,000660 --pages c103 c104 --concurrency 2
56
-
57
- ### 파일 입력 (한 줄에 하나)
58
- scraper2 ingest many --file ./codes.txt --pages c103 c104 --concurrency 3
59
-
60
- ---
61
-
62
- ## 헬스체크/버전
63
-
64
- scraper2 health
65
- scraper2 version
66
-
@@ -1,35 +0,0 @@
1
- scraper2_hj3415/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- scraper2_hj3415/__main__.py,sha256=9jwz7jroPXW1UYiTEp3aODJ3oisPFOOpKaBBUOiGNJ8,116
3
- scraper2_hj3415/di.py,sha256=e-L-YB7b9OQPnvuEOcVA52ju52EMIbRIgEAI7_jQleQ,4522
4
- scraper2_hj3415/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- scraper2_hj3415/adapters/_shared/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- scraper2_hj3415/adapters/_shared/utils.py,sha256=5mQAojWzevva_8WAAZEOOJAmRQkRpD5f7IPudvQulDY,946
7
- scraper2_hj3415/adapters/clients/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- scraper2_hj3415/adapters/clients/browser.py,sha256=gCl3XODiRR_SGDHbT1f23-coqX2u3NcPP_pkL8l3Z9M,4241
9
- scraper2_hj3415/adapters/clients/http.py,sha256=lX6Cgp9kwRa6iEGYyO4VPqxYzzhEMlih0vWMHxYDSQM,1532
10
- scraper2_hj3415/adapters/nfs/pipelines/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py,sha256=SO_SToGg1bdjHjfw9Bqoxv8KE1IBAuUEURwIcEMT0RE,2143
12
- scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py,sha256=Gm5KBgHG1Omm1L2EDgSKzPd5g47gszZMw3hKBOQ-7es,4693
13
- scraper2_hj3415/adapters/nfs/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py,sha256=oTYKx7hW4zcDYdmi7NY9DX2ynd4PwqICqsUqGLFhPVA,1887
15
- scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py,sha256=l6MZq_DGUHqmPaDmlumY_SWasirVphxdT1UvUHrPoug,4446
16
- scraper2_hj3415/adapters/nfs/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- scraper2_hj3415/adapters/nfs/sources/bundle_source.py,sha256=zkApPi6qXkqEvQlg8ISuHnRc2wrFY2lRq4T8PLQASik,1009
18
- scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py,sha256=qwCMOf2F0lJ3XxDuh6hOZSf9VqIPaAbasxLqXmGmjHQ,3487
19
- scraper2_hj3415/adapters/nfs/sources/c1034_session.py,sha256=imoUYxn5ExA3Wrum_CFBGyh1B8M0oxu9JdzGxcer5-0,3285
20
- scraper2_hj3415/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- scraper2_hj3415/core/constants.py,sha256=I52IOPV0Fco7L3nlqoesJ3SqJKCJ-kqDjNlRc4Pp9GE,1049
22
- scraper2_hj3415/core/types.py,sha256=JSwswF0jtpwK_ZTi-nCs0OOeCI3Sv8ZJNbxGiAqvF-w,241
23
- scraper2_hj3415/core/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- scraper2_hj3415/core/ports/sink_port.py,sha256=YNWSPWWfRVcCKKOybXgK93twMo4sJgre2u6H9lZeG0Q,331
25
- scraper2_hj3415/core/ports/source_port.py,sha256=dH7Nr96EZ_VH-msFA8msC8If72k1nQjJ-FXnvtS975w,363
26
- scraper2_hj3415/core/usecases/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- scraper2_hj3415/core/usecases/c1034_ingest.py,sha256=3dnrC9SMztTn4gHbn3NMEYXNte0rvLqumxKmK6ywrT4,5860
28
- scraper2_hj3415/entrypoints/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
- scraper2_hj3415/entrypoints/cli.py,sha256=B40LjRBkNRB8-moB_xgpYhr9BJHhDBXRQ0hPZroPktg,8107
30
- scraper2_hj3415/entrypoints/main.py,sha256=Ge5VajXIkeFPlLUdDIRSx0XvS7gJ74jTu0NC5krOm1k,682
31
- scraper2_hj3415-1.0.1.dist-info/entry_points.txt,sha256=a6Ti2iKJQUOzRtaZ1iq3WuKyzkBuX51JJ6wnBqWcGcU,64
32
- scraper2_hj3415-1.0.1.dist-info/licenses/LICENSE,sha256=QBiVGQuKAESeCfQE344Ik2ex6g2zfYdu9WqrRWydxIs,1068
33
- scraper2_hj3415-1.0.1.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
34
- scraper2_hj3415-1.0.1.dist-info/METADATA,sha256=WAy8R_KDj8xH8z9WinuO0IOYIzJ2jI5Eqb6rPpVjgr0,1994
35
- scraper2_hj3415-1.0.1.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- scraper2=scraper2_hj3415.entrypoints.cli:app
3
-
File without changes