scraper2-hj3415 1.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +103 -0
- scraper2/adapters/out/playwright/browser_factory.py +112 -0
- scraper2/adapters/out/playwright/session.py +121 -0
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +15 -0
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
- scraper2/adapters/out/sinks/memory/store.py +74 -0
- scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
- scraper2/app/composition.py +195 -0
- scraper2/app/parsing/_converters.py +85 -0
- scraper2/app/parsing/_normalize.py +134 -0
- scraper2/app/parsing/c101_parser.py +143 -0
- scraper2/app/parsing/c103_parser.py +128 -0
- scraper2/app/parsing/c104_parser.py +143 -0
- scraper2/app/parsing/c106_parser.py +153 -0
- scraper2/app/parsing/c108_parser.py +65 -0
- scraper2/app/ports/browser/browser_factory_port.py +11 -0
- scraper2/app/ports/browser/browser_port.py +22 -0
- scraper2/app/ports/ingest_port.py +13 -0
- scraper2/app/ports/sinks/base_sink_port.py +14 -0
- scraper2/app/ports/sinks/c101_sink_port.py +9 -0
- scraper2/app/ports/sinks/c103_sink_port.py +9 -0
- scraper2/app/ports/sinks/c104_sink_port.py +9 -0
- scraper2/app/ports/sinks/c106_sink_port.py +9 -0
- scraper2/app/ports/sinks/c108_sink_port.py +9 -0
- scraper2/app/usecases/fetch/fetch_c101.py +43 -0
- scraper2/app/usecases/fetch/fetch_c103.py +103 -0
- scraper2/app/usecases/fetch/fetch_c104.py +76 -0
- scraper2/app/usecases/fetch/fetch_c106.py +90 -0
- scraper2/app/usecases/fetch/fetch_c108.py +49 -0
- scraper2/app/usecases/ingest/ingest_c101.py +36 -0
- scraper2/app/usecases/ingest/ingest_c103.py +37 -0
- scraper2/app/usecases/ingest/ingest_c104.py +37 -0
- scraper2/app/usecases/ingest/ingest_c106.py +38 -0
- scraper2/app/usecases/ingest/ingest_c108.py +39 -0
- scraper2/main.py +257 -0
- scraper2_hj3415-2.1.0.dist-info/METADATA +164 -0
- scraper2_hj3415-2.1.0.dist-info/RECORD +63 -0
- scraper2_hj3415-2.1.0.dist-info/entry_points.txt +3 -0
- scraper2_hj3415/__main__.py +0 -6
- scraper2_hj3415/adapters/_shared/utils.py +0 -29
- scraper2_hj3415/adapters/clients/browser.py +0 -124
- scraper2_hj3415/adapters/clients/http.py +0 -51
- scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
- scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
- scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
- scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
- scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
- scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
- scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
- scraper2_hj3415/core/constants.py +0 -47
- scraper2_hj3415/core/ports/sink_port.py +0 -16
- scraper2_hj3415/core/ports/source_port.py +0 -13
- scraper2_hj3415/core/types.py +0 -11
- scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
- scraper2_hj3415/di.py +0 -103
- scraper2_hj3415/entrypoints/cli.py +0 -226
- scraper2_hj3415/entrypoints/main.py +0 -20
- scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
- scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
- scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
- {scraper2_hj3415 → scraper2}/__init__.py +0 -0
- {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
- {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
- {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
- {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
- {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
- {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
- {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
- {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.1.0.dist-info}/licenses/LICENSE +0 -0
scraper2/.DS_Store
ADDED
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# src/scraper2/adapters/out/playwright/session.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from typing import Any
|
|
4
|
+
from io import StringIO
|
|
5
|
+
import pandas as pd
|
|
6
|
+
from playwright.async_api import Page
|
|
7
|
+
|
|
8
|
+
class PlaywrightBrowser:
|
|
9
|
+
def __init__(self, page: Page):
|
|
10
|
+
self.page = page
|
|
11
|
+
|
|
12
|
+
async def goto(self, url: str, timeout_ms: int = 10_000) -> None:
|
|
13
|
+
self.page.set_default_timeout(10_000)
|
|
14
|
+
await self.page.goto(url, timeout=timeout_ms)
|
|
15
|
+
|
|
16
|
+
async def title(self) -> str:
|
|
17
|
+
return await self.page.title()
|
|
18
|
+
|
|
19
|
+
async def current_url(self) -> str:
|
|
20
|
+
return self.page.url
|
|
21
|
+
|
|
22
|
+
async def wait(self, selector: str, timeout_ms: int = 10_000) -> None:
|
|
23
|
+
await self.page.wait_for_selector(selector, timeout=timeout_ms, state="attached")
|
|
24
|
+
|
|
25
|
+
async def text(self, selector: str) -> str:
|
|
26
|
+
await self.wait(selector)
|
|
27
|
+
return (await self.page.locator(selector).first.text_content()) or ""
|
|
28
|
+
|
|
29
|
+
async def texts(self, selector: str) -> list[str]:
|
|
30
|
+
await self.wait(selector)
|
|
31
|
+
loc = self.page.locator(selector)
|
|
32
|
+
items = await loc.all()
|
|
33
|
+
out: list[str] = []
|
|
34
|
+
for it in items:
|
|
35
|
+
out.append((await it.text_content()) or "")
|
|
36
|
+
return out
|
|
37
|
+
|
|
38
|
+
async def text_first_by_text(self, needle: str) -> str:
|
|
39
|
+
return (await self.page.get_by_text(needle).first.text_content()) or ""
|
|
40
|
+
|
|
41
|
+
async def inner_text(self, selector: str) -> str:
|
|
42
|
+
await self.wait(selector)
|
|
43
|
+
return await self.page.locator(selector).first.inner_text()
|
|
44
|
+
|
|
45
|
+
async def click(self, selector: str) -> None:
|
|
46
|
+
await self.wait(selector)
|
|
47
|
+
await self.page.locator(selector).click()
|
|
48
|
+
|
|
49
|
+
async def table_records(
|
|
50
|
+
self,
|
|
51
|
+
table_selector: str,
|
|
52
|
+
*,
|
|
53
|
+
header: int | list[int] = 0
|
|
54
|
+
) -> list[dict[str, Any]]:
|
|
55
|
+
await self.wait(table_selector)
|
|
56
|
+
|
|
57
|
+
table = self.page.locator(table_selector).first
|
|
58
|
+
html = await table.evaluate("el => el.outerHTML") # <table> 포함
|
|
59
|
+
#print(html)
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
df = pd.read_html(StringIO(html), header=header)[0]
|
|
63
|
+
#print(df.head(3))
|
|
64
|
+
except Exception as e:
|
|
65
|
+
# ImportError(lxml 없음), ValueError 등 모두 여기서 잡아서 원인 노출
|
|
66
|
+
raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
|
|
67
|
+
|
|
68
|
+
if header == 0:
|
|
69
|
+
if "항목" in df.columns:
|
|
70
|
+
df["항목"] = df["항목"].astype(str).str.replace("펼치기", "").str.strip()
|
|
71
|
+
|
|
72
|
+
df.columns = (
|
|
73
|
+
df.columns.astype(str)
|
|
74
|
+
.str.replace("연간컨센서스보기", "", regex=False)
|
|
75
|
+
.str.replace("연간컨센서스닫기", "", regex=False)
|
|
76
|
+
.str.replace("(IFRS연결)", "", regex=False)
|
|
77
|
+
.str.replace("(IFRS별도)", "", regex=False)
|
|
78
|
+
.str.replace("(GAAP개별)", "", regex=False)
|
|
79
|
+
.str.replace("(YoY)", "", regex=False)
|
|
80
|
+
.str.replace("(QoQ)", "", regex=False)
|
|
81
|
+
.str.replace("(E)", "", regex=False)
|
|
82
|
+
.str.replace(".", "", regex=False)
|
|
83
|
+
.str.strip()
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
# NaN -> None 처리
|
|
87
|
+
records: list[dict[str, Any]] = df.where(pd.notnull(df), None).to_dict(orient="records")
|
|
88
|
+
return records
|
|
89
|
+
|
|
90
|
+
async def outer_html(self, selector: str) -> str:
|
|
91
|
+
loc = self.page.locator(selector).first
|
|
92
|
+
return await loc.evaluate("el => el.outerHTML")
|
|
93
|
+
|
|
94
|
+
async def all_texts(self, selector: str) -> list[str]:
|
|
95
|
+
# selector는 css도 되고, "xpath=..." 도 됨
|
|
96
|
+
loc = self.page.locator(selector)
|
|
97
|
+
return await loc.all_text_contents()
|
|
98
|
+
|
|
99
|
+
async def outer_html_nth(self, selector: str, index: int) -> str:
|
|
100
|
+
loc = self.page.locator(selector).nth(index)
|
|
101
|
+
# index가 범위를 벗어나면 playwright가 에러를 내는데,
|
|
102
|
+
# 필요하면 여기서 더 친절한 에러로 감싸도 됨.
|
|
103
|
+
return await loc.evaluate("el => el.outerHTML")
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# scraper2/adapters/out/playwright/browser_factory.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
from contextlib import asynccontextmanager
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import AsyncIterator
|
|
8
|
+
|
|
9
|
+
from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
10
|
+
from scraper2.app.ports.browser.browser_port import BrowserPort
|
|
11
|
+
from scraper2.adapters.out.playwright.session import PlaywrightPageSession
|
|
12
|
+
from scraper2.adapters.out.playwright.browser import PlaywrightBrowser
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class _LeaseItem:
|
|
17
|
+
session: PlaywrightPageSession
|
|
18
|
+
browser: BrowserPort
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PlaywrightBrowserFactory(BrowserFactoryPort):
|
|
22
|
+
"""
|
|
23
|
+
풀링 방식:
|
|
24
|
+
- astart()에서 max_concurrency 만큼 세션/페이지/브라우저를 미리 생성
|
|
25
|
+
- lease()는 큐에서 하나 빌려주고 반납받음
|
|
26
|
+
- aclose()에서 모두 종료
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
def __init__(self, *, headless: bool, timeout_ms: int, max_concurrency: int = 2):
|
|
30
|
+
self.headless = headless
|
|
31
|
+
self.timeout_ms = timeout_ms
|
|
32
|
+
self.max_concurrency = max_concurrency
|
|
33
|
+
|
|
34
|
+
self._pool: asyncio.Queue[_LeaseItem] = asyncio.Queue(maxsize=max_concurrency)
|
|
35
|
+
self._items: list[_LeaseItem] = [] # 종료용 레퍼런스
|
|
36
|
+
self._started = False
|
|
37
|
+
self._start_lock = asyncio.Lock()
|
|
38
|
+
self._closed = False
|
|
39
|
+
|
|
40
|
+
async def astart(self) -> None:
|
|
41
|
+
"""
|
|
42
|
+
풀을 미리 채움.
|
|
43
|
+
여러 번 호출돼도 1회만 초기화되도록 방어.
|
|
44
|
+
"""
|
|
45
|
+
if self._started:
|
|
46
|
+
return
|
|
47
|
+
|
|
48
|
+
async with self._start_lock:
|
|
49
|
+
if self._started:
|
|
50
|
+
return
|
|
51
|
+
if self._closed:
|
|
52
|
+
raise RuntimeError("Factory is closed; cannot start again.")
|
|
53
|
+
|
|
54
|
+
for _ in range(self.max_concurrency):
|
|
55
|
+
session = PlaywrightPageSession(headless=self.headless, timeout_ms=self.timeout_ms)
|
|
56
|
+
page = await session.start()
|
|
57
|
+
browser = PlaywrightBrowser(page)
|
|
58
|
+
|
|
59
|
+
item = _LeaseItem(session=session, browser=browser)
|
|
60
|
+
self._items.append(item)
|
|
61
|
+
await self._pool.put(item)
|
|
62
|
+
|
|
63
|
+
self._started = True
|
|
64
|
+
|
|
65
|
+
@asynccontextmanager
|
|
66
|
+
async def lease(self) -> AsyncIterator[BrowserPort]:
|
|
67
|
+
"""
|
|
68
|
+
브라우저 하나를 풀에서 빌려줌.
|
|
69
|
+
사용 후 반드시 풀에 반납.
|
|
70
|
+
"""
|
|
71
|
+
if self._closed:
|
|
72
|
+
raise RuntimeError("Factory is closed; cannot lease.")
|
|
73
|
+
if not self._started:
|
|
74
|
+
await self.astart()
|
|
75
|
+
|
|
76
|
+
item = await self._pool.get()
|
|
77
|
+
try:
|
|
78
|
+
yield item.browser
|
|
79
|
+
finally:
|
|
80
|
+
# close 중이면 반납하지 말고 그냥 종료 플로우에 맡김
|
|
81
|
+
if not self._closed:
|
|
82
|
+
await self._pool.put(item)
|
|
83
|
+
|
|
84
|
+
async def aclose(self) -> None:
|
|
85
|
+
"""
|
|
86
|
+
풀에 있는 모든 세션 종료.
|
|
87
|
+
- 실행 중인 lease가 끝나기 전에 닫으면: 남아있는 세션만 닫히고,
|
|
88
|
+
나중에 lease가 반납하려 할 때 _closed=True라 put이 안 되도록 처리.
|
|
89
|
+
"""
|
|
90
|
+
if self._closed:
|
|
91
|
+
return
|
|
92
|
+
self._closed = True
|
|
93
|
+
|
|
94
|
+
# 전체 세션 종료
|
|
95
|
+
# (이미 대여 중인 애도 결국 같은 session 객체이므로 close 시도됨)
|
|
96
|
+
for item in self._items:
|
|
97
|
+
try:
|
|
98
|
+
await item.session.close()
|
|
99
|
+
except Exception:
|
|
100
|
+
# 종료 단계에서는 예외 삼키는 게 보통 안전
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
self._items.clear()
|
|
104
|
+
|
|
105
|
+
# 큐 비우기 (참조 제거)
|
|
106
|
+
try:
|
|
107
|
+
while True:
|
|
108
|
+
self._pool.get_nowait()
|
|
109
|
+
except asyncio.QueueEmpty:
|
|
110
|
+
pass
|
|
111
|
+
|
|
112
|
+
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# src/scraper2/adapters/out/playwright/session.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from playwright.async_api import (
|
|
11
|
+
async_playwright,
|
|
12
|
+
Browser,
|
|
13
|
+
BrowserContext,
|
|
14
|
+
Page,
|
|
15
|
+
Error as PWError,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _install_playwright_browsers(*names: str) -> None:
|
|
20
|
+
"""python -m playwright install [names...] 를 코드에서 실행"""
|
|
21
|
+
subprocess.run([sys.executable, "-m", "playwright", "install", *names], check=True)
|
|
22
|
+
|
|
23
|
+
if sys.platform.startswith("linux"):
|
|
24
|
+
# deps는 실패해도 그냥 진행 (환경에 따라 불필요/권한 문제)
|
|
25
|
+
try:
|
|
26
|
+
subprocess.run(
|
|
27
|
+
[sys.executable, "-m", "playwright", "install-deps"], check=True
|
|
28
|
+
)
|
|
29
|
+
except Exception:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _need_install(e: Exception) -> bool:
|
|
34
|
+
msg = str(e)
|
|
35
|
+
return (
|
|
36
|
+
"Executable doesn't exist" in msg
|
|
37
|
+
or "download new browsers" in msg
|
|
38
|
+
or "playwright install" in msg
|
|
39
|
+
or "Please run the following command" in msg
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class PlaywrightPageSession:
|
|
45
|
+
"""
|
|
46
|
+
main에서 쓰기 쉬운 세션:
|
|
47
|
+
s = PlaywrightPageSession(headless=True)
|
|
48
|
+
page = await s.start()
|
|
49
|
+
...
|
|
50
|
+
await s.close()
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
headless: bool = True
|
|
54
|
+
browser_name: str = "chromium"
|
|
55
|
+
timeout_ms: int = 10_000
|
|
56
|
+
auto_install: bool = True # env PW_SKIP_AUTO_INSTALL=1이면 자동으로 꺼짐
|
|
57
|
+
|
|
58
|
+
# runtime resources
|
|
59
|
+
pw: Optional[object] = None
|
|
60
|
+
browser: Optional[Browser] = None
|
|
61
|
+
context: Optional[BrowserContext] = None
|
|
62
|
+
page: Optional[Page] = None
|
|
63
|
+
|
|
64
|
+
async def start(self) -> Page:
|
|
65
|
+
if self.page is not None:
|
|
66
|
+
return self.page # 이미 시작된 경우 재사용(원치 않으면 제거)
|
|
67
|
+
|
|
68
|
+
self.pw = await async_playwright().start()
|
|
69
|
+
try:
|
|
70
|
+
browser_type = getattr(self.pw, self.browser_name)
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
self.browser = await browser_type.launch(headless=self.headless)
|
|
74
|
+
except PWError as e:
|
|
75
|
+
should_auto = self.auto_install and os.getenv("PW_SKIP_AUTO_INSTALL") != "1"
|
|
76
|
+
if should_auto and _need_install(e):
|
|
77
|
+
# pw 종료 -> 설치 -> pw 재시작
|
|
78
|
+
await self.pw.stop()
|
|
79
|
+
_install_playwright_browsers(self.browser_name)
|
|
80
|
+
self.pw = await async_playwright().start()
|
|
81
|
+
browser_type = getattr(self.pw, self.browser_name)
|
|
82
|
+
self.browser = await browser_type.launch(headless=self.headless)
|
|
83
|
+
else:
|
|
84
|
+
raise
|
|
85
|
+
|
|
86
|
+
self.context = await self.browser.new_context()
|
|
87
|
+
self.page = await self.context.new_page()
|
|
88
|
+
self.page.set_default_timeout(self.timeout_ms)
|
|
89
|
+
return self.page
|
|
90
|
+
|
|
91
|
+
except Exception:
|
|
92
|
+
# start 중간에 터지면 자원 정리
|
|
93
|
+
await self.close()
|
|
94
|
+
raise
|
|
95
|
+
|
|
96
|
+
async def close(self) -> None:
|
|
97
|
+
# 역순 정리 (page는 context close 시 같이 정리됨)
|
|
98
|
+
if self.context is not None:
|
|
99
|
+
try:
|
|
100
|
+
await self.context.close()
|
|
101
|
+
except Exception:
|
|
102
|
+
pass
|
|
103
|
+
finally:
|
|
104
|
+
self.context = None
|
|
105
|
+
self.page = None
|
|
106
|
+
|
|
107
|
+
if self.browser is not None:
|
|
108
|
+
try:
|
|
109
|
+
await self.browser.close()
|
|
110
|
+
except Exception:
|
|
111
|
+
pass
|
|
112
|
+
finally:
|
|
113
|
+
self.browser = None
|
|
114
|
+
|
|
115
|
+
if self.pw is not None:
|
|
116
|
+
try:
|
|
117
|
+
await self.pw.stop()
|
|
118
|
+
except Exception:
|
|
119
|
+
pass
|
|
120
|
+
finally:
|
|
121
|
+
self.pw = None
|
|
Binary file
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/memory/__init__.py
|
|
2
|
+
from .c101_memory_sink import MemoryC101Sink
|
|
3
|
+
from .c103_memory_sink import MemoryC103Sink
|
|
4
|
+
from .c104_memory_sink import MemoryC104Sink
|
|
5
|
+
from .c106_memory_sink import MemoryC106Sink
|
|
6
|
+
from .c108_memory_sink import MemoryC108Sink
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"MemoryC101Sink",
|
|
10
|
+
"MemoryC103Sink",
|
|
11
|
+
"MemoryC104Sink",
|
|
12
|
+
"MemoryC106Sink",
|
|
13
|
+
"MemoryC108Sink",
|
|
14
|
+
]
|
|
15
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#scraper2/adapters/out/sinks/memory/c101_memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts.nfs.c101 import C101DTO
|
|
7
|
+
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
8
|
+
from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
|
|
9
|
+
|
|
10
|
+
_ENDPOINT = "c101"
|
|
11
|
+
|
|
12
|
+
class MemoryC101Sink(C101SinkPort):
|
|
13
|
+
def __init__(self, store: InMemoryStore[C101DTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: C101DTO) -> None:
|
|
17
|
+
await self._store.put(_ENDPOINT, dto.코드, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(self, dtos: Iterable[C101DTO]) -> None:
|
|
20
|
+
await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/memory/c103_memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts.nfs.c103 import C103DTO
|
|
7
|
+
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
8
|
+
from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
|
|
9
|
+
|
|
10
|
+
_ENDPOINT = "c103"
|
|
11
|
+
|
|
12
|
+
class MemoryC103Sink(C103SinkPort):
|
|
13
|
+
def __init__(self, store: InMemoryStore[C103DTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: C103DTO) -> None:
|
|
17
|
+
await self._store.put(_ENDPOINT, dto.코드, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(self, dtos: Iterable[C103DTO]) -> None:
|
|
20
|
+
await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/memory/c104_memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts.nfs.c104 import C104DTO
|
|
7
|
+
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
8
|
+
from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
|
|
9
|
+
|
|
10
|
+
_ENDPOINT = "c104"
|
|
11
|
+
|
|
12
|
+
class MemoryC104Sink(C104SinkPort):
|
|
13
|
+
def __init__(self, store: InMemoryStore[C104DTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: C104DTO) -> None:
|
|
17
|
+
await self._store.put(_ENDPOINT, dto.코드, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(self, dtos: Iterable[C104DTO]) -> None:
|
|
20
|
+
await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#scraper2/adapters/out/sinks/memory/c106_memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts.nfs.c106 import C106DTO
|
|
7
|
+
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
8
|
+
from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
|
|
9
|
+
|
|
10
|
+
_ENDPOINT = "c106"
|
|
11
|
+
|
|
12
|
+
class MemoryC106Sink(C106SinkPort):
|
|
13
|
+
def __init__(self, store: InMemoryStore[C106DTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: C106DTO) -> None:
|
|
17
|
+
await self._store.put(_ENDPOINT, dto.코드, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(self, dtos: Iterable[C106DTO]) -> None:
|
|
20
|
+
await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#scraper2/adapters/out/sinks/memory/c108_memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts.nfs.c108 import C108DTO
|
|
7
|
+
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
8
|
+
from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
|
|
9
|
+
|
|
10
|
+
_ENDPOINT = "c108"
|
|
11
|
+
|
|
12
|
+
class MemoryC108Sink(C108SinkPort):
|
|
13
|
+
def __init__(self, store: InMemoryStore[C108DTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: C108DTO) -> None:
|
|
17
|
+
await self._store.put(_ENDPOINT, dto.코드, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(self, dtos: Iterable[C108DTO]) -> None:
|
|
20
|
+
await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/memory/store.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
from collections import defaultdict, deque
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any, Deque, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T") # DTO 타입
|
|
10
|
+
|
|
11
|
+
@dataclass(frozen=True)
|
|
12
|
+
class StoreStats:
|
|
13
|
+
endpoint: str
|
|
14
|
+
latest_count: int
|
|
15
|
+
history_count: int
|
|
16
|
+
|
|
17
|
+
class InMemoryStore(Generic[T]):
|
|
18
|
+
"""
|
|
19
|
+
endpoint별로 DTO를 저장한다.
|
|
20
|
+
- latest: endpoint -> key(보통 코드) -> dto
|
|
21
|
+
- history: endpoint -> deque[dto] (최근 max_history개)
|
|
22
|
+
"""
|
|
23
|
+
def __init__(self, *, max_history: int = 2000):
|
|
24
|
+
self._lock = asyncio.Lock()
|
|
25
|
+
self._max_history = max_history
|
|
26
|
+
self._history: Dict[str, Deque[T]] = defaultdict(lambda: deque(maxlen=max_history))
|
|
27
|
+
self._latest: Dict[str, Dict[str, T]] = defaultdict(dict)
|
|
28
|
+
|
|
29
|
+
# ---------- write ----------
|
|
30
|
+
|
|
31
|
+
async def put(self, endpoint: str, key: str, dto: T) -> None:
|
|
32
|
+
async with self._lock:
|
|
33
|
+
self._history[endpoint].append(dto)
|
|
34
|
+
self._latest[endpoint][key] = dto
|
|
35
|
+
|
|
36
|
+
async def put_many(self, endpoint: str, items: Iterable[Tuple[str, T]]) -> None:
|
|
37
|
+
async with self._lock:
|
|
38
|
+
for key, dto in items:
|
|
39
|
+
self._history[endpoint].append(dto)
|
|
40
|
+
self._latest[endpoint][key] = dto
|
|
41
|
+
|
|
42
|
+
# ---------- read ----------
|
|
43
|
+
|
|
44
|
+
async def get(self, endpoint: str, key: str) -> Optional[T]:
|
|
45
|
+
async with self._lock:
|
|
46
|
+
return self._latest.get(endpoint, {}).get(key)
|
|
47
|
+
|
|
48
|
+
async def all_latest(self, endpoint: str) -> Dict[str, T]:
|
|
49
|
+
async with self._lock:
|
|
50
|
+
return dict(self._latest.get(endpoint, {}))
|
|
51
|
+
|
|
52
|
+
async def list_keys(self, endpoint: str) -> List[str]:
|
|
53
|
+
async with self._lock:
|
|
54
|
+
# 정렬은 취향인데, CLI 출력은 정렬이 편해서 기본 정렬
|
|
55
|
+
return sorted(self._latest.get(endpoint, {}).keys())
|
|
56
|
+
|
|
57
|
+
async def history(self, endpoint: str) -> List[T]:
|
|
58
|
+
async with self._lock:
|
|
59
|
+
return list(self._history.get(endpoint, []))
|
|
60
|
+
|
|
61
|
+
async def stats(self, endpoint: str) -> StoreStats:
|
|
62
|
+
async with self._lock:
|
|
63
|
+
latest_count = len(self._latest.get(endpoint, {}))
|
|
64
|
+
history_count = len(self._history.get(endpoint, []))
|
|
65
|
+
return StoreStats(endpoint=endpoint, latest_count=latest_count, history_count=history_count)
|
|
66
|
+
|
|
67
|
+
async def clear(self, endpoint: str | None = None) -> None:
|
|
68
|
+
async with self._lock:
|
|
69
|
+
if endpoint is None:
|
|
70
|
+
self._history.clear()
|
|
71
|
+
self._latest.clear()
|
|
72
|
+
else:
|
|
73
|
+
self._history.pop(endpoint, None)
|
|
74
|
+
self._latest.pop(endpoint, None)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/mongo/__init__.py
|
|
2
|
+
from .c101_mongo_sink import MongoC101Sink
|
|
3
|
+
from .c103_mongo_sink import MongoC103Sink
|
|
4
|
+
from .c104_mongo_sink import MongoC104Sink
|
|
5
|
+
from .c106_mongo_sink import MongoC106Sink
|
|
6
|
+
from .c108_mongo_sink import MongoC108Sink
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"MongoC101Sink",
|
|
10
|
+
"MongoC103Sink",
|
|
11
|
+
"MongoC104Sink",
|
|
12
|
+
"MongoC106Sink",
|
|
13
|
+
"MongoC108Sink",
|
|
14
|
+
]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts.nfs.c101 import C101DTO
|
|
10
|
+
from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
|
|
11
|
+
|
|
12
|
+
from db2.nfs import (
|
|
13
|
+
upsert_latest,
|
|
14
|
+
upsert_latest_many,
|
|
15
|
+
insert_snapshot,
|
|
16
|
+
insert_snapshots_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_ENDPOINT = "c101"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MongoC101Sink(C101SinkPort):
|
|
23
|
+
def __init__(self, db: AsyncDatabase):
|
|
24
|
+
self._db = db
|
|
25
|
+
|
|
26
|
+
async def write(self, dto: C101DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
+
# 최신 상태는 upsert
|
|
28
|
+
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
+
# 히스토리는 insert
|
|
30
|
+
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
31
|
+
|
|
32
|
+
async def write_many(
|
|
33
|
+
self,
|
|
34
|
+
dtos: Iterable[C101DTO],
|
|
35
|
+
*,
|
|
36
|
+
asof: Optional[datetime] = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
dtos_list = list(dtos)
|
|
39
|
+
if not dtos_list:
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
43
|
+
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts.nfs.c103 import C103DTO
|
|
10
|
+
from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
|
|
11
|
+
|
|
12
|
+
from db2.nfs import (
|
|
13
|
+
upsert_latest,
|
|
14
|
+
upsert_latest_many,
|
|
15
|
+
insert_snapshot,
|
|
16
|
+
insert_snapshots_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_ENDPOINT = "c103"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MongoC103Sink(C103SinkPort):
|
|
23
|
+
def __init__(self, db: AsyncDatabase):
|
|
24
|
+
self._db = db
|
|
25
|
+
|
|
26
|
+
async def write(self, dto: C103DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
+
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
+
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
+
|
|
30
|
+
async def write_many(
|
|
31
|
+
self,
|
|
32
|
+
dtos: Iterable[C103DTO],
|
|
33
|
+
*,
|
|
34
|
+
asof: Optional[datetime] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
dtos_list = list(dtos)
|
|
37
|
+
if not dtos_list:
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
+
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable, Optional
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts.nfs.c104 import C104DTO
|
|
10
|
+
from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
|
|
11
|
+
|
|
12
|
+
from db2.nfs import (
|
|
13
|
+
upsert_latest,
|
|
14
|
+
upsert_latest_many,
|
|
15
|
+
insert_snapshot,
|
|
16
|
+
insert_snapshots_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
_ENDPOINT = "c104"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MongoC104Sink(C104SinkPort):
|
|
23
|
+
def __init__(self, db: AsyncDatabase):
|
|
24
|
+
self._db = db
|
|
25
|
+
|
|
26
|
+
async def write(self, dto: C104DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
+
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
+
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
+
|
|
30
|
+
async def write_many(
|
|
31
|
+
self,
|
|
32
|
+
dtos: Iterable[C104DTO],
|
|
33
|
+
*,
|
|
34
|
+
asof: Optional[datetime] = None,
|
|
35
|
+
) -> None:
|
|
36
|
+
dtos_list = list(dtos)
|
|
37
|
+
if not dtos_list:
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
+
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|