scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
- scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
- scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +22 -0
- scraper2_hj3415/app/parsing/c104_parser.py +26 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
- scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
- scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
- scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
- scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
- scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
- scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
- scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +45 -72
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
- scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/browser.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import Page
|
|
5
|
+
|
|
6
|
+
from .capabilities import (
|
|
7
|
+
_PlaywrightBase,
|
|
8
|
+
PlaywrightNavigation,
|
|
9
|
+
PlaywrightWait,
|
|
10
|
+
PlaywrightInteraction,
|
|
11
|
+
PlaywrightText,
|
|
12
|
+
PlaywrightScope,
|
|
13
|
+
PlaywrightTable,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PlaywrightBrowser(
|
|
18
|
+
PlaywrightNavigation,
|
|
19
|
+
PlaywrightWait,
|
|
20
|
+
PlaywrightInteraction,
|
|
21
|
+
PlaywrightText,
|
|
22
|
+
PlaywrightScope,
|
|
23
|
+
PlaywrightTable,
|
|
24
|
+
):
|
|
25
|
+
def __init__(self, page: Page):
|
|
26
|
+
_PlaywrightBase.__init__(self, page)
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
#
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/browser_factory.py
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from contextlib import asynccontextmanager
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import AsyncIterator
|
|
7
|
+
from typing import AsyncIterator, cast
|
|
8
8
|
|
|
9
|
-
from
|
|
10
|
-
from
|
|
11
|
-
from
|
|
12
|
-
from
|
|
9
|
+
from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
10
|
+
from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
|
|
11
|
+
from scraper2_hj3415.app.adapters.out.playwright.session import PlaywrightPageSession
|
|
12
|
+
from scraper2_hj3415.app.adapters.out.playwright.browser import PlaywrightBrowser
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
@dataclass
|
|
@@ -54,7 +54,7 @@ class PlaywrightBrowserFactory(BrowserFactoryPort):
|
|
|
54
54
|
for _ in range(self.max_concurrency):
|
|
55
55
|
session = PlaywrightPageSession(headless=self.headless, timeout_ms=self.timeout_ms)
|
|
56
56
|
page = await session.start()
|
|
57
|
-
browser = PlaywrightBrowser(page)
|
|
57
|
+
browser:BrowserPort = cast(BrowserPort, PlaywrightBrowser(page))
|
|
58
58
|
|
|
59
59
|
item = _LeaseItem(session=session, browser=browser)
|
|
60
60
|
self._items.append(item)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py
|
|
2
|
+
from ._base import _PlaywrightBase
|
|
3
|
+
from .navigation import PlaywrightNavigation
|
|
4
|
+
from .wait import PlaywrightWait
|
|
5
|
+
from .interaction import PlaywrightInteraction
|
|
6
|
+
from .text import PlaywrightText
|
|
7
|
+
from .scope import PlaywrightScope
|
|
8
|
+
from .table import PlaywrightTable
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"_PlaywrightBase",
|
|
12
|
+
"PlaywrightNavigation",
|
|
13
|
+
"PlaywrightWait",
|
|
14
|
+
"PlaywrightInteraction",
|
|
15
|
+
"PlaywrightText",
|
|
16
|
+
"PlaywrightScope",
|
|
17
|
+
"PlaywrightTable",
|
|
18
|
+
]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import Page
|
|
5
|
+
from logging_hj3415 import logger
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _PlaywrightBase:
|
|
9
|
+
def __init__(self, page: Page):
|
|
10
|
+
self._page = page
|
|
11
|
+
|
|
12
|
+
async def _wait_for_network_quiet(self, *, timeout_ms: int = 10_000) -> None:
|
|
13
|
+
# networkidle은 사이트에 따라 영원히 안 올 수도 있어서 try로 감싸는 게 안전
|
|
14
|
+
logger.debug("wait for network quiet")
|
|
15
|
+
try:
|
|
16
|
+
await self._page.wait_for_load_state("networkidle", timeout=timeout_ms)
|
|
17
|
+
except Exception:
|
|
18
|
+
# networkidle이 안 와도 다음 단계(앵커 wait)가 더 중요함
|
|
19
|
+
return
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import TimeoutError as PwTimeoutError
|
|
5
|
+
from ._base import _PlaywrightBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlaywrightInteraction(_PlaywrightBase):
|
|
9
|
+
|
|
10
|
+
async def click(
|
|
11
|
+
self,
|
|
12
|
+
selector: str,
|
|
13
|
+
*,
|
|
14
|
+
index: int = 0,
|
|
15
|
+
timeout_ms: int = 4_000,
|
|
16
|
+
force: bool = False,
|
|
17
|
+
) -> None:
|
|
18
|
+
loc = self._page.locator(selector).nth(index)
|
|
19
|
+
await loc.click(timeout=timeout_ms, force=force)
|
|
20
|
+
|
|
21
|
+
async def try_click(
|
|
22
|
+
self,
|
|
23
|
+
selector: str,
|
|
24
|
+
*,
|
|
25
|
+
index: int = 0,
|
|
26
|
+
timeout_ms: int = 1_500,
|
|
27
|
+
force: bool = False,
|
|
28
|
+
) -> bool:
|
|
29
|
+
loc = self._page.locator(selector).nth(index)
|
|
30
|
+
try:
|
|
31
|
+
await loc.click(timeout=timeout_ms, trial=True, force=force)
|
|
32
|
+
return True
|
|
33
|
+
except PwTimeoutError:
|
|
34
|
+
return False
|
|
35
|
+
|
|
36
|
+
async def scroll_into_view(self, selector: str, *, index: int = 0) -> None:
|
|
37
|
+
await self._page.locator(selector).nth(index).scroll_into_view_if_needed()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from logging_hj3415 import logger
|
|
5
|
+
from ._base import _PlaywrightBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlaywrightNavigation(_PlaywrightBase):
|
|
9
|
+
async def title(self) -> str:
|
|
10
|
+
return await self._page.title()
|
|
11
|
+
|
|
12
|
+
async def current_url(self) -> str:
|
|
13
|
+
return self._page.url
|
|
14
|
+
|
|
15
|
+
async def goto_and_wait_for_stable(
|
|
16
|
+
self, url: str, timeout_ms: int = 10_000
|
|
17
|
+
) -> None:
|
|
18
|
+
logger.info(f"goto: {url}")
|
|
19
|
+
await self._page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
|
|
20
|
+
await self._wait_for_network_quiet(timeout_ms=timeout_ms // 2)
|
|
21
|
+
|
|
22
|
+
async def reload(self, *, timeout_ms: int = 10_000) -> None:
|
|
23
|
+
logger.info("reload")
|
|
24
|
+
await self._page.reload(timeout=timeout_ms, wait_until="domcontentloaded")
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
from ._base import _PlaywrightBase
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PlaywrightScope(_PlaywrightBase):
|
|
9
|
+
async def is_attached(self, selector: str, *, index: int = 0) -> bool:
|
|
10
|
+
try:
|
|
11
|
+
loc = self._page.locator(selector).nth(index)
|
|
12
|
+
return await loc.count() > 0
|
|
13
|
+
except Exception:
|
|
14
|
+
return False
|
|
15
|
+
|
|
16
|
+
async def computed_style(self, selector: str, *, index: int = 0, prop: str) -> str:
|
|
17
|
+
loc = self._page.locator(selector).nth(index)
|
|
18
|
+
return await loc.evaluate(
|
|
19
|
+
"(el, prop) => getComputedStyle(el)[prop] || ''", prop
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
async def count_in_nth(
|
|
23
|
+
self,
|
|
24
|
+
scope_selector: str,
|
|
25
|
+
*,
|
|
26
|
+
scope_index: int,
|
|
27
|
+
inner_selector: str,
|
|
28
|
+
) -> int:
|
|
29
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
30
|
+
return await scope.locator(inner_selector).count()
|
|
31
|
+
|
|
32
|
+
async def eval_in_nth_first(
|
|
33
|
+
self,
|
|
34
|
+
scope_selector: str,
|
|
35
|
+
*,
|
|
36
|
+
scope_index: int,
|
|
37
|
+
inner_selector: str,
|
|
38
|
+
expression: str,
|
|
39
|
+
) -> Any:
|
|
40
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
41
|
+
loc = scope.locator(inner_selector).first
|
|
42
|
+
|
|
43
|
+
if await loc.count() == 0:
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return await loc.evaluate(expression)
|
|
47
|
+
|
|
48
|
+
async def inner_text_in_nth(
|
|
49
|
+
self,
|
|
50
|
+
scope_selector: str,
|
|
51
|
+
*,
|
|
52
|
+
scope_index: int,
|
|
53
|
+
inner_selector: str,
|
|
54
|
+
inner_index: int = 0,
|
|
55
|
+
timeout_ms: int = 10_000,
|
|
56
|
+
) -> str:
|
|
57
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
58
|
+
inner = scope.locator(inner_selector).nth(inner_index)
|
|
59
|
+
|
|
60
|
+
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
return (await inner.inner_text()) or ""
|
|
64
|
+
except Exception:
|
|
65
|
+
return ""
|
|
66
|
+
|
|
67
|
+
async def text_content_in_nth(
|
|
68
|
+
self,
|
|
69
|
+
scope_selector: str,
|
|
70
|
+
*,
|
|
71
|
+
scope_index: int,
|
|
72
|
+
inner_selector: str,
|
|
73
|
+
inner_index: int = 0,
|
|
74
|
+
timeout_ms: int = 10_000,
|
|
75
|
+
) -> str:
|
|
76
|
+
scope = self._page.locator(scope_selector).nth(scope_index)
|
|
77
|
+
inner = scope.locator(inner_selector).nth(inner_index)
|
|
78
|
+
|
|
79
|
+
await inner.wait_for(state="attached", timeout=timeout_ms)
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
return (await inner.text_content()) or ""
|
|
83
|
+
except Exception:
|
|
84
|
+
return ""
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from io import StringIO
|
|
6
|
+
from typing import Any
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from ._base import _PlaywrightBase
|
|
9
|
+
|
|
10
|
+
_PERIOD_MM_RE = re.compile(r"\b(19|20)\d{2}/(03|06|09|12)\b")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class PlaywrightTable(_PlaywrightBase):
|
|
14
|
+
async def table_records(
|
|
15
|
+
self,
|
|
16
|
+
table_selector: str,
|
|
17
|
+
*,
|
|
18
|
+
header: int | list[int] | None = 0,
|
|
19
|
+
) -> list[dict[str, Any]]:
|
|
20
|
+
await self.wait_attached(table_selector)
|
|
21
|
+
|
|
22
|
+
table = self._page.locator(table_selector).first
|
|
23
|
+
html = await table.evaluate("el => el.outerHTML")
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
df = pd.read_html(StringIO(html), header=header)[0]
|
|
27
|
+
except Exception as e:
|
|
28
|
+
raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
|
|
29
|
+
|
|
30
|
+
# 문자열 컬럼일 때만 정규화
|
|
31
|
+
if all(isinstance(c, str) for c in df.columns):
|
|
32
|
+
if "항목" in df.columns:
|
|
33
|
+
df["항목"] = (
|
|
34
|
+
df["항목"].astype(str).str.replace("펼치기", "").str.strip()
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
df.columns = (
|
|
38
|
+
df.columns.astype(str)
|
|
39
|
+
.str.replace("연간컨센서스보기", "", regex=False)
|
|
40
|
+
.str.replace("연간컨센서스닫기", "", regex=False)
|
|
41
|
+
.str.replace("(IFRS연결)", "", regex=False)
|
|
42
|
+
.str.replace("(IFRS별도)", "", regex=False)
|
|
43
|
+
.str.replace("(GAAP개별)", "", regex=False)
|
|
44
|
+
.str.replace("(YoY)", "", regex=False)
|
|
45
|
+
.str.replace("(QoQ)", "", regex=False)
|
|
46
|
+
.str.replace("(E)", "", regex=False)
|
|
47
|
+
.str.replace(".", "", regex=False)
|
|
48
|
+
.str.strip()
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
return df.where(pd.notnull(df), None).to_dict(orient="records")
|
|
52
|
+
|
|
53
|
+
async def table_header_texts_nth(
|
|
54
|
+
self, table_selector: str, *, index: int
|
|
55
|
+
) -> list[str]:
|
|
56
|
+
table = self._page.locator(table_selector).nth(index)
|
|
57
|
+
thead = table.locator("thead")
|
|
58
|
+
if await thead.count() == 0:
|
|
59
|
+
return []
|
|
60
|
+
|
|
61
|
+
ths = thead.locator("th")
|
|
62
|
+
try:
|
|
63
|
+
texts = await ths.all_inner_texts()
|
|
64
|
+
except Exception:
|
|
65
|
+
texts = []
|
|
66
|
+
|
|
67
|
+
out: list[str] = []
|
|
68
|
+
for t in texts:
|
|
69
|
+
t = " ".join((t or "").split())
|
|
70
|
+
if t:
|
|
71
|
+
out.append(t)
|
|
72
|
+
return out
|
|
73
|
+
|
|
74
|
+
async def table_header_periods_mm_nth(
|
|
75
|
+
self, table_selector: str, *, index: int
|
|
76
|
+
) -> list[str]:
|
|
77
|
+
texts = await self.table_header_texts_nth(table_selector, index=index)
|
|
78
|
+
periods: list[str] = []
|
|
79
|
+
for t in texts:
|
|
80
|
+
for m in _PERIOD_MM_RE.finditer(t):
|
|
81
|
+
periods.append(m.group(0))
|
|
82
|
+
|
|
83
|
+
seen: set[str] = set()
|
|
84
|
+
uniq: list[str] = []
|
|
85
|
+
for p in periods:
|
|
86
|
+
if p in seen:
|
|
87
|
+
continue
|
|
88
|
+
seen.add(p)
|
|
89
|
+
uniq.append(p)
|
|
90
|
+
return uniq
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
from ._base import _PlaywrightBase
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PlaywrightText(_PlaywrightBase):
|
|
7
|
+
async def count(self, selector: str) -> int:
|
|
8
|
+
return await self._page.locator(selector).count()
|
|
9
|
+
|
|
10
|
+
async def text_content_first(self, selector: str) -> str:
|
|
11
|
+
return (await self._page.locator(selector).first.text_content()) or ""
|
|
12
|
+
|
|
13
|
+
async def all_texts(self, selector: str) -> list[str]:
|
|
14
|
+
loc = self._page.locator(selector)
|
|
15
|
+
return await loc.all_text_contents()
|
|
16
|
+
|
|
17
|
+
async def get_text_by_text(self, needle: str) -> str:
|
|
18
|
+
return (await self._page.get_by_text(needle).first.text_content()) or ""
|
|
19
|
+
|
|
20
|
+
async def inner_text(self, selector: str) -> str:
|
|
21
|
+
return await self._page.locator(selector).first.inner_text()
|
|
22
|
+
|
|
23
|
+
async def outer_html_nth(self, selector: str, index: int) -> str:
|
|
24
|
+
loc = self._page.locator(selector).nth(index)
|
|
25
|
+
return await loc.evaluate("el => el.outerHTML")
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import asyncio
|
|
5
|
+
import time
|
|
6
|
+
from logging_hj3415 import logger
|
|
7
|
+
from ._base import _PlaywrightBase
|
|
8
|
+
|
|
9
|
+
class PlaywrightWait(_PlaywrightBase):
|
|
10
|
+
"""
|
|
11
|
+
wait 관련은 _page만 필요하므로 _PlaywrightBase 상속 안 해도 되지만,
|
|
12
|
+
여기서는 self._page가 있다고 가정(PlaywrightBrowser가 베이스 제공).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
async def sleep_ms(self, ms: int) -> None:
|
|
16
|
+
await asyncio.sleep(ms / 1000)
|
|
17
|
+
|
|
18
|
+
async def wait_attached(self, selector: str, *, timeout_ms: int = 10_000) -> None:
|
|
19
|
+
await self._page.locator(selector).first.wait_for(state="attached", timeout=timeout_ms)
|
|
20
|
+
|
|
21
|
+
async def wait_visible(self, selector: str, *, timeout_ms: int = 10_000) -> None:
|
|
22
|
+
await self._page.locator(selector).first.wait_for(state="visible", timeout=timeout_ms)
|
|
23
|
+
|
|
24
|
+
async def wait_table_nth_ready(
|
|
25
|
+
self,
|
|
26
|
+
table_selector: str,
|
|
27
|
+
*,
|
|
28
|
+
index: int,
|
|
29
|
+
min_rows: int = 1,
|
|
30
|
+
timeout_ms: int = 20_000,
|
|
31
|
+
poll_ms: int = 200,
|
|
32
|
+
) -> None:
|
|
33
|
+
logger.debug("wait for table nth_ready")
|
|
34
|
+
table = self._page.locator(table_selector).nth(index)
|
|
35
|
+
await table.wait_for(state="attached", timeout=timeout_ms)
|
|
36
|
+
|
|
37
|
+
rows = table.locator("tbody tr")
|
|
38
|
+
deadline = time.monotonic() + timeout_ms / 1000
|
|
39
|
+
|
|
40
|
+
cnt = 0
|
|
41
|
+
while time.monotonic() < deadline:
|
|
42
|
+
try:
|
|
43
|
+
cnt = await rows.count()
|
|
44
|
+
except Exception:
|
|
45
|
+
cnt = 0
|
|
46
|
+
|
|
47
|
+
if cnt >= min_rows:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
await asyncio.sleep(poll_ms / 1000)
|
|
51
|
+
|
|
52
|
+
logger.warning(f"table rows timeout: last_cnt={cnt}, need>={min_rows}")
|
|
53
|
+
raise TimeoutError(f"nth table not ready: index={index}, rows<{min_rows}")
|
|
54
|
+
|
|
55
|
+
async def wait_table_text_changed(
|
|
56
|
+
self,
|
|
57
|
+
table_selector: str,
|
|
58
|
+
*,
|
|
59
|
+
index: int,
|
|
60
|
+
prev_text: str | None,
|
|
61
|
+
min_rows: int = 1,
|
|
62
|
+
min_lines: int = 50,
|
|
63
|
+
timeout_sec: float = 12.0,
|
|
64
|
+
poll_sec: float = 0.2,
|
|
65
|
+
) -> str:
|
|
66
|
+
# 0) row 기준 ready
|
|
67
|
+
await self.wait_table_nth_ready(
|
|
68
|
+
table_selector,
|
|
69
|
+
index=index,
|
|
70
|
+
min_rows=min_rows,
|
|
71
|
+
timeout_ms=int(timeout_sec * 1000),
|
|
72
|
+
poll_ms=int(poll_sec * 1000),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
start = time.monotonic()
|
|
76
|
+
last_text = ""
|
|
77
|
+
|
|
78
|
+
while True:
|
|
79
|
+
loc = self._page.locator(table_selector).nth(index)
|
|
80
|
+
try:
|
|
81
|
+
text = await loc.inner_text()
|
|
82
|
+
except Exception:
|
|
83
|
+
text = ""
|
|
84
|
+
|
|
85
|
+
lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
|
|
86
|
+
is_valid = len(lines) >= min_lines
|
|
87
|
+
|
|
88
|
+
if is_valid:
|
|
89
|
+
last_text = text
|
|
90
|
+
if prev_text is None or text != prev_text:
|
|
91
|
+
return text
|
|
92
|
+
|
|
93
|
+
if time.monotonic() - start >= timeout_sec:
|
|
94
|
+
return last_text
|
|
95
|
+
|
|
96
|
+
await asyncio.sleep(poll_sec)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/sinks/memory_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
from contracts_hj3415.nfs.nfs_dto import NfsDTO
|
|
7
|
+
from contracts_hj3415.nfs.types import Endpoint
|
|
8
|
+
|
|
9
|
+
from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class MemorySink:
|
|
13
|
+
def __init__(self, store: InMemoryStore[NfsDTO]):
|
|
14
|
+
self._store = store
|
|
15
|
+
|
|
16
|
+
async def write(self, dto: NfsDTO, *, endpoint: Endpoint) -> None:
|
|
17
|
+
await self._store.put(endpoint, dto.code, dto)
|
|
18
|
+
|
|
19
|
+
async def write_many(
|
|
20
|
+
self,
|
|
21
|
+
dtos: Iterable[NfsDTO],
|
|
22
|
+
*,
|
|
23
|
+
endpoint: Endpoint,
|
|
24
|
+
) -> None:
|
|
25
|
+
await self._store.put_many(endpoint, ((d.code, d) for d in dtos))
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
|
|
7
|
+
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
+
|
|
9
|
+
from contracts_hj3415.nfs.nfs_dto import NfsDTO
|
|
10
|
+
from contracts_hj3415.nfs.types import Endpoint
|
|
11
|
+
|
|
12
|
+
from db2_hj3415.nfs.repo import (
|
|
13
|
+
upsert_latest,
|
|
14
|
+
upsert_latest_many,
|
|
15
|
+
insert_snapshot,
|
|
16
|
+
insert_snapshots_many,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class MongoSink:
|
|
21
|
+
def __init__(self, db: AsyncDatabase):
|
|
22
|
+
self._db = db
|
|
23
|
+
|
|
24
|
+
async def write(self, dto: NfsDTO, *, endpoint: Endpoint) -> None:
|
|
25
|
+
code = str(dto.code).strip()
|
|
26
|
+
if not code:
|
|
27
|
+
return
|
|
28
|
+
|
|
29
|
+
payload = dict(dto.payload) # Mapping 방어
|
|
30
|
+
|
|
31
|
+
await upsert_latest(
|
|
32
|
+
self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
|
|
33
|
+
)
|
|
34
|
+
await insert_snapshot(
|
|
35
|
+
self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
async def write_many(
|
|
39
|
+
self,
|
|
40
|
+
dtos: Iterable[NfsDTO],
|
|
41
|
+
*,
|
|
42
|
+
endpoint: Endpoint,
|
|
43
|
+
) -> None:
|
|
44
|
+
items: dict[str, dict] = {}
|
|
45
|
+
ts: datetime | None = None
|
|
46
|
+
|
|
47
|
+
for dto in dtos:
|
|
48
|
+
code = str(dto.code).strip()
|
|
49
|
+
if not code:
|
|
50
|
+
continue
|
|
51
|
+
items[code] = dict(dto.payload)
|
|
52
|
+
if ts is None:
|
|
53
|
+
ts = dto.asof # 첫 dto의 asof를 배치 기준으로
|
|
54
|
+
|
|
55
|
+
if not items:
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
await upsert_latest_many(
|
|
59
|
+
self._db, endpoint=endpoint, items=items, asof=ts
|
|
60
|
+
)
|
|
61
|
+
await insert_snapshots_many(
|
|
62
|
+
self._db, endpoint=endpoint, items=items, asof=ts
|
|
63
|
+
)
|
|
@@ -1,29 +1,34 @@
|
|
|
1
|
-
#
|
|
1
|
+
# scraper2_hj3415/app/adapters/out/sinks/store.py
|
|
2
2
|
from __future__ import annotations
|
|
3
3
|
|
|
4
4
|
import asyncio
|
|
5
5
|
from collections import defaultdict, deque
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from typing import
|
|
7
|
+
from typing import Deque, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
|
|
8
8
|
|
|
9
9
|
T = TypeVar("T") # DTO 타입
|
|
10
10
|
|
|
11
|
+
|
|
11
12
|
@dataclass(frozen=True)
|
|
12
13
|
class StoreStats:
|
|
13
14
|
endpoint: str
|
|
14
15
|
latest_count: int
|
|
15
16
|
history_count: int
|
|
16
17
|
|
|
18
|
+
|
|
17
19
|
class InMemoryStore(Generic[T]):
|
|
18
20
|
"""
|
|
19
21
|
endpoint별로 DTO를 저장한다.
|
|
20
22
|
- latest: endpoint -> key(보통 코드) -> dto
|
|
21
23
|
- history: endpoint -> deque[dto] (최근 max_history개)
|
|
22
24
|
"""
|
|
25
|
+
|
|
23
26
|
def __init__(self, *, max_history: int = 2000):
|
|
24
27
|
self._lock = asyncio.Lock()
|
|
25
28
|
self._max_history = max_history
|
|
26
|
-
self._history: Dict[str, Deque[T]] = defaultdict(
|
|
29
|
+
self._history: Dict[str, Deque[T]] = defaultdict(
|
|
30
|
+
lambda: deque(maxlen=max_history)
|
|
31
|
+
)
|
|
27
32
|
self._latest: Dict[str, Dict[str, T]] = defaultdict(dict)
|
|
28
33
|
|
|
29
34
|
# ---------- write ----------
|
|
@@ -62,7 +67,11 @@ class InMemoryStore(Generic[T]):
|
|
|
62
67
|
async with self._lock:
|
|
63
68
|
latest_count = len(self._latest.get(endpoint, {}))
|
|
64
69
|
history_count = len(self._history.get(endpoint, []))
|
|
65
|
-
return StoreStats(
|
|
70
|
+
return StoreStats(
|
|
71
|
+
endpoint=endpoint,
|
|
72
|
+
latest_count=latest_count,
|
|
73
|
+
history_count=history_count,
|
|
74
|
+
)
|
|
66
75
|
|
|
67
76
|
async def clear(self, endpoint: str | None = None) -> None:
|
|
68
77
|
async with self._lock:
|
|
@@ -71,4 +80,4 @@ class InMemoryStore(Generic[T]):
|
|
|
71
80
|
self._latest.clear()
|
|
72
81
|
else:
|
|
73
82
|
self._history.pop(endpoint, None)
|
|
74
|
-
self._latest.pop(endpoint, None)
|
|
83
|
+
self._latest.pop(endpoint, None)
|