scraper2-hj3415 2.6.0__tar.gz → 2.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/PKG-INFO +1 -1
  2. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/pyproject.toml +1 -1
  3. scraper2_hj3415-2.7.0/src/scraper2_hj3415/.DS_Store +0 -0
  4. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/.DS_Store +0 -0
  5. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
  6. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/playwright/browser_factory.py +2 -2
  7. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
  8. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
  9. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
  10. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
  11. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
  12. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
  13. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
  14. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
  15. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/out/sinks/.DS_Store +0 -0
  16. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +3 -3
  17. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +11 -11
  18. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
  19. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/constants.py +2 -2
  20. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_tables/html_table.py +3 -2
  21. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c103_parser.py +4 -1
  22. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c104_parser.py +4 -1
  23. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
  24. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
  25. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
  26. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
  27. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
  28. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
  29. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
  30. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
  31. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +3 -3
  32. scraper2_hj3415-2.7.0/src/scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
  33. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/fetch/fetch_c103.py +18 -32
  34. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/fetch/fetch_c104.py +28 -51
  35. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/nfs_doc_builders.py +21 -7
  36. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/ingest/ingest_c101.py +2 -2
  37. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/ingest/ingest_c103.py +2 -2
  38. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/ingest/ingest_c104.py +2 -2
  39. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/ingest/ingest_c106.py +2 -2
  40. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/ingest/ingest_c108.py +2 -2
  41. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/cli.py +10 -7
  42. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/playwright/browser.py +0 -373
  43. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/site/wisereport_playwright.py +0 -168
  44. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/browser_port.py +0 -115
  45. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/site/wisereport_port.py +0 -20
  46. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/LICENSE +0 -0
  47. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/README.md +0 -0
  48. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/__init__.py +0 -0
  49. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/__init__.py +0 -0
  50. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/__init__.py +0 -0
  51. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/__init__.py +0 -0
  52. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/playwright/__init__.py +0 -0
  53. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/playwright/session.py +0 -0
  54. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/sinks/__init__.py +0 -0
  55. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/out/sinks/store.py +0 -0
  56. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/adapters/site/__init__.py +0 -0
  57. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/composition.py +0 -0
  58. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/__init__.py +0 -0
  59. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/blocks.py +0 -0
  60. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/doc.py +0 -0
  61. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/endpoint.py +0 -0
  62. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/series.py +0 -0
  63. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/domain/types.py +0 -0
  64. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/__init__.py +0 -0
  65. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_normalize/__init__.py +0 -0
  66. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_normalize/label.py +0 -0
  67. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_normalize/table.py +0 -0
  68. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_normalize/text.py +0 -0
  69. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_normalize/values.py +0 -0
  70. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/_tables/__init__.py +0 -0
  71. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  72. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +0 -0
  73. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/company_overview.py +0 -0
  74. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/earning_surprise.py +0 -0
  75. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/fundamentals.py +0 -0
  76. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/major_shareholders.py +0 -0
  77. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/sise.py +0 -0
  78. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/summary_cmp.py +0 -0
  79. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101/yearly_consensus.py +0 -0
  80. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c101_parser.py +0 -0
  81. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c106_parser.py +0 -0
  82. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/parsing/c108_parser.py +0 -0
  83. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/__init__.py +0 -0
  84. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  85. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/browser/browser_factory_port.py +0 -0
  86. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  87. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +0 -0
  88. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  89. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/ports/site/__init__.py +0 -0
  90. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/__init__.py +0 -0
  91. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  92. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/fetch/fetch_c101.py +0 -0
  93. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/fetch/fetch_c106.py +0 -0
  94. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/services/fetch/fetch_c108.py +0 -0
  95. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/__init__.py +0 -0
  96. {scraper2_hj3415-2.6.0 → scraper2_hj3415-2.7.0}/src/scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scraper2-hj3415
3
- Version: 2.6.0
3
+ Version: 2.7.0
4
4
  Summary: Naver WiseReport scraper
5
5
  Keywords: example,demo
6
6
  Author-email: Hyungjin Kim <hj3415@gmail.com>
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
4
4
 
5
5
  [project]
6
6
  name = "scraper2-hj3415" # PyPI 이름 (하이픈 허용)
7
- version = "2.6.0"
7
+ version = "2.7.0"
8
8
  description = "Naver WiseReport scraper"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -0,0 +1,26 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/browser.py
2
+ from __future__ import annotations
3
+
4
+ from playwright.async_api import Page
5
+
6
+ from .capabilities import (
7
+ _PlaywrightBase,
8
+ PlaywrightNavigation,
9
+ PlaywrightWait,
10
+ PlaywrightInteraction,
11
+ PlaywrightText,
12
+ PlaywrightScope,
13
+ PlaywrightTable,
14
+ )
15
+
16
+
17
+ class PlaywrightBrowser(
18
+ PlaywrightNavigation,
19
+ PlaywrightWait,
20
+ PlaywrightInteraction,
21
+ PlaywrightText,
22
+ PlaywrightScope,
23
+ PlaywrightTable,
24
+ ):
25
+ def __init__(self, page: Page):
26
+ _PlaywrightBase.__init__(self, page)
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
  import asyncio
5
5
  from contextlib import asynccontextmanager
6
6
  from dataclasses import dataclass
7
- from typing import AsyncIterator
7
+ from typing import AsyncIterator, cast
8
8
 
9
9
  from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
10
  from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
@@ -54,7 +54,7 @@ class PlaywrightBrowserFactory(BrowserFactoryPort):
54
54
  for _ in range(self.max_concurrency):
55
55
  session = PlaywrightPageSession(headless=self.headless, timeout_ms=self.timeout_ms)
56
56
  page = await session.start()
57
- browser = PlaywrightBrowser(page)
57
+ browser:BrowserPort = cast(BrowserPort, PlaywrightBrowser(page))
58
58
 
59
59
  item = _LeaseItem(session=session, browser=browser)
60
60
  self._items.append(item)
@@ -0,0 +1,18 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py
2
+ from ._base import _PlaywrightBase
3
+ from .navigation import PlaywrightNavigation
4
+ from .wait import PlaywrightWait
5
+ from .interaction import PlaywrightInteraction
6
+ from .text import PlaywrightText
7
+ from .scope import PlaywrightScope
8
+ from .table import PlaywrightTable
9
+
10
+ __all__ = [
11
+ "_PlaywrightBase",
12
+ "PlaywrightNavigation",
13
+ "PlaywrightWait",
14
+ "PlaywrightInteraction",
15
+ "PlaywrightText",
16
+ "PlaywrightScope",
17
+ "PlaywrightTable",
18
+ ]
@@ -0,0 +1,19 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py
2
+ from __future__ import annotations
3
+
4
+ from playwright.async_api import Page
5
+ from logging_hj3415 import logger
6
+
7
+
8
+ class _PlaywrightBase:
9
+ def __init__(self, page: Page):
10
+ self._page = page
11
+
12
+ async def _wait_for_network_quiet(self, *, timeout_ms: int = 10_000) -> None:
13
+ # networkidle은 사이트에 따라 영원히 안 올 수도 있어서 try로 감싸는 게 안전
14
+ logger.debug("wait for network quiet")
15
+ try:
16
+ await self._page.wait_for_load_state("networkidle", timeout=timeout_ms)
17
+ except Exception:
18
+ # networkidle이 안 와도 다음 단계(앵커 wait)가 더 중요함
19
+ return
@@ -0,0 +1,37 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py
2
+ from __future__ import annotations
3
+
4
+ from playwright.async_api import TimeoutError as PwTimeoutError
5
+ from ._base import _PlaywrightBase
6
+
7
+
8
+ class PlaywrightInteraction(_PlaywrightBase):
9
+
10
+ async def click(
11
+ self,
12
+ selector: str,
13
+ *,
14
+ index: int = 0,
15
+ timeout_ms: int = 4_000,
16
+ force: bool = False,
17
+ ) -> None:
18
+ loc = self._page.locator(selector).nth(index)
19
+ await loc.click(timeout=timeout_ms, force=force)
20
+
21
+ async def try_click(
22
+ self,
23
+ selector: str,
24
+ *,
25
+ index: int = 0,
26
+ timeout_ms: int = 1_500,
27
+ force: bool = False,
28
+ ) -> bool:
29
+ loc = self._page.locator(selector).nth(index)
30
+ try:
31
+ await loc.click(timeout=timeout_ms, trial=True, force=force)
32
+ return True
33
+ except PwTimeoutError:
34
+ return False
35
+
36
+ async def scroll_into_view(self, selector: str, *, index: int = 0) -> None:
37
+ await self._page.locator(selector).nth(index).scroll_into_view_if_needed()
@@ -0,0 +1,24 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py
2
+ from __future__ import annotations
3
+
4
+ from logging_hj3415 import logger
5
+ from ._base import _PlaywrightBase
6
+
7
+
8
+ class PlaywrightNavigation(_PlaywrightBase):
9
+ async def title(self) -> str:
10
+ return await self._page.title()
11
+
12
+ async def current_url(self) -> str:
13
+ return self._page.url
14
+
15
+ async def goto_and_wait_for_stable(
16
+ self, url: str, timeout_ms: int = 10_000
17
+ ) -> None:
18
+ logger.info(f"goto: {url}")
19
+ await self._page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
20
+ await self._wait_for_network_quiet(timeout_ms=timeout_ms // 2)
21
+
22
+ async def reload(self, *, timeout_ms: int = 10_000) -> None:
23
+ logger.info("reload")
24
+ await self._page.reload(timeout=timeout_ms, wait_until="domcontentloaded")
@@ -0,0 +1,84 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+ from ._base import _PlaywrightBase
6
+
7
+
8
+ class PlaywrightScope(_PlaywrightBase):
9
+ async def is_attached(self, selector: str, *, index: int = 0) -> bool:
10
+ try:
11
+ loc = self._page.locator(selector).nth(index)
12
+ return await loc.count() > 0
13
+ except Exception:
14
+ return False
15
+
16
+ async def computed_style(self, selector: str, *, index: int = 0, prop: str) -> str:
17
+ loc = self._page.locator(selector).nth(index)
18
+ return await loc.evaluate(
19
+ "(el, prop) => getComputedStyle(el)[prop] || ''", prop
20
+ )
21
+
22
+ async def count_in_nth(
23
+ self,
24
+ scope_selector: str,
25
+ *,
26
+ scope_index: int,
27
+ inner_selector: str,
28
+ ) -> int:
29
+ scope = self._page.locator(scope_selector).nth(scope_index)
30
+ return await scope.locator(inner_selector).count()
31
+
32
+ async def eval_in_nth_first(
33
+ self,
34
+ scope_selector: str,
35
+ *,
36
+ scope_index: int,
37
+ inner_selector: str,
38
+ expression: str,
39
+ ) -> Any:
40
+ scope = self._page.locator(scope_selector).nth(scope_index)
41
+ loc = scope.locator(inner_selector).first
42
+
43
+ if await loc.count() == 0:
44
+ return None
45
+
46
+ return await loc.evaluate(expression)
47
+
48
+ async def inner_text_in_nth(
49
+ self,
50
+ scope_selector: str,
51
+ *,
52
+ scope_index: int,
53
+ inner_selector: str,
54
+ inner_index: int = 0,
55
+ timeout_ms: int = 10_000,
56
+ ) -> str:
57
+ scope = self._page.locator(scope_selector).nth(scope_index)
58
+ inner = scope.locator(inner_selector).nth(inner_index)
59
+
60
+ await inner.wait_for(state="attached", timeout=timeout_ms)
61
+
62
+ try:
63
+ return (await inner.inner_text()) or ""
64
+ except Exception:
65
+ return ""
66
+
67
+ async def text_content_in_nth(
68
+ self,
69
+ scope_selector: str,
70
+ *,
71
+ scope_index: int,
72
+ inner_selector: str,
73
+ inner_index: int = 0,
74
+ timeout_ms: int = 10_000,
75
+ ) -> str:
76
+ scope = self._page.locator(scope_selector).nth(scope_index)
77
+ inner = scope.locator(inner_selector).nth(inner_index)
78
+
79
+ await inner.wait_for(state="attached", timeout=timeout_ms)
80
+
81
+ try:
82
+ return (await inner.text_content()) or ""
83
+ except Exception:
84
+ return ""
@@ -0,0 +1,90 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from io import StringIO
6
+ from typing import Any
7
+ import pandas as pd
8
+ from ._base import _PlaywrightBase
9
+
10
+ _PERIOD_MM_RE = re.compile(r"\b(19|20)\d{2}/(03|06|09|12)\b")
11
+
12
+
13
+ class PlaywrightTable(_PlaywrightBase):
14
+ async def table_records(
15
+ self,
16
+ table_selector: str,
17
+ *,
18
+ header: int | list[int] | None = 0,
19
+ ) -> list[dict[str, Any]]:
20
+ await self.wait_attached(table_selector)
21
+
22
+ table = self._page.locator(table_selector).first
23
+ html = await table.evaluate("el => el.outerHTML")
24
+
25
+ try:
26
+ df = pd.read_html(StringIO(html), header=header)[0]
27
+ except Exception as e:
28
+ raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
29
+
30
+ # 문자열 컬럼일 때만 정규화
31
+ if all(isinstance(c, str) for c in df.columns):
32
+ if "항목" in df.columns:
33
+ df["항목"] = (
34
+ df["항목"].astype(str).str.replace("펼치기", "").str.strip()
35
+ )
36
+
37
+ df.columns = (
38
+ df.columns.astype(str)
39
+ .str.replace("연간컨센서스보기", "", regex=False)
40
+ .str.replace("연간컨센서스닫기", "", regex=False)
41
+ .str.replace("(IFRS연결)", "", regex=False)
42
+ .str.replace("(IFRS별도)", "", regex=False)
43
+ .str.replace("(GAAP개별)", "", regex=False)
44
+ .str.replace("(YoY)", "", regex=False)
45
+ .str.replace("(QoQ)", "", regex=False)
46
+ .str.replace("(E)", "", regex=False)
47
+ .str.replace(".", "", regex=False)
48
+ .str.strip()
49
+ )
50
+
51
+ return df.where(pd.notnull(df), None).to_dict(orient="records")
52
+
53
+ async def table_header_texts_nth(
54
+ self, table_selector: str, *, index: int
55
+ ) -> list[str]:
56
+ table = self._page.locator(table_selector).nth(index)
57
+ thead = table.locator("thead")
58
+ if await thead.count() == 0:
59
+ return []
60
+
61
+ ths = thead.locator("th")
62
+ try:
63
+ texts = await ths.all_inner_texts()
64
+ except Exception:
65
+ texts = []
66
+
67
+ out: list[str] = []
68
+ for t in texts:
69
+ t = " ".join((t or "").split())
70
+ if t:
71
+ out.append(t)
72
+ return out
73
+
74
+ async def table_header_periods_mm_nth(
75
+ self, table_selector: str, *, index: int
76
+ ) -> list[str]:
77
+ texts = await self.table_header_texts_nth(table_selector, index=index)
78
+ periods: list[str] = []
79
+ for t in texts:
80
+ for m in _PERIOD_MM_RE.finditer(t):
81
+ periods.append(m.group(0))
82
+
83
+ seen: set[str] = set()
84
+ uniq: list[str] = []
85
+ for p in periods:
86
+ if p in seen:
87
+ continue
88
+ seen.add(p)
89
+ uniq.append(p)
90
+ return uniq
@@ -0,0 +1,25 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py
2
+ from __future__ import annotations
3
+ from ._base import _PlaywrightBase
4
+
5
+
6
+ class PlaywrightText(_PlaywrightBase):
7
+ async def count(self, selector: str) -> int:
8
+ return await self._page.locator(selector).count()
9
+
10
+ async def text_content_first(self, selector: str) -> str:
11
+ return (await self._page.locator(selector).first.text_content()) or ""
12
+
13
+ async def all_texts(self, selector: str) -> list[str]:
14
+ loc = self._page.locator(selector)
15
+ return await loc.all_text_contents()
16
+
17
+ async def get_text_by_text(self, needle: str) -> str:
18
+ return (await self._page.get_by_text(needle).first.text_content()) or ""
19
+
20
+ async def inner_text(self, selector: str) -> str:
21
+ return await self._page.locator(selector).first.inner_text()
22
+
23
+ async def outer_html_nth(self, selector: str, index: int) -> str:
24
+ loc = self._page.locator(selector).nth(index)
25
+ return await loc.evaluate("el => el.outerHTML")
@@ -0,0 +1,96 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import time
6
+ from logging_hj3415 import logger
7
+ from ._base import _PlaywrightBase
8
+
9
+ class PlaywrightWait(_PlaywrightBase):
10
+ """
11
+ wait 관련은 _page만 필요하므로 _PlaywrightBase 상속 안 해도 되지만,
12
+ 여기서는 self._page가 있다고 가정(PlaywrightBrowser가 베이스 제공).
13
+ """
14
+
15
+ async def sleep_ms(self, ms: int) -> None:
16
+ await asyncio.sleep(ms / 1000)
17
+
18
+ async def wait_attached(self, selector: str, *, timeout_ms: int = 10_000) -> None:
19
+ await self._page.locator(selector).first.wait_for(state="attached", timeout=timeout_ms)
20
+
21
+ async def wait_visible(self, selector: str, *, timeout_ms: int = 10_000) -> None:
22
+ await self._page.locator(selector).first.wait_for(state="visible", timeout=timeout_ms)
23
+
24
+ async def wait_table_nth_ready(
25
+ self,
26
+ table_selector: str,
27
+ *,
28
+ index: int,
29
+ min_rows: int = 1,
30
+ timeout_ms: int = 20_000,
31
+ poll_ms: int = 200,
32
+ ) -> None:
33
+ logger.debug("wait for table nth_ready")
34
+ table = self._page.locator(table_selector).nth(index)
35
+ await table.wait_for(state="attached", timeout=timeout_ms)
36
+
37
+ rows = table.locator("tbody tr")
38
+ deadline = time.monotonic() + timeout_ms / 1000
39
+
40
+ cnt = 0
41
+ while time.monotonic() < deadline:
42
+ try:
43
+ cnt = await rows.count()
44
+ except Exception:
45
+ cnt = 0
46
+
47
+ if cnt >= min_rows:
48
+ return
49
+
50
+ await asyncio.sleep(poll_ms / 1000)
51
+
52
+ logger.warning(f"table rows timeout: last_cnt={cnt}, need>={min_rows}")
53
+ raise TimeoutError(f"nth table not ready: index={index}, rows<{min_rows}")
54
+
55
+ async def wait_table_text_changed(
56
+ self,
57
+ table_selector: str,
58
+ *,
59
+ index: int,
60
+ prev_text: str | None,
61
+ min_rows: int = 1,
62
+ min_lines: int = 50,
63
+ timeout_sec: float = 12.0,
64
+ poll_sec: float = 0.2,
65
+ ) -> str:
66
+ # 0) row 기준 ready
67
+ await self.wait_table_nth_ready(
68
+ table_selector,
69
+ index=index,
70
+ min_rows=min_rows,
71
+ timeout_ms=int(timeout_sec * 1000),
72
+ poll_ms=int(poll_sec * 1000),
73
+ )
74
+
75
+ start = time.monotonic()
76
+ last_text = ""
77
+
78
+ while True:
79
+ loc = self._page.locator(table_selector).nth(index)
80
+ try:
81
+ text = await loc.inner_text()
82
+ except Exception:
83
+ text = ""
84
+
85
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
86
+ is_valid = len(lines) >= min_lines
87
+
88
+ if is_valid:
89
+ last_text = text
90
+ if prev_text is None or text != prev_text:
91
+ return text
92
+
93
+ if time.monotonic() - start >= timeout_sec:
94
+ return last_text
95
+
96
+ await asyncio.sleep(poll_sec)
@@ -4,7 +4,7 @@ from __future__ import annotations
4
4
  from typing import Iterable
5
5
 
6
6
  from contracts_hj3415.nfs.nfs_dto import NfsDTO
7
- from contracts_hj3415.nfs.types import Endpoints
7
+ from contracts_hj3415.nfs.types import Endpoint
8
8
 
9
9
  from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
10
10
 
@@ -13,13 +13,13 @@ class MemorySink:
13
13
  def __init__(self, store: InMemoryStore[NfsDTO]):
14
14
  self._store = store
15
15
 
16
- async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
16
+ async def write(self, dto: NfsDTO, *, endpoint: Endpoint) -> None:
17
17
  await self._store.put(endpoint, dto.code, dto)
18
18
 
19
19
  async def write_many(
20
20
  self,
21
21
  dtos: Iterable[NfsDTO],
22
22
  *,
23
- endpoint: Endpoints,
23
+ endpoint: Endpoint,
24
24
  ) -> None:
25
25
  await self._store.put_many(endpoint, ((d.code, d) for d in dtos))
@@ -7,13 +7,13 @@ from typing import Iterable
7
7
  from pymongo.asynchronous.database import AsyncDatabase
8
8
 
9
9
  from contracts_hj3415.nfs.nfs_dto import NfsDTO
10
- from contracts_hj3415.nfs.types import Endpoints
10
+ from contracts_hj3415.nfs.types import Endpoint
11
11
 
12
12
  from db2_hj3415.nfs.repo import (
13
- upsert_latest_payload,
14
- upsert_latest_payload_many,
15
- insert_snapshot_payload,
16
- insert_snapshots_payload_many,
13
+ upsert_latest,
14
+ upsert_latest_many,
15
+ insert_snapshot,
16
+ insert_snapshots_many,
17
17
  )
18
18
 
19
19
 
@@ -21,17 +21,17 @@ class MongoSink:
21
21
  def __init__(self, db: AsyncDatabase):
22
22
  self._db = db
23
23
 
24
- async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
24
+ async def write(self, dto: NfsDTO, *, endpoint: Endpoint) -> None:
25
25
  code = str(dto.code).strip()
26
26
  if not code:
27
27
  return
28
28
 
29
29
  payload = dict(dto.payload) # Mapping 방어
30
30
 
31
- await upsert_latest_payload(
31
+ await upsert_latest(
32
32
  self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
33
33
  )
34
- await insert_snapshot_payload(
34
+ await insert_snapshot(
35
35
  self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
36
36
  )
37
37
 
@@ -39,7 +39,7 @@ class MongoSink:
39
39
  self,
40
40
  dtos: Iterable[NfsDTO],
41
41
  *,
42
- endpoint: Endpoints,
42
+ endpoint: Endpoint,
43
43
  ) -> None:
44
44
  items: dict[str, dict] = {}
45
45
  ts: datetime | None = None
@@ -55,9 +55,9 @@ class MongoSink:
55
55
  if not items:
56
56
  return
57
57
 
58
- await upsert_latest_payload_many(
58
+ await upsert_latest_many(
59
59
  self._db, endpoint=endpoint, items=items, asof=ts
60
60
  )
61
- await insert_snapshots_payload_many(
61
+ await insert_snapshots_many(
62
62
  self._db, endpoint=endpoint, items=items, asof=ts
63
63
  )