scraper2-hj3415 2.4.0__tar.gz → 2.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {scraper2_hj3415-2.4.0 → scraper2_hj3415-2.6.0}/PKG-INFO +3 -1
  2. {scraper2_hj3415-2.4.0 → scraper2_hj3415-2.6.0}/pyproject.toml +6 -4
  3. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  4. {scraper2_hj3415-2.4.0/src/scraper2 → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  5. {scraper2_hj3415-2.4.0/src/scraper2 → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  6. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  7. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  8. {scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  9. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  10. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/composition.py +225 -0
  11. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/blocks.py +61 -0
  12. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/constants.py +33 -0
  13. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/doc.py +16 -0
  14. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/endpoint.py +11 -0
  15. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/series.py +11 -0
  16. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain/types.py +19 -0
  17. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  18. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  19. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  20. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  21. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  22. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  23. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  24. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  25. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  26. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  27. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  28. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  29. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  30. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  31. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  32. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  33. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  34. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  35. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  36. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/__init__.py +0 -0
  37. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  38. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  39. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  40. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  41. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  42. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  43. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  44. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/site/__init__.py +0 -0
  45. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  46. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/__init__.py +0 -0
  47. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  48. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  49. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  50. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  51. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  52. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  53. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  54. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/__init__.py +0 -0
  55. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  56. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  57. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  58. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  59. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  60. scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  61. scraper2_hj3415-2.4.0/src/scraper2/main.py → scraper2_hj3415-2.6.0/src/scraper2_hj3415/cli.py +40 -80
  62. scraper2_hj3415-2.4.0/src/scraper2/.DS_Store +0 -0
  63. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/.DS_Store +0 -0
  64. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/playwright/browser.py +0 -102
  65. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/.DS_Store +0 -0
  66. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  67. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  68. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  69. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  70. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  71. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  72. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  73. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  74. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  75. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  76. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  77. scraper2_hj3415-2.4.0/src/scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  78. scraper2_hj3415-2.4.0/src/scraper2/app/composition.py +0 -204
  79. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/_converters.py +0 -85
  80. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/_normalize.py +0 -134
  81. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/c101_parser.py +0 -143
  82. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/c103_parser.py +0 -128
  83. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/c104_parser.py +0 -143
  84. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/c106_parser.py +0 -153
  85. scraper2_hj3415-2.4.0/src/scraper2/app/parsing/c108_parser.py +0 -65
  86. scraper2_hj3415-2.4.0/src/scraper2/app/ports/browser/browser_factory_port.py +0 -11
  87. scraper2_hj3415-2.4.0/src/scraper2/app/ports/browser/browser_port.py +0 -22
  88. scraper2_hj3415-2.4.0/src/scraper2/app/ports/ingest_port.py +0 -14
  89. scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks/base_sink_port.py +0 -14
  90. scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  91. scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  92. scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  93. scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  94. scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  95. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  96. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  97. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  98. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  99. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  100. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  101. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  102. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  103. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  104. scraper2_hj3415-2.4.0/src/scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  105. {scraper2_hj3415-2.4.0 → scraper2_hj3415-2.6.0}/LICENSE +0 -0
  106. {scraper2_hj3415-2.4.0 → scraper2_hj3415-2.6.0}/README.md +0 -0
  107. {scraper2_hj3415-2.4.0/src/scraper2 → scraper2_hj3415-2.6.0/src/scraper2_hj3415}/__init__.py +0 -0
  108. {scraper2_hj3415-2.4.0/src/scraper2/adapters/out → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app}/__init__.py +0 -0
  109. {scraper2_hj3415-2.4.0/src/scraper2/adapters/out/playwright → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters}/__init__.py +0 -0
  110. {scraper2_hj3415-2.4.0/src/scraper2/app → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  111. {scraper2_hj3415-2.4.0/src/scraper2/app/parsing → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  112. {scraper2_hj3415-2.4.0/src/scraper2/app/ports → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  113. {scraper2_hj3415-2.4.0/src/scraper2/app/ports/browser → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  114. {scraper2_hj3415-2.4.0/src/scraper2/app/ports/sinks → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/domain}/__init__.py +0 -0
  115. {scraper2_hj3415-2.4.0/src/scraper2/app/usecases → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing}/__init__.py +0 -0
  116. {scraper2_hj3415-2.4.0/src/scraper2/app/usecases/fetch → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  117. {scraper2_hj3415-2.4.0/src/scraper2/app/usecases/ingest → scraper2_hj3415-2.6.0/src/scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scraper2-hj3415
3
- Version: 2.4.0
3
+ Version: 2.6.0
4
4
  Summary: Naver WiseReport scraper
5
5
  Keywords: example,demo
6
6
  Author-email: Hyungjin Kim <hj3415@gmail.com>
@@ -17,6 +17,8 @@ Requires-Dist: lxml>=6.0.2
17
17
  Requires-Dist: typer>=0.21.0
18
18
  Requires-Dist: db2-hj3415
19
19
  Requires-Dist: contracts-hj3415
20
+ Requires-Dist: common-hj3415
21
+ Requires-Dist: logging-hj3415
20
22
 
21
23
  # scraper2
22
24
 
@@ -4,7 +4,7 @@ build-backend = "flit_core.buildapi"
4
4
 
5
5
  [project]
6
6
  name = "scraper2-hj3415" # PyPI 이름 (하이픈 허용)
7
- version = "2.4.0"
7
+ version = "2.6.0"
8
8
  description = "Naver WiseReport scraper"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -25,11 +25,13 @@ dependencies = [
25
25
  "typer>=0.21.0",
26
26
  "db2-hj3415",
27
27
  "contracts-hj3415",
28
+ "common-hj3415",
29
+ "logging-hj3415",
28
30
  ]
29
31
 
30
32
  [tool.flit.module]
31
- name = "scraper2"
32
- path = "src/scraper2"
33
+ name = "scraper2_hj3415"
34
+ path = "src/scraper2_hj3415"
33
35
 
34
36
  [project.scripts]
35
- scraper2 = "scraper2.main:app"
37
+ scraper2 = "scraper2_hj3415.cli:app"
@@ -0,0 +1,373 @@
1
+ # scraper2_hj3415/app/adapters/out/playwright/browser.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+ from io import StringIO
6
+ import pandas as pd
7
+ from playwright.async_api import Page, TimeoutError as PwTimeoutError
8
+ import asyncio
9
+ import time
10
+ from logging_hj3415 import logger
11
+
12
+
13
+ class PlaywrightBrowser:
14
+ def __init__(self, page: Page):
15
+ self._page = page
16
+
17
+ async def _wait_for_network_quiet(self, *, timeout_ms: int = 10_000) -> None:
18
+ # networkidle은 사이트에 따라 영원히 안 올 수도 있어서 try로 감싸는 게 안전
19
+ logger.debug("wait for network quiet")
20
+ try:
21
+ await self._page.wait_for_load_state("networkidle", timeout=timeout_ms)
22
+ except Exception:
23
+ # networkidle이 안 와도 다음 단계(앵커 wait)가 더 중요함
24
+ return
25
+
26
+ async def wait_table_nth_ready(
27
+ self,
28
+ table_selector: str,
29
+ *,
30
+ index: int,
31
+ min_rows: int = 1,
32
+ timeout_ms: int = 20_000,
33
+ poll_ms: int = 200,
34
+ ) -> None:
35
+ logger.debug("wait for table nth_ready")
36
+ table = self._page.locator(table_selector).nth(index)
37
+ await table.wait_for(state="attached", timeout=timeout_ms)
38
+
39
+ # html = await table.evaluate("el => el.outerHTML")
40
+ # logger.debug(f"TABLE HTML:\n{html}")
41
+
42
+ rows = table.locator("tbody tr")
43
+ deadline = time.monotonic() + timeout_ms / 1000
44
+
45
+ cnt = 0
46
+ while time.monotonic() < deadline:
47
+ try:
48
+ cnt = await rows.count()
49
+ except Exception:
50
+ cnt = 0
51
+
52
+ if cnt >= min_rows:
53
+ return
54
+
55
+ await asyncio.sleep(poll_ms / 1000)
56
+
57
+ logger.warning(f"table rows timeout: last_cnt={cnt}, need>={min_rows}")
58
+ raise TimeoutError(f"nth table not ready: index={index}, rows<{min_rows}")
59
+
60
+ async def title(self) -> str:
61
+ return await self._page.title()
62
+
63
+ async def current_url(self) -> str:
64
+ return self._page.url
65
+
66
+ async def goto_and_wait_for_stable(
67
+ self, url: str, timeout_ms: int = 10_000
68
+ ) -> None:
69
+ logger.info(f"goto: {url}")
70
+ await self._page.goto(url, timeout=timeout_ms, wait_until="domcontentloaded")
71
+ await self._wait_for_network_quiet(timeout_ms=timeout_ms // 2)
72
+
73
+ async def reload(self, *, timeout_ms: int = 10_000) -> None:
74
+ logger.info("reload")
75
+ await self._page.reload(timeout=timeout_ms, wait_until="domcontentloaded")
76
+
77
+ async def sleep_ms(self, ms: int) -> None:
78
+ await asyncio.sleep(ms / 1000)
79
+
80
+ async def wait_attached(self, selector: str, *, timeout_ms: int = 10_000) -> None:
81
+ await self._page.locator(selector).first.wait_for(
82
+ state="attached", timeout=timeout_ms
83
+ )
84
+
85
+ async def wait_visible(self, selector: str, *, timeout_ms: int = 10_000) -> None:
86
+ await self._page.locator(selector).first.wait_for(
87
+ state="visible", timeout=timeout_ms
88
+ )
89
+
90
+ async def click(
91
+ self,
92
+ selector: str,
93
+ *,
94
+ index: int = 0,
95
+ timeout_ms: int = 4_000,
96
+ force: bool = False,
97
+ ) -> None:
98
+ loc = self._page.locator(selector).nth(index)
99
+ await loc.click(timeout=timeout_ms, force=force)
100
+
101
+ async def try_click(
102
+ self,
103
+ selector: str,
104
+ *,
105
+ index: int = 0,
106
+ timeout_ms: int = 1_500,
107
+ force: bool = False,
108
+ ) -> bool:
109
+ loc = self._page.locator(selector).nth(index)
110
+ try:
111
+ await loc.click(timeout=timeout_ms, trial=True, force=force)
112
+ return True
113
+ except PwTimeoutError:
114
+ return False
115
+
116
+ async def count(self, selector: str) -> int:
117
+ return await self._page.locator(selector).count()
118
+
119
+ async def scroll_into_view(self, selector: str, *, index: int = 0) -> None:
120
+ # 선택한 요소가 화면에 보이도록 스크롤을 자동으로 내려준다.
121
+ await self._page.locator(selector).nth(index).scroll_into_view_if_needed()
122
+
123
+ async def text_content_first(self, selector: str) -> str:
124
+ # selector 첫 번째 요소의 text_content()를 반환
125
+ return (await self._page.locator(selector).first.text_content()) or ""
126
+
127
+ async def all_texts(self, selector: str) -> list[str]:
128
+ # selector로 잡히는 모든 요소를 all_text_contents()로 가져옴
129
+ loc = self._page.locator(selector)
130
+ return await loc.all_text_contents()
131
+
132
+ async def get_text_by_text(self, needle: str) -> str:
133
+ """
134
+ 페이지에서 주어진 텍스트(needle)를 포함하는 요소 중
135
+ 첫 번째 요소의 text_content를 반환한다.
136
+
137
+ - 요소가 없으면 빈 문자열 반환
138
+ - 부분 일치 기준
139
+ """
140
+ return (await self._page.get_by_text(needle).first.text_content()) or ""
141
+
142
+ async def inner_text(self, selector: str) -> str:
143
+ """
144
+ selector에 해당하는 첫 번째 요소의 innerText를 반환한다.
145
+
146
+ - 요소가 DOM에 attach될 때까지 대기
147
+ - 화면에 보이는 텍스트 기준(innerText)
148
+ """
149
+ return await self._page.locator(selector).first.inner_text()
150
+
151
+ async def outer_html_nth(self, selector: str, index: int) -> str:
152
+ """
153
+ selector로 매칭되는 요소 중 index번째 요소의 outerHTML을 반환한다.
154
+
155
+ - index는 0-based
156
+ - 요소가 없으면 playwright 예외 발생
157
+ """
158
+ loc = self._page.locator(selector).nth(index)
159
+ # index가 범위를 벗어나면 playwright가 에러를 내는데,
160
+ # 필요하면 여기서 더 친절한 에러로 감싸도 됨.
161
+ return await loc.evaluate("el => el.outerHTML")
162
+
163
+ async def wait_table_text_changed(
164
+ self,
165
+ table_selector: str,
166
+ *,
167
+ index: int,
168
+ prev_text: str | None,
169
+ min_rows: int = 1,
170
+ min_lines: int = 50,
171
+ timeout_sec: float = 12.0,
172
+ poll_sec: float = 0.2,
173
+ ) -> str:
174
+ """
175
+ 지정한 table(nth)의 innerText가 '유효한 상태'가 되고,
176
+ 이전 텍스트(prev_text)와 달라질 때까지 대기한 뒤 반환한다.
177
+
178
+ 동작 순서:
179
+ 1) tbody row 개수 기준으로 테이블이 최소한 로딩되었는지 보장
180
+ 2) innerText를 주기적으로 폴링하며
181
+ - 최소 라인 수(min_lines)를 만족하고
182
+ - prev_text가 None이거나, prev_text와 다른 경우 반환
183
+
184
+ 특징:
185
+ - DOM이 붙었지만 데이터가 아직 비어 있는 상태를 배제
186
+ - 클릭/토글 이후 실제 데이터 변경을 안정적으로 감지
187
+ - 타임아웃 시 마지막으로 관측된 텍스트를 반환
188
+
189
+ 반환값:
190
+ - 변경된(innerText) 문자열
191
+ """
192
+
193
+ # 0) 최초/혹은 불안정할 때는 row 기준으로 'ready'를 먼저 확보
194
+ await self.wait_table_nth_ready(
195
+ table_selector,
196
+ index=index,
197
+ min_rows=min_rows,
198
+ timeout_ms=int(timeout_sec * 1000),
199
+ poll_ms=int(poll_sec * 1000),
200
+ )
201
+
202
+ # 1) 그 다음 텍스트 기반으로 '유효 + 변경'을 기다림
203
+ start = time.monotonic()
204
+ last_text = ""
205
+
206
+ while True:
207
+ loc = self._page.locator(table_selector).nth(index)
208
+ try:
209
+ text = await loc.inner_text()
210
+ except Exception:
211
+ text = ""
212
+
213
+ lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
214
+ is_valid = len(lines) >= min_lines
215
+
216
+ if is_valid:
217
+ last_text = text
218
+ if prev_text is None or text != prev_text:
219
+ return text
220
+
221
+ if time.monotonic() - start >= timeout_sec:
222
+ return last_text
223
+
224
+ await asyncio.sleep(poll_sec)
225
+
226
+ async def is_attached(self, selector: str, *, index: int = 0) -> bool:
227
+ """
228
+ selector의 nth(index) 요소가 DOM에 존재하는지(attached) 여부를 반환한다.
229
+ 요소가 없거나 접근 중 예외가 발생하면 False를 반환한다.
230
+ """
231
+ try:
232
+ loc = self._page.locator(selector).nth(index)
233
+ return await loc.count() > 0
234
+ except Exception:
235
+ return False
236
+
237
+ async def computed_style(self, selector: str, *, index: int = 0, prop: str) -> str:
238
+ """
239
+ selector의 nth(index) 요소에 대해,
240
+ CSS 계산값(getComputedStyle)의 특정 속성(prop)을 문자열로 반환한다.
241
+ (예: display, visibility, opacity 등)
242
+ """
243
+ loc = self._page.locator(selector).nth(index)
244
+ # attached 보장하고 싶으면 여기서 wait_for(state="attached") 추가 가능
245
+ return await loc.evaluate(
246
+ "(el, prop) => getComputedStyle(el)[prop] || ''", prop
247
+ )
248
+
249
+ async def count_in_nth(
250
+ self,
251
+ scope_selector: str,
252
+ *,
253
+ scope_index: int,
254
+ inner_selector: str,
255
+ ) -> int:
256
+ """
257
+ scope_selector의 nth(scope_index) 범위 안에서
258
+ inner_selector에 매칭되는 요소 개수를 반환한다.
259
+ """
260
+ scope = self._page.locator(scope_selector).nth(scope_index)
261
+ return await scope.locator(inner_selector).count()
262
+
263
+ async def eval_in_nth_first(
264
+ self,
265
+ scope_selector: str,
266
+ *,
267
+ scope_index: int,
268
+ inner_selector: str,
269
+ expression: str,
270
+ ) -> Any:
271
+ """
272
+ scope(nth) 내부의 inner_selector.first element를 잡고 JS expression을 실행한다.
273
+
274
+ expression 예:
275
+ - "el => window.getComputedStyle(el).display"
276
+ - "el => el.getAttribute('data-content') || ''"
277
+ - "el => el.innerText"
278
+ """
279
+ scope = self._page.locator(scope_selector).nth(scope_index)
280
+ loc = scope.locator(inner_selector).first
281
+
282
+ # 매칭되는 게 없으면 None
283
+ if await loc.count() == 0:
284
+ return None
285
+
286
+ return await loc.evaluate(expression)
287
+
288
+ async def inner_text_in_nth(
289
+ self,
290
+ scope_selector: str,
291
+ *,
292
+ scope_index: int,
293
+ inner_selector: str,
294
+ inner_index: int = 0,
295
+ timeout_ms: int = 10_000,
296
+ ) -> str:
297
+ """
298
+ scope(nth) 내부에서 inner_selector(nth)의 innerText를 반환.
299
+ - innerText: 렌더링 기준(줄바꿈/숨김 반영)
300
+ """
301
+ scope = self._page.locator(scope_selector).nth(scope_index)
302
+ inner = scope.locator(inner_selector).nth(inner_index)
303
+
304
+ # 요소가 늦게 뜨는 케이스 대응
305
+ await inner.wait_for(state="attached", timeout=timeout_ms)
306
+
307
+ try:
308
+ return (await inner.inner_text()) or ""
309
+ except Exception:
310
+ # inner_text 자체가 실패하는 순간(사라짐/리렌더)도 있어서 안전하게
311
+ return ""
312
+
313
+ async def text_content_in_nth(
314
+ self,
315
+ scope_selector: str,
316
+ *,
317
+ scope_index: int,
318
+ inner_selector: str,
319
+ inner_index: int = 0,
320
+ timeout_ms: int = 10_000,
321
+ ) -> str:
322
+ """
323
+ scope(nth) 내부에서 inner_selector(nth)의 textContent를 반환.
324
+ - textContent: DOM 기준(숨김 텍스트 포함 가능)
325
+ """
326
+ scope = self._page.locator(scope_selector).nth(scope_index)
327
+ inner = scope.locator(inner_selector).nth(inner_index)
328
+
329
+ await inner.wait_for(state="attached", timeout=timeout_ms)
330
+
331
+ try:
332
+ return (await inner.text_content()) or ""
333
+ except Exception:
334
+ return ""
335
+
336
+ async def table_records(
337
+ self,
338
+ table_selector: str,
339
+ *,
340
+ header: int | list[int] | None = 0,
341
+ ) -> list[dict[str, Any]]:
342
+ await self.wait_attached(table_selector)
343
+
344
+ table = self._page.locator(table_selector).first
345
+ html = await table.evaluate("el => el.outerHTML")
346
+
347
+ try:
348
+ df = pd.read_html(StringIO(html), header=header)[0]
349
+ except Exception as e:
350
+ raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
351
+
352
+ # 문자열 컬럼일 때만 정규화
353
+ if all(isinstance(c, str) for c in df.columns):
354
+ if "항목" in df.columns:
355
+ df["항목"] = (
356
+ df["항목"].astype(str).str.replace("펼치기", "").str.strip()
357
+ )
358
+
359
+ df.columns = (
360
+ df.columns.astype(str)
361
+ .str.replace("연간컨센서스보기", "", regex=False)
362
+ .str.replace("연간컨센서스닫기", "", regex=False)
363
+ .str.replace("(IFRS연결)", "", regex=False)
364
+ .str.replace("(IFRS별도)", "", regex=False)
365
+ .str.replace("(GAAP개별)", "", regex=False)
366
+ .str.replace("(YoY)", "", regex=False)
367
+ .str.replace("(QoQ)", "", regex=False)
368
+ .str.replace("(E)", "", regex=False)
369
+ .str.replace(".", "", regex=False)
370
+ .str.strip()
371
+ )
372
+
373
+ return df.where(pd.notnull(df), None).to_dict(orient="records")
@@ -1,4 +1,4 @@
1
- # scraper2/adapters/out/playwright/browser_factory.py
1
+ # scraper2_hj3415/app/adapters/out/playwright/browser_factory.py
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
@@ -6,10 +6,10 @@ from contextlib import asynccontextmanager
6
6
  from dataclasses import dataclass
7
7
  from typing import AsyncIterator
8
8
 
9
- from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
- from scraper2.app.ports.browser.browser_port import BrowserPort
11
- from scraper2.adapters.out.playwright.session import PlaywrightPageSession
12
- from scraper2.adapters.out.playwright.browser import PlaywrightBrowser
9
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2_hj3415.app.ports.browser.browser_port import BrowserPort
11
+ from scraper2_hj3415.app.adapters.out.playwright.session import PlaywrightPageSession
12
+ from scraper2_hj3415.app.adapters.out.playwright.browser import PlaywrightBrowser
13
13
 
14
14
 
15
15
  @dataclass
@@ -1,4 +1,4 @@
1
- # src/scraper2/adapters/out/playwright/session.py
1
+ # scraper2_hj3415/app/adapters/out/playwright/session.py
2
2
  from __future__ import annotations
3
3
 
4
4
  import os
@@ -0,0 +1,25 @@
1
+ # scraper2_hj3415/app/adapters/out/sinks/memory_sink.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Iterable
5
+
6
+ from contracts_hj3415.nfs.nfs_dto import NfsDTO
7
+ from contracts_hj3415.nfs.types import Endpoints
8
+
9
+ from scraper2_hj3415.app.adapters.out.sinks.store import InMemoryStore
10
+
11
+
12
+ class MemorySink:
13
+ def __init__(self, store: InMemoryStore[NfsDTO]):
14
+ self._store = store
15
+
16
+ async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
17
+ await self._store.put(endpoint, dto.code, dto)
18
+
19
+ async def write_many(
20
+ self,
21
+ dtos: Iterable[NfsDTO],
22
+ *,
23
+ endpoint: Endpoints,
24
+ ) -> None:
25
+ await self._store.put_many(endpoint, ((d.code, d) for d in dtos))
@@ -0,0 +1,63 @@
1
+ # scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+ from typing import Iterable
6
+
7
+ from pymongo.asynchronous.database import AsyncDatabase
8
+
9
+ from contracts_hj3415.nfs.nfs_dto import NfsDTO
10
+ from contracts_hj3415.nfs.types import Endpoints
11
+
12
+ from db2_hj3415.nfs.repo import (
13
+ upsert_latest_payload,
14
+ upsert_latest_payload_many,
15
+ insert_snapshot_payload,
16
+ insert_snapshots_payload_many,
17
+ )
18
+
19
+
20
+ class MongoSink:
21
+ def __init__(self, db: AsyncDatabase):
22
+ self._db = db
23
+
24
+ async def write(self, dto: NfsDTO, *, endpoint: Endpoints) -> None:
25
+ code = str(dto.code).strip()
26
+ if not code:
27
+ return
28
+
29
+ payload = dict(dto.payload) # Mapping 방어
30
+
31
+ await upsert_latest_payload(
32
+ self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
33
+ )
34
+ await insert_snapshot_payload(
35
+ self._db, endpoint=endpoint, code=code, payload=payload, asof=dto.asof
36
+ )
37
+
38
+ async def write_many(
39
+ self,
40
+ dtos: Iterable[NfsDTO],
41
+ *,
42
+ endpoint: Endpoints,
43
+ ) -> None:
44
+ items: dict[str, dict] = {}
45
+ ts: datetime | None = None
46
+
47
+ for dto in dtos:
48
+ code = str(dto.code).strip()
49
+ if not code:
50
+ continue
51
+ items[code] = dict(dto.payload)
52
+ if ts is None:
53
+ ts = dto.asof # 첫 dto의 asof를 배치 기준으로
54
+
55
+ if not items:
56
+ return
57
+
58
+ await upsert_latest_payload_many(
59
+ self._db, endpoint=endpoint, items=items, asof=ts
60
+ )
61
+ await insert_snapshots_payload_many(
62
+ self._db, endpoint=endpoint, items=items, asof=ts
63
+ )
@@ -1,29 +1,34 @@
1
- # scraper2/adapters/out/sinks/memory/store.py
1
+ # scraper2_hj3415/app/adapters/out/sinks/store.py
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
5
  from collections import defaultdict, deque
6
6
  from dataclasses import dataclass
7
- from typing import Any, Deque, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
7
+ from typing import Deque, Dict, Generic, Iterable, List, Optional, Tuple, TypeVar
8
8
 
9
9
  T = TypeVar("T") # DTO 타입
10
10
 
11
+
11
12
  @dataclass(frozen=True)
12
13
  class StoreStats:
13
14
  endpoint: str
14
15
  latest_count: int
15
16
  history_count: int
16
17
 
18
+
17
19
  class InMemoryStore(Generic[T]):
18
20
  """
19
21
  endpoint별로 DTO를 저장한다.
20
22
  - latest: endpoint -> key(보통 코드) -> dto
21
23
  - history: endpoint -> deque[dto] (최근 max_history개)
22
24
  """
25
+
23
26
  def __init__(self, *, max_history: int = 2000):
24
27
  self._lock = asyncio.Lock()
25
28
  self._max_history = max_history
26
- self._history: Dict[str, Deque[T]] = defaultdict(lambda: deque(maxlen=max_history))
29
+ self._history: Dict[str, Deque[T]] = defaultdict(
30
+ lambda: deque(maxlen=max_history)
31
+ )
27
32
  self._latest: Dict[str, Dict[str, T]] = defaultdict(dict)
28
33
 
29
34
  # ---------- write ----------
@@ -62,7 +67,11 @@ class InMemoryStore(Generic[T]):
62
67
  async with self._lock:
63
68
  latest_count = len(self._latest.get(endpoint, {}))
64
69
  history_count = len(self._history.get(endpoint, []))
65
- return StoreStats(endpoint=endpoint, latest_count=latest_count, history_count=history_count)
70
+ return StoreStats(
71
+ endpoint=endpoint,
72
+ latest_count=latest_count,
73
+ history_count=history_count,
74
+ )
66
75
 
67
76
  async def clear(self, endpoint: str | None = None) -> None:
68
77
  async with self._lock:
@@ -71,4 +80,4 @@ class InMemoryStore(Generic[T]):
71
80
  self._latest.clear()
72
81
  else:
73
82
  self._history.pop(endpoint, None)
74
- self._latest.pop(endpoint, None)
83
+ self._latest.pop(endpoint, None)