scraper2-hj3415 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -80
  60. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.0.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.0.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,183 @@
1
+ # scraper2_hj3415/app/usecases/fetch/fetch_c104.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import random
6
+ from typing import Iterable, Any
7
+
8
+ from logging_hj3415 import logger
9
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2_hj3415.app.ports.site.wisereport_port import WiseReportPort
11
+ from scraper2_hj3415.app.adapters.site.wisereport_playwright import WiseReportPlaywright
12
+
13
+ from scraper2_hj3415.app.parsing.c104_parser import (
14
+ parse_c104_current_table,
15
+ TABLE_XPATH,
16
+ )
17
+ from scraper2_hj3415.app.services.nfs_doc_builders import build_metrics_doc_from_parsed
18
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
19
+ from scraper2_hj3415.app.domain.doc import NfsDoc
20
+ from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
21
+
22
+
23
+ BTN_SETS: dict[str, list[tuple[str, str]]] = {
24
+ "수익성y": [
25
+ ("수익성", 'xpath=//*[ @id="val_tab1"]'),
26
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
27
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
28
+ ],
29
+ "성장성y": [
30
+ ("성장성", 'xpath=//*[ @id="val_tab2"]'),
31
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
32
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
33
+ ],
34
+ "안정성y": [
35
+ ("안정성", 'xpath=//*[ @id="val_tab3"]'),
36
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
37
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
38
+ ],
39
+ "활동성y": [
40
+ ("활동성", 'xpath=//*[ @id="val_tab4"]'),
41
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
42
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
43
+ ],
44
+ "가치분석y": [
45
+ ("가치분석연간", 'xpath=//*[@id="frqTyp0_2"]'),
46
+ ("가치분석검색", 'xpath=//*[@id="hfinGubun2"]'),
47
+ ],
48
+ "수익성q": [
49
+ ("수익성", 'xpath=//*[ @id="val_tab1"]'),
50
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
51
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
52
+ ],
53
+ "성장성q": [
54
+ ("성장성", 'xpath=//*[ @id="val_tab2"]'),
55
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
56
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
57
+ ],
58
+ "안정성q": [
59
+ ("안정성", 'xpath=//*[ @id="val_tab3"]'),
60
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
61
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
62
+ ],
63
+ "활동성q": [
64
+ ("활동성", 'xpath=//*[ @id="val_tab4"]'),
65
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
66
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
67
+ ],
68
+ "가치분석q": [
69
+ ("가치분석분기", 'xpath=//*[@id="frqTyp1_2"]'),
70
+ ("가치분석검색", 'xpath=//*[@id="hfinGubun2"]'),
71
+ ],
72
+ }
73
+
74
+
75
+ def _is_value_analysis(key: str) -> bool:
76
+ return key.startswith("가치분석")
77
+
78
+
79
+ def _table_index_for_key(key: str) -> int:
80
+ # ✅ 네 주석대로 가치분석만 1, 나머지 0
81
+ return 1 if _is_value_analysis(key) else 0
82
+
83
+
84
+ class FetchC104:
85
+ def __init__(self, factory: BrowserFactoryPort):
86
+ self.factory = factory
87
+
88
+ async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
89
+ async with self.factory.lease() as browser:
90
+ wr: WiseReportPort = WiseReportPlaywright(browser)
91
+
92
+ url = (
93
+ "https://navercomp.wisereport.co.kr/v2/company/c1040001.aspx"
94
+ f"?cn=&cmp_cd={code}"
95
+ )
96
+ await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
97
+
98
+ if sleep_sec > 0:
99
+ await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
100
+
101
+ parsed: dict[str, list[dict[str, Any]]] = {}
102
+
103
+ # ✅ table index별로 prev_text를 따로 들고가야 안정적
104
+ prev_text_by_idx: dict[int, str | None] = {0: None, 1: None}
105
+
106
+ # ✅ 최초 baseline 확보(둘 다 시도)
107
+ for idx in (0, 1):
108
+ try:
109
+ prev_text_by_idx[idx] = await browser.wait_table_text_changed(
110
+ TABLE_XPATH,
111
+ index=idx,
112
+ prev_text=None,
113
+ min_rows=5,
114
+ min_lines=50,
115
+ timeout_sec=10.0,
116
+ )
117
+ except Exception:
118
+ prev_text_by_idx[idx] = None
119
+
120
+ for key, steps in BTN_SETS.items():
121
+ idx = _table_index_for_key(key)
122
+
123
+ # ✅ 상태 전환(행동)
124
+ await wr.click_steps(steps, jitter_sec=0.6)
125
+ await wr.ensure_yearly_consensus_open_in_table_nth(
126
+ table_selector=TABLE_XPATH,
127
+ table_index=idx,
128
+ )
129
+
130
+ # ✅ 데이터 변경 대기(행동) - idx별로 추적
131
+ prev_text_by_idx[idx] = await browser.wait_table_text_changed(
132
+ TABLE_XPATH,
133
+ index=idx,
134
+ prev_text=prev_text_by_idx[idx],
135
+ min_rows=5,
136
+ min_lines=50,
137
+ timeout_sec=12.0,
138
+ )
139
+
140
+ # ✅ 파싱은 “현재 화면의 idx 테이블 1개”만
141
+ try:
142
+ parsed[key] = await parse_c104_current_table(
143
+ browser,
144
+ table_index=idx,
145
+ )
146
+ except Exception:
147
+ parsed[key] = []
148
+
149
+ logger.debug(f"parsed:\n{parsed}")
150
+
151
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C104]
152
+ if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
153
+ logger.warning(
154
+ f"c104 fetch: parsed result empty; return None | code={code}"
155
+ )
156
+ return None
157
+
158
+ doc = build_metrics_doc_from_parsed(
159
+ code=code,
160
+ endpoint_kind=EndpointKind.C104,
161
+ parsed=parsed,
162
+ block_keys=block_keys,
163
+ item_key="항목",
164
+ raw_label_key="항목_raw",
165
+ keep_empty_blocks=True,
166
+ )
167
+ logger.debug(f"c104 doc: {doc}")
168
+ return doc
169
+
170
+ async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
171
+ return await self._fetch_one(code, sleep_sec=sleep_sec)
172
+
173
+ async def execute_many(
174
+ self,
175
+ codes: Iterable[str],
176
+ *,
177
+ sleep_sec: float = 2.0,
178
+ ) -> list[NfsDoc]:
179
+ results = await asyncio.gather(
180
+ *(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes),
181
+ return_exceptions=False,
182
+ )
183
+ return [r for r in results if r is not None]
@@ -0,0 +1,90 @@
1
+ # scraper2_hj3415/app/usecases/fetch/fetch_c106.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import random
6
+ from typing import Iterable, Any
7
+
8
+ from logging_hj3415 import logger
9
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2_hj3415.app.parsing.c106_parser import (
11
+ parse_c106_header_codes,
12
+ parse_c106_current_table,
13
+ )
14
+ from scraper2_hj3415.app.services.nfs_doc_builders import build_metrics_doc_from_parsed
15
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
16
+ from scraper2_hj3415.app.domain.doc import NfsDoc
17
+ from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
18
+
19
+
20
+ class FetchC106:
21
+ def __init__(self, factory: BrowserFactoryPort):
22
+ self.factory = factory
23
+
24
+ async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
25
+ async with self.factory.lease() as browser:
26
+ # 1) 헤더 코드 추출용(기준 페이지)
27
+ url0 = (
28
+ "https://navercomp.wisereport.co.kr/v2/company/c1060001.aspx"
29
+ f"?cn=&cmp_cd={code}"
30
+ )
31
+ await browser.goto_and_wait_for_stable(url0, timeout_ms=10_000)
32
+
33
+ if sleep_sec > 0:
34
+ await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
35
+
36
+ header_codes = await parse_c106_header_codes(browser)
37
+ if not header_codes:
38
+ logger.warning(f"c106 fetch: header codes empty; code={code}")
39
+ return None
40
+
41
+ base_url = (
42
+ "https://navercomp.wisereport.co.kr/v2/company/cF6002.aspx"
43
+ f"?cmp_cd={code}&finGubun=MAIN&sec_cd=FG000&frq="
44
+ )
45
+
46
+ parsed: dict[str, list[dict[str, Any]]] = {}
47
+
48
+ for frq in ("q", "y"):
49
+ url = base_url + frq
50
+ await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
51
+
52
+ # 기존 지터 유지(필요하면 정책화)
53
+ await asyncio.sleep(0.5 + random.uniform(0, 0.3))
54
+
55
+ parsed[frq] = await parse_c106_current_table(
56
+ browser,
57
+ columns=header_codes,
58
+ table_selector="#cTB611",
59
+ table_index=0,
60
+ timeout_ms=10_000,
61
+ )
62
+
63
+ logger.debug(f"parsed:\n{parsed}")
64
+
65
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C106]
66
+ if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
67
+ logger.warning(f"c106 fetch: parsed result empty; return None | code={code}")
68
+ return None
69
+
70
+ doc = build_metrics_doc_from_parsed(
71
+ code=code,
72
+ endpoint_kind=EndpointKind.C106,
73
+ parsed=parsed,
74
+ block_keys=block_keys,
75
+ item_key="항목",
76
+ raw_label_key="항목_raw",
77
+ keep_empty_blocks=True,
78
+ )
79
+ logger.debug(f"c106 doc: {doc}")
80
+ return doc
81
+
82
+ async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
83
+ return await self._fetch_one(code, sleep_sec=sleep_sec)
84
+
85
+ async def execute_many(self, codes: Iterable[str], *, sleep_sec: float = 2.0) -> list[NfsDoc]:
86
+ results = await asyncio.gather(
87
+ *(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes),
88
+ return_exceptions=False,
89
+ )
90
+ return [r for r in results if r is not None]
@@ -0,0 +1,59 @@
1
+ # scraper2_hj3415/app/usecases/fetch/fetch_c108.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import random
6
+ from typing import Iterable
7
+
8
+ from logging_hj3415 import logger
9
+ from scraper2_hj3415.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2_hj3415.app.parsing.c108_parser import parse_c108_to_dict
11
+ from scraper2_hj3415.app.services.nfs_doc_builders import build_c108_doc_from_parsed
12
+
13
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
14
+ from scraper2_hj3415.app.domain.doc import NfsDoc
15
+ from scraper2_hj3415.app.domain.blocks import BLOCK_KEYS_BY_ENDPOINT
16
+
17
+
18
+ class FetchC108:
19
+ def __init__(self, factory: BrowserFactoryPort):
20
+ self.factory = factory
21
+
22
+ async def _fetch_one(self, code: str, *, sleep_sec: float) -> NfsDoc | None:
23
+ async with self.factory.lease() as browser:
24
+ url = f"https://navercomp.wisereport.co.kr/v2/company/c1080001.aspx?cn=&cmp_cd={code}"
25
+ await browser.goto_and_wait_for_stable(url, timeout_ms=10_000)
26
+
27
+ if sleep_sec > 0:
28
+ await asyncio.sleep(sleep_sec + random.uniform(0, 1.0))
29
+
30
+ parsed = await parse_c108_to_dict(browser=browser)
31
+
32
+ logger.debug(f"parsed:\n{parsed}")
33
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[EndpointKind.C108]
34
+ if not parsed or all(not (parsed.get(str(bk)) or []) for bk in block_keys):
35
+ logger.warning(
36
+ f"c108 fetch: parsed result empty; return None | code={code}"
37
+ )
38
+ return None
39
+
40
+ doc = build_c108_doc_from_parsed(
41
+ code=code, parsed=parsed, keep_empty_blocks=True
42
+ )
43
+ logger.debug(f"c108 doc: {doc}")
44
+ return doc
45
+
46
+ async def execute(self, code: str, *, sleep_sec: float = 2.0) -> NfsDoc | None:
47
+ return await self._fetch_one(code, sleep_sec=sleep_sec)
48
+
49
+ async def execute_many(
50
+ self,
51
+ codes: Iterable[str],
52
+ *,
53
+ sleep_sec: float = 2.0,
54
+ ) -> list[NfsDoc]:
55
+ results = await asyncio.gather(
56
+ *(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes),
57
+ return_exceptions=False,
58
+ )
59
+ return [r for r in results if r is not None]
@@ -0,0 +1,290 @@
1
+ # scraper2_hj3415/app/services/nfs_doc_builders.py
2
+ from __future__ import annotations
3
+
4
+ from collections import defaultdict
5
+ from typing import Mapping, Iterable, Any
6
+
7
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
8
+ from scraper2_hj3415.app.domain.constants import BLOCK_KEYS_BY_ENDPOINT
9
+ from scraper2_hj3415.app.domain.doc import NfsDoc
10
+ from scraper2_hj3415.app.domain.blocks import MetricsBlock, RecordsBlock, KvBlock, BlockData
11
+ from scraper2_hj3415.app.domain.series import MetricSeries
12
+ from scraper2_hj3415.app.domain.types import LabelsMap, MetricKey, Period, Num, BlockKey, Records, Record
13
+
14
+ from common_hj3415.utils import nan_to_none
15
+
16
+
17
+ def is_all_none(row: dict[str, Any]) -> bool:
18
+ return all(v is None for v in row.values())
19
+
20
+
21
+ ParsedBlocks = Mapping[
22
+ str, Any
23
+ ] # parser가 반환한 "block_key(str) -> rows(list[dict])"
24
+
25
+
26
+ def build_metrics_block_and_labels_from_rows(
27
+ *,
28
+ endpoint_kind: EndpointKind,
29
+ block_key: BlockKey,
30
+ rows: Records,
31
+ item_key: str = "항목",
32
+ raw_label_key: str = "항목_raw",
33
+ ) -> tuple[MetricsBlock, LabelsMap]:
34
+ """
35
+ rows(list[dict]) -> (MetricsBlock, LabelsMap)
36
+ c103/c104/c106 공통 빌더.
37
+
38
+ - Metric key는 item_key(보통 '항목')에서 만들고,
39
+ 기간 컬럼들은 {Period: Num}으로 유지한다.
40
+ - LabelsMap은 dto_key -> raw_label(정제된 원라벨)
41
+ """
42
+ grouped: dict[str, list[tuple[dict[Period, Num], str]]] = defaultdict(list)
43
+
44
+ for r in rows:
45
+ item = r.get(item_key)
46
+ if not item:
47
+ continue
48
+
49
+ raw_label = r.get(raw_label_key)
50
+ if raw_label is None:
51
+ raw_label = item
52
+
53
+ per_map: dict[Period, Num] = {
54
+ str(k): nan_to_none(v)
55
+ for k, v in r.items()
56
+ if k not in (item_key, raw_label_key)
57
+ }
58
+
59
+ grouped[item].append((per_map, raw_label))
60
+
61
+ series_map: dict[MetricKey, MetricSeries] = {}
62
+ labels_map: LabelsMap = {}
63
+
64
+ for item, pairs in grouped.items():
65
+ if len(pairs) == 1:
66
+ per_map, raw_label = pairs[0]
67
+ series_map[item] = MetricSeries(key=item, values=per_map)
68
+ labels_map[item] = raw_label
69
+ continue
70
+
71
+ kept = [(per_map, raw) for (per_map, raw) in pairs if not is_all_none(per_map)]
72
+ if not kept:
73
+ continue
74
+
75
+ for idx, (per_map, raw_label) in enumerate(kept, start=1):
76
+ mk = item if idx == 1 else f"{item}_{idx}"
77
+ series_map[mk] = MetricSeries(key=mk, values=per_map)
78
+ labels_map[mk] = raw_label
79
+
80
+ block = MetricsBlock(
81
+ endpoint_kind=endpoint_kind, block_key=block_key, metrics=series_map
82
+ )
83
+ return block, labels_map
84
+
85
+
86
+ def build_metrics_doc_from_parsed(
87
+ *,
88
+ code: str,
89
+ endpoint_kind: EndpointKind,
90
+ parsed: ParsedBlocks,
91
+ block_keys: Iterable[BlockKey] | None = None,
92
+ item_key: str = "항목",
93
+ raw_label_key: str = "항목_raw",
94
+ keep_empty_blocks: bool = True,
95
+ ) -> NfsDoc:
96
+ """
97
+ parser가 만든 dict(블록키 -> rows)를 받아서 NfsDoc(=MetricsBlock들)로 조립.
98
+ - c103/c104/c106 공용으로 사용 가능.
99
+
100
+ keep_empty_blocks:
101
+ - True: block은 항상 생성 (metrics 비어도 block 존재)
102
+ - False: rows가 없거나 metrics가 비면 blocks에서 제외
103
+ """
104
+ if block_keys is None:
105
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[endpoint_kind]
106
+
107
+ blocks: dict[BlockKey, MetricsBlock] = {}
108
+ labels: dict[BlockKey, LabelsMap] = {}
109
+
110
+ for bk in block_keys:
111
+ rows = parsed.get(str(bk), []) or []
112
+ block, lm = build_metrics_block_and_labels_from_rows(
113
+ endpoint_kind=endpoint_kind,
114
+ block_key=bk,
115
+ rows=rows,
116
+ item_key=item_key,
117
+ raw_label_key=raw_label_key,
118
+ )
119
+
120
+ if not keep_empty_blocks and not block.metrics:
121
+ continue
122
+
123
+ blocks[bk] = block
124
+ labels[bk] = lm # 비어있어도 {}로 유지
125
+
126
+ return NfsDoc(code=code, endpoint_kind=endpoint_kind, blocks=blocks, labels=labels)
127
+
128
+
129
+ def _as_records(x: Any) -> Records:
130
+ """
131
+ 안전하게 rows를 Records(=Sequence[Record])로 캐스팅/정리.
132
+ - None/비정상 값이면 빈 리스트
133
+ - list[dict] 형태만 통과시키고 나머지는 필터
134
+ """
135
+ if not x:
136
+ return []
137
+ if not isinstance(x, list):
138
+ return []
139
+
140
+ out: list[Record] = []
141
+ for it in x:
142
+ if isinstance(it, dict):
143
+ out.append(it)
144
+ return out
145
+
146
+
147
+ def build_records_block_from_rows(
148
+ *,
149
+ endpoint_kind: EndpointKind,
150
+ block_key: BlockKey,
151
+ rows: Records,
152
+ ) -> RecordsBlock:
153
+ """
154
+ rows(list[dict]) -> RecordsBlock
155
+ - c108 같은 레코드성 블록(리포트 목록 등)에 사용
156
+ """
157
+ # RecordsBlock 쪽에서도 __post_init__로 block_key 검증이 수행된다는 전제(네가 정돈한 도메인)
158
+ return RecordsBlock(endpoint_kind=endpoint_kind, block_key=block_key, rows=list(rows))
159
+
160
+
161
+ def build_c108_doc_from_parsed(
162
+ *,
163
+ code: str,
164
+ parsed: ParsedBlocks,
165
+ block_keys: Iterable[BlockKey] | None = None,
166
+ keep_empty_blocks: bool = True,
167
+ ) -> NfsDoc:
168
+ """
169
+ c108 parser 결과(dict)를 받아서 NfsDoc(=RecordsBlock들)로 조립.
170
+
171
+ 규칙(너가 정한 원칙):
172
+ - labels는 항상 존재(빈 dict라도)
173
+ - c108은 labels를 비우는 것을 정상으로 간주
174
+
175
+ keep_empty_blocks:
176
+ - True: block은 항상 생성(rows 비어도 block 존재)
177
+ - False: rows가 비면 blocks에서 제외
178
+ """
179
+ endpoint_kind = EndpointKind.C108
180
+
181
+ if block_keys is None:
182
+ # 보통 ("리포트",) 같은 튜플
183
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[endpoint_kind]
184
+
185
+ blocks: dict[BlockKey, RecordsBlock] = {}
186
+ labels: dict[BlockKey, LabelsMap] = {}
187
+
188
+ for bk in block_keys:
189
+ rows = _as_records(parsed.get(str(bk)))
190
+ block = build_records_block_from_rows(
191
+ endpoint_kind=endpoint_kind,
192
+ block_key=bk,
193
+ rows=rows,
194
+ )
195
+
196
+ if not keep_empty_blocks and not block.rows:
197
+ continue
198
+
199
+ blocks[bk] = block
200
+ labels[bk] = {} # c108은 labels 비우는 것이 정상
201
+
202
+ return NfsDoc(
203
+ code=code,
204
+ endpoint_kind=endpoint_kind,
205
+ blocks=blocks,
206
+ labels=labels,
207
+ )
208
+
209
+ def build_kv_block_from_mapping(
210
+ *,
211
+ endpoint_kind: EndpointKind,
212
+ block_key: BlockKey,
213
+ data: Mapping[str, Any] | None,
214
+ keep_empty: bool = True,
215
+ ) -> KvBlock | None:
216
+ """
217
+ dict 형태 블록을 KvBlock으로 감싼다.
218
+ - c101 요약/시세/기업개요/펀더멘털/어닝서프라이즈 같은 "구조 dict"에 사용
219
+ """
220
+ if not data:
221
+ if not keep_empty:
222
+ return None
223
+ data = {}
224
+
225
+ return KvBlock(endpoint_kind=endpoint_kind, block_key=block_key, values=data)
226
+
227
+
228
+ ParsedC101 = Mapping[str, Any] # c101은 dict/list/dict(중첩) 섞여서 Any가 현실적
229
+
230
+
231
+ def build_c101_doc_from_parsed(
232
+ *,
233
+ code: str,
234
+ parsed: ParsedC101,
235
+ block_keys: Iterable[BlockKey] | None = None,
236
+ keep_empty_blocks: bool = True,
237
+ ) -> NfsDoc:
238
+ """
239
+ c101 parser 결과(블록별 다양한 타입)를 NfsDoc으로 조립.
240
+ labels는 c101은 '비어도 정상' 규칙을 따르므로 항상 {}로 둔다.
241
+ """
242
+ endpoint_kind = EndpointKind.C101
243
+
244
+ if block_keys is None:
245
+ block_keys = BLOCK_KEYS_BY_ENDPOINT[endpoint_kind]
246
+
247
+ blocks: dict[BlockKey, BlockData] = {}
248
+ labels: dict[BlockKey, LabelsMap] = {}
249
+
250
+ for bk in block_keys:
251
+ v = parsed.get(str(bk))
252
+
253
+ # c101 규칙: labels는 비어도 정상, 있으면 넣는 게 아니라 "기본 비움" 추천
254
+ labels[bk] = {}
255
+
256
+ # list -> RecordsBlock
257
+ if isinstance(v, list):
258
+ # v: list[dict[str, Any]] 가정 (파서가 그렇게 만들고 있음)
259
+ rb = build_records_block_from_rows(
260
+ endpoint_kind=endpoint_kind,
261
+ block_key=bk,
262
+ rows=v, # type: ignore[arg-type] (rows 타입 맞추면 제거 가능)
263
+ )
264
+ if rb is not None:
265
+ blocks[bk] = rb
266
+ continue
267
+
268
+ # dict(중첩 포함) -> KvBlock
269
+ if isinstance(v, dict):
270
+ kb = build_kv_block_from_mapping(
271
+ endpoint_kind=endpoint_kind,
272
+ block_key=bk,
273
+ data=v,
274
+ keep_empty=keep_empty_blocks,
275
+ )
276
+ if kb is not None:
277
+ blocks[bk] = kb
278
+ continue
279
+
280
+ # None/기타 -> empty policy
281
+ if keep_empty_blocks:
282
+ kb = build_kv_block_from_mapping(
283
+ endpoint_kind=endpoint_kind,
284
+ block_key=bk,
285
+ data={},
286
+ keep_empty=True,
287
+ )
288
+ blocks[bk] = kb
289
+
290
+ return NfsDoc(code=code, endpoint_kind=endpoint_kind, blocks=blocks, labels=labels)
File without changes
File without changes