scraper2-hj3415 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -80
  60. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.0.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.0.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,122 @@
1
+ # scraper2_hj3415/app/usecases/ingest/ingest_c108.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+ from typing import Iterable, Optional, cast
6
+
7
+ from scraper2_hj3415.app.services.fetch.fetch_c108 import FetchC108
8
+ from scraper2_hj3415.app.ports.sinks.nfs_sink_port import NfsSinkPort
9
+ from common_hj3415.utils.time import utcnow
10
+
11
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
12
+ from scraper2_hj3415.app.domain.constants import get_block_keys
13
+ from scraper2_hj3415.app.domain.doc import NfsDoc
14
+ from scraper2_hj3415.app.domain.blocks import RecordsBlock
15
+
16
+ from contracts_hj3415.nfs.types import Endpoints
17
+
18
+ from contracts_hj3415.nfs.c108_dto import C108DTO, C108Payload, C108Blocks
19
+
20
+ from logging_hj3415 import logger
21
+
22
+ endpoint_kind = EndpointKind.C108
23
+ endpoint: Endpoints = cast(Endpoints, endpoint_kind.value)
24
+
25
+
26
+ def _to_list_of_dict(rows: object) -> list[dict]:
27
+ """
28
+ RecordsBlock.rows(Sequence[Mapping]) -> list[dict]
29
+ - sink/serialization 안전하게 dict로 강제
30
+ """
31
+ if not rows:
32
+ return []
33
+ out: list[dict] = []
34
+ if isinstance(rows, list):
35
+ for r in rows:
36
+ if isinstance(r, dict):
37
+ out.append(r)
38
+ else:
39
+ out.append(dict(r)) # Mapping이면 dict() 가능
40
+ return out
41
+
42
+ # Sequence[Mapping] 일반 케이스
43
+ try:
44
+ for r in rows: # type: ignore[assignment]
45
+ out.append(dict(r)) # Mapping 가정
46
+ except Exception:
47
+ return []
48
+ return out
49
+
50
+
51
+ def c108_doc_to_dto(*, doc: NfsDoc, asof: datetime) -> C108DTO:
52
+ """
53
+ NfsDoc(domain) -> C108DTO(contracts envelope)
54
+
55
+ 규칙:
56
+ - labels는 항상 존재(빈 dict라도)
57
+ - c108은 labels를 비우는 것이 정상
58
+ - payload.blocks['리포트'] = list[dict]
59
+ """
60
+ if doc.endpoint_kind != EndpointKind.C108:
61
+ raise ValueError(f"c108_doc_to_dto expects C108 doc, got: {doc.endpoint_kind}")
62
+
63
+ # contracts payload 구조에 맞게: blocks/labels를 항상 구성
64
+ blocks: C108Blocks = {"리포트": []}
65
+
66
+ # block_keys를 따르되, 실질적으로는 '리포트' 하나만 있어도 충분
67
+ for bk in get_block_keys(EndpointKind.C108):
68
+ if bk != "리포트":
69
+ continue
70
+
71
+ block = doc.blocks.get(bk)
72
+ if isinstance(block, RecordsBlock):
73
+ blocks["리포트"] = _to_list_of_dict(block.rows)
74
+ else:
75
+ # 혹시 구조가 섞였으면 최대한 안전하게 빈 값
76
+ blocks["리포트"] = []
77
+
78
+ payload: C108Payload = {"blocks": blocks}
79
+
80
+ return C108DTO(
81
+ code=doc.code,
82
+ asof=asof,
83
+ endpoint=endpoint,
84
+ payload=payload,
85
+ )
86
+
87
+
88
+ class IngestC108:
89
+ def __init__(self, fetch: FetchC108, sink: NfsSinkPort[C108DTO]):
90
+ self.fetch = fetch
91
+ self.sink = sink
92
+
93
+ async def execute(
94
+ self, code: str, *, sleep_sec: float = 2.0, asof: datetime | None = None
95
+ ) -> C108DTO:
96
+ asof = asof or utcnow()
97
+
98
+ doc = await self.fetch.execute(code, sleep_sec=sleep_sec)
99
+ logger.debug(f"doc:\n{doc}")
100
+ if doc is None:
101
+ raise RuntimeError(f"c108 fetch returned None: code={code}")
102
+
103
+ dto = c108_doc_to_dto(doc=doc, asof=asof)
104
+ logger.debug(f"dto:\n{dto}")
105
+
106
+ await self.sink.write(dto, endpoint=endpoint)
107
+ return dto
108
+
109
+ async def execute_many(
110
+ self,
111
+ codes: Iterable[str],
112
+ *,
113
+ sleep_sec: float = 2.0,
114
+ asof: Optional[datetime] = None,
115
+ ) -> list[C108DTO]:
116
+ batch_asof = asof or utcnow()
117
+
118
+ docs = await self.fetch.execute_many(codes, sleep_sec=sleep_sec)
119
+ dtos = [c108_doc_to_dto(doc=d, asof=batch_asof) for d in docs]
120
+ logger.debug(f"dtos:\n{dtos}")
121
+ await self.sink.write_many(dtos, endpoint=endpoint)
122
+ return dtos
@@ -1,21 +1,32 @@
1
- # scraper2/main.py
1
+ # scraper2_hj3415/cli.py
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
- from typing import Any, Literal, cast
5
+ from typing import Any, cast, get_args
6
6
 
7
+
8
+ import time
7
9
  import typer
8
10
  from datetime import datetime, timezone
9
11
 
10
- from scraper2.app.composition import build_usecases
11
- from scraper2.app.ports.ingest_port import IngestPort
12
+ from db2_hj3415.nfs.repo import ensure_indexes
13
+ from db2_hj3415.settings import get_settings
14
+ from db2_hj3415.universe.repo import list_universe_codes
15
+
16
+ from scraper2_hj3415.app.composition import build_usecases
17
+ from scraper2_hj3415.app.ports.ingest.nfs_ingest_port import NfsIngestPort
18
+ from scraper2_hj3415.app.domain.types import Sink
12
19
 
13
- from logging_hj3415 import setup_logging
20
+ from contracts_hj3415.nfs.types import Endpoints
21
+ from contracts_hj3415.universe.types import UniverseNames
22
+
23
+ from logging_hj3415 import setup_logging, current_log_level, reset_logging, to_pretty_json
14
24
 
15
25
  setup_logging()
26
+ # 운영시에는 아래 항목 주석처리하고 환경변수로 제어할것
27
+ reset_logging("DEBUG")
28
+ print(f"Current log level - {current_log_level()}")
16
29
 
17
- Endpoint = Literal["c101", "c103", "c104", "c106", "c108", "all"]
18
- Sink = Literal["memory", "mongo"]
19
30
 
20
31
  app = typer.Typer(no_args_is_help=True)
21
32
 
@@ -30,64 +41,19 @@ app.add_typer(mi_app, name="mi")
30
41
  # small helpers
31
42
  # -------------------------
32
43
 
33
- def _endpoint_list(ep: Endpoint) -> list[str]:
34
- return ["c101", "c103", "c104", "c106", "c108"] if ep == "all" else [ep]
35
-
36
-
37
- async def _maybe_await_close(obj: Any) -> None:
38
- close = getattr(obj, "close", None)
39
- if close is None:
40
- return
41
- out = close()
42
- if asyncio.iscoroutine(out):
43
- await out
44
-
44
+ def _endpoint_list(endpoint: str) -> list[str]:
45
+ if endpoint == "all":
46
+ return list(get_args(Endpoints)) # -> ["c101", "c103", "c104", "c106", "c108"]
47
+ return [endpoint]
45
48
 
46
49
  async def _mongo_bootstrap(db) -> None:
47
- from db2.nfs import ensure_indexes
48
- from db2.settings import get_settings
49
-
50
50
  s = get_settings()
51
51
  await ensure_indexes(db, snapshot_ttl_days=s.SNAPSHOT_TTL_DAYS)
52
52
 
53
-
54
- async def _load_codes_from_universe(db, *, universe: str) -> list[str]:
55
- """
56
- db2.universe에 저장된 universe_latest에서 codes 로드.
57
- (네 db2 API 명에 맞춰 조정하면 됨)
58
- """
59
- from db2.universe import get_universe_latest # 네가 가진 API
60
-
61
- doc = await get_universe_latest(db, universe=universe)
62
- if not doc:
63
- return []
64
-
65
- # doc 형태가 {"items":[{code,name,...}, ...]} 혹은 {"payload":{"items":[...]}} 일 수 있어서 방어
66
- data = doc
67
- if isinstance(data, dict) and "payload" in data and isinstance(data["payload"], dict):
68
- data = data["payload"]
69
- if isinstance(data, dict) and "items" in data:
70
- data = data["items"]
71
-
72
- if not isinstance(data, list):
73
- return []
74
-
75
- codes: list[str] = []
76
- for row in data:
77
- if not isinstance(row, dict):
78
- continue
79
- code = str(row.get("code") or "").strip()
80
- if code:
81
- codes.append(code)
82
- return codes
83
-
84
-
85
- import time
86
-
87
53
  async def _run_ingest_with_progress(
88
54
  *,
89
55
  ucs: Any,
90
- endpoint: Endpoint,
56
+ endpoint: str,
91
57
  codes: list[str],
92
58
  sleep_sec: float,
93
59
  show: bool,
@@ -109,7 +75,7 @@ async def _run_ingest_with_progress(
109
75
  yield xs[i:i + n]
110
76
 
111
77
  async def _run_one_endpoint(ep: str) -> None:
112
- ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
78
+ ingest_uc = cast(NfsIngestPort, getattr(ucs.ingest, ep))
113
79
 
114
80
  ok = 0
115
81
  fail = 0
@@ -138,18 +104,6 @@ async def _run_ingest_with_progress(
138
104
  elapsed = time.perf_counter() - t0 # ✅ 종료 시각
139
105
  typer.echo(f"\n⏱ elapsed time: {_format_elapsed(elapsed)}")
140
106
 
141
- def _dto_to_pretty(obj: Any) -> str:
142
- # pydantic v2 우선
143
- if hasattr(obj, "model_dump_json"):
144
- return obj.model_dump_json(indent=2, by_alias=False)
145
- if hasattr(obj, "model_dump"):
146
- import json
147
- return json.dumps(obj.model_dump(), ensure_ascii=False, indent=2)
148
- # dict fallback
149
- if isinstance(obj, dict):
150
- import json
151
- return json.dumps(obj, ensure_ascii=False, indent=2, default=str)
152
- return str(obj)
153
107
 
154
108
  def _format_elapsed(sec: float) -> str:
155
109
  if sec < 60:
@@ -199,11 +153,11 @@ def _parse_asof(asof: str | None) -> datetime:
199
153
 
200
154
  @nfs_app.command("one")
201
155
  def nfs_one(
202
- endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
156
+ endpoint: str = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
203
157
  code: str = typer.Argument(..., help="종목코드 (예: 005930)"),
204
158
  sleep_sec: float = typer.Option(2.0, "--sleep"),
205
159
  sink: Sink = typer.Option("memory", "--sink"),
206
- show: bool = typer.Option(True, "--show/--no-show", help="결과 DTO 출력"),
160
+ show: bool = typer.Option(False, "--show/--no-show", help="결과 DTO 출력"),
207
161
  asof: str | None = typer.Option(None, "--asof", help="배치 기준시각(ISO8601, UTC 권장). 예: 2026-01-09T05:00:00Z"),
208
162
  ):
209
163
  code = code.strip()
@@ -211,7 +165,7 @@ def nfs_one(
211
165
  raise typer.BadParameter("code는 비어있을 수 없습니다.")
212
166
 
213
167
  async def _run():
214
- ucs = build_usecases(sink_kind=sink)
168
+ ucs = build_usecases(sink=sink)
215
169
 
216
170
  if sink == "mongo":
217
171
  if ucs.db is None:
@@ -221,17 +175,23 @@ def nfs_one(
221
175
  try:
222
176
  run_asof = _parse_asof(asof)
223
177
  for ep in _endpoint_list(endpoint):
224
- ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
178
+ ingest_uc = cast(NfsIngestPort, getattr(ucs.ingest, ep))
225
179
  results = await ingest_uc.execute_many([code], sleep_sec=sleep_sec, asof=run_asof)
226
180
  dto = results[0] if results else None
227
181
 
228
182
  typer.echo(f"\n=== ONE DONE: {ep} {code} ===")
229
- if not show:
183
+ is_memory_sink = sink == "memory"
184
+ should_show = show or is_memory_sink
185
+
186
+ if not should_show:
230
187
  continue
188
+
231
189
  if dto is None:
232
190
  typer.echo("(no result)")
233
191
  else:
234
- typer.echo(_dto_to_pretty(dto))
192
+ if is_memory_sink:
193
+ typer.echo("memory result:")
194
+ typer.echo(to_pretty_json(dto))
235
195
  finally:
236
196
  await ucs.aclose()
237
197
 
@@ -240,7 +200,7 @@ def nfs_one(
240
200
 
241
201
  @nfs_app.command("all")
242
202
  def nfs_all(
243
- endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
203
+ endpoint: str = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
244
204
  universe: str = typer.Option("krx300", "--universe"),
245
205
  limit: int = typer.Option(0, "--limit", help="0=전체"),
246
206
  sleep_sec: float = typer.Option(2.0, "--sleep"),
@@ -251,12 +211,12 @@ def nfs_all(
251
211
  asof: str | None = typer.Option(None, "--asof", help="배치 기준시각(ISO8601). 예: 2026-01-09T05:00:00Z"),
252
212
  ):
253
213
  async def _run():
254
- ucs = build_usecases(sink_kind=sink)
214
+ ucs = build_usecases(sink=sink)
255
215
  if ucs.db is None:
256
216
  raise RuntimeError("all 모드는 DB가 필요합니다. mongo sink로 ucs.db를 노출하세요.")
257
217
  await _mongo_bootstrap(ucs.db)
258
218
 
259
- codes = await _load_codes_from_universe(ucs.db, universe=universe)
219
+ codes = await list_universe_codes(ucs.db, universe=cast(UniverseNames, universe))
260
220
  if not codes:
261
221
  raise RuntimeError(f"universe='{universe}' codes가 비었습니다. 먼저 krx sync로 universe를 채우세요.")
262
222
 
@@ -291,4 +251,4 @@ def mi():
291
251
 
292
252
 
293
253
  if __name__ == "__main__":
294
- app()
254
+ app()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scraper2-hj3415
3
- Version: 2.4.0
3
+ Version: 2.6.0
4
4
  Summary: Naver WiseReport scraper
5
5
  Keywords: example,demo
6
6
  Author-email: Hyungjin Kim <hj3415@gmail.com>
@@ -17,6 +17,8 @@ Requires-Dist: lxml>=6.0.2
17
17
  Requires-Dist: typer>=0.21.0
18
18
  Requires-Dist: db2-hj3415
19
19
  Requires-Dist: contracts-hj3415
20
+ Requires-Dist: common-hj3415
21
+ Requires-Dist: logging-hj3415
20
22
 
21
23
  # scraper2
22
24
 
@@ -0,0 +1,75 @@
1
+ scraper2_hj3415/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ scraper2_hj3415/cli.py,sha256=idwTPbCBcqof2YVRsBzyBMnl92o2b5JRyMbzOxOXKZA,8068
3
+ scraper2_hj3415/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ scraper2_hj3415/app/composition.py,sha256=t9NnNDLL6-VT28dT_kBtc_4Sd6qdR_cGKcY2_wmQHMI,6573
5
+ scraper2_hj3415/app/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ scraper2_hj3415/app/adapters/out/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scraper2_hj3415/app/adapters/out/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ scraper2_hj3415/app/adapters/out/playwright/browser.py,sha256=mbPNLynkTHkNHLqOG7UxVo7UclfNkHbNyN4M9xrDGOk,13152
9
+ scraper2_hj3415/app/adapters/out/playwright/browser_factory.py,sha256=Tp30xdE4Z7cWHBCCgvaqD3tqY4qyg2ij_YmuLmj2WUg,3744
10
+ scraper2_hj3415/app/adapters/out/playwright/session.py,sha256=GLlpO0rLQwXnxX1GyaRxy7P2UsYrceNczfhUvx3pepE,3734
11
+ scraper2_hj3415/app/adapters/out/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ scraper2_hj3415/app/adapters/out/sinks/memory_sink.py,sha256=VlywXRmWjfh33yuE7AuwKU4aPi_UJAZQMSWELlHJ-l8,722
13
+ scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py,sha256=GS61lV1gaxdxC9o7Z9APAPs9WnWL6-qa_V5Ls245yNk,1749
14
+ scraper2_hj3415/app/adapters/out/sinks/store.py,sha256=yerl6NvaacVHDGnQ1Obc31aQFxxXUCWfGq2DKKV8aTc,2826
15
+ scraper2_hj3415/app/adapters/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ scraper2_hj3415/app/adapters/site/wisereport_playwright.py,sha256=lpvhy6Rdy9j762KE0udhxgIVXi0n2pvDbmbs2y0WPHs,6970
17
+ scraper2_hj3415/app/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ scraper2_hj3415/app/domain/blocks.py,sha256=ddrGYo12hRI3h3mLsGzrM9RUZuEabqdfBWsx5l8jVGs,1871
19
+ scraper2_hj3415/app/domain/constants.py,sha256=-BxAKH6smxE5jEHJsXmf32erk0UlisRz8BkYHXYTSgA,1181
20
+ scraper2_hj3415/app/domain/doc.py,sha256=G9Ik6f-HiNZjiT26Obtrx6HTyYsTbMg1vKaBQ536hOI,483
21
+ scraper2_hj3415/app/domain/endpoint.py,sha256=8nwV0ybBYDKS8ULngNvc_dh-Pl4BqpMJrdKHDuThSTA,222
22
+ scraper2_hj3415/app/domain/series.py,sha256=KZqHqavPkL8p14F58wuUrOS-N9k0pKfKBdhDL6kTYU4,300
23
+ scraper2_hj3415/app/domain/types.py,sha256=_fPII4xFc_zTWuVm-V_SdaB092XR2OeS0sNdJVwE5t8,374
24
+ scraper2_hj3415/app/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ scraper2_hj3415/app/parsing/c101_parser.py,sha256=fA7-EUlG3GhLlX8l7m9p-Bn5O2WQY0H9h7IAIDvlmGQ,2049
26
+ scraper2_hj3415/app/parsing/c103_parser.py,sha256=Tn5tfpUh4lf6IMsKW9JAp2b12W32vKcF8FPfzEJYvtY,770
27
+ scraper2_hj3415/app/parsing/c104_parser.py,sha256=NfwFcgNZb6EqCcDYRoWiMq281CzATELJsDO_Np64Clk,814
28
+ scraper2_hj3415/app/parsing/c106_parser.py,sha256=DvXv_OndsWed4LOyv8D9bsCxbj8_6rYrfR6ICR-VBnM,4346
29
+ scraper2_hj3415/app/parsing/c108_parser.py,sha256=Kopf3CAV4W66YR6at7isoNV-C8A7-eCOQcqPs85FgEE,7572
30
+ scraper2_hj3415/app/parsing/_normalize/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ scraper2_hj3415/app/parsing/_normalize/label.py,sha256=yUtUalOlXuckzsQ7RXqdQ6F4Q2UmVBGr-zoEpZ6ryX0,2752
32
+ scraper2_hj3415/app/parsing/_normalize/table.py,sha256=V6I79gOHeRsyFiuIXyEy0Whg-pxZeDdTaGh0cdR5mrE,1464
33
+ scraper2_hj3415/app/parsing/_normalize/text.py,sha256=BnBZyaQiuydsQVUSDgIEn8JYYbxrM-3BZTmvNqiFK3g,683
34
+ scraper2_hj3415/app/parsing/_normalize/values.py,sha256=X5H7xprg5y8pkXilXCg_br7UIPjErcLHGDkOrxjctbk,1824
35
+ scraper2_hj3415/app/parsing/_tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ scraper2_hj3415/app/parsing/_tables/html_table.py,sha256=m44eA0rVhhk1QL74fgR2dbldKuF1_K5mJr2xiyb-55U,2393
37
+ scraper2_hj3415/app/parsing/c101/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ scraper2_hj3415/app/parsing/c101/_sise_normalizer.py,sha256=0wG9AZ6MYvWMf_UA9QK3_TbyDctdi3ldyKNKwmLJJic,3150
39
+ scraper2_hj3415/app/parsing/c101/company_overview.py,sha256=R6K44Rlw9Iv740SR2XZDxBkZxsLi_SNB6n3xruO3xqk,1391
40
+ scraper2_hj3415/app/parsing/c101/earning_surprise.py,sha256=QqiVVrdJuQXm25mvm0AsQB7gi461IiUbAoD8iCamUjg,7028
41
+ scraper2_hj3415/app/parsing/c101/fundamentals.py,sha256=3Fy6ya53NF-VoXa16GDpqUTdoFo2PIEjt5rjlXNa8sI,2875
42
+ scraper2_hj3415/app/parsing/c101/major_shareholders.py,sha256=sF1j1VNZSoIkQYHmuhMWSx52l00WDf6El2NkiRoXW0o,2047
43
+ scraper2_hj3415/app/parsing/c101/sise.py,sha256=Mky6pLWZ_LZkeUMHIPcZfrv0icTNxWEc7uTYKU2uJ0M,1314
44
+ scraper2_hj3415/app/parsing/c101/summary_cmp.py,sha256=hhBCtH7hgAKFUh4gr7J-mz-6c9NLT9KZODFY8LTG-Fc,2776
45
+ scraper2_hj3415/app/parsing/c101/yearly_consensus.py,sha256=FCLA-pYCMbQffNYOV6YbZ8GnPJjyZHmCSIKdw9-EPuI,5572
46
+ scraper2_hj3415/app/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
+ scraper2_hj3415/app/ports/browser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ scraper2_hj3415/app/ports/browser/browser_factory_port.py,sha256=exG5XM3yen84lqsY0ggpZ58kcyCyoaMusDMooVxCGG0,353
49
+ scraper2_hj3415/app/ports/browser/browser_port.py,sha256=VUxMDsrKWBAi1TVD8b-PbsHkCSjZ9ZMgsr3eVmhb_1I,3628
50
+ scraper2_hj3415/app/ports/ingest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
+ scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py,sha256=Ia8GByRLV9SPEU89I8A4V6tmUwsO8f7xM7-yVxnsA0o,658
52
+ scraper2_hj3415/app/ports/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
+ scraper2_hj3415/app/ports/sinks/nfs_sink_port.py,sha256=8EOdFH5-703yc8XP47PZ0mmdizg4d_kAzuR1-G5b4MY,522
54
+ scraper2_hj3415/app/ports/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
+ scraper2_hj3415/app/ports/site/wisereport_port.py,sha256=ufbYJ1jyNkSmlIbV1CqI8BekxjgGgWvxj8yb73ZRUU0,663
56
+ scraper2_hj3415/app/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
+ scraper2_hj3415/app/services/nfs_doc_builders.py,sha256=bz2Is3xXlM98gtd1QQUqgeoqWk2EwbuWNCJw5oJXkg8,8874
58
+ scraper2_hj3415/app/services/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
+ scraper2_hj3415/app/services/fetch/fetch_c101.py,sha256=nl96RKvahNS9YtusE4e9AFMb0Wf7UHaW2X6qAAzYuCY,2228
60
+ scraper2_hj3415/app/services/fetch/fetch_c103.py,sha256=PKJwtZLJDfVNjj5l8nzx9EJOtqH1zR8OF-pOiUlMrRc,5148
61
+ scraper2_hj3415/app/services/fetch/fetch_c104.py,sha256=djl7Z_CW58gpl8naMezumc_dbjc3l-EX6OQJeWU_ZAw,6549
62
+ scraper2_hj3415/app/services/fetch/fetch_c106.py,sha256=UIAMaQB-FHXsvQ0ONI3fTO22M3npXrHILxO_4ayY2Lk,3462
63
+ scraper2_hj3415/app/services/fetch/fetch_c108.py,sha256=o9GesH66jqCygZIEbrHlVwHGGK1_2Ilc2_1b24fSC54,2236
64
+ scraper2_hj3415/app/usecases/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
+ scraper2_hj3415/app/usecases/ingest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ scraper2_hj3415/app/usecases/ingest/ingest_c101.py,sha256=nLhdG27NsuZqxqQI6bWRcFI2T7BZV-QXmUldql6qCo8,3760
67
+ scraper2_hj3415/app/usecases/ingest/ingest_c103.py,sha256=g36Kc1DK8v0AsyN_FCJq_A1Lv1EwrZ1vaS7qMCZTSEQ,5405
68
+ scraper2_hj3415/app/usecases/ingest/ingest_c104.py,sha256=CfS1HsnnHt4N-N3YhtiZjahCE-SryHpWXoTT4-AyCac,5959
69
+ scraper2_hj3415/app/usecases/ingest/ingest_c106.py,sha256=pmGTO-Obp5Lw-a27BVII0eyxyBAIiFVok7xiBqLEJXk,4301
70
+ scraper2_hj3415/app/usecases/ingest/ingest_c108.py,sha256=6iDhJAjzLnqHiNBKDvNLsQxrnJcG5I68SdE-dojUMJY,3817
71
+ scraper2_hj3415-2.6.0.dist-info/entry_points.txt,sha256=jNGmOvBmptIUr9_XUMQOH4s6jNKCU51jOhKd31gOe8c,52
72
+ scraper2_hj3415-2.6.0.dist-info/licenses/LICENSE,sha256=QBiVGQuKAESeCfQE344Ik2ex6g2zfYdu9WqrRWydxIs,1068
73
+ scraper2_hj3415-2.6.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
74
+ scraper2_hj3415-2.6.0.dist-info/METADATA,sha256=Xw3KsE6SxvOs_CJvsXF2-x4ebhkAssA7qVD0uApNeBw,3516
75
+ scraper2_hj3415-2.6.0.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ scraper2=scraper2_hj3415.cli:app
3
+
scraper2/.DS_Store DELETED
Binary file
Binary file
@@ -1,102 +0,0 @@
1
- # src/scraper2/adapters/out/playwright/session.py
2
- from __future__ import annotations
3
- from typing import Any
4
- from io import StringIO
5
- import pandas as pd
6
- from playwright.async_api import Page
7
-
8
- class PlaywrightBrowser:
9
- def __init__(self, page: Page):
10
- self.page = page
11
-
12
- async def goto(self, url: str, timeout_ms: int = 10_000) -> None:
13
- await self.page.goto(url, timeout=timeout_ms)
14
-
15
- async def title(self) -> str:
16
- return await self.page.title()
17
-
18
- async def current_url(self) -> str:
19
- return self.page.url
20
-
21
- async def wait(self, selector: str, timeout_ms: int = 10_000) -> None:
22
- await self.page.wait_for_selector(selector, timeout=timeout_ms, state="attached")
23
-
24
- async def text(self, selector: str) -> str:
25
- await self.wait(selector)
26
- return (await self.page.locator(selector).first.text_content()) or ""
27
-
28
- async def texts(self, selector: str) -> list[str]:
29
- await self.wait(selector)
30
- loc = self.page.locator(selector)
31
- items = await loc.all()
32
- out: list[str] = []
33
- for it in items:
34
- out.append((await it.text_content()) or "")
35
- return out
36
-
37
- async def text_first_by_text(self, needle: str) -> str:
38
- return (await self.page.get_by_text(needle).first.text_content()) or ""
39
-
40
- async def inner_text(self, selector: str) -> str:
41
- await self.wait(selector)
42
- return await self.page.locator(selector).first.inner_text()
43
-
44
- async def click(self, selector: str) -> None:
45
- await self.wait(selector)
46
- await self.page.locator(selector).click()
47
-
48
- async def table_records(
49
- self,
50
- table_selector: str,
51
- *,
52
- header: int | list[int] = 0
53
- ) -> list[dict[str, Any]]:
54
- await self.wait(table_selector)
55
-
56
- table = self.page.locator(table_selector).first
57
- html = await table.evaluate("el => el.outerHTML") # <table> 포함
58
- #print(html)
59
-
60
- try:
61
- df = pd.read_html(StringIO(html), header=header)[0]
62
- #print(df.head(3))
63
- except Exception as e:
64
- # ImportError(lxml 없음), ValueError 등 모두 여기서 잡아서 원인 노출
65
- raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
66
-
67
- if header == 0:
68
- if "항목" in df.columns:
69
- df["항목"] = df["항목"].astype(str).str.replace("펼치기", "").str.strip()
70
-
71
- df.columns = (
72
- df.columns.astype(str)
73
- .str.replace("연간컨센서스보기", "", regex=False)
74
- .str.replace("연간컨센서스닫기", "", regex=False)
75
- .str.replace("(IFRS연결)", "", regex=False)
76
- .str.replace("(IFRS별도)", "", regex=False)
77
- .str.replace("(GAAP개별)", "", regex=False)
78
- .str.replace("(YoY)", "", regex=False)
79
- .str.replace("(QoQ)", "", regex=False)
80
- .str.replace("(E)", "", regex=False)
81
- .str.replace(".", "", regex=False)
82
- .str.strip()
83
- )
84
-
85
- # NaN -> None 처리
86
- records: list[dict[str, Any]] = df.where(pd.notnull(df), None).to_dict(orient="records")
87
- return records
88
-
89
- async def outer_html(self, selector: str) -> str:
90
- loc = self.page.locator(selector).first
91
- return await loc.evaluate("el => el.outerHTML")
92
-
93
- async def all_texts(self, selector: str) -> list[str]:
94
- # selector는 css도 되고, "xpath=..." 도 됨
95
- loc = self.page.locator(selector)
96
- return await loc.all_text_contents()
97
-
98
- async def outer_html_nth(self, selector: str, index: int) -> str:
99
- loc = self.page.locator(selector).nth(index)
100
- # index가 범위를 벗어나면 playwright가 에러를 내는데,
101
- # 필요하면 여기서 더 친절한 에러로 감싸도 됨.
102
- return await loc.evaluate("el => el.outerHTML")
Binary file
@@ -1,15 +0,0 @@
1
- # scraper2/adapters/out/sinks/memory/__init__.py
2
- from .c101_memory_sink import MemoryC101Sink
3
- from .c103_memory_sink import MemoryC103Sink
4
- from .c104_memory_sink import MemoryC104Sink
5
- from .c106_memory_sink import MemoryC106Sink
6
- from .c108_memory_sink import MemoryC108Sink
7
-
8
- __all__ = [
9
- "MemoryC101Sink",
10
- "MemoryC103Sink",
11
- "MemoryC104Sink",
12
- "MemoryC106Sink",
13
- "MemoryC108Sink",
14
- ]
15
-
@@ -1,26 +0,0 @@
1
- #scraper2/adapters/out/sinks/memory/c101_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c101 import C101DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
10
-
11
- _ENDPOINT = "c101"
12
-
13
- class MemoryC101Sink(C101SinkPort):
14
- def __init__(self, store: InMemoryStore[C101DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C101DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C101DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,26 +0,0 @@
1
- # scraper2/adapters/out/sinks/memory/c103_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c103 import C103DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
10
-
11
- _ENDPOINT = "c103"
12
-
13
- class MemoryC103Sink(C103SinkPort):
14
- def __init__(self, store: InMemoryStore[C103DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C103DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C103DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,26 +0,0 @@
1
- # scraper2/adapters/out/sinks/memory/c104_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c104 import C104DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
10
-
11
- _ENDPOINT = "c104"
12
-
13
- class MemoryC104Sink(C104SinkPort):
14
- def __init__(self, store: InMemoryStore[C104DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C104DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C104DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,26 +0,0 @@
1
- #scraper2/adapters/out/sinks/memory/c106_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c106 import C106DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
10
-
11
- _ENDPOINT = "c106"
12
-
13
- class MemoryC106Sink(C106SinkPort):
14
- def __init__(self, store: InMemoryStore[C106DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C106DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C106DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))