scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -70
  60. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,122 @@
1
+ # scraper2_hj3415/app/usecases/ingest/ingest_c108.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+ from typing import Iterable, Optional, cast
6
+
7
+ from scraper2_hj3415.app.services.fetch.fetch_c108 import FetchC108
8
+ from scraper2_hj3415.app.ports.sinks.nfs_sink_port import NfsSinkPort
9
+ from common_hj3415.utils.time import utcnow
10
+
11
+ from scraper2_hj3415.app.domain.endpoint import EndpointKind
12
+ from scraper2_hj3415.app.domain.constants import get_block_keys
13
+ from scraper2_hj3415.app.domain.doc import NfsDoc
14
+ from scraper2_hj3415.app.domain.blocks import RecordsBlock
15
+
16
+ from contracts_hj3415.nfs.types import Endpoints
17
+
18
+ from contracts_hj3415.nfs.c108_dto import C108DTO, C108Payload, C108Blocks
19
+
20
+ from logging_hj3415 import logger
21
+
22
+ endpoint_kind = EndpointKind.C108
23
+ endpoint: Endpoints = cast(Endpoints, endpoint_kind.value)
24
+
25
+
26
+ def _to_list_of_dict(rows: object) -> list[dict]:
27
+ """
28
+ RecordsBlock.rows(Sequence[Mapping]) -> list[dict]
29
+ - sink/serialization 안전하게 dict로 강제
30
+ """
31
+ if not rows:
32
+ return []
33
+ out: list[dict] = []
34
+ if isinstance(rows, list):
35
+ for r in rows:
36
+ if isinstance(r, dict):
37
+ out.append(r)
38
+ else:
39
+ out.append(dict(r)) # Mapping이면 dict() 가능
40
+ return out
41
+
42
+ # Sequence[Mapping] 일반 케이스
43
+ try:
44
+ for r in rows: # type: ignore[assignment]
45
+ out.append(dict(r)) # Mapping 가정
46
+ except Exception:
47
+ return []
48
+ return out
49
+
50
+
51
+ def c108_doc_to_dto(*, doc: NfsDoc, asof: datetime) -> C108DTO:
52
+ """
53
+ NfsDoc(domain) -> C108DTO(contracts envelope)
54
+
55
+ 규칙:
56
+ - labels는 항상 존재(빈 dict라도)
57
+ - c108은 labels를 비우는 것이 정상
58
+ - payload.blocks['리포트'] = list[dict]
59
+ """
60
+ if doc.endpoint_kind != EndpointKind.C108:
61
+ raise ValueError(f"c108_doc_to_dto expects C108 doc, got: {doc.endpoint_kind}")
62
+
63
+ # contracts payload 구조에 맞게: blocks/labels를 항상 구성
64
+ blocks: C108Blocks = {"리포트": []}
65
+
66
+ # block_keys를 따르되, 실질적으로는 '리포트' 하나만 있어도 충분
67
+ for bk in get_block_keys(EndpointKind.C108):
68
+ if bk != "리포트":
69
+ continue
70
+
71
+ block = doc.blocks.get(bk)
72
+ if isinstance(block, RecordsBlock):
73
+ blocks["리포트"] = _to_list_of_dict(block.rows)
74
+ else:
75
+ # 혹시 구조가 섞였으면 최대한 안전하게 빈 값
76
+ blocks["리포트"] = []
77
+
78
+ payload: C108Payload = {"blocks": blocks}
79
+
80
+ return C108DTO(
81
+ code=doc.code,
82
+ asof=asof,
83
+ endpoint=endpoint,
84
+ payload=payload,
85
+ )
86
+
87
+
88
+ class IngestC108:
89
+ def __init__(self, fetch: FetchC108, sink: NfsSinkPort[C108DTO]):
90
+ self.fetch = fetch
91
+ self.sink = sink
92
+
93
+ async def execute(
94
+ self, code: str, *, sleep_sec: float = 2.0, asof: datetime | None = None
95
+ ) -> C108DTO:
96
+ asof = asof or utcnow()
97
+
98
+ doc = await self.fetch.execute(code, sleep_sec=sleep_sec)
99
+ logger.debug(f"doc:\n{doc}")
100
+ if doc is None:
101
+ raise RuntimeError(f"c108 fetch returned None: code={code}")
102
+
103
+ dto = c108_doc_to_dto(doc=doc, asof=asof)
104
+ logger.debug(f"dto:\n{dto}")
105
+
106
+ await self.sink.write(dto, endpoint=endpoint)
107
+ return dto
108
+
109
+ async def execute_many(
110
+ self,
111
+ codes: Iterable[str],
112
+ *,
113
+ sleep_sec: float = 2.0,
114
+ asof: Optional[datetime] = None,
115
+ ) -> list[C108DTO]:
116
+ batch_asof = asof or utcnow()
117
+
118
+ docs = await self.fetch.execute_many(codes, sleep_sec=sleep_sec)
119
+ dtos = [c108_doc_to_dto(doc=d, asof=batch_asof) for d in docs]
120
+ logger.debug(f"dtos:\n{dtos}")
121
+ await self.sink.write_many(dtos, endpoint=endpoint)
122
+ return dtos
@@ -1,21 +1,32 @@
1
- # scraper2/main.py
1
+ # scraper2_hj3415/cli.py
2
2
  from __future__ import annotations
3
3
 
4
4
  import asyncio
5
- from typing import Any, Literal, cast
5
+ from typing import Any, cast, get_args
6
6
 
7
+
8
+ import time
7
9
  import typer
8
10
  from datetime import datetime, timezone
9
11
 
10
- from scraper2.app.composition import build_usecases
11
- from scraper2.app.ports.ingest_port import IngestPort
12
+ from db2_hj3415.nfs.repo import ensure_indexes
13
+ from db2_hj3415.settings import get_settings
14
+ from db2_hj3415.universe.repo import list_universe_codes
15
+
16
+ from scraper2_hj3415.app.composition import build_usecases
17
+ from scraper2_hj3415.app.ports.ingest.nfs_ingest_port import NfsIngestPort
18
+ from scraper2_hj3415.app.domain.types import Sink
12
19
 
13
- from logging_hj3415 import setup_logging
20
+ from contracts_hj3415.nfs.types import Endpoints
21
+ from contracts_hj3415.universe.types import UniverseNames
22
+
23
+ from logging_hj3415 import setup_logging, current_log_level, reset_logging, to_pretty_json
14
24
 
15
25
  setup_logging()
26
+ # 운영시에는 아래 항목 주석처리하고 환경변수로 제어할것
27
+ reset_logging("DEBUG")
28
+ print(f"Current log level - {current_log_level()}")
16
29
 
17
- Endpoint = Literal["c101", "c103", "c104", "c106", "c108", "all"]
18
- Sink = Literal["memory", "mongo"]
19
30
 
20
31
  app = typer.Typer(no_args_is_help=True)
21
32
 
@@ -30,54 +41,19 @@ app.add_typer(mi_app, name="mi")
30
41
  # small helpers
31
42
  # -------------------------
32
43
 
33
- def _endpoint_list(ep: Endpoint) -> list[str]:
34
- return ["c101", "c103", "c104", "c106", "c108"] if ep == "all" else [ep]
44
+ def _endpoint_list(endpoint: str) -> list[str]:
45
+ if endpoint == "all":
46
+ return list(get_args(Endpoints)) # -> ["c101", "c103", "c104", "c106", "c108"]
47
+ return [endpoint]
35
48
 
36
49
  async def _mongo_bootstrap(db) -> None:
37
- from db2.nfs import ensure_indexes
38
- from db2.settings import get_settings
39
-
40
50
  s = get_settings()
41
51
  await ensure_indexes(db, snapshot_ttl_days=s.SNAPSHOT_TTL_DAYS)
42
52
 
43
-
44
- async def _load_codes_from_universe(db, *, universe: str) -> list[str]:
45
- """
46
- db2.universe에 저장된 universe_latest에서 codes 로드.
47
- (네 db2 API 명에 맞춰 조정하면 됨)
48
- """
49
- from db2.universe import get_universe_latest # 네가 가진 API
50
-
51
- doc = await get_universe_latest(db, universe=universe)
52
- if not doc:
53
- return []
54
-
55
- # doc 형태가 {"items":[{code,name,...}, ...]} 혹은 {"payload":{"items":[...]}} 일 수 있어서 방어
56
- data = doc
57
- if isinstance(data, dict) and "payload" in data and isinstance(data["payload"], dict):
58
- data = data["payload"]
59
- if isinstance(data, dict) and "items" in data:
60
- data = data["items"]
61
-
62
- if not isinstance(data, list):
63
- return []
64
-
65
- codes: list[str] = []
66
- for row in data:
67
- if not isinstance(row, dict):
68
- continue
69
- code = str(row.get("code") or "").strip()
70
- if code:
71
- codes.append(code)
72
- return codes
73
-
74
-
75
- import time
76
-
77
53
  async def _run_ingest_with_progress(
78
54
  *,
79
55
  ucs: Any,
80
- endpoint: Endpoint,
56
+ endpoint: str,
81
57
  codes: list[str],
82
58
  sleep_sec: float,
83
59
  show: bool,
@@ -99,7 +75,7 @@ async def _run_ingest_with_progress(
99
75
  yield xs[i:i + n]
100
76
 
101
77
  async def _run_one_endpoint(ep: str) -> None:
102
- ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
78
+ ingest_uc = cast(NfsIngestPort, getattr(ucs.ingest, ep))
103
79
 
104
80
  ok = 0
105
81
  fail = 0
@@ -128,18 +104,6 @@ async def _run_ingest_with_progress(
128
104
  elapsed = time.perf_counter() - t0 # ✅ 종료 시각
129
105
  typer.echo(f"\n⏱ elapsed time: {_format_elapsed(elapsed)}")
130
106
 
131
- def _dto_to_pretty(obj: Any) -> str:
132
- # pydantic v2 우선
133
- if hasattr(obj, "model_dump_json"):
134
- return obj.model_dump_json(indent=2, by_alias=False)
135
- if hasattr(obj, "model_dump"):
136
- import json
137
- return json.dumps(obj.model_dump(), ensure_ascii=False, indent=2)
138
- # dict fallback
139
- if isinstance(obj, dict):
140
- import json
141
- return json.dumps(obj, ensure_ascii=False, indent=2, default=str)
142
- return str(obj)
143
107
 
144
108
  def _format_elapsed(sec: float) -> str:
145
109
  if sec < 60:
@@ -189,11 +153,11 @@ def _parse_asof(asof: str | None) -> datetime:
189
153
 
190
154
  @nfs_app.command("one")
191
155
  def nfs_one(
192
- endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
156
+ endpoint: str = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
193
157
  code: str = typer.Argument(..., help="종목코드 (예: 005930)"),
194
158
  sleep_sec: float = typer.Option(2.0, "--sleep"),
195
159
  sink: Sink = typer.Option("memory", "--sink"),
196
- show: bool = typer.Option(True, "--show/--no-show", help="결과 DTO 출력"),
160
+ show: bool = typer.Option(False, "--show/--no-show", help="결과 DTO 출력"),
197
161
  asof: str | None = typer.Option(None, "--asof", help="배치 기준시각(ISO8601, UTC 권장). 예: 2026-01-09T05:00:00Z"),
198
162
  ):
199
163
  code = code.strip()
@@ -201,7 +165,7 @@ def nfs_one(
201
165
  raise typer.BadParameter("code는 비어있을 수 없습니다.")
202
166
 
203
167
  async def _run():
204
- ucs = build_usecases(sink_kind=sink)
168
+ ucs = build_usecases(sink=sink)
205
169
 
206
170
  if sink == "mongo":
207
171
  if ucs.db is None:
@@ -211,17 +175,23 @@ def nfs_one(
211
175
  try:
212
176
  run_asof = _parse_asof(asof)
213
177
  for ep in _endpoint_list(endpoint):
214
- ingest_uc = cast(IngestPort, getattr(ucs.ingest, ep))
178
+ ingest_uc = cast(NfsIngestPort, getattr(ucs.ingest, ep))
215
179
  results = await ingest_uc.execute_many([code], sleep_sec=sleep_sec, asof=run_asof)
216
180
  dto = results[0] if results else None
217
181
 
218
182
  typer.echo(f"\n=== ONE DONE: {ep} {code} ===")
219
- if not show:
183
+ is_memory_sink = sink == "memory"
184
+ should_show = show or is_memory_sink
185
+
186
+ if not should_show:
220
187
  continue
188
+
221
189
  if dto is None:
222
190
  typer.echo("(no result)")
223
191
  else:
224
- typer.echo(_dto_to_pretty(dto))
192
+ if is_memory_sink:
193
+ typer.echo("memory result:")
194
+ typer.echo(to_pretty_json(dto))
225
195
  finally:
226
196
  await ucs.aclose()
227
197
 
@@ -230,7 +200,7 @@ def nfs_one(
230
200
 
231
201
  @nfs_app.command("all")
232
202
  def nfs_all(
233
- endpoint: Endpoint = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
203
+ endpoint: str = typer.Argument(..., help="c101|c103|c104|c106|c108|all"),
234
204
  universe: str = typer.Option("krx300", "--universe"),
235
205
  limit: int = typer.Option(0, "--limit", help="0=전체"),
236
206
  sleep_sec: float = typer.Option(2.0, "--sleep"),
@@ -241,12 +211,12 @@ def nfs_all(
241
211
  asof: str | None = typer.Option(None, "--asof", help="배치 기준시각(ISO8601). 예: 2026-01-09T05:00:00Z"),
242
212
  ):
243
213
  async def _run():
244
- ucs = build_usecases(sink_kind=sink)
214
+ ucs = build_usecases(sink=sink)
245
215
  if ucs.db is None:
246
216
  raise RuntimeError("all 모드는 DB가 필요합니다. mongo sink로 ucs.db를 노출하세요.")
247
217
  await _mongo_bootstrap(ucs.db)
248
218
 
249
- codes = await _load_codes_from_universe(ucs.db, universe=universe)
219
+ codes = await list_universe_codes(ucs.db, universe=cast(UniverseNames, universe))
250
220
  if not codes:
251
221
  raise RuntimeError(f"universe='{universe}' codes가 비었습니다. 먼저 krx sync로 universe를 채우세요.")
252
222
 
@@ -281,4 +251,4 @@ def mi():
281
251
 
282
252
 
283
253
  if __name__ == "__main__":
284
- app()
254
+ app()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scraper2-hj3415
3
- Version: 2.4.1
3
+ Version: 2.6.0
4
4
  Summary: Naver WiseReport scraper
5
5
  Keywords: example,demo
6
6
  Author-email: Hyungjin Kim <hj3415@gmail.com>
@@ -17,6 +17,8 @@ Requires-Dist: lxml>=6.0.2
17
17
  Requires-Dist: typer>=0.21.0
18
18
  Requires-Dist: db2-hj3415
19
19
  Requires-Dist: contracts-hj3415
20
+ Requires-Dist: common-hj3415
21
+ Requires-Dist: logging-hj3415
20
22
 
21
23
  # scraper2
22
24
 
@@ -0,0 +1,75 @@
1
+ scraper2_hj3415/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ scraper2_hj3415/cli.py,sha256=idwTPbCBcqof2YVRsBzyBMnl92o2b5JRyMbzOxOXKZA,8068
3
+ scraper2_hj3415/app/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ scraper2_hj3415/app/composition.py,sha256=t9NnNDLL6-VT28dT_kBtc_4Sd6qdR_cGKcY2_wmQHMI,6573
5
+ scraper2_hj3415/app/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ scraper2_hj3415/app/adapters/out/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scraper2_hj3415/app/adapters/out/playwright/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ scraper2_hj3415/app/adapters/out/playwright/browser.py,sha256=mbPNLynkTHkNHLqOG7UxVo7UclfNkHbNyN4M9xrDGOk,13152
9
+ scraper2_hj3415/app/adapters/out/playwright/browser_factory.py,sha256=Tp30xdE4Z7cWHBCCgvaqD3tqY4qyg2ij_YmuLmj2WUg,3744
10
+ scraper2_hj3415/app/adapters/out/playwright/session.py,sha256=GLlpO0rLQwXnxX1GyaRxy7P2UsYrceNczfhUvx3pepE,3734
11
+ scraper2_hj3415/app/adapters/out/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ scraper2_hj3415/app/adapters/out/sinks/memory_sink.py,sha256=VlywXRmWjfh33yuE7AuwKU4aPi_UJAZQMSWELlHJ-l8,722
13
+ scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py,sha256=GS61lV1gaxdxC9o7Z9APAPs9WnWL6-qa_V5Ls245yNk,1749
14
+ scraper2_hj3415/app/adapters/out/sinks/store.py,sha256=yerl6NvaacVHDGnQ1Obc31aQFxxXUCWfGq2DKKV8aTc,2826
15
+ scraper2_hj3415/app/adapters/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ scraper2_hj3415/app/adapters/site/wisereport_playwright.py,sha256=lpvhy6Rdy9j762KE0udhxgIVXi0n2pvDbmbs2y0WPHs,6970
17
+ scraper2_hj3415/app/domain/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
+ scraper2_hj3415/app/domain/blocks.py,sha256=ddrGYo12hRI3h3mLsGzrM9RUZuEabqdfBWsx5l8jVGs,1871
19
+ scraper2_hj3415/app/domain/constants.py,sha256=-BxAKH6smxE5jEHJsXmf32erk0UlisRz8BkYHXYTSgA,1181
20
+ scraper2_hj3415/app/domain/doc.py,sha256=G9Ik6f-HiNZjiT26Obtrx6HTyYsTbMg1vKaBQ536hOI,483
21
+ scraper2_hj3415/app/domain/endpoint.py,sha256=8nwV0ybBYDKS8ULngNvc_dh-Pl4BqpMJrdKHDuThSTA,222
22
+ scraper2_hj3415/app/domain/series.py,sha256=KZqHqavPkL8p14F58wuUrOS-N9k0pKfKBdhDL6kTYU4,300
23
+ scraper2_hj3415/app/domain/types.py,sha256=_fPII4xFc_zTWuVm-V_SdaB092XR2OeS0sNdJVwE5t8,374
24
+ scraper2_hj3415/app/parsing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
+ scraper2_hj3415/app/parsing/c101_parser.py,sha256=fA7-EUlG3GhLlX8l7m9p-Bn5O2WQY0H9h7IAIDvlmGQ,2049
26
+ scraper2_hj3415/app/parsing/c103_parser.py,sha256=Tn5tfpUh4lf6IMsKW9JAp2b12W32vKcF8FPfzEJYvtY,770
27
+ scraper2_hj3415/app/parsing/c104_parser.py,sha256=NfwFcgNZb6EqCcDYRoWiMq281CzATELJsDO_Np64Clk,814
28
+ scraper2_hj3415/app/parsing/c106_parser.py,sha256=DvXv_OndsWed4LOyv8D9bsCxbj8_6rYrfR6ICR-VBnM,4346
29
+ scraper2_hj3415/app/parsing/c108_parser.py,sha256=Kopf3CAV4W66YR6at7isoNV-C8A7-eCOQcqPs85FgEE,7572
30
+ scraper2_hj3415/app/parsing/_normalize/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
+ scraper2_hj3415/app/parsing/_normalize/label.py,sha256=yUtUalOlXuckzsQ7RXqdQ6F4Q2UmVBGr-zoEpZ6ryX0,2752
32
+ scraper2_hj3415/app/parsing/_normalize/table.py,sha256=V6I79gOHeRsyFiuIXyEy0Whg-pxZeDdTaGh0cdR5mrE,1464
33
+ scraper2_hj3415/app/parsing/_normalize/text.py,sha256=BnBZyaQiuydsQVUSDgIEn8JYYbxrM-3BZTmvNqiFK3g,683
34
+ scraper2_hj3415/app/parsing/_normalize/values.py,sha256=X5H7xprg5y8pkXilXCg_br7UIPjErcLHGDkOrxjctbk,1824
35
+ scraper2_hj3415/app/parsing/_tables/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ scraper2_hj3415/app/parsing/_tables/html_table.py,sha256=m44eA0rVhhk1QL74fgR2dbldKuF1_K5mJr2xiyb-55U,2393
37
+ scraper2_hj3415/app/parsing/c101/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ scraper2_hj3415/app/parsing/c101/_sise_normalizer.py,sha256=0wG9AZ6MYvWMf_UA9QK3_TbyDctdi3ldyKNKwmLJJic,3150
39
+ scraper2_hj3415/app/parsing/c101/company_overview.py,sha256=R6K44Rlw9Iv740SR2XZDxBkZxsLi_SNB6n3xruO3xqk,1391
40
+ scraper2_hj3415/app/parsing/c101/earning_surprise.py,sha256=QqiVVrdJuQXm25mvm0AsQB7gi461IiUbAoD8iCamUjg,7028
41
+ scraper2_hj3415/app/parsing/c101/fundamentals.py,sha256=3Fy6ya53NF-VoXa16GDpqUTdoFo2PIEjt5rjlXNa8sI,2875
42
+ scraper2_hj3415/app/parsing/c101/major_shareholders.py,sha256=sF1j1VNZSoIkQYHmuhMWSx52l00WDf6El2NkiRoXW0o,2047
43
+ scraper2_hj3415/app/parsing/c101/sise.py,sha256=Mky6pLWZ_LZkeUMHIPcZfrv0icTNxWEc7uTYKU2uJ0M,1314
44
+ scraper2_hj3415/app/parsing/c101/summary_cmp.py,sha256=hhBCtH7hgAKFUh4gr7J-mz-6c9NLT9KZODFY8LTG-Fc,2776
45
+ scraper2_hj3415/app/parsing/c101/yearly_consensus.py,sha256=FCLA-pYCMbQffNYOV6YbZ8GnPJjyZHmCSIKdw9-EPuI,5572
46
+ scraper2_hj3415/app/ports/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
47
+ scraper2_hj3415/app/ports/browser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ scraper2_hj3415/app/ports/browser/browser_factory_port.py,sha256=exG5XM3yen84lqsY0ggpZ58kcyCyoaMusDMooVxCGG0,353
49
+ scraper2_hj3415/app/ports/browser/browser_port.py,sha256=VUxMDsrKWBAi1TVD8b-PbsHkCSjZ9ZMgsr3eVmhb_1I,3628
50
+ scraper2_hj3415/app/ports/ingest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
+ scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py,sha256=Ia8GByRLV9SPEU89I8A4V6tmUwsO8f7xM7-yVxnsA0o,658
52
+ scraper2_hj3415/app/ports/sinks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
+ scraper2_hj3415/app/ports/sinks/nfs_sink_port.py,sha256=8EOdFH5-703yc8XP47PZ0mmdizg4d_kAzuR1-G5b4MY,522
54
+ scraper2_hj3415/app/ports/site/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
+ scraper2_hj3415/app/ports/site/wisereport_port.py,sha256=ufbYJ1jyNkSmlIbV1CqI8BekxjgGgWvxj8yb73ZRUU0,663
56
+ scraper2_hj3415/app/services/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
+ scraper2_hj3415/app/services/nfs_doc_builders.py,sha256=bz2Is3xXlM98gtd1QQUqgeoqWk2EwbuWNCJw5oJXkg8,8874
58
+ scraper2_hj3415/app/services/fetch/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
+ scraper2_hj3415/app/services/fetch/fetch_c101.py,sha256=nl96RKvahNS9YtusE4e9AFMb0Wf7UHaW2X6qAAzYuCY,2228
60
+ scraper2_hj3415/app/services/fetch/fetch_c103.py,sha256=PKJwtZLJDfVNjj5l8nzx9EJOtqH1zR8OF-pOiUlMrRc,5148
61
+ scraper2_hj3415/app/services/fetch/fetch_c104.py,sha256=djl7Z_CW58gpl8naMezumc_dbjc3l-EX6OQJeWU_ZAw,6549
62
+ scraper2_hj3415/app/services/fetch/fetch_c106.py,sha256=UIAMaQB-FHXsvQ0ONI3fTO22M3npXrHILxO_4ayY2Lk,3462
63
+ scraper2_hj3415/app/services/fetch/fetch_c108.py,sha256=o9GesH66jqCygZIEbrHlVwHGGK1_2Ilc2_1b24fSC54,2236
64
+ scraper2_hj3415/app/usecases/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
+ scraper2_hj3415/app/usecases/ingest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
+ scraper2_hj3415/app/usecases/ingest/ingest_c101.py,sha256=nLhdG27NsuZqxqQI6bWRcFI2T7BZV-QXmUldql6qCo8,3760
67
+ scraper2_hj3415/app/usecases/ingest/ingest_c103.py,sha256=g36Kc1DK8v0AsyN_FCJq_A1Lv1EwrZ1vaS7qMCZTSEQ,5405
68
+ scraper2_hj3415/app/usecases/ingest/ingest_c104.py,sha256=CfS1HsnnHt4N-N3YhtiZjahCE-SryHpWXoTT4-AyCac,5959
69
+ scraper2_hj3415/app/usecases/ingest/ingest_c106.py,sha256=pmGTO-Obp5Lw-a27BVII0eyxyBAIiFVok7xiBqLEJXk,4301
70
+ scraper2_hj3415/app/usecases/ingest/ingest_c108.py,sha256=6iDhJAjzLnqHiNBKDvNLsQxrnJcG5I68SdE-dojUMJY,3817
71
+ scraper2_hj3415-2.6.0.dist-info/entry_points.txt,sha256=jNGmOvBmptIUr9_XUMQOH4s6jNKCU51jOhKd31gOe8c,52
72
+ scraper2_hj3415-2.6.0.dist-info/licenses/LICENSE,sha256=QBiVGQuKAESeCfQE344Ik2ex6g2zfYdu9WqrRWydxIs,1068
73
+ scraper2_hj3415-2.6.0.dist-info/WHEEL,sha256=G2gURzTEtmeR8nrdXUJfNiB3VYVxigPQ-bEQujpNiNs,82
74
+ scraper2_hj3415-2.6.0.dist-info/METADATA,sha256=Xw3KsE6SxvOs_CJvsXF2-x4ebhkAssA7qVD0uApNeBw,3516
75
+ scraper2_hj3415-2.6.0.dist-info/RECORD,,
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ scraper2=scraper2_hj3415.cli:app
3
+
scraper2/.DS_Store DELETED
Binary file
Binary file
@@ -1,102 +0,0 @@
1
- # src/scraper2/adapters/out/playwright/session.py
2
- from __future__ import annotations
3
- from typing import Any
4
- from io import StringIO
5
- import pandas as pd
6
- from playwright.async_api import Page
7
-
8
- class PlaywrightBrowser:
9
- def __init__(self, page: Page):
10
- self.page = page
11
-
12
- async def goto(self, url: str, timeout_ms: int = 10_000) -> None:
13
- await self.page.goto(url, timeout=timeout_ms)
14
-
15
- async def title(self) -> str:
16
- return await self.page.title()
17
-
18
- async def current_url(self) -> str:
19
- return self.page.url
20
-
21
- async def wait(self, selector: str, timeout_ms: int = 10_000) -> None:
22
- await self.page.wait_for_selector(selector, timeout=timeout_ms, state="attached")
23
-
24
- async def text(self, selector: str) -> str:
25
- await self.wait(selector)
26
- return (await self.page.locator(selector).first.text_content()) or ""
27
-
28
- async def texts(self, selector: str) -> list[str]:
29
- await self.wait(selector)
30
- loc = self.page.locator(selector)
31
- items = await loc.all()
32
- out: list[str] = []
33
- for it in items:
34
- out.append((await it.text_content()) or "")
35
- return out
36
-
37
- async def text_first_by_text(self, needle: str) -> str:
38
- return (await self.page.get_by_text(needle).first.text_content()) or ""
39
-
40
- async def inner_text(self, selector: str) -> str:
41
- await self.wait(selector)
42
- return await self.page.locator(selector).first.inner_text()
43
-
44
- async def click(self, selector: str) -> None:
45
- await self.wait(selector)
46
- await self.page.locator(selector).click()
47
-
48
- async def table_records(
49
- self,
50
- table_selector: str,
51
- *,
52
- header: int | list[int] = 0
53
- ) -> list[dict[str, Any]]:
54
- await self.wait(table_selector)
55
-
56
- table = self.page.locator(table_selector).first
57
- html = await table.evaluate("el => el.outerHTML") # <table> 포함
58
- #print(html)
59
-
60
- try:
61
- df = pd.read_html(StringIO(html), header=header)[0]
62
- #print(df.head(3))
63
- except Exception as e:
64
- # ImportError(lxml 없음), ValueError 등 모두 여기서 잡아서 원인 노출
65
- raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
66
-
67
- if header == 0:
68
- if "항목" in df.columns:
69
- df["항목"] = df["항목"].astype(str).str.replace("펼치기", "").str.strip()
70
-
71
- df.columns = (
72
- df.columns.astype(str)
73
- .str.replace("연간컨센서스보기", "", regex=False)
74
- .str.replace("연간컨센서스닫기", "", regex=False)
75
- .str.replace("(IFRS연결)", "", regex=False)
76
- .str.replace("(IFRS별도)", "", regex=False)
77
- .str.replace("(GAAP개별)", "", regex=False)
78
- .str.replace("(YoY)", "", regex=False)
79
- .str.replace("(QoQ)", "", regex=False)
80
- .str.replace("(E)", "", regex=False)
81
- .str.replace(".", "", regex=False)
82
- .str.strip()
83
- )
84
-
85
- # NaN -> None 처리
86
- records: list[dict[str, Any]] = df.where(pd.notnull(df), None).to_dict(orient="records")
87
- return records
88
-
89
- async def outer_html(self, selector: str) -> str:
90
- loc = self.page.locator(selector).first
91
- return await loc.evaluate("el => el.outerHTML")
92
-
93
- async def all_texts(self, selector: str) -> list[str]:
94
- # selector는 css도 되고, "xpath=..." 도 됨
95
- loc = self.page.locator(selector)
96
- return await loc.all_text_contents()
97
-
98
- async def outer_html_nth(self, selector: str, index: int) -> str:
99
- loc = self.page.locator(selector).nth(index)
100
- # index가 범위를 벗어나면 playwright가 에러를 내는데,
101
- # 필요하면 여기서 더 친절한 에러로 감싸도 됨.
102
- return await loc.evaluate("el => el.outerHTML")
Binary file
@@ -1,15 +0,0 @@
1
- # scraper2/adapters/out/sinks/memory/__init__.py
2
- from .c101_memory_sink import MemoryC101Sink
3
- from .c103_memory_sink import MemoryC103Sink
4
- from .c104_memory_sink import MemoryC104Sink
5
- from .c106_memory_sink import MemoryC106Sink
6
- from .c108_memory_sink import MemoryC108Sink
7
-
8
- __all__ = [
9
- "MemoryC101Sink",
10
- "MemoryC103Sink",
11
- "MemoryC104Sink",
12
- "MemoryC106Sink",
13
- "MemoryC108Sink",
14
- ]
15
-
@@ -1,26 +0,0 @@
1
- #scraper2/adapters/out/sinks/memory/c101_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c101 import C101DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
10
-
11
- _ENDPOINT = "c101"
12
-
13
- class MemoryC101Sink(C101SinkPort):
14
- def __init__(self, store: InMemoryStore[C101DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C101DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C101DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,26 +0,0 @@
1
- # scraper2/adapters/out/sinks/memory/c103_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c103 import C103DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
10
-
11
- _ENDPOINT = "c103"
12
-
13
- class MemoryC103Sink(C103SinkPort):
14
- def __init__(self, store: InMemoryStore[C103DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C103DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C103DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,26 +0,0 @@
1
- # scraper2/adapters/out/sinks/memory/c104_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c104 import C104DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
10
-
11
- _ENDPOINT = "c104"
12
-
13
- class MemoryC104Sink(C104SinkPort):
14
- def __init__(self, store: InMemoryStore[C104DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C104DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C104DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,26 +0,0 @@
1
- #scraper2/adapters/out/sinks/memory/c106_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c106 import C106DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
10
-
11
- _ENDPOINT = "c106"
12
-
13
- class MemoryC106Sink(C106SinkPort):
14
- def __init__(self, store: InMemoryStore[C106DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C106DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C106DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))