scraper2-hj3415 2.4.1__py3-none-any.whl → 2.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +26 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +7 -7
  3. scraper2_hj3415/app/adapters/out/playwright/capabilities/__init__.py +18 -0
  4. scraper2_hj3415/app/adapters/out/playwright/capabilities/_base.py +19 -0
  5. scraper2_hj3415/app/adapters/out/playwright/capabilities/interaction.py +37 -0
  6. scraper2_hj3415/app/adapters/out/playwright/capabilities/navigation.py +24 -0
  7. scraper2_hj3415/app/adapters/out/playwright/capabilities/scope.py +84 -0
  8. scraper2_hj3415/app/adapters/out/playwright/capabilities/table.py +90 -0
  9. scraper2_hj3415/app/adapters/out/playwright/capabilities/text.py +25 -0
  10. scraper2_hj3415/app/adapters/out/playwright/capabilities/wait.py +96 -0
  11. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  12. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  13. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  14. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  15. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +379 -0
  16. scraper2_hj3415/app/composition.py +225 -0
  17. scraper2_hj3415/app/domain/blocks.py +61 -0
  18. scraper2_hj3415/app/domain/constants.py +33 -0
  19. scraper2_hj3415/app/domain/doc.py +16 -0
  20. scraper2_hj3415/app/domain/endpoint.py +11 -0
  21. scraper2_hj3415/app/domain/series.py +11 -0
  22. scraper2_hj3415/app/domain/types.py +19 -0
  23. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  24. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  25. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  26. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  27. scraper2_hj3415/app/parsing/_tables/html_table.py +89 -0
  28. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  29. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  30. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  31. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  32. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  33. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  34. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  35. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  36. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  37. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  38. scraper2_hj3415/app/parsing/c103_parser.py +22 -0
  39. scraper2_hj3415/app/parsing/c104_parser.py +26 -0
  40. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  41. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  42. scraper2_hj3415/app/ports/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  44. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  45. scraper2_hj3415/app/ports/browser/browser_port.py +32 -0
  46. scraper2_hj3415/app/ports/browser/capabilities/__init__.py +15 -0
  47. scraper2_hj3415/app/ports/browser/capabilities/interaction.py +27 -0
  48. scraper2_hj3415/app/ports/browser/capabilities/navigation.py +18 -0
  49. scraper2_hj3415/app/ports/browser/capabilities/scope.py +66 -0
  50. scraper2_hj3415/app/ports/browser/capabilities/table.py +28 -0
  51. scraper2_hj3415/app/ports/browser/capabilities/text.py +16 -0
  52. scraper2_hj3415/app/ports/browser/capabilities/wait.py +51 -0
  53. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  55. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  56. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  57. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  58. scraper2_hj3415/app/ports/site/wisereport_port.py +30 -0
  59. scraper2_hj3415/app/services/__init__.py +0 -0
  60. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  61. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  62. scraper2_hj3415/app/services/fetch/fetch_c103.py +121 -0
  63. scraper2_hj3415/app/services/fetch/fetch_c104.py +160 -0
  64. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  65. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  66. scraper2_hj3415/app/services/nfs_doc_builders.py +304 -0
  67. scraper2_hj3415/app/usecases/__init__.py +0 -0
  68. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  69. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  70. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  71. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  72. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  73. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  74. scraper2/main.py → scraper2_hj3415/cli.py +45 -72
  75. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/METADATA +3 -1
  76. scraper2_hj3415-2.7.0.dist-info/RECORD +93 -0
  77. scraper2_hj3415-2.7.0.dist-info/entry_points.txt +3 -0
  78. scraper2/adapters/out/playwright/browser.py +0 -102
  79. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  80. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  81. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  82. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  83. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  84. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  85. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  86. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  87. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  88. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  89. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  90. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  91. scraper2/app/composition.py +0 -204
  92. scraper2/app/parsing/_converters.py +0 -85
  93. scraper2/app/parsing/_normalize.py +0 -134
  94. scraper2/app/parsing/c101_parser.py +0 -143
  95. scraper2/app/parsing/c103_parser.py +0 -128
  96. scraper2/app/parsing/c104_parser.py +0 -143
  97. scraper2/app/parsing/c106_parser.py +0 -153
  98. scraper2/app/parsing/c108_parser.py +0 -65
  99. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  100. scraper2/app/ports/browser/browser_port.py +0 -22
  101. scraper2/app/ports/ingest_port.py +0 -14
  102. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  103. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  104. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  105. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  106. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  107. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  108. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  109. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  110. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  111. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  112. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  113. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  114. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  115. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  116. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  117. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  118. scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
  119. scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
  120. {scraper2 → scraper2_hj3415}/.DS_Store +0 -0
  121. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  122. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  123. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  124. {scraper2 → scraper2_hj3415/app}/adapters/out/.DS_Store +0 -0
  125. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  126. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  127. {scraper2 → scraper2_hj3415/app}/adapters/out/sinks/.DS_Store +0 -0
  128. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  129. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  130. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  131. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  132. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  133. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  134. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/WHEEL +0 -0
  135. {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.7.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,204 +0,0 @@
1
- # scraper2/app/composition.py
2
- from __future__ import annotations
3
-
4
- import os
5
- from dataclasses import dataclass
6
- from typing import Literal, Optional
7
-
8
- from pymongo.asynchronous.database import AsyncDatabase
9
-
10
- from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
11
- from scraper2.adapters.out.playwright.browser_factory import PlaywrightBrowserFactory
12
-
13
- from scraper2.app.usecases.fetch.fetch_c101 import FetchC101
14
- from scraper2.app.usecases.fetch.fetch_c103 import FetchC103
15
- from scraper2.app.usecases.fetch.fetch_c104 import FetchC104
16
- from scraper2.app.usecases.fetch.fetch_c106 import FetchC106
17
- from scraper2.app.usecases.fetch.fetch_c108 import FetchC108
18
-
19
- from scraper2.app.ports.ingest_port import IngestPort
20
- from scraper2.app.usecases.ingest.ingest_c101 import IngestC101
21
- from scraper2.app.usecases.ingest.ingest_c103 import IngestC103
22
- from scraper2.app.usecases.ingest.ingest_c104 import IngestC104
23
- from scraper2.app.usecases.ingest.ingest_c106 import IngestC106
24
- from scraper2.app.usecases.ingest.ingest_c108 import IngestC108
25
-
26
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
27
- from scraper2.adapters.out.sinks.memory.c101_memory_sink import MemoryC101Sink
28
- from scraper2.adapters.out.sinks.memory.c103_memory_sink import MemoryC103Sink
29
- from scraper2.adapters.out.sinks.memory.c104_memory_sink import MemoryC104Sink
30
- from scraper2.adapters.out.sinks.memory.c106_memory_sink import MemoryC106Sink
31
- from scraper2.adapters.out.sinks.memory.c108_memory_sink import MemoryC108Sink
32
-
33
- from scraper2.adapters.out.sinks.mongo.c101_mongo_sink import MongoC101Sink
34
- from scraper2.adapters.out.sinks.mongo.c103_mongo_sink import MongoC103Sink
35
- from scraper2.adapters.out.sinks.mongo.c104_mongo_sink import MongoC104Sink
36
- from scraper2.adapters.out.sinks.mongo.c106_mongo_sink import MongoC106Sink
37
- from scraper2.adapters.out.sinks.mongo.c108_mongo_sink import MongoC108Sink
38
-
39
- from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
40
- from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
41
- from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
42
- from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
43
- from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
44
-
45
- from db2.mongo import Mongo
46
-
47
- SinkKind = Literal["memory", "mongo"]
48
-
49
-
50
- def _env_bool(key: str, default: bool) -> bool:
51
- v = os.getenv(key)
52
- return default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
53
-
54
-
55
- def _env_int(key: str, default: int) -> int:
56
- v = os.getenv(key)
57
- if v is None:
58
- return default
59
- try:
60
- return int(v)
61
- except ValueError:
62
- return default
63
-
64
-
65
- def build_browser_factory() -> BrowserFactoryPort:
66
- return PlaywrightBrowserFactory(
67
- headless=_env_bool("SCRAPER_HEADLESS", True),
68
- timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
69
- max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
70
- )
71
-
72
-
73
- # -------------------------
74
- # Bundles
75
- # -------------------------
76
-
77
- @dataclass(frozen=True)
78
- class FetchUsecases:
79
- c101: FetchC101
80
- c103: FetchC103
81
- c104: FetchC104
82
- c106: FetchC106
83
- c108: FetchC108
84
-
85
-
86
- @dataclass(frozen=True)
87
- class Sinks:
88
- c101: C101SinkPort
89
- c103: C103SinkPort
90
- c104: C104SinkPort
91
- c106: C106SinkPort
92
- c108: C108SinkPort
93
-
94
-
95
- @dataclass(frozen=True)
96
- class IngestUsecases:
97
- c101: IngestPort
98
- c103: IngestPort
99
- c104: IngestPort
100
- c106: IngestPort
101
- c108: IngestPort
102
-
103
-
104
- @dataclass(frozen=True)
105
- class Usecases:
106
- fetch: FetchUsecases
107
- ingest: IngestUsecases
108
- sinks: Sinks
109
- store: InMemoryStore | None = None # ✅ memory일 때만
110
- mongo: Mongo | None = None # ✅ mongo일 때만
111
- db: AsyncDatabase | None = None # ✅ mongo일 때만
112
- browser_factory: Optional[BrowserFactoryPort] = None
113
-
114
- async def aclose(self) -> None:
115
- if self.browser_factory is not None:
116
- await self.browser_factory.aclose()
117
-
118
- if self.mongo is not None:
119
- await self.mongo.close()
120
-
121
- # -------------------------
122
- # builders
123
- # -------------------------
124
-
125
- def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
126
- return FetchUsecases(
127
- c101=FetchC101(factory=factory),
128
- c103=FetchC103(factory=factory),
129
- c104=FetchC104(factory=factory),
130
- c106=FetchC106(factory=factory),
131
- c108=FetchC108(factory=factory),
132
- )
133
-
134
-
135
- @dataclass(frozen=True)
136
- class MemoryBundle:
137
- store: InMemoryStore
138
- sinks: Sinks
139
-
140
-
141
- def build_memory_bundle() -> MemoryBundle:
142
- store = InMemoryStore()
143
- sinks = Sinks(
144
- c101=MemoryC101Sink(store),
145
- c103=MemoryC103Sink(store),
146
- c104=MemoryC104Sink(store),
147
- c106=MemoryC106Sink(store),
148
- c108=MemoryC108Sink(store),
149
- )
150
- return MemoryBundle(store=store, sinks=sinks)
151
-
152
- # ---- mongo bundle ----
153
-
154
- @dataclass(frozen=True)
155
- class MongoBundle:
156
- mongo: Mongo
157
- db: AsyncDatabase
158
- sinks: Sinks
159
-
160
-
161
- def build_mongo_bundle() -> MongoBundle:
162
- mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
163
- db = mongo.get_db()
164
- sinks = Sinks(
165
- c101=MongoC101Sink(db),
166
- c103=MongoC103Sink(db),
167
- c104=MongoC104Sink(db),
168
- c106=MongoC106Sink(db),
169
- c108=MongoC108Sink(db),
170
- )
171
- return MongoBundle(mongo=mongo, db=db, sinks=sinks)
172
-
173
-
174
- def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
175
- return IngestUsecases(
176
- c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
177
- c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
178
- c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
179
- c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
180
- c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
181
- )
182
-
183
-
184
- def build_usecases(
185
- *,
186
- factory: BrowserFactoryPort | None = None,
187
- sink_kind: SinkKind = "memory",
188
- ) -> Usecases:
189
- factory = factory or build_browser_factory()
190
- fetch = build_fetch_usecases(factory=factory)
191
-
192
- if sink_kind == "memory":
193
- bundle = build_memory_bundle()
194
- ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
195
- return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, store=bundle.store,
196
- browser_factory=factory)
197
-
198
- if sink_kind == "mongo":
199
- bundle = build_mongo_bundle()
200
- ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
201
- return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, mongo=bundle.mongo, db=bundle.db,
202
- browser_factory=factory)
203
-
204
- raise ValueError(f"Unknown sink_kind: {sink_kind}")
@@ -1,85 +0,0 @@
1
- from __future__ import annotations
2
- import re
3
- from typing import Iterable
4
-
5
-
6
- _EMPTY_VALUES = {"", "-", "N/A", "NA", "null", "None"}
7
-
8
-
9
- def normalize(s: str | None) -> str:
10
- if s is None:
11
- return ""
12
- return s.strip()
13
-
14
-
15
- def _is_empty(s: str) -> bool:
16
- return s in _EMPTY_VALUES
17
-
18
-
19
- def to_int(s: str | None) -> int | None:
20
- """
21
- C101 파서용 정수 변환기
22
-
23
- 처리 규칙:
24
- - None / '' / '-' / 'N/A' → None
25
- - ',', '주', '원', '주식' 제거
26
- - '1,234' → 1234
27
- """
28
- s = normalize(s)
29
- if _is_empty(s):
30
- return None
31
-
32
- for ch in (",", "원", "주", "주식"):
33
- s = s.replace(ch, "")
34
-
35
- try:
36
- return int(s)
37
- except ValueError:
38
- return None
39
-
40
-
41
- def to_float(s: str | None) -> float | None:
42
- """
43
- C101 파서용 실수 변환기
44
-
45
- 처리 규칙:
46
- - None / '' / '-' / 'N/A' → None
47
- - ',', '%', '원' 제거
48
- - '12.34%' → 12.34
49
- """
50
- s = normalize(s)
51
- if _is_empty(s):
52
- return None
53
-
54
- for ch in (",", "%", "원"):
55
- s = s.replace(ch, "")
56
-
57
- try:
58
- return float(s)
59
- except ValueError:
60
- return None
61
-
62
-
63
- def parse_won(text: str) -> int:
64
- """
65
- 한국 화폐 표현 문자열을 숫자로 변환 (조원, 억원, 만원, 원, 억 등 처리)
66
- """
67
- units = {
68
- "조원": 1_000_000_000_000,
69
- "억원": 100_000_000,
70
- "억": 100_000_000,
71
- "만원": 10_000,
72
- "원": 1,
73
- }
74
-
75
- text = text.replace(",", "").strip()
76
- match = re.match(r"([-+]?[0-9]*\.?[0-9]+)([가-힣]+)", text)
77
-
78
- if not match:
79
- raise ValueError(f"형식이 잘못된 금액 문자열: {text}")
80
-
81
- number, unit = match.groups()
82
- if unit not in units:
83
- raise ValueError(f"알 수 없는 단위: {unit}")
84
-
85
- return int(float(number) * units[unit])
@@ -1,134 +0,0 @@
1
- # scraper2/app/parsing/_normalize.py
2
- from __future__ import annotations
3
-
4
- import re
5
- from collections import Counter
6
- from typing import Any
7
-
8
- import numpy as np
9
- import pandas as pd
10
-
11
-
12
- # -----------------------------
13
- # 1) 항목(행의 "항목" 값) 정규화
14
- # -----------------------------
15
- _IFRS_PATTERN = re.compile(r"\(IFRS[^)]*\)")
16
- _ETC_PAREN_PATTERN = re.compile(r"\((E|YoY|QoQ)[^)]*\)")
17
- _BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
18
- _EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
19
- _ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
20
-
21
- def normalize_c1034_item(text: str | None) -> str:
22
- """
23
- C103 항목명(행 값) 정규화
24
- - 펼치기/컨센서스 제거
25
- - 모든 괄호 내용 제거 (발표기준, 연결, 개별 등 포함)
26
- - [구K-IFRS] 제거
27
- - '*' 제거
28
- - 공백 정리
29
- """
30
- if not text:
31
- return ""
32
-
33
- s = str(text)
34
-
35
- # 1) 불필요한 키워드
36
- s = _EXTRA_WORDS_PATTERN.sub("", s)
37
-
38
- # 2) 대괄호 제거
39
- s = _BRACKET_PATTERN.sub("", s)
40
-
41
- # 3) 모든 괄호 제거 (중요)
42
- s = _ALL_PAREN_PATTERN.sub("", s)
43
-
44
- # 4) 별표 제거
45
- s = s.replace("*", "")
46
-
47
- # 5) 공백 정리
48
- s = re.sub(r"\s+", " ", s).strip()
49
-
50
- return s
51
-
52
-
53
- # -----------------------------
54
- # 2) 컬럼명 정규화
55
- # -----------------------------
56
- _COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
57
- _COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
58
- _COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
59
- _COL_MULTI_SPACE = re.compile(r"\s+")
60
-
61
- def normalize_c1034_col(col: str | None) -> str:
62
- """
63
- C103 컬럼명 정규화
64
- 예)
65
- "2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
66
- "2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
67
- "전년대비 (YoY)" -> "전년대비"
68
- "전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
69
- """
70
- if col is None:
71
- return ""
72
-
73
- s = str(col)
74
-
75
- # 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
76
- s = _COL_DOTNUM.sub("", s)
77
-
78
- # 2) 컨센서스 문구 제거
79
- s = _COL_EXTRA_WORDS.sub("", s)
80
-
81
- # 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
82
- s = _COL_PAREN_PATTERN.sub("", s)
83
-
84
- # 4) 공백 정리
85
- s = _COL_MULTI_SPACE.sub(" ", s).strip()
86
-
87
- return s
88
-
89
-
90
- def _dedupe_columns(cols: list[str]) -> list[str]:
91
- """
92
- 정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
93
- 예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
94
- """
95
- seen: Counter[str] = Counter()
96
- out: list[str] = []
97
- for c in cols:
98
- c = c or ""
99
- seen[c] += 1
100
- if seen[c] == 1:
101
- out.append(c)
102
- else:
103
- out.append(f"{c}_{seen[c]}")
104
- return out
105
-
106
-
107
- # -----------------------------
108
- # 3) DataFrame 전체 정규화 + records 변환
109
- # -----------------------------
110
- def normalize_c1034_df(df: pd.DataFrame) -> pd.DataFrame:
111
- """
112
- - 컬럼명 전체 정규화
113
- - '항목' 값 정규화
114
- - NaN -> None
115
- - 중복 컬럼명 자동 분리(_2/_3)
116
- """
117
- if df is None or df.empty:
118
- return df
119
-
120
- df = df.copy()
121
-
122
- # 컬럼명 정규화 + 중복 방지
123
- norm_cols = [normalize_c1034_col(c) for c in df.columns.astype(str).tolist()]
124
- df.columns = _dedupe_columns(norm_cols)
125
-
126
- # 항목 값 정규화
127
- if "항목" in df.columns:
128
- df["항목"] = df["항목"].map(normalize_c1034_item)
129
-
130
- # NaN -> None
131
- df = df.replace({np.nan: None})
132
- return df
133
-
134
-
@@ -1,143 +0,0 @@
1
- # scraper2/app/parsing/c101_parser.py
2
- from __future__ import annotations
3
-
4
- from scraper2.app.parsing._converters import to_int, to_float, normalize, parse_won
5
- from scraper2.app.ports.browser.browser_port import BrowserPort
6
- from typing import Any
7
-
8
-
9
- class C101ParseError(RuntimeError):
10
- pass
11
-
12
-
13
- def _after_colon(s: str) -> str:
14
- # "업종: XXX" 같은 케이스
15
- parts = s.split(":")
16
- return parts[1].strip() if len(parts) > 1 else s.strip()
17
-
18
-
19
- async def parse_c101_to_dict(browser: BrowserPort) -> dict[str, Any] | None:
20
- """
21
- - BrowserPort만 사용
22
- - dict만 반환
23
- - 실패 시 None (기존 동작 유지)
24
- """
25
-
26
- # 날짜 파싱: 텍스트 검색 기반
27
- raw_date_str = await browser.text_first_by_text("[기준:")
28
- if not raw_date_str:
29
- return None
30
- 날짜 = raw_date_str.replace("[기준:", "").replace("]", "").strip()
31
-
32
- # 1) 재무정보 (1st table)
33
- # 여기선 "tbody 존재"만 기다리면 됨
34
- await browser.wait("#pArea > div.wrapper-table > div > table > tbody")
35
-
36
- 종목명 = normalize(await browser.text(
37
- "#pArea > div.wrapper-table > div > table > tbody "
38
- "tr:nth-child(1) > td > dl > dt:nth-child(1) > span"
39
- ))
40
- 코드 = normalize(await browser.text(
41
- "#pArea > div.wrapper-table > div > table > tbody "
42
- "tr:nth-child(1) > td > dl > dt:nth-child(1) > b"
43
- ))
44
- 업종_raw = await browser.text(
45
- "#pArea > div.wrapper-table > div > table > tbody "
46
- "tr:nth-child(1) > td > dl > dt:nth-child(4)"
47
- )
48
- 업종 = _after_colon(업종_raw)
49
-
50
- eps = to_int(await browser.text(
51
- "#pArea > div.wrapper-table > div > table > tbody "
52
- "tr:nth-child(3) > td > dl > dt:nth-child(1) > b"
53
- ))
54
- bps = to_int(await browser.text(
55
- "#pArea > div.wrapper-table > div > table > tbody "
56
- "tr:nth-child(3) > td > dl > dt:nth-child(2) > b"
57
- ))
58
- per = to_float(await browser.text(
59
- "#pArea > div.wrapper-table > div > table > tbody "
60
- "tr:nth-child(3) > td > dl > dt:nth-child(3) > b"
61
- ))
62
- 업종per = to_float(await browser.text(
63
- "#pArea > div.wrapper-table > div > table > tbody "
64
- "tr:nth-child(3) > td > dl > dt:nth-child(4) > b"
65
- ))
66
- pbr = to_float(await browser.text(
67
- "#pArea > div.wrapper-table > div > table > tbody "
68
- "tr:nth-child(3) > td > dl > dt:nth-child(5) > b"
69
- ))
70
- 배당수익률 = to_float(await browser.text(
71
- "#pArea > div.wrapper-table > div > table > tbody "
72
- "tr:nth-child(3) > td > dl > dt:nth-child(6) > b"
73
- ))
74
-
75
- # 2) 주가 정보 (2nd table)
76
- await browser.wait("#cTB11 > tbody")
77
-
78
- 주가 = to_int(await browser.text("#cTB11 > tbody tr:nth-child(1) > td > strong"))
79
-
80
- 전일대비_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(2)")
81
- 전일대비 = to_int(전일대비_raw.replace("원", ""))
82
-
83
- 수익률_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(3)")
84
- 수익률 = to_float(수익률_raw.replace("%", ""))
85
-
86
- 최고최저52 = await browser.text("#cTB11 > tbody tr:nth-child(2) > td")
87
- 최고52, 최저52 = (to_int(x.strip().replace("원", "")) for x in 최고최저52.split("/"))
88
-
89
- 거래량거래대금 = await browser.text("#cTB11 > tbody tr:nth-child(4) > td")
90
- 거래량_str, 거래대금_str = (x.strip() for x in 거래량거래대금.split("/"))
91
- 거래량 = to_int(거래량_str.replace("주", ""))
92
- 거래대금 = parse_won(거래대금_str)
93
-
94
- 시가총액 = parse_won(await browser.text("#cTB11 > tbody tr:nth-child(5) > td"))
95
- 베타52주 = to_float(await browser.text("#cTB11 > tbody tr:nth-child(6) > td"))
96
-
97
- 발행주식유동비율 = await browser.text("#cTB11 > tbody tr:nth-child(7) > td")
98
- 발행주식_str, 유동비율_str = (x.strip() for x in 발행주식유동비율.split("/"))
99
- 발행주식 = to_int(발행주식_str.replace("주", ""))
100
- 유동비율 = to_float(유동비율_str.replace("%", ""))
101
-
102
- 외국인지분율 = to_float((await browser.text("#cTB11 > tbody tr:nth-child(8) > td")).replace("%", ""))
103
-
104
- 수익률1M3M6M1Y = await browser.text("#cTB11 > tbody tr:nth-child(9) > td")
105
- 수익률1M, 수익률3M, 수익률6M, 수익률1Y = (
106
- to_float(x.strip().replace("%", "")) for x in 수익률1M3M6M1Y.split("/")
107
- )
108
-
109
- # 3) 개요
110
- # ul 아래 li들을 전부 읽어서 합치기
111
- await browser.wait("#wrapper > div:nth-child(6) > div.cmp_comment > ul")
112
- li_texts = await browser.texts("#wrapper > div:nth-child(6) > div.cmp_comment > ul li")
113
- 개요 = "".join(t.strip() for t in li_texts if t and t.strip())
114
-
115
- return {
116
- "종목명": 종목명,
117
- "코드": 코드,
118
- "날짜": 날짜,
119
- "업종": 업종,
120
- "eps": eps,
121
- "bps": bps,
122
- "per": per,
123
- "업종per": 업종per,
124
- "pbr": pbr,
125
- "배당수익률": 배당수익률,
126
- "주가": 주가,
127
- "전일대비": 전일대비,
128
- "수익률": 수익률,
129
- "최고52": 최고52,
130
- "최저52": 최저52,
131
- "거래량": 거래량,
132
- "거래대금": 거래대금,
133
- "시가총액": 시가총액,
134
- "베타52주": 베타52주,
135
- "발행주식": 발행주식,
136
- "유동비율": 유동비율,
137
- "외국인지분율": 외국인지분율,
138
- "수익률1M": 수익률1M,
139
- "수익률3M": 수익률3M,
140
- "수익률6M": 수익률6M,
141
- "수익률1Y": 수익률1Y,
142
- "개요": 개요,
143
- }
@@ -1,128 +0,0 @@
1
- # scraper2/app/parsing/c103_parser.py
2
- from __future__ import annotations
3
-
4
- import asyncio
5
- from io import StringIO
6
- from typing import Any
7
-
8
- import pandas as pd
9
-
10
- from scraper2.app.ports.browser.browser_port import BrowserPort
11
- from scraper2.app.parsing._normalize import normalize_c1034_df
12
-
13
- # ---- constants ----
14
-
15
- TABLE_XPATH = "xpath=//div[@id='wrapper']//div//table" # 기존 selector 의도 유지
16
- TABLE_INDEX = 2 # 너 코드의 nth(2)
17
-
18
-
19
- BTN_SETS: dict[str, list[tuple[str, str]]] = {
20
- "손익계산서y": [
21
- ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
22
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
23
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
24
- ],
25
- "재무상태표y": [
26
- ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
27
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
28
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
29
- ],
30
- "현금흐름표y": [
31
- ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
32
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
33
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
34
- ],
35
- "손익계산서q": [
36
- ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
37
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
38
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
39
- ],
40
- "재무상태표q": [
41
- ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
42
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
43
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
44
- ],
45
- "현금흐름표q": [
46
- ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
47
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
48
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
49
- ],
50
- }
51
-
52
-
53
- # ---- small helpers ----
54
-
55
- async def _click_steps(
56
- browser: BrowserPort,
57
- steps: list[tuple[str, str]],
58
- *,
59
- jitter_sec: float = 0.6,
60
- ) -> None:
61
- """
62
- (goto 없음) 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
63
- """
64
- for _name, selector in steps:
65
- await browser.wait(selector)
66
- await browser.click(selector)
67
- # 서버/클라이언트 부담 줄이기: 작은 지터
68
- await asyncio.sleep(0.2 + (jitter_sec * 0.5))
69
-
70
-
71
- def _html_table_to_df(html: str) -> pd.DataFrame:
72
- """
73
- table outerHTML -> DataFrame
74
- """
75
- dfs = pd.read_html(StringIO(html), header=0)
76
- if not dfs:
77
- raise ValueError("pd.read_html() 테이블 파싱 실패")
78
- return dfs[0]
79
-
80
-
81
- async def _nth_table_outer_html(browser: BrowserPort, nth: int) -> str:
82
- return await browser.outer_html_nth(TABLE_XPATH, nth)
83
-
84
- def df_to_c103_records(df: pd.DataFrame) -> list[dict[str, Any]]:
85
- """
86
- C103 테이블 DataFrame -> 정규화된 records(list[dict])
87
- - 항목이 비면 제거
88
- """
89
- df = normalize_c1034_df(df)
90
- if df is None or df.empty:
91
- return []
92
-
93
- records: list[dict[str, Any]] = []
94
- for r in df.to_dict(orient="records"):
95
- item = r.get("항목")
96
- if not item:
97
- continue
98
- records.append(r)
99
- return records
100
-
101
- # ---- public parser ----
102
-
103
- async def parse_c103_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
104
- """
105
- C103 파서: dict만 반환
106
- {
107
- "손익계산서y": [ {...}, ... ],
108
- "손익계산서q": [ {...}, ... ],
109
- ...
110
- }
111
- """
112
- out: dict[str, list[dict[str, Any]]] = {}
113
-
114
- for key, steps in BTN_SETS.items():
115
- # 클릭 → 테이블 로딩 → nth table outerHTML → df → records
116
- await _click_steps(browser, steps)
117
- await browser.wait(TABLE_XPATH)
118
-
119
- try:
120
- html = await _nth_table_outer_html(browser, TABLE_INDEX)
121
- df = _html_table_to_df(html)
122
- out[key] = df_to_c103_records(df)
123
- except Exception:
124
- out[key] = [] # 실패는 빈 리스트
125
-
126
- return out
127
-
128
-