scraper2-hj3415 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -80
  60. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.0.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.0.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,26 +0,0 @@
1
- #scraper2/adapters/out/sinks/memory/c108_memory_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from contracts.nfs.c108 import C108DTO
8
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
9
- from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
10
-
11
- _ENDPOINT = "c108"
12
-
13
- class MemoryC108Sink(C108SinkPort):
14
- def __init__(self, store: InMemoryStore[C108DTO]):
15
- self._store = store
16
-
17
- async def write(self, dto: C108DTO, *, asof: Optional[datetime] = None) -> None:
18
- await self._store.put(_ENDPOINT, dto.코드, dto)
19
-
20
- async def write_many(
21
- self,
22
- dtos: Iterable[C108DTO],
23
- *,
24
- asof: Optional[datetime] = None,
25
- ) -> None:
26
- await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
@@ -1,14 +0,0 @@
1
- # scraper2/adapters/out/sinks/mongo/__init__.py
2
- from .c101_mongo_sink import MongoC101Sink
3
- from .c103_mongo_sink import MongoC103Sink
4
- from .c104_mongo_sink import MongoC104Sink
5
- from .c106_mongo_sink import MongoC106Sink
6
- from .c108_mongo_sink import MongoC108Sink
7
-
8
- __all__ = [
9
- "MongoC101Sink",
10
- "MongoC103Sink",
11
- "MongoC104Sink",
12
- "MongoC106Sink",
13
- "MongoC108Sink",
14
- ]
@@ -1,43 +0,0 @@
1
- # scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from pymongo.asynchronous.database import AsyncDatabase
8
-
9
- from contracts.nfs.c101 import C101DTO
10
- from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
11
-
12
- from db2.nfs import (
13
- upsert_latest,
14
- upsert_latest_many,
15
- insert_snapshot,
16
- insert_snapshots_many,
17
- )
18
-
19
- _ENDPOINT = "c101"
20
-
21
-
22
- class MongoC101Sink(C101SinkPort):
23
- def __init__(self, db: AsyncDatabase):
24
- self._db = db
25
-
26
- async def write(self, dto: C101DTO, *, asof: Optional[datetime] = None) -> None:
27
- # 최신 상태는 upsert
28
- await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
- # 히스토리는 insert
30
- await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
31
-
32
- async def write_many(
33
- self,
34
- dtos: Iterable[C101DTO],
35
- *,
36
- asof: Optional[datetime] = None,
37
- ) -> None:
38
- dtos_list = list(dtos)
39
- if not dtos_list:
40
- return
41
-
42
- await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
43
- await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -1,41 +0,0 @@
1
- # scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from pymongo.asynchronous.database import AsyncDatabase
8
-
9
- from contracts.nfs.c103 import C103DTO
10
- from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
11
-
12
- from db2.nfs import (
13
- upsert_latest,
14
- upsert_latest_many,
15
- insert_snapshot,
16
- insert_snapshots_many,
17
- )
18
-
19
- _ENDPOINT = "c103"
20
-
21
-
22
- class MongoC103Sink(C103SinkPort):
23
- def __init__(self, db: AsyncDatabase):
24
- self._db = db
25
-
26
- async def write(self, dto: C103DTO, *, asof: Optional[datetime] = None) -> None:
27
- await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
28
- await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
-
30
- async def write_many(
31
- self,
32
- dtos: Iterable[C103DTO],
33
- *,
34
- asof: Optional[datetime] = None,
35
- ) -> None:
36
- dtos_list = list(dtos)
37
- if not dtos_list:
38
- return
39
-
40
- await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
41
- await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -1,41 +0,0 @@
1
- # scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from pymongo.asynchronous.database import AsyncDatabase
8
-
9
- from contracts.nfs.c104 import C104DTO
10
- from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
11
-
12
- from db2.nfs import (
13
- upsert_latest,
14
- upsert_latest_many,
15
- insert_snapshot,
16
- insert_snapshots_many,
17
- )
18
-
19
- _ENDPOINT = "c104"
20
-
21
-
22
- class MongoC104Sink(C104SinkPort):
23
- def __init__(self, db: AsyncDatabase):
24
- self._db = db
25
-
26
- async def write(self, dto: C104DTO, *, asof: Optional[datetime] = None) -> None:
27
- await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
28
- await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
-
30
- async def write_many(
31
- self,
32
- dtos: Iterable[C104DTO],
33
- *,
34
- asof: Optional[datetime] = None,
35
- ) -> None:
36
- dtos_list = list(dtos)
37
- if not dtos_list:
38
- return
39
-
40
- await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
41
- await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -1,41 +0,0 @@
1
- # scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from pymongo.asynchronous.database import AsyncDatabase
8
-
9
- from contracts.nfs.c106 import C106DTO
10
- from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
11
-
12
- from db2.nfs import (
13
- upsert_latest,
14
- upsert_latest_many,
15
- insert_snapshot,
16
- insert_snapshots_many,
17
- )
18
-
19
- _ENDPOINT = "c106"
20
-
21
-
22
- class MongoC106Sink(C106SinkPort):
23
- def __init__(self, db: AsyncDatabase):
24
- self._db = db
25
-
26
- async def write(self, dto: C106DTO, *, asof: Optional[datetime] = None) -> None:
27
- await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
28
- await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
-
30
- async def write_many(
31
- self,
32
- dtos: Iterable[C106DTO],
33
- *,
34
- asof: Optional[datetime] = None,
35
- ) -> None:
36
- dtos_list = list(dtos)
37
- if not dtos_list:
38
- return
39
-
40
- await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
41
- await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -1,41 +0,0 @@
1
- # scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py
2
- from __future__ import annotations
3
-
4
- from datetime import datetime
5
- from typing import Iterable, Optional
6
-
7
- from pymongo.asynchronous.database import AsyncDatabase
8
-
9
- from contracts.nfs.c108 import C108DTO
10
- from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
11
-
12
- from db2.nfs import (
13
- upsert_latest,
14
- upsert_latest_many,
15
- insert_snapshot,
16
- insert_snapshots_many,
17
- )
18
-
19
- _ENDPOINT = "c108"
20
-
21
-
22
- class MongoC108Sink(C108SinkPort):
23
- def __init__(self, db: AsyncDatabase):
24
- self._db = db
25
-
26
- async def write(self, dto: C108DTO, *, asof: Optional[datetime] = None) -> None:
27
- await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
28
- await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
29
-
30
- async def write_many(
31
- self,
32
- dtos: Iterable[C108DTO],
33
- *,
34
- asof: Optional[datetime] = None,
35
- ) -> None:
36
- dtos_list = list(dtos)
37
- if not dtos_list:
38
- return
39
-
40
- await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
41
- await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
@@ -1,204 +0,0 @@
1
- # scraper2/app/composition.py
2
- from __future__ import annotations
3
-
4
- import os
5
- from dataclasses import dataclass
6
- from typing import Literal, Optional
7
-
8
- from pymongo.asynchronous.database import AsyncDatabase
9
-
10
- from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
11
- from scraper2.adapters.out.playwright.browser_factory import PlaywrightBrowserFactory
12
-
13
- from scraper2.app.usecases.fetch.fetch_c101 import FetchC101
14
- from scraper2.app.usecases.fetch.fetch_c103 import FetchC103
15
- from scraper2.app.usecases.fetch.fetch_c104 import FetchC104
16
- from scraper2.app.usecases.fetch.fetch_c106 import FetchC106
17
- from scraper2.app.usecases.fetch.fetch_c108 import FetchC108
18
-
19
- from scraper2.app.ports.ingest_port import IngestPort
20
- from scraper2.app.usecases.ingest.ingest_c101 import IngestC101
21
- from scraper2.app.usecases.ingest.ingest_c103 import IngestC103
22
- from scraper2.app.usecases.ingest.ingest_c104 import IngestC104
23
- from scraper2.app.usecases.ingest.ingest_c106 import IngestC106
24
- from scraper2.app.usecases.ingest.ingest_c108 import IngestC108
25
-
26
- from scraper2.adapters.out.sinks.memory.store import InMemoryStore
27
- from scraper2.adapters.out.sinks.memory.c101_memory_sink import MemoryC101Sink
28
- from scraper2.adapters.out.sinks.memory.c103_memory_sink import MemoryC103Sink
29
- from scraper2.adapters.out.sinks.memory.c104_memory_sink import MemoryC104Sink
30
- from scraper2.adapters.out.sinks.memory.c106_memory_sink import MemoryC106Sink
31
- from scraper2.adapters.out.sinks.memory.c108_memory_sink import MemoryC108Sink
32
-
33
- from scraper2.adapters.out.sinks.mongo.c101_mongo_sink import MongoC101Sink
34
- from scraper2.adapters.out.sinks.mongo.c103_mongo_sink import MongoC103Sink
35
- from scraper2.adapters.out.sinks.mongo.c104_mongo_sink import MongoC104Sink
36
- from scraper2.adapters.out.sinks.mongo.c106_mongo_sink import MongoC106Sink
37
- from scraper2.adapters.out.sinks.mongo.c108_mongo_sink import MongoC108Sink
38
-
39
- from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
40
- from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
41
- from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
42
- from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
43
- from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
44
-
45
- from db2.mongo import Mongo
46
-
47
- SinkKind = Literal["memory", "mongo"]
48
-
49
-
50
- def _env_bool(key: str, default: bool) -> bool:
51
- v = os.getenv(key)
52
- return default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
53
-
54
-
55
- def _env_int(key: str, default: int) -> int:
56
- v = os.getenv(key)
57
- if v is None:
58
- return default
59
- try:
60
- return int(v)
61
- except ValueError:
62
- return default
63
-
64
-
65
- def build_browser_factory() -> BrowserFactoryPort:
66
- return PlaywrightBrowserFactory(
67
- headless=_env_bool("SCRAPER_HEADLESS", True),
68
- timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
69
- max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
70
- )
71
-
72
-
73
- # -------------------------
74
- # Bundles
75
- # -------------------------
76
-
77
- @dataclass(frozen=True)
78
- class FetchUsecases:
79
- c101: FetchC101
80
- c103: FetchC103
81
- c104: FetchC104
82
- c106: FetchC106
83
- c108: FetchC108
84
-
85
-
86
- @dataclass(frozen=True)
87
- class Sinks:
88
- c101: C101SinkPort
89
- c103: C103SinkPort
90
- c104: C104SinkPort
91
- c106: C106SinkPort
92
- c108: C108SinkPort
93
-
94
-
95
- @dataclass(frozen=True)
96
- class IngestUsecases:
97
- c101: IngestPort
98
- c103: IngestPort
99
- c104: IngestPort
100
- c106: IngestPort
101
- c108: IngestPort
102
-
103
-
104
- @dataclass(frozen=True)
105
- class Usecases:
106
- fetch: FetchUsecases
107
- ingest: IngestUsecases
108
- sinks: Sinks
109
- store: InMemoryStore | None = None # ✅ memory일 때만
110
- mongo: Mongo | None = None # ✅ mongo일 때만
111
- db: AsyncDatabase | None = None # ✅ mongo일 때만
112
- browser_factory: Optional[BrowserFactoryPort] = None
113
-
114
- async def aclose(self) -> None:
115
- if self.browser_factory is not None:
116
- await self.browser_factory.aclose()
117
-
118
- if self.mongo is not None:
119
- await self.mongo.close()
120
-
121
- # -------------------------
122
- # builders
123
- # -------------------------
124
-
125
- def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
126
- return FetchUsecases(
127
- c101=FetchC101(factory=factory),
128
- c103=FetchC103(factory=factory),
129
- c104=FetchC104(factory=factory),
130
- c106=FetchC106(factory=factory),
131
- c108=FetchC108(factory=factory),
132
- )
133
-
134
-
135
- @dataclass(frozen=True)
136
- class MemoryBundle:
137
- store: InMemoryStore
138
- sinks: Sinks
139
-
140
-
141
- def build_memory_bundle() -> MemoryBundle:
142
- store = InMemoryStore()
143
- sinks = Sinks(
144
- c101=MemoryC101Sink(store),
145
- c103=MemoryC103Sink(store),
146
- c104=MemoryC104Sink(store),
147
- c106=MemoryC106Sink(store),
148
- c108=MemoryC108Sink(store),
149
- )
150
- return MemoryBundle(store=store, sinks=sinks)
151
-
152
- # ---- mongo bundle ----
153
-
154
- @dataclass(frozen=True)
155
- class MongoBundle:
156
- mongo: Mongo
157
- db: AsyncDatabase
158
- sinks: Sinks
159
-
160
-
161
- def build_mongo_bundle() -> MongoBundle:
162
- mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
163
- db = mongo.get_db()
164
- sinks = Sinks(
165
- c101=MongoC101Sink(db),
166
- c103=MongoC103Sink(db),
167
- c104=MongoC104Sink(db),
168
- c106=MongoC106Sink(db),
169
- c108=MongoC108Sink(db),
170
- )
171
- return MongoBundle(mongo=mongo, db=db, sinks=sinks)
172
-
173
-
174
- def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
175
- return IngestUsecases(
176
- c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
177
- c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
178
- c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
179
- c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
180
- c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
181
- )
182
-
183
-
184
- def build_usecases(
185
- *,
186
- factory: BrowserFactoryPort | None = None,
187
- sink_kind: SinkKind = "memory",
188
- ) -> Usecases:
189
- factory = factory or build_browser_factory()
190
- fetch = build_fetch_usecases(factory=factory)
191
-
192
- if sink_kind == "memory":
193
- bundle = build_memory_bundle()
194
- ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
195
- return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, store=bundle.store,
196
- browser_factory=factory)
197
-
198
- if sink_kind == "mongo":
199
- bundle = build_mongo_bundle()
200
- ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
201
- return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, mongo=bundle.mongo, db=bundle.db,
202
- browser_factory=factory)
203
-
204
- raise ValueError(f"Unknown sink_kind: {sink_kind}")
@@ -1,85 +0,0 @@
1
- from __future__ import annotations
2
- import re
3
- from typing import Iterable
4
-
5
-
6
- _EMPTY_VALUES = {"", "-", "N/A", "NA", "null", "None"}
7
-
8
-
9
- def normalize(s: str | None) -> str:
10
- if s is None:
11
- return ""
12
- return s.strip()
13
-
14
-
15
- def _is_empty(s: str) -> bool:
16
- return s in _EMPTY_VALUES
17
-
18
-
19
- def to_int(s: str | None) -> int | None:
20
- """
21
- C101 파서용 정수 변환기
22
-
23
- 처리 규칙:
24
- - None / '' / '-' / 'N/A' → None
25
- - ',', '주', '원', '주식' 제거
26
- - '1,234' → 1234
27
- """
28
- s = normalize(s)
29
- if _is_empty(s):
30
- return None
31
-
32
- for ch in (",", "원", "주", "주식"):
33
- s = s.replace(ch, "")
34
-
35
- try:
36
- return int(s)
37
- except ValueError:
38
- return None
39
-
40
-
41
- def to_float(s: str | None) -> float | None:
42
- """
43
- C101 파서용 실수 변환기
44
-
45
- 처리 규칙:
46
- - None / '' / '-' / 'N/A' → None
47
- - ',', '%', '원' 제거
48
- - '12.34%' → 12.34
49
- """
50
- s = normalize(s)
51
- if _is_empty(s):
52
- return None
53
-
54
- for ch in (",", "%", "원"):
55
- s = s.replace(ch, "")
56
-
57
- try:
58
- return float(s)
59
- except ValueError:
60
- return None
61
-
62
-
63
- def parse_won(text: str) -> int:
64
- """
65
- 한국 화폐 표현 문자열을 숫자로 변환 (조원, 억원, 만원, 원, 억 등 처리)
66
- """
67
- units = {
68
- "조원": 1_000_000_000_000,
69
- "억원": 100_000_000,
70
- "억": 100_000_000,
71
- "만원": 10_000,
72
- "원": 1,
73
- }
74
-
75
- text = text.replace(",", "").strip()
76
- match = re.match(r"([-+]?[0-9]*\.?[0-9]+)([가-힣]+)", text)
77
-
78
- if not match:
79
- raise ValueError(f"형식이 잘못된 금액 문자열: {text}")
80
-
81
- number, unit = match.groups()
82
- if unit not in units:
83
- raise ValueError(f"알 수 없는 단위: {unit}")
84
-
85
- return int(float(number) * units[unit])
@@ -1,134 +0,0 @@
1
- # scraper2/app/parsing/_normalize.py
2
- from __future__ import annotations
3
-
4
- import re
5
- from collections import Counter
6
- from typing import Any
7
-
8
- import numpy as np
9
- import pandas as pd
10
-
11
-
12
- # -----------------------------
13
- # 1) 항목(행의 "항목" 값) 정규화
14
- # -----------------------------
15
- _IFRS_PATTERN = re.compile(r"\(IFRS[^)]*\)")
16
- _ETC_PAREN_PATTERN = re.compile(r"\((E|YoY|QoQ)[^)]*\)")
17
- _BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
18
- _EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
19
- _ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
20
-
21
- def normalize_c1034_item(text: str | None) -> str:
22
- """
23
- C103 항목명(행 값) 정규화
24
- - 펼치기/컨센서스 제거
25
- - 모든 괄호 내용 제거 (발표기준, 연결, 개별 등 포함)
26
- - [구K-IFRS] 제거
27
- - '*' 제거
28
- - 공백 정리
29
- """
30
- if not text:
31
- return ""
32
-
33
- s = str(text)
34
-
35
- # 1) 불필요한 키워드
36
- s = _EXTRA_WORDS_PATTERN.sub("", s)
37
-
38
- # 2) 대괄호 제거
39
- s = _BRACKET_PATTERN.sub("", s)
40
-
41
- # 3) 모든 괄호 제거 (중요)
42
- s = _ALL_PAREN_PATTERN.sub("", s)
43
-
44
- # 4) 별표 제거
45
- s = s.replace("*", "")
46
-
47
- # 5) 공백 정리
48
- s = re.sub(r"\s+", " ", s).strip()
49
-
50
- return s
51
-
52
-
53
- # -----------------------------
54
- # 2) 컬럼명 정규화
55
- # -----------------------------
56
- _COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
57
- _COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
58
- _COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
59
- _COL_MULTI_SPACE = re.compile(r"\s+")
60
-
61
- def normalize_c1034_col(col: str | None) -> str:
62
- """
63
- C103 컬럼명 정규화
64
- 예)
65
- "2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
66
- "2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
67
- "전년대비 (YoY)" -> "전년대비"
68
- "전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
69
- """
70
- if col is None:
71
- return ""
72
-
73
- s = str(col)
74
-
75
- # 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
76
- s = _COL_DOTNUM.sub("", s)
77
-
78
- # 2) 컨센서스 문구 제거
79
- s = _COL_EXTRA_WORDS.sub("", s)
80
-
81
- # 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
82
- s = _COL_PAREN_PATTERN.sub("", s)
83
-
84
- # 4) 공백 정리
85
- s = _COL_MULTI_SPACE.sub(" ", s).strip()
86
-
87
- return s
88
-
89
-
90
- def _dedupe_columns(cols: list[str]) -> list[str]:
91
- """
92
- 정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
93
- 예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
94
- """
95
- seen: Counter[str] = Counter()
96
- out: list[str] = []
97
- for c in cols:
98
- c = c or ""
99
- seen[c] += 1
100
- if seen[c] == 1:
101
- out.append(c)
102
- else:
103
- out.append(f"{c}_{seen[c]}")
104
- return out
105
-
106
-
107
- # -----------------------------
108
- # 3) DataFrame 전체 정규화 + records 변환
109
- # -----------------------------
110
- def normalize_c1034_df(df: pd.DataFrame) -> pd.DataFrame:
111
- """
112
- - 컬럼명 전체 정규화
113
- - '항목' 값 정규화
114
- - NaN -> None
115
- - 중복 컬럼명 자동 분리(_2/_3)
116
- """
117
- if df is None or df.empty:
118
- return df
119
-
120
- df = df.copy()
121
-
122
- # 컬럼명 정규화 + 중복 방지
123
- norm_cols = [normalize_c1034_col(c) for c in df.columns.astype(str).tolist()]
124
- df.columns = _dedupe_columns(norm_cols)
125
-
126
- # 항목 값 정규화
127
- if "항목" in df.columns:
128
- df["항목"] = df["항목"].map(normalize_c1034_item)
129
-
130
- # NaN -> None
131
- df = df.replace({np.nan: None})
132
- return df
133
-
134
-