scraper2-hj3415 2.4.1__py3-none-any.whl → 2.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
- {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
- scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
- scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
- {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
- scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
- scraper2_hj3415/app/composition.py +225 -0
- scraper2_hj3415/app/domain/blocks.py +61 -0
- scraper2_hj3415/app/domain/constants.py +33 -0
- scraper2_hj3415/app/domain/doc.py +16 -0
- scraper2_hj3415/app/domain/endpoint.py +11 -0
- scraper2_hj3415/app/domain/series.py +11 -0
- scraper2_hj3415/app/domain/types.py +19 -0
- scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
- scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
- scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
- scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
- scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
- scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
- scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
- scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
- scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
- scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
- scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
- scraper2_hj3415/app/parsing/c101/sise.py +47 -0
- scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
- scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
- scraper2_hj3415/app/parsing/c101_parser.py +45 -0
- scraper2_hj3415/app/parsing/c103_parser.py +19 -0
- scraper2_hj3415/app/parsing/c104_parser.py +23 -0
- scraper2_hj3415/app/parsing/c106_parser.py +137 -0
- scraper2_hj3415/app/parsing/c108_parser.py +254 -0
- scraper2_hj3415/app/ports/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/__init__.py +0 -0
- scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
- scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
- scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
- scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
- scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
- scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
- scraper2_hj3415/app/ports/site/__init__.py +0 -0
- scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
- scraper2_hj3415/app/services/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/__init__.py +0 -0
- scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
- scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
- scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
- scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
- scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
- scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
- scraper2_hj3415/app/usecases/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
- scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
- scraper2/main.py → scraper2_hj3415/cli.py +40 -70
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
- scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
- scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
- scraper2/.DS_Store +0 -0
- scraper2/adapters/out/.DS_Store +0 -0
- scraper2/adapters/out/playwright/browser.py +0 -102
- scraper2/adapters/out/sinks/.DS_Store +0 -0
- scraper2/adapters/out/sinks/memory/__init__.py +0 -15
- scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
- scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
- scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
- scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
- scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
- scraper2/app/composition.py +0 -204
- scraper2/app/parsing/_converters.py +0 -85
- scraper2/app/parsing/_normalize.py +0 -134
- scraper2/app/parsing/c101_parser.py +0 -143
- scraper2/app/parsing/c103_parser.py +0 -128
- scraper2/app/parsing/c104_parser.py +0 -143
- scraper2/app/parsing/c106_parser.py +0 -153
- scraper2/app/parsing/c108_parser.py +0 -65
- scraper2/app/ports/browser/browser_factory_port.py +0 -11
- scraper2/app/ports/browser/browser_port.py +0 -22
- scraper2/app/ports/ingest_port.py +0 -14
- scraper2/app/ports/sinks/base_sink_port.py +0 -14
- scraper2/app/ports/sinks/c101_sink_port.py +0 -9
- scraper2/app/ports/sinks/c103_sink_port.py +0 -9
- scraper2/app/ports/sinks/c104_sink_port.py +0 -9
- scraper2/app/ports/sinks/c106_sink_port.py +0 -9
- scraper2/app/ports/sinks/c108_sink_port.py +0 -9
- scraper2/app/usecases/fetch/fetch_c101.py +0 -43
- scraper2/app/usecases/fetch/fetch_c103.py +0 -103
- scraper2/app/usecases/fetch/fetch_c104.py +0 -76
- scraper2/app/usecases/fetch/fetch_c106.py +0 -90
- scraper2/app/usecases/fetch/fetch_c108.py +0 -49
- scraper2/app/usecases/ingest/ingest_c101.py +0 -36
- scraper2/app/usecases/ingest/ingest_c103.py +0 -37
- scraper2/app/usecases/ingest/ingest_c104.py +0 -37
- scraper2/app/usecases/ingest/ingest_c106.py +0 -38
- scraper2/app/usecases/ingest/ingest_c108.py +0 -39
- scraper2_hj3415-2.4.1.dist-info/RECORD +0 -63
- scraper2_hj3415-2.4.1.dist-info/entry_points.txt +0 -3
- {scraper2 → scraper2_hj3415}/__init__.py +0 -0
- {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
- {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
- {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
- {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
- {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
- {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
- {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
- {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
- {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
- {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
- {scraper2_hj3415-2.4.1.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
#scraper2/adapters/out/sinks/memory/c108_memory_sink.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import Iterable, Optional
|
|
6
|
-
|
|
7
|
-
from contracts.nfs.c108 import C108DTO
|
|
8
|
-
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
9
|
-
from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
|
|
10
|
-
|
|
11
|
-
_ENDPOINT = "c108"
|
|
12
|
-
|
|
13
|
-
class MemoryC108Sink(C108SinkPort):
|
|
14
|
-
def __init__(self, store: InMemoryStore[C108DTO]):
|
|
15
|
-
self._store = store
|
|
16
|
-
|
|
17
|
-
async def write(self, dto: C108DTO, *, asof: Optional[datetime] = None) -> None:
|
|
18
|
-
await self._store.put(_ENDPOINT, dto.코드, dto)
|
|
19
|
-
|
|
20
|
-
async def write_many(
|
|
21
|
-
self,
|
|
22
|
-
dtos: Iterable[C108DTO],
|
|
23
|
-
*,
|
|
24
|
-
asof: Optional[datetime] = None,
|
|
25
|
-
) -> None:
|
|
26
|
-
await self._store.put_many(_ENDPOINT, ((d.코드, d) for d in dtos))
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
# scraper2/adapters/out/sinks/mongo/__init__.py
|
|
2
|
-
from .c101_mongo_sink import MongoC101Sink
|
|
3
|
-
from .c103_mongo_sink import MongoC103Sink
|
|
4
|
-
from .c104_mongo_sink import MongoC104Sink
|
|
5
|
-
from .c106_mongo_sink import MongoC106Sink
|
|
6
|
-
from .c108_mongo_sink import MongoC108Sink
|
|
7
|
-
|
|
8
|
-
__all__ = [
|
|
9
|
-
"MongoC101Sink",
|
|
10
|
-
"MongoC103Sink",
|
|
11
|
-
"MongoC104Sink",
|
|
12
|
-
"MongoC106Sink",
|
|
13
|
-
"MongoC108Sink",
|
|
14
|
-
]
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
# scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import Iterable, Optional
|
|
6
|
-
|
|
7
|
-
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
-
|
|
9
|
-
from contracts.nfs.c101 import C101DTO
|
|
10
|
-
from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
|
|
11
|
-
|
|
12
|
-
from db2.nfs import (
|
|
13
|
-
upsert_latest,
|
|
14
|
-
upsert_latest_many,
|
|
15
|
-
insert_snapshot,
|
|
16
|
-
insert_snapshots_many,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
_ENDPOINT = "c101"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MongoC101Sink(C101SinkPort):
|
|
23
|
-
def __init__(self, db: AsyncDatabase):
|
|
24
|
-
self._db = db
|
|
25
|
-
|
|
26
|
-
async def write(self, dto: C101DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
-
# 최신 상태는 upsert
|
|
28
|
-
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
-
# 히스토리는 insert
|
|
30
|
-
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
31
|
-
|
|
32
|
-
async def write_many(
|
|
33
|
-
self,
|
|
34
|
-
dtos: Iterable[C101DTO],
|
|
35
|
-
*,
|
|
36
|
-
asof: Optional[datetime] = None,
|
|
37
|
-
) -> None:
|
|
38
|
-
dtos_list = list(dtos)
|
|
39
|
-
if not dtos_list:
|
|
40
|
-
return
|
|
41
|
-
|
|
42
|
-
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
43
|
-
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import Iterable, Optional
|
|
6
|
-
|
|
7
|
-
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
-
|
|
9
|
-
from contracts.nfs.c103 import C103DTO
|
|
10
|
-
from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
|
|
11
|
-
|
|
12
|
-
from db2.nfs import (
|
|
13
|
-
upsert_latest,
|
|
14
|
-
upsert_latest_many,
|
|
15
|
-
insert_snapshot,
|
|
16
|
-
insert_snapshots_many,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
_ENDPOINT = "c103"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MongoC103Sink(C103SinkPort):
|
|
23
|
-
def __init__(self, db: AsyncDatabase):
|
|
24
|
-
self._db = db
|
|
25
|
-
|
|
26
|
-
async def write(self, dto: C103DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
-
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
-
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
-
|
|
30
|
-
async def write_many(
|
|
31
|
-
self,
|
|
32
|
-
dtos: Iterable[C103DTO],
|
|
33
|
-
*,
|
|
34
|
-
asof: Optional[datetime] = None,
|
|
35
|
-
) -> None:
|
|
36
|
-
dtos_list = list(dtos)
|
|
37
|
-
if not dtos_list:
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
-
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import Iterable, Optional
|
|
6
|
-
|
|
7
|
-
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
-
|
|
9
|
-
from contracts.nfs.c104 import C104DTO
|
|
10
|
-
from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
|
|
11
|
-
|
|
12
|
-
from db2.nfs import (
|
|
13
|
-
upsert_latest,
|
|
14
|
-
upsert_latest_many,
|
|
15
|
-
insert_snapshot,
|
|
16
|
-
insert_snapshots_many,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
_ENDPOINT = "c104"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MongoC104Sink(C104SinkPort):
|
|
23
|
-
def __init__(self, db: AsyncDatabase):
|
|
24
|
-
self._db = db
|
|
25
|
-
|
|
26
|
-
async def write(self, dto: C104DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
-
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
-
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
-
|
|
30
|
-
async def write_many(
|
|
31
|
-
self,
|
|
32
|
-
dtos: Iterable[C104DTO],
|
|
33
|
-
*,
|
|
34
|
-
asof: Optional[datetime] = None,
|
|
35
|
-
) -> None:
|
|
36
|
-
dtos_list = list(dtos)
|
|
37
|
-
if not dtos_list:
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
-
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import Iterable, Optional
|
|
6
|
-
|
|
7
|
-
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
-
|
|
9
|
-
from contracts.nfs.c106 import C106DTO
|
|
10
|
-
from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
|
|
11
|
-
|
|
12
|
-
from db2.nfs import (
|
|
13
|
-
upsert_latest,
|
|
14
|
-
upsert_latest_many,
|
|
15
|
-
insert_snapshot,
|
|
16
|
-
insert_snapshots_many,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
_ENDPOINT = "c106"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MongoC106Sink(C106SinkPort):
|
|
23
|
-
def __init__(self, db: AsyncDatabase):
|
|
24
|
-
self._db = db
|
|
25
|
-
|
|
26
|
-
async def write(self, dto: C106DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
-
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
-
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
-
|
|
30
|
-
async def write_many(
|
|
31
|
-
self,
|
|
32
|
-
dtos: Iterable[C106DTO],
|
|
33
|
-
*,
|
|
34
|
-
asof: Optional[datetime] = None,
|
|
35
|
-
) -> None:
|
|
36
|
-
dtos_list = list(dtos)
|
|
37
|
-
if not dtos_list:
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
-
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
# scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
from datetime import datetime
|
|
5
|
-
from typing import Iterable, Optional
|
|
6
|
-
|
|
7
|
-
from pymongo.asynchronous.database import AsyncDatabase
|
|
8
|
-
|
|
9
|
-
from contracts.nfs.c108 import C108DTO
|
|
10
|
-
from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
|
|
11
|
-
|
|
12
|
-
from db2.nfs import (
|
|
13
|
-
upsert_latest,
|
|
14
|
-
upsert_latest_many,
|
|
15
|
-
insert_snapshot,
|
|
16
|
-
insert_snapshots_many,
|
|
17
|
-
)
|
|
18
|
-
|
|
19
|
-
_ENDPOINT = "c108"
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class MongoC108Sink(C108SinkPort):
|
|
23
|
-
def __init__(self, db: AsyncDatabase):
|
|
24
|
-
self._db = db
|
|
25
|
-
|
|
26
|
-
async def write(self, dto: C108DTO, *, asof: Optional[datetime] = None) -> None:
|
|
27
|
-
await upsert_latest(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
28
|
-
await insert_snapshot(self._db, endpoint=_ENDPOINT, dto=dto, asof=asof)
|
|
29
|
-
|
|
30
|
-
async def write_many(
|
|
31
|
-
self,
|
|
32
|
-
dtos: Iterable[C108DTO],
|
|
33
|
-
*,
|
|
34
|
-
asof: Optional[datetime] = None,
|
|
35
|
-
) -> None:
|
|
36
|
-
dtos_list = list(dtos)
|
|
37
|
-
if not dtos_list:
|
|
38
|
-
return
|
|
39
|
-
|
|
40
|
-
await upsert_latest_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
|
41
|
-
await insert_snapshots_many(self._db, endpoint=_ENDPOINT, dtos=dtos_list, asof=asof)
|
scraper2/app/composition.py
DELETED
|
@@ -1,204 +0,0 @@
|
|
|
1
|
-
# scraper2/app/composition.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
from dataclasses import dataclass
|
|
6
|
-
from typing import Literal, Optional
|
|
7
|
-
|
|
8
|
-
from pymongo.asynchronous.database import AsyncDatabase
|
|
9
|
-
|
|
10
|
-
from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
|
|
11
|
-
from scraper2.adapters.out.playwright.browser_factory import PlaywrightBrowserFactory
|
|
12
|
-
|
|
13
|
-
from scraper2.app.usecases.fetch.fetch_c101 import FetchC101
|
|
14
|
-
from scraper2.app.usecases.fetch.fetch_c103 import FetchC103
|
|
15
|
-
from scraper2.app.usecases.fetch.fetch_c104 import FetchC104
|
|
16
|
-
from scraper2.app.usecases.fetch.fetch_c106 import FetchC106
|
|
17
|
-
from scraper2.app.usecases.fetch.fetch_c108 import FetchC108
|
|
18
|
-
|
|
19
|
-
from scraper2.app.ports.ingest_port import IngestPort
|
|
20
|
-
from scraper2.app.usecases.ingest.ingest_c101 import IngestC101
|
|
21
|
-
from scraper2.app.usecases.ingest.ingest_c103 import IngestC103
|
|
22
|
-
from scraper2.app.usecases.ingest.ingest_c104 import IngestC104
|
|
23
|
-
from scraper2.app.usecases.ingest.ingest_c106 import IngestC106
|
|
24
|
-
from scraper2.app.usecases.ingest.ingest_c108 import IngestC108
|
|
25
|
-
|
|
26
|
-
from scraper2.adapters.out.sinks.memory.store import InMemoryStore
|
|
27
|
-
from scraper2.adapters.out.sinks.memory.c101_memory_sink import MemoryC101Sink
|
|
28
|
-
from scraper2.adapters.out.sinks.memory.c103_memory_sink import MemoryC103Sink
|
|
29
|
-
from scraper2.adapters.out.sinks.memory.c104_memory_sink import MemoryC104Sink
|
|
30
|
-
from scraper2.adapters.out.sinks.memory.c106_memory_sink import MemoryC106Sink
|
|
31
|
-
from scraper2.adapters.out.sinks.memory.c108_memory_sink import MemoryC108Sink
|
|
32
|
-
|
|
33
|
-
from scraper2.adapters.out.sinks.mongo.c101_mongo_sink import MongoC101Sink
|
|
34
|
-
from scraper2.adapters.out.sinks.mongo.c103_mongo_sink import MongoC103Sink
|
|
35
|
-
from scraper2.adapters.out.sinks.mongo.c104_mongo_sink import MongoC104Sink
|
|
36
|
-
from scraper2.adapters.out.sinks.mongo.c106_mongo_sink import MongoC106Sink
|
|
37
|
-
from scraper2.adapters.out.sinks.mongo.c108_mongo_sink import MongoC108Sink
|
|
38
|
-
|
|
39
|
-
from scraper2.app.ports.sinks.c101_sink_port import C101SinkPort
|
|
40
|
-
from scraper2.app.ports.sinks.c103_sink_port import C103SinkPort
|
|
41
|
-
from scraper2.app.ports.sinks.c104_sink_port import C104SinkPort
|
|
42
|
-
from scraper2.app.ports.sinks.c106_sink_port import C106SinkPort
|
|
43
|
-
from scraper2.app.ports.sinks.c108_sink_port import C108SinkPort
|
|
44
|
-
|
|
45
|
-
from db2.mongo import Mongo
|
|
46
|
-
|
|
47
|
-
SinkKind = Literal["memory", "mongo"]
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def _env_bool(key: str, default: bool) -> bool:
|
|
51
|
-
v = os.getenv(key)
|
|
52
|
-
return default if v is None else v.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def _env_int(key: str, default: int) -> int:
|
|
56
|
-
v = os.getenv(key)
|
|
57
|
-
if v is None:
|
|
58
|
-
return default
|
|
59
|
-
try:
|
|
60
|
-
return int(v)
|
|
61
|
-
except ValueError:
|
|
62
|
-
return default
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def build_browser_factory() -> BrowserFactoryPort:
|
|
66
|
-
return PlaywrightBrowserFactory(
|
|
67
|
-
headless=_env_bool("SCRAPER_HEADLESS", True),
|
|
68
|
-
timeout_ms=_env_int("SCRAPER_TIMEOUT_MS", 20_000),
|
|
69
|
-
max_concurrency=_env_int("SCRAPER_MAX_CONCURRENCY", 2),
|
|
70
|
-
)
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
# -------------------------
|
|
74
|
-
# Bundles
|
|
75
|
-
# -------------------------
|
|
76
|
-
|
|
77
|
-
@dataclass(frozen=True)
|
|
78
|
-
class FetchUsecases:
|
|
79
|
-
c101: FetchC101
|
|
80
|
-
c103: FetchC103
|
|
81
|
-
c104: FetchC104
|
|
82
|
-
c106: FetchC106
|
|
83
|
-
c108: FetchC108
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
@dataclass(frozen=True)
|
|
87
|
-
class Sinks:
|
|
88
|
-
c101: C101SinkPort
|
|
89
|
-
c103: C103SinkPort
|
|
90
|
-
c104: C104SinkPort
|
|
91
|
-
c106: C106SinkPort
|
|
92
|
-
c108: C108SinkPort
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
@dataclass(frozen=True)
|
|
96
|
-
class IngestUsecases:
|
|
97
|
-
c101: IngestPort
|
|
98
|
-
c103: IngestPort
|
|
99
|
-
c104: IngestPort
|
|
100
|
-
c106: IngestPort
|
|
101
|
-
c108: IngestPort
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
@dataclass(frozen=True)
|
|
105
|
-
class Usecases:
|
|
106
|
-
fetch: FetchUsecases
|
|
107
|
-
ingest: IngestUsecases
|
|
108
|
-
sinks: Sinks
|
|
109
|
-
store: InMemoryStore | None = None # ✅ memory일 때만
|
|
110
|
-
mongo: Mongo | None = None # ✅ mongo일 때만
|
|
111
|
-
db: AsyncDatabase | None = None # ✅ mongo일 때만
|
|
112
|
-
browser_factory: Optional[BrowserFactoryPort] = None
|
|
113
|
-
|
|
114
|
-
async def aclose(self) -> None:
|
|
115
|
-
if self.browser_factory is not None:
|
|
116
|
-
await self.browser_factory.aclose()
|
|
117
|
-
|
|
118
|
-
if self.mongo is not None:
|
|
119
|
-
await self.mongo.close()
|
|
120
|
-
|
|
121
|
-
# -------------------------
|
|
122
|
-
# builders
|
|
123
|
-
# -------------------------
|
|
124
|
-
|
|
125
|
-
def build_fetch_usecases(*, factory: BrowserFactoryPort) -> FetchUsecases:
|
|
126
|
-
return FetchUsecases(
|
|
127
|
-
c101=FetchC101(factory=factory),
|
|
128
|
-
c103=FetchC103(factory=factory),
|
|
129
|
-
c104=FetchC104(factory=factory),
|
|
130
|
-
c106=FetchC106(factory=factory),
|
|
131
|
-
c108=FetchC108(factory=factory),
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
@dataclass(frozen=True)
|
|
136
|
-
class MemoryBundle:
|
|
137
|
-
store: InMemoryStore
|
|
138
|
-
sinks: Sinks
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
def build_memory_bundle() -> MemoryBundle:
|
|
142
|
-
store = InMemoryStore()
|
|
143
|
-
sinks = Sinks(
|
|
144
|
-
c101=MemoryC101Sink(store),
|
|
145
|
-
c103=MemoryC103Sink(store),
|
|
146
|
-
c104=MemoryC104Sink(store),
|
|
147
|
-
c106=MemoryC106Sink(store),
|
|
148
|
-
c108=MemoryC108Sink(store),
|
|
149
|
-
)
|
|
150
|
-
return MemoryBundle(store=store, sinks=sinks)
|
|
151
|
-
|
|
152
|
-
# ---- mongo bundle ----
|
|
153
|
-
|
|
154
|
-
@dataclass(frozen=True)
|
|
155
|
-
class MongoBundle:
|
|
156
|
-
mongo: Mongo
|
|
157
|
-
db: AsyncDatabase
|
|
158
|
-
sinks: Sinks
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def build_mongo_bundle() -> MongoBundle:
|
|
162
|
-
mongo = Mongo() # settings는 db2가 env로 읽음 (DB2_MONGO_URI 등)
|
|
163
|
-
db = mongo.get_db()
|
|
164
|
-
sinks = Sinks(
|
|
165
|
-
c101=MongoC101Sink(db),
|
|
166
|
-
c103=MongoC103Sink(db),
|
|
167
|
-
c104=MongoC104Sink(db),
|
|
168
|
-
c106=MongoC106Sink(db),
|
|
169
|
-
c108=MongoC108Sink(db),
|
|
170
|
-
)
|
|
171
|
-
return MongoBundle(mongo=mongo, db=db, sinks=sinks)
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
def build_ingest_usecases(*, fetch: FetchUsecases, sinks: Sinks) -> IngestUsecases:
|
|
175
|
-
return IngestUsecases(
|
|
176
|
-
c101=IngestC101(fetch=fetch.c101, sink=sinks.c101),
|
|
177
|
-
c103=IngestC103(fetch=fetch.c103, sink=sinks.c103),
|
|
178
|
-
c104=IngestC104(fetch=fetch.c104, sink=sinks.c104),
|
|
179
|
-
c106=IngestC106(fetch=fetch.c106, sink=sinks.c106),
|
|
180
|
-
c108=IngestC108(fetch=fetch.c108, sink=sinks.c108),
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
def build_usecases(
|
|
185
|
-
*,
|
|
186
|
-
factory: BrowserFactoryPort | None = None,
|
|
187
|
-
sink_kind: SinkKind = "memory",
|
|
188
|
-
) -> Usecases:
|
|
189
|
-
factory = factory or build_browser_factory()
|
|
190
|
-
fetch = build_fetch_usecases(factory=factory)
|
|
191
|
-
|
|
192
|
-
if sink_kind == "memory":
|
|
193
|
-
bundle = build_memory_bundle()
|
|
194
|
-
ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
|
|
195
|
-
return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, store=bundle.store,
|
|
196
|
-
browser_factory=factory)
|
|
197
|
-
|
|
198
|
-
if sink_kind == "mongo":
|
|
199
|
-
bundle = build_mongo_bundle()
|
|
200
|
-
ingest = build_ingest_usecases(fetch=fetch, sinks=bundle.sinks)
|
|
201
|
-
return Usecases(fetch=fetch, ingest=ingest, sinks=bundle.sinks, mongo=bundle.mongo, db=bundle.db,
|
|
202
|
-
browser_factory=factory)
|
|
203
|
-
|
|
204
|
-
raise ValueError(f"Unknown sink_kind: {sink_kind}")
|
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
import re
|
|
3
|
-
from typing import Iterable
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
_EMPTY_VALUES = {"", "-", "N/A", "NA", "null", "None"}
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def normalize(s: str | None) -> str:
|
|
10
|
-
if s is None:
|
|
11
|
-
return ""
|
|
12
|
-
return s.strip()
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def _is_empty(s: str) -> bool:
|
|
16
|
-
return s in _EMPTY_VALUES
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def to_int(s: str | None) -> int | None:
|
|
20
|
-
"""
|
|
21
|
-
C101 파서용 정수 변환기
|
|
22
|
-
|
|
23
|
-
처리 규칙:
|
|
24
|
-
- None / '' / '-' / 'N/A' → None
|
|
25
|
-
- ',', '주', '원', '주식' 제거
|
|
26
|
-
- '1,234' → 1234
|
|
27
|
-
"""
|
|
28
|
-
s = normalize(s)
|
|
29
|
-
if _is_empty(s):
|
|
30
|
-
return None
|
|
31
|
-
|
|
32
|
-
for ch in (",", "원", "주", "주식"):
|
|
33
|
-
s = s.replace(ch, "")
|
|
34
|
-
|
|
35
|
-
try:
|
|
36
|
-
return int(s)
|
|
37
|
-
except ValueError:
|
|
38
|
-
return None
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def to_float(s: str | None) -> float | None:
|
|
42
|
-
"""
|
|
43
|
-
C101 파서용 실수 변환기
|
|
44
|
-
|
|
45
|
-
처리 규칙:
|
|
46
|
-
- None / '' / '-' / 'N/A' → None
|
|
47
|
-
- ',', '%', '원' 제거
|
|
48
|
-
- '12.34%' → 12.34
|
|
49
|
-
"""
|
|
50
|
-
s = normalize(s)
|
|
51
|
-
if _is_empty(s):
|
|
52
|
-
return None
|
|
53
|
-
|
|
54
|
-
for ch in (",", "%", "원"):
|
|
55
|
-
s = s.replace(ch, "")
|
|
56
|
-
|
|
57
|
-
try:
|
|
58
|
-
return float(s)
|
|
59
|
-
except ValueError:
|
|
60
|
-
return None
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def parse_won(text: str) -> int:
|
|
64
|
-
"""
|
|
65
|
-
한국 화폐 표현 문자열을 숫자로 변환 (조원, 억원, 만원, 원, 억 등 처리)
|
|
66
|
-
"""
|
|
67
|
-
units = {
|
|
68
|
-
"조원": 1_000_000_000_000,
|
|
69
|
-
"억원": 100_000_000,
|
|
70
|
-
"억": 100_000_000,
|
|
71
|
-
"만원": 10_000,
|
|
72
|
-
"원": 1,
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
text = text.replace(",", "").strip()
|
|
76
|
-
match = re.match(r"([-+]?[0-9]*\.?[0-9]+)([가-힣]+)", text)
|
|
77
|
-
|
|
78
|
-
if not match:
|
|
79
|
-
raise ValueError(f"형식이 잘못된 금액 문자열: {text}")
|
|
80
|
-
|
|
81
|
-
number, unit = match.groups()
|
|
82
|
-
if unit not in units:
|
|
83
|
-
raise ValueError(f"알 수 없는 단위: {unit}")
|
|
84
|
-
|
|
85
|
-
return int(float(number) * units[unit])
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
# scraper2/app/parsing/_normalize.py
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import re
|
|
5
|
-
from collections import Counter
|
|
6
|
-
from typing import Any
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import pandas as pd
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# -----------------------------
|
|
13
|
-
# 1) 항목(행의 "항목" 값) 정규화
|
|
14
|
-
# -----------------------------
|
|
15
|
-
_IFRS_PATTERN = re.compile(r"\(IFRS[^)]*\)")
|
|
16
|
-
_ETC_PAREN_PATTERN = re.compile(r"\((E|YoY|QoQ)[^)]*\)")
|
|
17
|
-
_BRACKET_PATTERN = re.compile(r"\[[^\]]*\]") # [구K-IFRS] 등
|
|
18
|
-
_EXTRA_WORDS_PATTERN = re.compile(r"(펼치기|연간컨센서스보기|연간컨센서스닫기)")
|
|
19
|
-
_ALL_PAREN_PATTERN = re.compile(r"\([^)]*\)") # ★ 모든 괄호 제거
|
|
20
|
-
|
|
21
|
-
def normalize_c1034_item(text: str | None) -> str:
|
|
22
|
-
"""
|
|
23
|
-
C103 항목명(행 값) 정규화
|
|
24
|
-
- 펼치기/컨센서스 제거
|
|
25
|
-
- 모든 괄호 내용 제거 (발표기준, 연결, 개별 등 포함)
|
|
26
|
-
- [구K-IFRS] 제거
|
|
27
|
-
- '*' 제거
|
|
28
|
-
- 공백 정리
|
|
29
|
-
"""
|
|
30
|
-
if not text:
|
|
31
|
-
return ""
|
|
32
|
-
|
|
33
|
-
s = str(text)
|
|
34
|
-
|
|
35
|
-
# 1) 불필요한 키워드
|
|
36
|
-
s = _EXTRA_WORDS_PATTERN.sub("", s)
|
|
37
|
-
|
|
38
|
-
# 2) 대괄호 제거
|
|
39
|
-
s = _BRACKET_PATTERN.sub("", s)
|
|
40
|
-
|
|
41
|
-
# 3) 모든 괄호 제거 (중요)
|
|
42
|
-
s = _ALL_PAREN_PATTERN.sub("", s)
|
|
43
|
-
|
|
44
|
-
# 4) 별표 제거
|
|
45
|
-
s = s.replace("*", "")
|
|
46
|
-
|
|
47
|
-
# 5) 공백 정리
|
|
48
|
-
s = re.sub(r"\s+", " ", s).strip()
|
|
49
|
-
|
|
50
|
-
return s
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
# -----------------------------
|
|
54
|
-
# 2) 컬럼명 정규화
|
|
55
|
-
# -----------------------------
|
|
56
|
-
_COL_PAREN_PATTERN = re.compile(r"\((IFRS[^)]*|E|YoY|QoQ)[^)]*\)")
|
|
57
|
-
_COL_EXTRA_WORDS = re.compile(r"(연간컨센서스보기|연간컨센서스닫기)")
|
|
58
|
-
_COL_DOTNUM = re.compile(r"\.\d+$") # pandas 중복 컬럼 suffix 제거용 (.1, .2 ...)
|
|
59
|
-
_COL_MULTI_SPACE = re.compile(r"\s+")
|
|
60
|
-
|
|
61
|
-
def normalize_c1034_col(col: str | None) -> str:
|
|
62
|
-
"""
|
|
63
|
-
C103 컬럼명 정규화
|
|
64
|
-
예)
|
|
65
|
-
"2024/12 (IFRS연결) 연간컨센서스보기" -> "2024/12"
|
|
66
|
-
"2025/12(E) (IFRS연결) 연간컨센서스닫기" -> "2025/12"
|
|
67
|
-
"전년대비 (YoY)" -> "전년대비"
|
|
68
|
-
"전년대비 (YoY).1" -> "전년대비" (중복은 후처리에서 _2/_3로 자동 분리)
|
|
69
|
-
"""
|
|
70
|
-
if col is None:
|
|
71
|
-
return ""
|
|
72
|
-
|
|
73
|
-
s = str(col)
|
|
74
|
-
|
|
75
|
-
# 1) pandas가 붙인 .1 같은 suffix 제거 (정규화 충돌은 후단에서 처리)
|
|
76
|
-
s = _COL_DOTNUM.sub("", s)
|
|
77
|
-
|
|
78
|
-
# 2) 컨센서스 문구 제거
|
|
79
|
-
s = _COL_EXTRA_WORDS.sub("", s)
|
|
80
|
-
|
|
81
|
-
# 3) 괄호 주석 제거: (IFRS...), (E), (YoY), (QoQ)
|
|
82
|
-
s = _COL_PAREN_PATTERN.sub("", s)
|
|
83
|
-
|
|
84
|
-
# 4) 공백 정리
|
|
85
|
-
s = _COL_MULTI_SPACE.sub(" ", s).strip()
|
|
86
|
-
|
|
87
|
-
return s
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def _dedupe_columns(cols: list[str]) -> list[str]:
|
|
91
|
-
"""
|
|
92
|
-
정규화 후 중복 컬럼명이 생기면 자동으로 _2, _3 ... 붙여서 유니크하게 만든다.
|
|
93
|
-
예) ["전년대비", "전년대비"] -> ["전년대비", "전년대비_2"]
|
|
94
|
-
"""
|
|
95
|
-
seen: Counter[str] = Counter()
|
|
96
|
-
out: list[str] = []
|
|
97
|
-
for c in cols:
|
|
98
|
-
c = c or ""
|
|
99
|
-
seen[c] += 1
|
|
100
|
-
if seen[c] == 1:
|
|
101
|
-
out.append(c)
|
|
102
|
-
else:
|
|
103
|
-
out.append(f"{c}_{seen[c]}")
|
|
104
|
-
return out
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
# -----------------------------
|
|
108
|
-
# 3) DataFrame 전체 정규화 + records 변환
|
|
109
|
-
# -----------------------------
|
|
110
|
-
def normalize_c1034_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
111
|
-
"""
|
|
112
|
-
- 컬럼명 전체 정규화
|
|
113
|
-
- '항목' 값 정규화
|
|
114
|
-
- NaN -> None
|
|
115
|
-
- 중복 컬럼명 자동 분리(_2/_3)
|
|
116
|
-
"""
|
|
117
|
-
if df is None or df.empty:
|
|
118
|
-
return df
|
|
119
|
-
|
|
120
|
-
df = df.copy()
|
|
121
|
-
|
|
122
|
-
# 컬럼명 정규화 + 중복 방지
|
|
123
|
-
norm_cols = [normalize_c1034_col(c) for c in df.columns.astype(str).tolist()]
|
|
124
|
-
df.columns = _dedupe_columns(norm_cols)
|
|
125
|
-
|
|
126
|
-
# 항목 값 정규화
|
|
127
|
-
if "항목" in df.columns:
|
|
128
|
-
df["항목"] = df["항목"].map(normalize_c1034_item)
|
|
129
|
-
|
|
130
|
-
# NaN -> None
|
|
131
|
-
df = df.replace({np.nan: None})
|
|
132
|
-
return df
|
|
133
|
-
|
|
134
|
-
|