scraper2-hj3415 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. scraper2/.DS_Store +0 -0
  2. scraper2/adapters/out/.DS_Store +0 -0
  3. scraper2/adapters/out/playwright/browser.py +103 -0
  4. scraper2/adapters/out/playwright/browser_factory.py +112 -0
  5. scraper2/adapters/out/playwright/session.py +121 -0
  6. scraper2/adapters/out/sinks/.DS_Store +0 -0
  7. scraper2/adapters/out/sinks/memory/__init__.py +15 -0
  8. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
  9. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
  10. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
  11. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
  12. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
  13. scraper2/adapters/out/sinks/memory/store.py +74 -0
  14. scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
  15. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
  16. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
  17. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
  18. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
  19. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
  20. scraper2/app/composition.py +195 -0
  21. scraper2/app/parsing/_converters.py +85 -0
  22. scraper2/app/parsing/_normalize.py +134 -0
  23. scraper2/app/parsing/c101_parser.py +143 -0
  24. scraper2/app/parsing/c103_parser.py +128 -0
  25. scraper2/app/parsing/c104_parser.py +143 -0
  26. scraper2/app/parsing/c106_parser.py +153 -0
  27. scraper2/app/parsing/c108_parser.py +65 -0
  28. scraper2/app/ports/browser/browser_factory_port.py +11 -0
  29. scraper2/app/ports/browser/browser_port.py +22 -0
  30. scraper2/app/ports/ingest_port.py +13 -0
  31. scraper2/app/ports/sinks/base_sink_port.py +14 -0
  32. scraper2/app/ports/sinks/c101_sink_port.py +9 -0
  33. scraper2/app/ports/sinks/c103_sink_port.py +9 -0
  34. scraper2/app/ports/sinks/c104_sink_port.py +9 -0
  35. scraper2/app/ports/sinks/c106_sink_port.py +9 -0
  36. scraper2/app/ports/sinks/c108_sink_port.py +9 -0
  37. scraper2/app/usecases/fetch/fetch_c101.py +43 -0
  38. scraper2/app/usecases/fetch/fetch_c103.py +103 -0
  39. scraper2/app/usecases/fetch/fetch_c104.py +76 -0
  40. scraper2/app/usecases/fetch/fetch_c106.py +90 -0
  41. scraper2/app/usecases/fetch/fetch_c108.py +49 -0
  42. scraper2/app/usecases/ingest/ingest_c101.py +36 -0
  43. scraper2/app/usecases/ingest/ingest_c103.py +37 -0
  44. scraper2/app/usecases/ingest/ingest_c104.py +37 -0
  45. scraper2/app/usecases/ingest/ingest_c106.py +38 -0
  46. scraper2/app/usecases/ingest/ingest_c108.py +39 -0
  47. scraper2/main.py +257 -0
  48. scraper2_hj3415-2.0.0.dist-info/METADATA +164 -0
  49. scraper2_hj3415-2.0.0.dist-info/RECORD +63 -0
  50. scraper2_hj3415-2.0.0.dist-info/entry_points.txt +3 -0
  51. scraper2_hj3415/__main__.py +0 -6
  52. scraper2_hj3415/adapters/_shared/utils.py +0 -29
  53. scraper2_hj3415/adapters/clients/browser.py +0 -124
  54. scraper2_hj3415/adapters/clients/http.py +0 -51
  55. scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
  56. scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
  57. scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
  58. scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
  59. scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
  60. scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
  61. scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
  62. scraper2_hj3415/core/constants.py +0 -47
  63. scraper2_hj3415/core/ports/sink_port.py +0 -16
  64. scraper2_hj3415/core/ports/source_port.py +0 -13
  65. scraper2_hj3415/core/types.py +0 -11
  66. scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
  67. scraper2_hj3415/di.py +0 -103
  68. scraper2_hj3415/entrypoints/cli.py +0 -226
  69. scraper2_hj3415/entrypoints/main.py +0 -20
  70. scraper2_hj3415-1.0.1.dist-info/METADATA +0 -66
  71. scraper2_hj3415-1.0.1.dist-info/RECORD +0 -35
  72. scraper2_hj3415-1.0.1.dist-info/entry_points.txt +0 -3
  73. {scraper2_hj3415 → scraper2}/__init__.py +0 -0
  74. {scraper2_hj3415/adapters → scraper2/adapters/out}/__init__.py +0 -0
  75. {scraper2_hj3415/adapters/_shared → scraper2/adapters/out/playwright}/__init__.py +0 -0
  76. {scraper2_hj3415/adapters/clients → scraper2/app}/__init__.py +0 -0
  77. {scraper2_hj3415/adapters/nfs/pipelines → scraper2/app/parsing}/__init__.py +0 -0
  78. {scraper2_hj3415/adapters/nfs/sinks → scraper2/app/ports}/__init__.py +0 -0
  79. {scraper2_hj3415/adapters/nfs/sources → scraper2/app/ports/browser}/__init__.py +0 -0
  80. {scraper2_hj3415/core → scraper2/app/ports/sinks}/__init__.py +0 -0
  81. {scraper2_hj3415/core/ports → scraper2/app/usecases}/__init__.py +0 -0
  82. {scraper2_hj3415/core/usecases → scraper2/app/usecases/fetch}/__init__.py +0 -0
  83. {scraper2_hj3415/entrypoints → scraper2/app/usecases/ingest}/__init__.py +0 -0
  84. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.0.0.dist-info}/WHEEL +0 -0
  85. {scraper2_hj3415-1.0.1.dist-info → scraper2_hj3415-2.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,128 @@
1
+ # scraper2/app/parsing/c103_parser.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ from io import StringIO
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from scraper2.app.ports.browser.browser_port import BrowserPort
11
+ from scraper2.app.parsing._normalize import normalize_c1034_df
12
+
13
+ # ---- constants ----
14
+
15
+ TABLE_XPATH = "xpath=//div[@id='wrapper']//div//table" # 기존 selector 의도 유지
16
+ TABLE_INDEX = 2 # 너 코드의 nth(2)
17
+
18
+
19
+ BTN_SETS: dict[str, list[tuple[str, str]]] = {
20
+ "손익계산서y": [
21
+ ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
22
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
23
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
24
+ ],
25
+ "재무상태표y": [
26
+ ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
27
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
28
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
29
+ ],
30
+ "현금흐름표y": [
31
+ ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
32
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
33
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
34
+ ],
35
+ "손익계산서q": [
36
+ ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
37
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
38
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
39
+ ],
40
+ "재무상태표q": [
41
+ ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
42
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
43
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
44
+ ],
45
+ "현금흐름표q": [
46
+ ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
47
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
48
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
49
+ ],
50
+ }
51
+
52
+
53
+ # ---- small helpers ----
54
+
55
+ async def _click_steps(
56
+ browser: BrowserPort,
57
+ steps: list[tuple[str, str]],
58
+ *,
59
+ jitter_sec: float = 0.6,
60
+ ) -> None:
61
+ """
62
+ (goto 없음) 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
63
+ """
64
+ for _name, selector in steps:
65
+ await browser.wait(selector)
66
+ await browser.click(selector)
67
+ # 서버/클라이언트 부담 줄이기: 작은 지터
68
+ await asyncio.sleep(0.2 + (jitter_sec * 0.5))
69
+
70
+
71
+ def _html_table_to_df(html: str) -> pd.DataFrame:
72
+ """
73
+ table outerHTML -> DataFrame
74
+ """
75
+ dfs = pd.read_html(StringIO(html), header=0)
76
+ if not dfs:
77
+ raise ValueError("pd.read_html() 테이블 파싱 실패")
78
+ return dfs[0]
79
+
80
+
81
+ async def _nth_table_outer_html(browser: BrowserPort, nth: int) -> str:
82
+ return await browser.outer_html_nth(TABLE_XPATH, nth)
83
+
84
+ def df_to_c103_records(df: pd.DataFrame) -> list[dict[str, Any]]:
85
+ """
86
+ C103 테이블 DataFrame -> 정규화된 records(list[dict])
87
+ - 항목이 비면 제거
88
+ """
89
+ df = normalize_c1034_df(df)
90
+ if df is None or df.empty:
91
+ return []
92
+
93
+ records: list[dict[str, Any]] = []
94
+ for r in df.to_dict(orient="records"):
95
+ item = r.get("항목")
96
+ if not item:
97
+ continue
98
+ records.append(r)
99
+ return records
100
+
101
+ # ---- public parser ----
102
+
103
+ async def parse_c103_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
104
+ """
105
+ C103 파서: dict만 반환
106
+ {
107
+ "손익계산서y": [ {...}, ... ],
108
+ "손익계산서q": [ {...}, ... ],
109
+ ...
110
+ }
111
+ """
112
+ out: dict[str, list[dict[str, Any]]] = {}
113
+
114
+ for key, steps in BTN_SETS.items():
115
+ # 클릭 → 테이블 로딩 → nth table outerHTML → df → records
116
+ await _click_steps(browser, steps)
117
+ await browser.wait(TABLE_XPATH)
118
+
119
+ try:
120
+ html = await _nth_table_outer_html(browser, TABLE_INDEX)
121
+ df = _html_table_to_df(html)
122
+ out[key] = df_to_c103_records(df)
123
+ except Exception:
124
+ out[key] = [] # 실패는 빈 리스트
125
+
126
+ return out
127
+
128
+
@@ -0,0 +1,143 @@
1
+ # scraper2/app/parsing/c104_parser.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ from io import StringIO
6
+ from typing import Any
7
+
8
+ import pandas as pd
9
+
10
+ from scraper2.app.ports.browser.browser_port import BrowserPort
11
+ from scraper2.app.parsing._normalize import normalize_c1034_df
12
+
13
+
14
+ # ---- constants ----
15
+
16
+ TABLE_SELECTOR = 'xpath=//table[@class="gHead01 all-width data-list"]'
17
+
18
+ BTN_SETS: dict[str, list[tuple[str, str]]] = {
19
+ "수익성y": [
20
+ ("수익성", 'xpath=//*[ @id="val_tab1"]'),
21
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
22
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
23
+ ],
24
+ "성장성y": [
25
+ ("성장성", 'xpath=//*[ @id="val_tab2"]'),
26
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
27
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
28
+ ],
29
+ "안정성y": [
30
+ ("안정성", 'xpath=//*[ @id="val_tab3"]'),
31
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
32
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
33
+ ],
34
+ "활동성y": [
35
+ ("활동성", 'xpath=//*[ @id="val_tab4"]'),
36
+ ("연간", 'xpath=//*[@id="frqTyp0"]'),
37
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
38
+ ],
39
+ "가치분석y": [
40
+ ("가치분석연간", 'xpath=//*[@id="frqTyp0_2"]'),
41
+ ("가치분석검색", 'xpath=//*[@id="hfinGubun2"]'),
42
+ ],
43
+ "수익성q": [
44
+ ("수익성", 'xpath=//*[ @id="val_tab1"]'),
45
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
46
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
47
+ ],
48
+ "성장성q": [
49
+ ("성장성", 'xpath=//*[ @id="val_tab2"]'),
50
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
51
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
52
+ ],
53
+ "안정성q": [
54
+ ("안정성", 'xpath=//*[ @id="val_tab3"]'),
55
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
56
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
57
+ ],
58
+ "활동성q": [
59
+ ("활동성", 'xpath=//*[ @id="val_tab4"]'),
60
+ ("분기", 'xpath=//*[@id="frqTyp1"]'),
61
+ ("검색", 'xpath=//*[@id="hfinGubun"]'),
62
+ ],
63
+ "가치분석q": [
64
+ ("가치분석분기", 'xpath=//*[@id="frqTyp1_2"]'),
65
+ ("가치분석검색", 'xpath=//*[@id="hfinGubun2"]'),
66
+ ],
67
+ }
68
+
69
+
70
+ # ---- small helpers ----
71
+
72
+ async def _click_steps(
73
+ browser: BrowserPort,
74
+ steps: list[tuple[str, str]],
75
+ *,
76
+ jitter_sec: float = 0.6,
77
+ ) -> None:
78
+ """
79
+ (goto 없음) 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
80
+ """
81
+ for _name, selector in steps:
82
+ await browser.wait(selector)
83
+ await browser.click(selector)
84
+ await asyncio.sleep(0.2 + (jitter_sec * 0.5))
85
+
86
+
87
+ def _html_table_to_df(html: str) -> pd.DataFrame:
88
+ dfs = pd.read_html(StringIO(html), header=0)
89
+ if not dfs:
90
+ raise ValueError("pd.read_html() 테이블 파싱 실패")
91
+ return dfs[0]
92
+
93
+
94
+ async def _table_outer_html(browser: BrowserPort, *, is_value_analysis: bool) -> str:
95
+ """
96
+ c104 원본 로직 유지:
97
+ - 가치분석*: 두 번째 테이블(nth=1)
98
+ - 그 외: 첫 번째 테이블(nth=0)
99
+ """
100
+ idx = 1 if is_value_analysis else 0
101
+ return await browser.outer_html_nth(TABLE_SELECTOR, idx)
102
+
103
+
104
+ def df_to_c104_records(df: pd.DataFrame) -> list[dict[str, Any]]:
105
+ """
106
+ C104 테이블 DataFrame -> records(list[dict])
107
+ - NaN -> None
108
+ - (원하면 normalize_c104_df(df) 추가)
109
+ """
110
+ df = normalize_c1034_df(df)
111
+ if df is None or df.empty:
112
+ return []
113
+
114
+ df = df.where(pd.notnull(df), None)
115
+
116
+ return df.to_dict(orient="records")
117
+
118
+
119
+ # ---- public parser ----
120
+
121
+ async def parse_c104_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
122
+ """
123
+ C104 파서: dict만 반환
124
+ {
125
+ "수익성y": [ {...}, ... ],
126
+ "가치분석q": [ {...}, ... ],
127
+ ...
128
+ }
129
+ """
130
+ out: dict[str, list[dict[str, Any]]] = {}
131
+
132
+ for key, steps in BTN_SETS.items():
133
+ await _click_steps(browser, steps)
134
+ await browser.wait(TABLE_SELECTOR)
135
+
136
+ try:
137
+ html = await _table_outer_html(browser, is_value_analysis=key.startswith("가치분석"))
138
+ df = _html_table_to_df(html)
139
+ out[key] = df_to_c104_records(df)
140
+ except Exception:
141
+ out[key] = []
142
+
143
+ return out
@@ -0,0 +1,153 @@
1
+ # scraper2/app/parsing/c106_parser.py
2
+ from __future__ import annotations
3
+
4
+ from io import StringIO
5
+ from typing import Dict
6
+
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from scraper2.app.ports.browser.browser_port import BrowserPort
11
+
12
+
13
+ # ---------- pure helpers (sync) ----------
14
+
15
+ def df_to_metrics(df: pd.DataFrame) -> dict[str, dict[str, float | None]]:
16
+ """
17
+ DataFrame ->
18
+ { "<항목2>": { "<회사명>": float|None, ... }, ... }
19
+ """
20
+ out: dict[str, dict[str, float | None]] = {}
21
+ if df is None or df.empty:
22
+ return out
23
+
24
+ rows = df.replace({np.nan: None}).to_dict(orient="records")
25
+ for r in rows:
26
+ metric = r.get("항목2")
27
+ if not metric:
28
+ continue
29
+
30
+ bucket = out.setdefault(str(metric).strip(), {})
31
+ for k, v in r.items():
32
+ if k in ("항목", "항목2"):
33
+ continue
34
+ bucket[str(k)] = None if v is None else float(v)
35
+
36
+ return out
37
+
38
+
39
+ def df_from_comparison_html(html: str, company_names: list[str]) -> pd.DataFrame:
40
+ """
41
+ #cTB611 테이블 HTML -> 정제된 DataFrame
42
+ """
43
+ df = pd.read_html(StringIO(html), header=None)[0]
44
+
45
+ # 첫 번째 열은 '항목', 두 번째 열은 '항목2', 나머지는 회사명
46
+ df.columns = ["항목", "항목2"] + company_names
47
+
48
+ # 대분류(항목) NaN은 이전 값으로 채움
49
+ df["항목"] = df["항목"].ffill()
50
+
51
+ # 첫 두 줄만 '주가데이터' 삽입(기존 로직 유지)
52
+ for i in range(min(2, len(df))):
53
+ row = df.loc[i].tolist()
54
+ new_row = ["주가데이터"] + row
55
+ df.loc[i] = new_row[: len(df.columns)]
56
+
57
+ # 항목2 없는 행 제거
58
+ df = df[df["항목2"].notna()].reset_index(drop=True)
59
+
60
+ # 특정 항목은 대분류 수동 지정
61
+ df.loc[df["항목2"].isin(["투자의견", "목표주가(원)"]), "항목"] = "기타지표"
62
+
63
+ # 필요없는 행 제거
64
+ df = df[df["항목2"] != "재무연월"].reset_index(drop=True)
65
+
66
+ # 숫자 정리 (회사 컬럼만)
67
+ for col in df.columns[2:]:
68
+ df[col] = df[col].replace("-", "0")
69
+ df[col] = pd.to_numeric(df[col], errors="coerce")
70
+
71
+ # 문자열 정리
72
+ df["항목"] = df["항목"].astype(str).str.replace("펼치기", "").str.strip()
73
+ df["항목2"] = df["항목2"].astype(str).str.strip()
74
+
75
+ return df
76
+
77
+
78
+ # ---------- parsers (async, BrowserPort only) ----------
79
+
80
+ async def parse_c106_header(browser: BrowserPort) -> list[str]:
81
+ """
82
+ 현재 페이지에서 '기업간비교자료' 헤더(회사명들)만 뽑는다.
83
+ (goto/sleep 없음)
84
+ """
85
+ await browser.wait("#cTB611_h")
86
+
87
+ selector = (
88
+ 'xpath=//caption[contains(text(), "기업간비교자료")]/following-sibling::thead//th[not(@colspan)]'
89
+ )
90
+ th_texts = await browser.all_texts(selector)
91
+
92
+ names: list[str] = []
93
+ for t in th_texts:
94
+ name = t.strip().split("\n")[0]
95
+ if name:
96
+ names.append(name)
97
+ return names
98
+
99
+
100
+ async def parse_c106_table_to_metrics(
101
+ browser: BrowserPort,
102
+ company_names: list[str],
103
+ ) -> dict[str, dict[str, float | None]]:
104
+ """
105
+ 현재 페이지에서 #cTB611 테이블을 읽어 metrics dict로 변환한다.
106
+ """
107
+ await browser.wait("#cTB611")
108
+ html = await browser.outer_html("#cTB611")
109
+
110
+ df = df_from_comparison_html(html, company_names)
111
+ if not isinstance(df, pd.DataFrame):
112
+ raise TypeError(f"c106 df invalid: type={type(df)}")
113
+ return df_to_metrics(df)
114
+
115
+
116
+ METRIC_NAME_MAP = {
117
+ "전일종가(원)": "전일종가",
118
+ "시가총액(억원)": "시가총액",
119
+
120
+ "자산총계(억원)": "자산총계",
121
+ "부채총계(억원)": "부채총계",
122
+
123
+ "매출액(억원)": "매출액",
124
+ "영업이익(억원)": "영업이익",
125
+ "당기순이익(억원)": "당기순이익",
126
+ "당기순이익(지배)(억원)": "당기순이익_지배",
127
+
128
+ "영업이익률(%)": "영업이익률",
129
+ "순이익률(%)": "순이익률",
130
+ "ROE(%)": "ROE",
131
+ "부채비율(%)": "부채비율",
132
+
133
+ "PER": "PER",
134
+ "PBR": "PBR",
135
+
136
+ "투자의견": "투자의견",
137
+ "목표주가(원)": "목표주가",
138
+ }
139
+
140
+ def normalize_c106_metrics(
141
+ raw: Dict[str, Dict[str, float | None]],
142
+ ) -> Dict[str, Dict[str, float | None]]:
143
+ out: Dict[str, Dict[str, float | None]] = {}
144
+
145
+ for raw_name, company_map in raw.items():
146
+ field = METRIC_NAME_MAP.get(raw_name)
147
+ if not field:
148
+ # 모르는 항목은 버리거나 로그만 남김
149
+ continue
150
+
151
+ out[field] = company_map
152
+
153
+ return out
@@ -0,0 +1,65 @@
1
+ # scraper2/app/parsing/c108_parser.py
2
+ from __future__ import annotations
3
+
4
+ from typing import Any
5
+
6
+ import pandas as pd
7
+ from scraper2.app.ports.browser.browser_port import BrowserPort
8
+
9
+
10
+ def extract_bullets(text: str | None) -> list[str]:
11
+ if not text:
12
+ return []
13
+ return [
14
+ line.replace("▶", "").strip()
15
+ for line in text.splitlines()
16
+ if line.strip().startswith("▶")
17
+ ]
18
+
19
+
20
+ async def parse_c108_to_dicts(browser: BrowserPort) -> list[dict[str, Any]]:
21
+ title = await browser.title()
22
+ url = await browser.current_url()
23
+
24
+ if "접속장애" in title:
25
+ print(
26
+ "[C108][ACCESS_ERROR] "
27
+ f"title={title!r} url={url}"
28
+ )
29
+ return []
30
+
31
+ # table → records (adapter가 처리)
32
+ try:
33
+ records = await browser.table_records("#tableCmpDetail", header=0)
34
+ except Exception as e:
35
+ print("table_records failed:", type(e).__name__, e)
36
+ return []
37
+
38
+ # 날짜 변환(파서에서 처리)
39
+ # records의 key가 '일자'일 가능성이 있으니 통일
40
+ for r in records:
41
+ if "일자" in r and "날짜" not in r:
42
+ r["날짜"] = r.pop("일자")
43
+
44
+ # "%y/%m/%d" -> "%Y.%m.%d"
45
+ if r.get("날짜"):
46
+ dt = pd.to_datetime(r["날짜"], format="%y/%m/%d", errors="coerce")
47
+ r["날짜"] = None if pd.isna(dt) else dt.strftime("%Y.%m.%d")
48
+
49
+ # 내용 추출
50
+ contents: list[list[str]] = []
51
+ row_count = len(records)
52
+
53
+ for i in range(row_count):
54
+ try:
55
+ await browser.click(f"#a{i}")
56
+ await browser.wait(f"#c{i} > div > div.comment-body")
57
+ text = await browser.inner_text(f"#c{i} > div > div.comment-body")
58
+ contents.append(extract_bullets(text))
59
+ except Exception:
60
+ contents.append([])
61
+
62
+ # records에 내용/코드 주입
63
+ for r, content in zip(records, contents, strict=False):
64
+ r["내용"] = content
65
+ return records
@@ -0,0 +1,11 @@
1
+ # scraper2/app/ports/browser/browser_factory_port.py
2
+ from __future__ import annotations
3
+ from typing import AsyncIterator, Protocol
4
+ from contextlib import asynccontextmanager
5
+
6
+ from scraper2.app.ports.browser.browser_port import BrowserPort
7
+
8
+ class BrowserFactoryPort(Protocol):
9
+ @asynccontextmanager
10
+ async def lease(self) -> AsyncIterator[BrowserPort]: ...
11
+ async def aclose(self) -> None: ...
@@ -0,0 +1,22 @@
1
+ # scraper2/app/ports/browser/browser_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol, Any
4
+
5
+ class BrowserPort(Protocol):
6
+ async def goto(self, url: str, timeout_ms: int = 10_000) -> None: ...
7
+ async def title(self) -> str: ...
8
+ async def current_url(self) -> str: ...
9
+ async def text(self, selector: str) -> str: ...
10
+ async def texts(self, selector: str) -> list[str]: ...
11
+ async def text_first_by_text(self, needle: str) -> str: ...
12
+ async def inner_text(self, selector: str) -> str: ...
13
+ async def click(self, selector: str) -> None: ...
14
+ async def wait(self, selector: str, timeout_ms: int = 10_000) -> None: ...
15
+ # 핵심 추가: table → records
16
+ async def table_records(self, table_selector: str, *, header: int | list[int] = 0) -> list[dict[str, Any]]: ...
17
+ # c106에서 사용
18
+ async def outer_html(self, selector: str) -> str: ...
19
+ async def all_texts(self, selector: str) -> list[str]: ...
20
+
21
+ # c1034 추가: selector로 여러 개 잡히는 것 중 nth(0-based)의 outerHTML 반환
22
+ async def outer_html_nth(self, selector: str, index: int) -> str: ...
@@ -0,0 +1,13 @@
1
+ # scraper2/app/ports/ingest_port.py
2
+ from typing import Protocol, Iterable
3
+ from datetime import datetime
4
+
5
+ class IngestPort(Protocol):
6
+ async def execute_many(
7
+ self,
8
+ codes: list[str],
9
+ *,
10
+ sleep_sec: float = ...,
11
+ asof: datetime | None = None,
12
+ ) -> list[object]:
13
+ ...
@@ -0,0 +1,14 @@
1
+ # scraper2/app/ports/sinks/base_sink_port.py
2
+ from __future__ import annotations
3
+
4
+ from datetime import datetime
5
+ from typing import Protocol, TypeVar, Iterable, Optional
6
+
7
+ T = TypeVar("T")
8
+
9
+ class SinkPort(Protocol[T]):
10
+ async def write(self, dto: T, *, asof: Optional[datetime] = None) -> None:
11
+ ...
12
+
13
+ async def write_many(self, dtos: Iterable[T], *, asof: Optional[datetime] = None) -> None:
14
+ ...
@@ -0,0 +1,9 @@
1
+ # scraper2/app/ports/sinks/c101_sink_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol
4
+ from scraper2.app.ports.sinks.base_sink_port import SinkPort
5
+ from contracts.nfs.c101 import C101DTO
6
+
7
+ class C101SinkPort(SinkPort[C101DTO], Protocol):
8
+ """C101DTO 저장 싱크"""
9
+ ...
@@ -0,0 +1,9 @@
1
+ # scraper2/app/ports/sinks/c103_sink_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol
4
+ from scraper2.app.ports.sinks.base_sink_port import SinkPort
5
+ from contracts.nfs.c103 import C103DTO
6
+
7
+ class C103SinkPort(SinkPort[C103DTO], Protocol):
8
+ """C103DTO 저장 싱크"""
9
+ ...
@@ -0,0 +1,9 @@
1
+ # scraper2/app/ports/sinks/c104_sink_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol
4
+ from scraper2.app.ports.sinks.base_sink_port import SinkPort
5
+ from contracts.nfs.c104 import C104DTO
6
+
7
+ class C104SinkPort(SinkPort[C104DTO], Protocol):
8
+ """C104DTO 저장 싱크"""
9
+ ...
@@ -0,0 +1,9 @@
1
+ # scraper2/app/ports/sinks/c106_sink_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol
4
+ from scraper2.app.ports.sinks.base_sink_port import SinkPort
5
+ from contracts.nfs.c106 import C106DTO
6
+
7
+ class C106SinkPort(SinkPort[C106DTO], Protocol):
8
+ """C106DTO 저장 싱크"""
9
+ ...
@@ -0,0 +1,9 @@
1
+ # scraper2/app/ports/sinks/c108_sink_port.py
2
+ from __future__ import annotations
3
+ from typing import Protocol
4
+ from scraper2.app.ports.sinks.base_sink_port import SinkPort
5
+ from contracts.nfs.c108 import C108DTO
6
+
7
+ class C108SinkPort(SinkPort[C108DTO], Protocol):
8
+ """C108DTO 저장 싱크"""
9
+ ...
@@ -0,0 +1,43 @@
1
+ # scraper2/app/usecases/fetch/fetch_c101.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ import random
6
+ from typing import Iterable
7
+ from contracts.nfs.c101 import C101DTO
8
+ from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
9
+ from scraper2.app.parsing.c101_parser import parse_c101_to_dict
10
+
11
+
12
+ class FetchC101:
13
+ def __init__(self, factory: BrowserFactoryPort):
14
+ self.factory = factory
15
+
16
+ async def _fetch_one(self, code: str, *, sleep_sec: float) -> C101DTO | None:
17
+ async with self.factory.lease() as browser:
18
+ url = f"https://navercomp.wisereport.co.kr/v2/company/c1010001.aspx?cmp_cd={code}"
19
+ await browser.goto(url, timeout_ms=10_000)
20
+
21
+ jitter_sec = 1.0
22
+ if sleep_sec > 0:
23
+ delay = sleep_sec + random.uniform(0, jitter_sec)
24
+ await asyncio.sleep(delay)
25
+
26
+ d = await parse_c101_to_dict(browser)
27
+ return None if not d else C101DTO(**d)
28
+
29
+ async def execute(self, code: str, *, sleep_sec: float = 2.0) -> C101DTO | None:
30
+ return await self._fetch_one(code, sleep_sec=sleep_sec)
31
+
32
+ async def execute_many(
33
+ self,
34
+ codes: Iterable[str],
35
+ *,
36
+ sleep_sec: float = 2.0,
37
+ ) -> list[C101DTO]:
38
+ results = await asyncio.gather(
39
+ *(self._fetch_one(c, sleep_sec=sleep_sec) for c in codes),
40
+ return_exceptions=False,
41
+ )
42
+ return [r for r in results if r is not None]
43
+