scraper2-hj3415 2.4.0__py3-none-any.whl → 2.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. scraper2_hj3415/app/adapters/out/playwright/browser.py +373 -0
  2. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/browser_factory.py +5 -5
  3. {scraper2 → scraper2_hj3415/app}/adapters/out/playwright/session.py +1 -1
  4. scraper2_hj3415/app/adapters/out/sinks/memory_sink.py +25 -0
  5. scraper2_hj3415/app/adapters/out/sinks/mongo_sink.py +63 -0
  6. {scraper2/adapters/out/sinks/memory → scraper2_hj3415/app/adapters/out/sinks}/store.py +14 -5
  7. scraper2_hj3415/app/adapters/site/wisereport_playwright.py +168 -0
  8. scraper2_hj3415/app/composition.py +225 -0
  9. scraper2_hj3415/app/domain/blocks.py +61 -0
  10. scraper2_hj3415/app/domain/constants.py +33 -0
  11. scraper2_hj3415/app/domain/doc.py +16 -0
  12. scraper2_hj3415/app/domain/endpoint.py +11 -0
  13. scraper2_hj3415/app/domain/series.py +11 -0
  14. scraper2_hj3415/app/domain/types.py +19 -0
  15. scraper2_hj3415/app/parsing/_normalize/label.py +92 -0
  16. scraper2_hj3415/app/parsing/_normalize/table.py +53 -0
  17. scraper2_hj3415/app/parsing/_normalize/text.py +31 -0
  18. scraper2_hj3415/app/parsing/_normalize/values.py +70 -0
  19. scraper2_hj3415/app/parsing/_tables/html_table.py +88 -0
  20. scraper2_hj3415/app/parsing/c101/__init__.py +0 -0
  21. scraper2_hj3415/app/parsing/c101/_sise_normalizer.py +103 -0
  22. scraper2_hj3415/app/parsing/c101/company_overview.py +47 -0
  23. scraper2_hj3415/app/parsing/c101/earning_surprise.py +217 -0
  24. scraper2_hj3415/app/parsing/c101/fundamentals.py +95 -0
  25. scraper2_hj3415/app/parsing/c101/major_shareholders.py +57 -0
  26. scraper2_hj3415/app/parsing/c101/sise.py +47 -0
  27. scraper2_hj3415/app/parsing/c101/summary_cmp.py +87 -0
  28. scraper2_hj3415/app/parsing/c101/yearly_consensus.py +197 -0
  29. scraper2_hj3415/app/parsing/c101_parser.py +45 -0
  30. scraper2_hj3415/app/parsing/c103_parser.py +19 -0
  31. scraper2_hj3415/app/parsing/c104_parser.py +23 -0
  32. scraper2_hj3415/app/parsing/c106_parser.py +137 -0
  33. scraper2_hj3415/app/parsing/c108_parser.py +254 -0
  34. scraper2_hj3415/app/ports/__init__.py +0 -0
  35. scraper2_hj3415/app/ports/browser/__init__.py +0 -0
  36. scraper2_hj3415/app/ports/browser/browser_factory_port.py +9 -0
  37. scraper2_hj3415/app/ports/browser/browser_port.py +115 -0
  38. scraper2_hj3415/app/ports/ingest/__init__.py +0 -0
  39. scraper2_hj3415/app/ports/ingest/nfs_ingest_port.py +28 -0
  40. scraper2_hj3415/app/ports/sinks/__init__.py +0 -0
  41. scraper2_hj3415/app/ports/sinks/nfs_sink_port.py +20 -0
  42. scraper2_hj3415/app/ports/site/__init__.py +0 -0
  43. scraper2_hj3415/app/ports/site/wisereport_port.py +20 -0
  44. scraper2_hj3415/app/services/__init__.py +0 -0
  45. scraper2_hj3415/app/services/fetch/__init__.py +0 -0
  46. scraper2_hj3415/app/services/fetch/fetch_c101.py +59 -0
  47. scraper2_hj3415/app/services/fetch/fetch_c103.py +135 -0
  48. scraper2_hj3415/app/services/fetch/fetch_c104.py +183 -0
  49. scraper2_hj3415/app/services/fetch/fetch_c106.py +90 -0
  50. scraper2_hj3415/app/services/fetch/fetch_c108.py +59 -0
  51. scraper2_hj3415/app/services/nfs_doc_builders.py +290 -0
  52. scraper2_hj3415/app/usecases/__init__.py +0 -0
  53. scraper2_hj3415/app/usecases/ingest/__init__.py +0 -0
  54. scraper2_hj3415/app/usecases/ingest/ingest_c101.py +111 -0
  55. scraper2_hj3415/app/usecases/ingest/ingest_c103.py +162 -0
  56. scraper2_hj3415/app/usecases/ingest/ingest_c104.py +182 -0
  57. scraper2_hj3415/app/usecases/ingest/ingest_c106.py +136 -0
  58. scraper2_hj3415/app/usecases/ingest/ingest_c108.py +122 -0
  59. scraper2/main.py → scraper2_hj3415/cli.py +40 -80
  60. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/METADATA +3 -1
  61. scraper2_hj3415-2.6.0.dist-info/RECORD +75 -0
  62. scraper2_hj3415-2.6.0.dist-info/entry_points.txt +3 -0
  63. scraper2/.DS_Store +0 -0
  64. scraper2/adapters/out/.DS_Store +0 -0
  65. scraper2/adapters/out/playwright/browser.py +0 -102
  66. scraper2/adapters/out/sinks/.DS_Store +0 -0
  67. scraper2/adapters/out/sinks/memory/__init__.py +0 -15
  68. scraper2/adapters/out/sinks/memory/c101_memory_sink.py +0 -26
  69. scraper2/adapters/out/sinks/memory/c103_memory_sink.py +0 -26
  70. scraper2/adapters/out/sinks/memory/c104_memory_sink.py +0 -26
  71. scraper2/adapters/out/sinks/memory/c106_memory_sink.py +0 -26
  72. scraper2/adapters/out/sinks/memory/c108_memory_sink.py +0 -26
  73. scraper2/adapters/out/sinks/mongo/__init__.py +0 -14
  74. scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +0 -43
  75. scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +0 -41
  76. scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +0 -41
  77. scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +0 -41
  78. scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +0 -41
  79. scraper2/app/composition.py +0 -204
  80. scraper2/app/parsing/_converters.py +0 -85
  81. scraper2/app/parsing/_normalize.py +0 -134
  82. scraper2/app/parsing/c101_parser.py +0 -143
  83. scraper2/app/parsing/c103_parser.py +0 -128
  84. scraper2/app/parsing/c104_parser.py +0 -143
  85. scraper2/app/parsing/c106_parser.py +0 -153
  86. scraper2/app/parsing/c108_parser.py +0 -65
  87. scraper2/app/ports/browser/browser_factory_port.py +0 -11
  88. scraper2/app/ports/browser/browser_port.py +0 -22
  89. scraper2/app/ports/ingest_port.py +0 -14
  90. scraper2/app/ports/sinks/base_sink_port.py +0 -14
  91. scraper2/app/ports/sinks/c101_sink_port.py +0 -9
  92. scraper2/app/ports/sinks/c103_sink_port.py +0 -9
  93. scraper2/app/ports/sinks/c104_sink_port.py +0 -9
  94. scraper2/app/ports/sinks/c106_sink_port.py +0 -9
  95. scraper2/app/ports/sinks/c108_sink_port.py +0 -9
  96. scraper2/app/usecases/fetch/fetch_c101.py +0 -43
  97. scraper2/app/usecases/fetch/fetch_c103.py +0 -103
  98. scraper2/app/usecases/fetch/fetch_c104.py +0 -76
  99. scraper2/app/usecases/fetch/fetch_c106.py +0 -90
  100. scraper2/app/usecases/fetch/fetch_c108.py +0 -49
  101. scraper2/app/usecases/ingest/ingest_c101.py +0 -36
  102. scraper2/app/usecases/ingest/ingest_c103.py +0 -37
  103. scraper2/app/usecases/ingest/ingest_c104.py +0 -37
  104. scraper2/app/usecases/ingest/ingest_c106.py +0 -38
  105. scraper2/app/usecases/ingest/ingest_c108.py +0 -39
  106. scraper2_hj3415-2.4.0.dist-info/RECORD +0 -63
  107. scraper2_hj3415-2.4.0.dist-info/entry_points.txt +0 -3
  108. {scraper2 → scraper2_hj3415}/__init__.py +0 -0
  109. {scraper2/adapters/out → scraper2_hj3415/app}/__init__.py +0 -0
  110. {scraper2/adapters/out/playwright → scraper2_hj3415/app/adapters}/__init__.py +0 -0
  111. {scraper2/app → scraper2_hj3415/app/adapters/out}/__init__.py +0 -0
  112. {scraper2/app/parsing → scraper2_hj3415/app/adapters/out/playwright}/__init__.py +0 -0
  113. {scraper2/app/ports → scraper2_hj3415/app/adapters/out/sinks}/__init__.py +0 -0
  114. {scraper2/app/ports/browser → scraper2_hj3415/app/adapters/site}/__init__.py +0 -0
  115. {scraper2/app/ports/sinks → scraper2_hj3415/app/domain}/__init__.py +0 -0
  116. {scraper2/app/usecases → scraper2_hj3415/app/parsing}/__init__.py +0 -0
  117. {scraper2/app/usecases/fetch → scraper2_hj3415/app/parsing/_normalize}/__init__.py +0 -0
  118. {scraper2/app/usecases/ingest → scraper2_hj3415/app/parsing/_tables}/__init__.py +0 -0
  119. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/WHEEL +0 -0
  120. {scraper2_hj3415-2.4.0.dist-info → scraper2_hj3415-2.6.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,143 +0,0 @@
1
- # scraper2/app/parsing/c101_parser.py
2
- from __future__ import annotations
3
-
4
- from scraper2.app.parsing._converters import to_int, to_float, normalize, parse_won
5
- from scraper2.app.ports.browser.browser_port import BrowserPort
6
- from typing import Any
7
-
8
-
9
- class C101ParseError(RuntimeError):
10
- pass
11
-
12
-
13
- def _after_colon(s: str) -> str:
14
- # "업종: XXX" 같은 케이스
15
- parts = s.split(":")
16
- return parts[1].strip() if len(parts) > 1 else s.strip()
17
-
18
-
19
- async def parse_c101_to_dict(browser: BrowserPort) -> dict[str, Any] | None:
20
- """
21
- - BrowserPort만 사용
22
- - dict만 반환
23
- - 실패 시 None (기존 동작 유지)
24
- """
25
-
26
- # 날짜 파싱: 텍스트 검색 기반
27
- raw_date_str = await browser.text_first_by_text("[기준:")
28
- if not raw_date_str:
29
- return None
30
- 날짜 = raw_date_str.replace("[기준:", "").replace("]", "").strip()
31
-
32
- # 1) 재무정보 (1st table)
33
- # 여기선 "tbody 존재"만 기다리면 됨
34
- await browser.wait("#pArea > div.wrapper-table > div > table > tbody")
35
-
36
- 종목명 = normalize(await browser.text(
37
- "#pArea > div.wrapper-table > div > table > tbody "
38
- "tr:nth-child(1) > td > dl > dt:nth-child(1) > span"
39
- ))
40
- 코드 = normalize(await browser.text(
41
- "#pArea > div.wrapper-table > div > table > tbody "
42
- "tr:nth-child(1) > td > dl > dt:nth-child(1) > b"
43
- ))
44
- 업종_raw = await browser.text(
45
- "#pArea > div.wrapper-table > div > table > tbody "
46
- "tr:nth-child(1) > td > dl > dt:nth-child(4)"
47
- )
48
- 업종 = _after_colon(업종_raw)
49
-
50
- eps = to_int(await browser.text(
51
- "#pArea > div.wrapper-table > div > table > tbody "
52
- "tr:nth-child(3) > td > dl > dt:nth-child(1) > b"
53
- ))
54
- bps = to_int(await browser.text(
55
- "#pArea > div.wrapper-table > div > table > tbody "
56
- "tr:nth-child(3) > td > dl > dt:nth-child(2) > b"
57
- ))
58
- per = to_float(await browser.text(
59
- "#pArea > div.wrapper-table > div > table > tbody "
60
- "tr:nth-child(3) > td > dl > dt:nth-child(3) > b"
61
- ))
62
- 업종per = to_float(await browser.text(
63
- "#pArea > div.wrapper-table > div > table > tbody "
64
- "tr:nth-child(3) > td > dl > dt:nth-child(4) > b"
65
- ))
66
- pbr = to_float(await browser.text(
67
- "#pArea > div.wrapper-table > div > table > tbody "
68
- "tr:nth-child(3) > td > dl > dt:nth-child(5) > b"
69
- ))
70
- 배당수익률 = to_float(await browser.text(
71
- "#pArea > div.wrapper-table > div > table > tbody "
72
- "tr:nth-child(3) > td > dl > dt:nth-child(6) > b"
73
- ))
74
-
75
- # 2) 주가 정보 (2nd table)
76
- await browser.wait("#cTB11 > tbody")
77
-
78
- 주가 = to_int(await browser.text("#cTB11 > tbody tr:nth-child(1) > td > strong"))
79
-
80
- 전일대비_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(2)")
81
- 전일대비 = to_int(전일대비_raw.replace("원", ""))
82
-
83
- 수익률_raw = await browser.text("#cTB11 > tbody tr:nth-child(1) > td > span:nth-child(3)")
84
- 수익률 = to_float(수익률_raw.replace("%", ""))
85
-
86
- 최고최저52 = await browser.text("#cTB11 > tbody tr:nth-child(2) > td")
87
- 최고52, 최저52 = (to_int(x.strip().replace("원", "")) for x in 최고최저52.split("/"))
88
-
89
- 거래량거래대금 = await browser.text("#cTB11 > tbody tr:nth-child(4) > td")
90
- 거래량_str, 거래대금_str = (x.strip() for x in 거래량거래대금.split("/"))
91
- 거래량 = to_int(거래량_str.replace("주", ""))
92
- 거래대금 = parse_won(거래대금_str)
93
-
94
- 시가총액 = parse_won(await browser.text("#cTB11 > tbody tr:nth-child(5) > td"))
95
- 베타52주 = to_float(await browser.text("#cTB11 > tbody tr:nth-child(6) > td"))
96
-
97
- 발행주식유동비율 = await browser.text("#cTB11 > tbody tr:nth-child(7) > td")
98
- 발행주식_str, 유동비율_str = (x.strip() for x in 발행주식유동비율.split("/"))
99
- 발행주식 = to_int(발행주식_str.replace("주", ""))
100
- 유동비율 = to_float(유동비율_str.replace("%", ""))
101
-
102
- 외국인지분율 = to_float((await browser.text("#cTB11 > tbody tr:nth-child(8) > td")).replace("%", ""))
103
-
104
- 수익률1M3M6M1Y = await browser.text("#cTB11 > tbody tr:nth-child(9) > td")
105
- 수익률1M, 수익률3M, 수익률6M, 수익률1Y = (
106
- to_float(x.strip().replace("%", "")) for x in 수익률1M3M6M1Y.split("/")
107
- )
108
-
109
- # 3) 개요
110
- # ul 아래 li들을 전부 읽어서 합치기
111
- await browser.wait("#wrapper > div:nth-child(6) > div.cmp_comment > ul")
112
- li_texts = await browser.texts("#wrapper > div:nth-child(6) > div.cmp_comment > ul li")
113
- 개요 = "".join(t.strip() for t in li_texts if t and t.strip())
114
-
115
- return {
116
- "종목명": 종목명,
117
- "코드": 코드,
118
- "날짜": 날짜,
119
- "업종": 업종,
120
- "eps": eps,
121
- "bps": bps,
122
- "per": per,
123
- "업종per": 업종per,
124
- "pbr": pbr,
125
- "배당수익률": 배당수익률,
126
- "주가": 주가,
127
- "전일대비": 전일대비,
128
- "수익률": 수익률,
129
- "최고52": 최고52,
130
- "최저52": 최저52,
131
- "거래량": 거래량,
132
- "거래대금": 거래대금,
133
- "시가총액": 시가총액,
134
- "베타52주": 베타52주,
135
- "발행주식": 발행주식,
136
- "유동비율": 유동비율,
137
- "외국인지분율": 외국인지분율,
138
- "수익률1M": 수익률1M,
139
- "수익률3M": 수익률3M,
140
- "수익률6M": 수익률6M,
141
- "수익률1Y": 수익률1Y,
142
- "개요": 개요,
143
- }
@@ -1,128 +0,0 @@
1
- # scraper2/app/parsing/c103_parser.py
2
- from __future__ import annotations
3
-
4
- import asyncio
5
- from io import StringIO
6
- from typing import Any
7
-
8
- import pandas as pd
9
-
10
- from scraper2.app.ports.browser.browser_port import BrowserPort
11
- from scraper2.app.parsing._normalize import normalize_c1034_df
12
-
13
- # ---- constants ----
14
-
15
- TABLE_XPATH = "xpath=//div[@id='wrapper']//div//table" # 기존 selector 의도 유지
16
- TABLE_INDEX = 2 # 너 코드의 nth(2)
17
-
18
-
19
- BTN_SETS: dict[str, list[tuple[str, str]]] = {
20
- "손익계산서y": [
21
- ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
22
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
23
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
24
- ],
25
- "재무상태표y": [
26
- ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
27
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
28
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
29
- ],
30
- "현금흐름표y": [
31
- ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
32
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
33
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
34
- ],
35
- "손익계산서q": [
36
- ("손익계산서", 'xpath=//*[@id="rpt_tab1"]'),
37
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
38
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
39
- ],
40
- "재무상태표q": [
41
- ("재무상태표", 'xpath=//*[@id="rpt_tab2"]'),
42
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
43
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
44
- ],
45
- "현금흐름표q": [
46
- ("현금흐름표", 'xpath=//*[@id="rpt_tab3"]'),
47
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
48
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
49
- ],
50
- }
51
-
52
-
53
- # ---- small helpers ----
54
-
55
- async def _click_steps(
56
- browser: BrowserPort,
57
- steps: list[tuple[str, str]],
58
- *,
59
- jitter_sec: float = 0.6,
60
- ) -> None:
61
- """
62
- (goto 없음) 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
63
- """
64
- for _name, selector in steps:
65
- await browser.wait(selector)
66
- await browser.click(selector)
67
- # 서버/클라이언트 부담 줄이기: 작은 지터
68
- await asyncio.sleep(0.2 + (jitter_sec * 0.5))
69
-
70
-
71
- def _html_table_to_df(html: str) -> pd.DataFrame:
72
- """
73
- table outerHTML -> DataFrame
74
- """
75
- dfs = pd.read_html(StringIO(html), header=0)
76
- if not dfs:
77
- raise ValueError("pd.read_html() 테이블 파싱 실패")
78
- return dfs[0]
79
-
80
-
81
- async def _nth_table_outer_html(browser: BrowserPort, nth: int) -> str:
82
- return await browser.outer_html_nth(TABLE_XPATH, nth)
83
-
84
- def df_to_c103_records(df: pd.DataFrame) -> list[dict[str, Any]]:
85
- """
86
- C103 테이블 DataFrame -> 정규화된 records(list[dict])
87
- - 항목이 비면 제거
88
- """
89
- df = normalize_c1034_df(df)
90
- if df is None or df.empty:
91
- return []
92
-
93
- records: list[dict[str, Any]] = []
94
- for r in df.to_dict(orient="records"):
95
- item = r.get("항목")
96
- if not item:
97
- continue
98
- records.append(r)
99
- return records
100
-
101
- # ---- public parser ----
102
-
103
- async def parse_c103_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
104
- """
105
- C103 파서: dict만 반환
106
- {
107
- "손익계산서y": [ {...}, ... ],
108
- "손익계산서q": [ {...}, ... ],
109
- ...
110
- }
111
- """
112
- out: dict[str, list[dict[str, Any]]] = {}
113
-
114
- for key, steps in BTN_SETS.items():
115
- # 클릭 → 테이블 로딩 → nth table outerHTML → df → records
116
- await _click_steps(browser, steps)
117
- await browser.wait(TABLE_XPATH)
118
-
119
- try:
120
- html = await _nth_table_outer_html(browser, TABLE_INDEX)
121
- df = _html_table_to_df(html)
122
- out[key] = df_to_c103_records(df)
123
- except Exception:
124
- out[key] = [] # 실패는 빈 리스트
125
-
126
- return out
127
-
128
-
@@ -1,143 +0,0 @@
1
- # scraper2/app/parsing/c104_parser.py
2
- from __future__ import annotations
3
-
4
- import asyncio
5
- from io import StringIO
6
- from typing import Any
7
-
8
- import pandas as pd
9
-
10
- from scraper2.app.ports.browser.browser_port import BrowserPort
11
- from scraper2.app.parsing._normalize import normalize_c1034_df
12
-
13
-
14
- # ---- constants ----
15
-
16
- TABLE_SELECTOR = 'xpath=//table[@class="gHead01 all-width data-list"]'
17
-
18
- BTN_SETS: dict[str, list[tuple[str, str]]] = {
19
- "수익성y": [
20
- ("수익성", 'xpath=//*[ @id="val_tab1"]'),
21
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
22
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
23
- ],
24
- "성장성y": [
25
- ("성장성", 'xpath=//*[ @id="val_tab2"]'),
26
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
27
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
28
- ],
29
- "안정성y": [
30
- ("안정성", 'xpath=//*[ @id="val_tab3"]'),
31
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
32
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
33
- ],
34
- "활동성y": [
35
- ("활동성", 'xpath=//*[ @id="val_tab4"]'),
36
- ("연간", 'xpath=//*[@id="frqTyp0"]'),
37
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
38
- ],
39
- "가치분석y": [
40
- ("가치분석연간", 'xpath=//*[@id="frqTyp0_2"]'),
41
- ("가치분석검색", 'xpath=//*[@id="hfinGubun2"]'),
42
- ],
43
- "수익성q": [
44
- ("수익성", 'xpath=//*[ @id="val_tab1"]'),
45
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
46
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
47
- ],
48
- "성장성q": [
49
- ("성장성", 'xpath=//*[ @id="val_tab2"]'),
50
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
51
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
52
- ],
53
- "안정성q": [
54
- ("안정성", 'xpath=//*[ @id="val_tab3"]'),
55
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
56
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
57
- ],
58
- "활동성q": [
59
- ("활동성", 'xpath=//*[ @id="val_tab4"]'),
60
- ("분기", 'xpath=//*[@id="frqTyp1"]'),
61
- ("검색", 'xpath=//*[@id="hfinGubun"]'),
62
- ],
63
- "가치분석q": [
64
- ("가치분석분기", 'xpath=//*[@id="frqTyp1_2"]'),
65
- ("가치분석검색", 'xpath=//*[@id="hfinGubun2"]'),
66
- ],
67
- }
68
-
69
-
70
- # ---- small helpers ----
71
-
72
- async def _click_steps(
73
- browser: BrowserPort,
74
- steps: list[tuple[str, str]],
75
- *,
76
- jitter_sec: float = 0.6,
77
- ) -> None:
78
- """
79
- (goto 없음) 현재 페이지에서 탭/라디오/검색 버튼 클릭만 수행.
80
- """
81
- for _name, selector in steps:
82
- await browser.wait(selector)
83
- await browser.click(selector)
84
- await asyncio.sleep(0.2 + (jitter_sec * 0.5))
85
-
86
-
87
- def _html_table_to_df(html: str) -> pd.DataFrame:
88
- dfs = pd.read_html(StringIO(html), header=0)
89
- if not dfs:
90
- raise ValueError("pd.read_html() 테이블 파싱 실패")
91
- return dfs[0]
92
-
93
-
94
- async def _table_outer_html(browser: BrowserPort, *, is_value_analysis: bool) -> str:
95
- """
96
- c104 원본 로직 유지:
97
- - 가치분석*: 두 번째 테이블(nth=1)
98
- - 그 외: 첫 번째 테이블(nth=0)
99
- """
100
- idx = 1 if is_value_analysis else 0
101
- return await browser.outer_html_nth(TABLE_SELECTOR, idx)
102
-
103
-
104
- def df_to_c104_records(df: pd.DataFrame) -> list[dict[str, Any]]:
105
- """
106
- C104 테이블 DataFrame -> records(list[dict])
107
- - NaN -> None
108
- - (원하면 normalize_c104_df(df) 추가)
109
- """
110
- df = normalize_c1034_df(df)
111
- if df is None or df.empty:
112
- return []
113
-
114
- df = df.where(pd.notnull(df), None)
115
-
116
- return df.to_dict(orient="records")
117
-
118
-
119
- # ---- public parser ----
120
-
121
- async def parse_c104_to_dict(browser: BrowserPort) -> dict[str, list[dict[str, Any]]]:
122
- """
123
- C104 파서: dict만 반환
124
- {
125
- "수익성y": [ {...}, ... ],
126
- "가치분석q": [ {...}, ... ],
127
- ...
128
- }
129
- """
130
- out: dict[str, list[dict[str, Any]]] = {}
131
-
132
- for key, steps in BTN_SETS.items():
133
- await _click_steps(browser, steps)
134
- await browser.wait(TABLE_SELECTOR)
135
-
136
- try:
137
- html = await _table_outer_html(browser, is_value_analysis=key.startswith("가치분석"))
138
- df = _html_table_to_df(html)
139
- out[key] = df_to_c104_records(df)
140
- except Exception:
141
- out[key] = []
142
-
143
- return out
@@ -1,153 +0,0 @@
1
- # scraper2/app/parsing/c106_parser.py
2
- from __future__ import annotations
3
-
4
- from io import StringIO
5
- from typing import Dict
6
-
7
- import numpy as np
8
- import pandas as pd
9
-
10
- from scraper2.app.ports.browser.browser_port import BrowserPort
11
-
12
-
13
- # ---------- pure helpers (sync) ----------
14
-
15
- def df_to_metrics(df: pd.DataFrame) -> dict[str, dict[str, float | None]]:
16
- """
17
- DataFrame ->
18
- { "<항목2>": { "<회사명>": float|None, ... }, ... }
19
- """
20
- out: dict[str, dict[str, float | None]] = {}
21
- if df is None or df.empty:
22
- return out
23
-
24
- rows = df.replace({np.nan: None}).to_dict(orient="records")
25
- for r in rows:
26
- metric = r.get("항목2")
27
- if not metric:
28
- continue
29
-
30
- bucket = out.setdefault(str(metric).strip(), {})
31
- for k, v in r.items():
32
- if k in ("항목", "항목2"):
33
- continue
34
- bucket[str(k)] = None if v is None else float(v)
35
-
36
- return out
37
-
38
-
39
- def df_from_comparison_html(html: str, company_names: list[str]) -> pd.DataFrame:
40
- """
41
- #cTB611 테이블 HTML -> 정제된 DataFrame
42
- """
43
- df = pd.read_html(StringIO(html), header=None)[0]
44
-
45
- # 첫 번째 열은 '항목', 두 번째 열은 '항목2', 나머지는 회사명
46
- df.columns = ["항목", "항목2"] + company_names
47
-
48
- # 대분류(항목) NaN은 이전 값으로 채움
49
- df["항목"] = df["항목"].ffill()
50
-
51
- # 첫 두 줄만 '주가데이터' 삽입(기존 로직 유지)
52
- for i in range(min(2, len(df))):
53
- row = df.loc[i].tolist()
54
- new_row = ["주가데이터"] + row
55
- df.loc[i] = new_row[: len(df.columns)]
56
-
57
- # 항목2 없는 행 제거
58
- df = df[df["항목2"].notna()].reset_index(drop=True)
59
-
60
- # 특정 항목은 대분류 수동 지정
61
- df.loc[df["항목2"].isin(["투자의견", "목표주가(원)"]), "항목"] = "기타지표"
62
-
63
- # 필요없는 행 제거
64
- df = df[df["항목2"] != "재무연월"].reset_index(drop=True)
65
-
66
- # 숫자 정리 (회사 컬럼만)
67
- for col in df.columns[2:]:
68
- df[col] = df[col].replace("-", "0")
69
- df[col] = pd.to_numeric(df[col], errors="coerce")
70
-
71
- # 문자열 정리
72
- df["항목"] = df["항목"].astype(str).str.replace("펼치기", "").str.strip()
73
- df["항목2"] = df["항목2"].astype(str).str.strip()
74
-
75
- return df
76
-
77
-
78
- # ---------- parsers (async, BrowserPort only) ----------
79
-
80
- async def parse_c106_header(browser: BrowserPort) -> list[str]:
81
- """
82
- 현재 페이지에서 '기업간비교자료' 헤더(회사명들)만 뽑는다.
83
- (goto/sleep 없음)
84
- """
85
- await browser.wait("#cTB611_h")
86
-
87
- selector = (
88
- 'xpath=//caption[contains(text(), "기업간비교자료")]/following-sibling::thead//th[not(@colspan)]'
89
- )
90
- th_texts = await browser.all_texts(selector)
91
-
92
- names: list[str] = []
93
- for t in th_texts:
94
- name = t.strip().split("\n")[0]
95
- if name:
96
- names.append(name)
97
- return names
98
-
99
-
100
- async def parse_c106_table_to_metrics(
101
- browser: BrowserPort,
102
- company_names: list[str],
103
- ) -> dict[str, dict[str, float | None]]:
104
- """
105
- 현재 페이지에서 #cTB611 테이블을 읽어 metrics dict로 변환한다.
106
- """
107
- await browser.wait("#cTB611")
108
- html = await browser.outer_html("#cTB611")
109
-
110
- df = df_from_comparison_html(html, company_names)
111
- if not isinstance(df, pd.DataFrame):
112
- raise TypeError(f"c106 df invalid: type={type(df)}")
113
- return df_to_metrics(df)
114
-
115
-
116
- METRIC_NAME_MAP = {
117
- "전일종가(원)": "전일종가",
118
- "시가총액(억원)": "시가총액",
119
-
120
- "자산총계(억원)": "자산총계",
121
- "부채총계(억원)": "부채총계",
122
-
123
- "매출액(억원)": "매출액",
124
- "영업이익(억원)": "영업이익",
125
- "당기순이익(억원)": "당기순이익",
126
- "당기순이익(지배)(억원)": "당기순이익_지배",
127
-
128
- "영업이익률(%)": "영업이익률",
129
- "순이익률(%)": "순이익률",
130
- "ROE(%)": "ROE",
131
- "부채비율(%)": "부채비율",
132
-
133
- "PER": "PER",
134
- "PBR": "PBR",
135
-
136
- "투자의견": "투자의견",
137
- "목표주가(원)": "목표주가",
138
- }
139
-
140
- def normalize_c106_metrics(
141
- raw: Dict[str, Dict[str, float | None]],
142
- ) -> Dict[str, Dict[str, float | None]]:
143
- out: Dict[str, Dict[str, float | None]] = {}
144
-
145
- for raw_name, company_map in raw.items():
146
- field = METRIC_NAME_MAP.get(raw_name)
147
- if not field:
148
- # 모르는 항목은 버리거나 로그만 남김
149
- continue
150
-
151
- out[field] = company_map
152
-
153
- return out
@@ -1,65 +0,0 @@
1
- # scraper2/app/parsing/c108_parser.py
2
- from __future__ import annotations
3
-
4
- from typing import Any
5
-
6
- import pandas as pd
7
- from scraper2.app.ports.browser.browser_port import BrowserPort
8
-
9
-
10
- def extract_bullets(text: str | None) -> list[str]:
11
- if not text:
12
- return []
13
- return [
14
- line.replace("▶", "").strip()
15
- for line in text.splitlines()
16
- if line.strip().startswith("▶")
17
- ]
18
-
19
-
20
- async def parse_c108_to_dicts(browser: BrowserPort) -> list[dict[str, Any]]:
21
- title = await browser.title()
22
- url = await browser.current_url()
23
-
24
- if "접속장애" in title:
25
- print(
26
- "[C108][ACCESS_ERROR] "
27
- f"title={title!r} url={url}"
28
- )
29
- return []
30
-
31
- # table → records (adapter가 처리)
32
- try:
33
- records = await browser.table_records("#tableCmpDetail", header=0)
34
- except Exception as e:
35
- print("table_records failed:", type(e).__name__, e)
36
- return []
37
-
38
- # 날짜 변환(파서에서 처리)
39
- # records의 key가 '일자'일 가능성이 있으니 통일
40
- for r in records:
41
- if "일자" in r and "날짜" not in r:
42
- r["날짜"] = r.pop("일자")
43
-
44
- # "%y/%m/%d" -> "%Y.%m.%d"
45
- if r.get("날짜"):
46
- dt = pd.to_datetime(r["날짜"], format="%y/%m/%d", errors="coerce")
47
- r["날짜"] = None if pd.isna(dt) else dt.strftime("%Y.%m.%d")
48
-
49
- # 내용 추출
50
- contents: list[list[str]] = []
51
- row_count = len(records)
52
-
53
- for i in range(row_count):
54
- try:
55
- await browser.click(f"#a{i}")
56
- await browser.wait(f"#c{i} > div > div.comment-body")
57
- text = await browser.inner_text(f"#c{i} > div > div.comment-body")
58
- contents.append(extract_bullets(text))
59
- except Exception:
60
- contents.append([])
61
-
62
- # records에 내용/코드 주입
63
- for r, content in zip(records, contents, strict=False):
64
- r["내용"] = content
65
- return records
@@ -1,11 +0,0 @@
1
- # scraper2/app/ports/browser/browser_factory_port.py
2
- from __future__ import annotations
3
- from typing import AsyncIterator, Protocol
4
- from contextlib import asynccontextmanager
5
-
6
- from scraper2.app.ports.browser.browser_port import BrowserPort
7
-
8
- class BrowserFactoryPort(Protocol):
9
- @asynccontextmanager
10
- async def lease(self) -> AsyncIterator[BrowserPort]: ...
11
- async def aclose(self) -> None: ...
@@ -1,22 +0,0 @@
1
- # scraper2/app/ports/browser/browser_port.py
2
- from __future__ import annotations
3
- from typing import Protocol, Any
4
-
5
- class BrowserPort(Protocol):
6
- async def goto(self, url: str, timeout_ms: int = 10_000) -> None: ...
7
- async def title(self) -> str: ...
8
- async def current_url(self) -> str: ...
9
- async def text(self, selector: str) -> str: ...
10
- async def texts(self, selector: str) -> list[str]: ...
11
- async def text_first_by_text(self, needle: str) -> str: ...
12
- async def inner_text(self, selector: str) -> str: ...
13
- async def click(self, selector: str) -> None: ...
14
- async def wait(self, selector: str, timeout_ms: int = 10_000) -> None: ...
15
- # 핵심 추가: table → records
16
- async def table_records(self, table_selector: str, *, header: int | list[int] = 0) -> list[dict[str, Any]]: ...
17
- # c106에서 사용
18
- async def outer_html(self, selector: str) -> str: ...
19
- async def all_texts(self, selector: str) -> list[str]: ...
20
-
21
- # c1034 추가: selector로 여러 개 잡히는 것 중 nth(0-based)의 outerHTML 반환
22
- async def outer_html_nth(self, selector: str, index: int) -> str: ...
@@ -1,14 +0,0 @@
1
- # scraper2/app/ports/ingest_port.py
2
- from typing import Protocol, Iterable, Optional, Sequence, Any
3
- from datetime import datetime
4
-
5
- class IngestPort(Protocol):
6
- async def execute_many(
7
- self,
8
- codes: Iterable[str],
9
- *,
10
- sleep_sec: float = ...,
11
- asof: Optional[datetime] = None,
12
- ) -> Sequence[Any]:
13
- ...
14
-