scraper2-hj3415 1.0.1__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. scraper2_hj3415-2.1.0/PKG-INFO +164 -0
  2. scraper2_hj3415-2.1.0/README.md +144 -0
  3. {scraper2_hj3415-1.0.1 → scraper2_hj3415-2.1.0}/pyproject.toml +10 -18
  4. scraper2_hj3415-2.1.0/src/scraper2/.DS_Store +0 -0
  5. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/.DS_Store +0 -0
  6. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/playwright/browser.py +103 -0
  7. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/playwright/browser_factory.py +112 -0
  8. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/playwright/session.py +121 -0
  9. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/.DS_Store +0 -0
  10. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/__init__.py +15 -0
  11. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/c101_memory_sink.py +20 -0
  12. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/c103_memory_sink.py +20 -0
  13. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/c104_memory_sink.py +20 -0
  14. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/c106_memory_sink.py +20 -0
  15. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/c108_memory_sink.py +20 -0
  16. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/memory/store.py +74 -0
  17. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/mongo/__init__.py +14 -0
  18. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/mongo/c101_mongo_sink.py +43 -0
  19. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/mongo/c103_mongo_sink.py +41 -0
  20. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/mongo/c104_mongo_sink.py +41 -0
  21. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/mongo/c106_mongo_sink.py +41 -0
  22. scraper2_hj3415-2.1.0/src/scraper2/adapters/out/sinks/mongo/c108_mongo_sink.py +41 -0
  23. scraper2_hj3415-2.1.0/src/scraper2/app/composition.py +195 -0
  24. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/_converters.py +85 -0
  25. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/_normalize.py +134 -0
  26. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/c101_parser.py +143 -0
  27. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/c103_parser.py +128 -0
  28. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/c104_parser.py +143 -0
  29. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/c106_parser.py +153 -0
  30. scraper2_hj3415-2.1.0/src/scraper2/app/parsing/c108_parser.py +65 -0
  31. scraper2_hj3415-2.1.0/src/scraper2/app/ports/browser/browser_factory_port.py +11 -0
  32. scraper2_hj3415-2.1.0/src/scraper2/app/ports/browser/browser_port.py +22 -0
  33. scraper2_hj3415-2.1.0/src/scraper2/app/ports/ingest_port.py +13 -0
  34. scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks/base_sink_port.py +14 -0
  35. scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks/c101_sink_port.py +9 -0
  36. scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks/c103_sink_port.py +9 -0
  37. scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks/c104_sink_port.py +9 -0
  38. scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks/c106_sink_port.py +9 -0
  39. scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks/c108_sink_port.py +9 -0
  40. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/fetch/fetch_c101.py +43 -0
  41. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/fetch/fetch_c103.py +103 -0
  42. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/fetch/fetch_c104.py +76 -0
  43. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/fetch/fetch_c106.py +90 -0
  44. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/fetch/fetch_c108.py +49 -0
  45. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/ingest/ingest_c101.py +36 -0
  46. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/ingest/ingest_c103.py +37 -0
  47. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/ingest/ingest_c104.py +37 -0
  48. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/ingest/ingest_c106.py +38 -0
  49. scraper2_hj3415-2.1.0/src/scraper2/app/usecases/ingest/ingest_c108.py +39 -0
  50. scraper2_hj3415-2.1.0/src/scraper2/main.py +257 -0
  51. scraper2_hj3415-1.0.1/.coverage +0 -0
  52. scraper2_hj3415-1.0.1/PKG-INFO +0 -66
  53. scraper2_hj3415-1.0.1/README.md +0 -38
  54. scraper2_hj3415-1.0.1/pytest.ini +0 -10
  55. scraper2_hj3415-1.0.1/requirements.txt +0 -8
  56. scraper2_hj3415-1.0.1/src/scraper2_hj3415/__main__.py +0 -6
  57. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/_shared/utils.py +0 -29
  58. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/clients/browser.py +0 -124
  59. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/clients/http.py +0 -51
  60. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/pipelines/c1034_pipeline.py +0 -55
  61. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/pipelines/normalize_c1034.py +0 -109
  62. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sinks/c1034_sink.py +0 -51
  63. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sinks/df_to_dto_mappers.py +0 -106
  64. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sources/bundle_source.py +0 -24
  65. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sources/c1034_fetch.py +0 -117
  66. scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sources/c1034_session.py +0 -90
  67. scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/constants.py +0 -47
  68. scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/ports/sink_port.py +0 -16
  69. scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/ports/source_port.py +0 -13
  70. scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/types.py +0 -11
  71. scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/usecases/c1034_ingest.py +0 -139
  72. scraper2_hj3415-1.0.1/src/scraper2_hj3415/di.py +0 -103
  73. scraper2_hj3415-1.0.1/src/scraper2_hj3415/entrypoints/cli.py +0 -226
  74. scraper2_hj3415-1.0.1/src/scraper2_hj3415/entrypoints/main.py +0 -20
  75. scraper2_hj3415-1.0.1/tests/adapters/_shared/test_utils.py +0 -109
  76. scraper2_hj3415-1.0.1/tests/adapters/clients/test_browser.py +0 -17
  77. scraper2_hj3415-1.0.1/tests/adapters/clients/test_http.py +0 -76
  78. scraper2_hj3415-1.0.1/tests/adapters/nfs/pipelines/test_c1034_pipeline.py +0 -107
  79. scraper2_hj3415-1.0.1/tests/adapters/nfs/pipelines/test_normalize_c1034.py +0 -223
  80. scraper2_hj3415-1.0.1/tests/adapters/nfs/sinks/test_c1034_sink.py +0 -140
  81. scraper2_hj3415-1.0.1/tests/adapters/nfs/sinks/test_mappers.py +0 -201
  82. scraper2_hj3415-1.0.1/tests/adapters/nfs/sources/sample.py +0 -393
  83. scraper2_hj3415-1.0.1/tests/adapters/nfs/sources/test_bundle_source.py +0 -90
  84. scraper2_hj3415-1.0.1/tests/adapters/nfs/sources/test_c1034_fetch.py +0 -182
  85. scraper2_hj3415-1.0.1/tests/adapters/nfs/sources/test_c1034_session.py +0 -208
  86. scraper2_hj3415-1.0.1/tests/conftest.py +0 -8
  87. scraper2_hj3415-1.0.1/tests/core/ports/test_sink_port.py +0 -48
  88. scraper2_hj3415-1.0.1/tests/core/ports/test_source_port.py +0 -59
  89. scraper2_hj3415-1.0.1/tests/core/usecases/test_c1034_ingest.py +0 -188
  90. scraper2_hj3415-1.0.1/tests/e2e/test_ingest_c1034_mongo_e2e.py +0 -107
  91. scraper2_hj3415-1.0.1/tests/test_di.py +0 -168
  92. {scraper2_hj3415-1.0.1 → scraper2_hj3415-2.1.0}/LICENSE +0 -0
  93. {scraper2_hj3415-1.0.1/src/scraper2_hj3415 → scraper2_hj3415-2.1.0/src/scraper2}/__init__.py +0 -0
  94. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters → scraper2_hj3415-2.1.0/src/scraper2/adapters/out}/__init__.py +0 -0
  95. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/_shared → scraper2_hj3415-2.1.0/src/scraper2/adapters/out/playwright}/__init__.py +0 -0
  96. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/clients → scraper2_hj3415-2.1.0/src/scraper2/app}/__init__.py +0 -0
  97. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/pipelines → scraper2_hj3415-2.1.0/src/scraper2/app/parsing}/__init__.py +0 -0
  98. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sinks → scraper2_hj3415-2.1.0/src/scraper2/app/ports}/__init__.py +0 -0
  99. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/adapters/nfs/sources → scraper2_hj3415-2.1.0/src/scraper2/app/ports/browser}/__init__.py +0 -0
  100. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/core → scraper2_hj3415-2.1.0/src/scraper2/app/ports/sinks}/__init__.py +0 -0
  101. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/ports → scraper2_hj3415-2.1.0/src/scraper2/app/usecases}/__init__.py +0 -0
  102. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/core/usecases → scraper2_hj3415-2.1.0/src/scraper2/app/usecases/fetch}/__init__.py +0 -0
  103. {scraper2_hj3415-1.0.1/src/scraper2_hj3415/entrypoints → scraper2_hj3415-2.1.0/src/scraper2/app/usecases/ingest}/__init__.py +0 -0
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: scraper2-hj3415
3
+ Version: 2.1.0
4
+ Summary: Naver WiseReport scraper
5
+ Keywords: example,demo
6
+ Author-email: Hyungjin Kim <hj3415@gmail.com>
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Typing :: Typed
12
+ License-File: LICENSE
13
+ Requires-Dist: playwright>=1.55
14
+ Requires-Dist: pandas>=2.3.3
15
+ Requires-Dist: pandas-stubs>=2.3.3
16
+ Requires-Dist: lxml>=6.0.2
17
+ Requires-Dist: typer>=0.21.0
18
+ Requires-Dist: db2-hj3415
19
+
20
+ # scraper2
21
+
22
+ KRX 기반 국내 주식 재무(NFS) 데이터 스크래퍼 CLI 도구입니다.
23
+ 종목 단건 수집(one)과 유니버스 전체 수집(all)을 명확히 분리해 운영합니다.
24
+
25
+
26
+
27
+ 설치
28
+
29
+ pip install -e .
30
+
31
+ 설치 후 CLI 확인:
32
+
33
+ scraper2 --help
34
+
35
+
36
+
37
+
38
+ CLI 구조
39
+
40
+ scraper2
41
+ ├─ nfs
42
+ │ ├─ one # 종목 1개 수집
43
+ │ └─ all # 유니버스 전체 수집
44
+ └─ mi # (reserved)
45
+
46
+
47
+
48
+
49
+ 1️⃣ 단일 종목 수집 (one)
50
+
51
+ 사용법
52
+
53
+ scraper2 nfs one <endpoint> <code> [options]
54
+
55
+ 예시
56
+
57
+ # 삼성전자 c101 단건 수집 (메모리)
58
+ scraper2 nfs one c101 005930
59
+
60
+ # 여러 endpoint 동시 수집
61
+ scraper2 nfs one all 005930
62
+
63
+ # MongoDB 저장
64
+ scraper2 nfs one c101 005930 --sink mongo
65
+
66
+ # 결과 DTO 출력 비활성화
67
+ scraper2 nfs one c101 005930 --no-show
68
+
69
+ 특징
70
+ • 항상 종목 1개만 처리
71
+ • 결과 DTO를 즉시 출력 (디버깅·검증용)
72
+ • 운영 자동화보다는 수동 테스트 / 검증용
73
+
74
+
75
+
76
+ 2️⃣ 유니버스 전체 수집 (all)
77
+
78
+ 사용 전 필수
79
+
80
+ # universe를 먼저 DB에 적재
81
+ krx sync
82
+
83
+
84
+
85
+
86
+ 사용법
87
+
88
+ scraper2 nfs all <endpoint> [options]
89
+
90
+ 예시
91
+
92
+ # krx300 전체 종목 c101 수집
93
+ scraper2 nfs all c101
94
+
95
+ # 모든 endpoint 수집
96
+ scraper2 nfs all all
97
+
98
+ # 특정 유니버스 지정
99
+ scraper2 nfs all c103 --universe krx300
100
+
101
+ # 처리 개수 제한 (테스트용)
102
+ scraper2 nfs all c101 --limit 10
103
+
104
+ # 진행률 배치 크기 조정
105
+ scraper2 nfs all c101 --chunk 5
106
+
107
+ 실행 중 출력 예시
108
+
109
+ === NFS ALL === universe=krx300, endpoint=c101, codes=298, sink=mongo
110
+
111
+ === START: c101 === total=298, chunk_size=5
112
+ progress: 25/298 (ok=25, fail=0)
113
+ progress: 50/298 (ok=50, fail=0)
114
+ ...
115
+ === DONE: c101 === ok=297, fail=1, total=298
116
+
117
+ ⏱ elapsed time: 6m 42s
118
+
119
+ 특징
120
+ • DB에 저장된 universe_latest 기준
121
+ • 진행률, 성공/실패 개수 실시간 출력
122
+ • 전체 작업 종료 시 총 소요 시간 표시
123
+ • 운영/스케줄러용 메인 모드
124
+
125
+
126
+
127
+ 3️⃣ 작업 스케줄러 예시 (cron)
128
+
129
+ 매일 새벽 2시 전체 수집
130
+
131
+ 0 2 * * * /path/to/venv/bin/scraper2 nfs all all >> /var/log/scraper2_nfs.log 2>&1
132
+
133
+ endpoint 분리 실행 (권장)
134
+
135
+ 0 2 * * * /path/to/venv/bin/scraper2 nfs all c101 >> /var/log/nfs_c101.log 2>&1
136
+ 10 2 * * * /path/to/venv/bin/scraper2 nfs all c103 >> /var/log/nfs_c103.log 2>&1
137
+
138
+ 테스트용 제한 실행
139
+
140
+ 0 1 * * * /path/to/venv/bin/scraper2 nfs all c101 --limit 20 >> /var/log/nfs_test.log 2>&1
141
+
142
+
143
+
144
+
145
+ 운영 권장 사항
146
+ • all 모드는 반드시 mongo sink 사용
147
+ • universe는 krx sync로 관리
148
+ • 로그에는
149
+ • 진행률
150
+ • ok / fail 개수
151
+ • 전체 소요 시간
152
+ 이 모두 남도록 설계됨
153
+ • 장애 발생 시 어디까지 진행됐는지 로그만 보고 판단 가능
154
+
155
+
156
+
157
+ 요약
158
+
159
+ 목적 명령
160
+ 단건 검증 scraper2 nfs one c101 005930
161
+ 전체 수집 scraper2 nfs all all
162
+ 스케줄러 scraper2 nfs all <endpoint>
163
+
164
+
@@ -0,0 +1,144 @@
1
+ # scraper2
2
+
3
+ KRX 기반 국내 주식 재무(NFS) 데이터 스크래퍼 CLI 도구입니다.
4
+ 종목 단건 수집(one)과 유니버스 전체 수집(all)을 명확히 분리해 운영합니다.
5
+
6
+
7
+
8
+ 설치
9
+
10
+ pip install -e .
11
+
12
+ 설치 후 CLI 확인:
13
+
14
+ scraper2 --help
15
+
16
+
17
+
18
+
19
+ CLI 구조
20
+
21
+ scraper2
22
+ ├─ nfs
23
+ │ ├─ one # 종목 1개 수집
24
+ │ └─ all # 유니버스 전체 수집
25
+ └─ mi # (reserved)
26
+
27
+
28
+
29
+
30
+ 1️⃣ 단일 종목 수집 (one)
31
+
32
+ 사용법
33
+
34
+ scraper2 nfs one <endpoint> <code> [options]
35
+
36
+ 예시
37
+
38
+ # 삼성전자 c101 단건 수집 (메모리)
39
+ scraper2 nfs one c101 005930
40
+
41
+ # 여러 endpoint 동시 수집
42
+ scraper2 nfs one all 005930
43
+
44
+ # MongoDB 저장
45
+ scraper2 nfs one c101 005930 --sink mongo
46
+
47
+ # 결과 DTO 출력 비활성화
48
+ scraper2 nfs one c101 005930 --no-show
49
+
50
+ 특징
51
+ • 항상 종목 1개만 처리
52
+ • 결과 DTO를 즉시 출력 (디버깅·검증용)
53
+ • 운영 자동화보다는 수동 테스트 / 검증용
54
+
55
+
56
+
57
+ 2️⃣ 유니버스 전체 수집 (all)
58
+
59
+ 사용 전 필수
60
+
61
+ # universe를 먼저 DB에 적재
62
+ krx sync
63
+
64
+
65
+
66
+
67
+ 사용법
68
+
69
+ scraper2 nfs all <endpoint> [options]
70
+
71
+ 예시
72
+
73
+ # krx300 전체 종목 c101 수집
74
+ scraper2 nfs all c101
75
+
76
+ # 모든 endpoint 수집
77
+ scraper2 nfs all all
78
+
79
+ # 특정 유니버스 지정
80
+ scraper2 nfs all c103 --universe krx300
81
+
82
+ # 처리 개수 제한 (테스트용)
83
+ scraper2 nfs all c101 --limit 10
84
+
85
+ # 진행률 배치 크기 조정
86
+ scraper2 nfs all c101 --chunk 5
87
+
88
+ 실행 중 출력 예시
89
+
90
+ === NFS ALL === universe=krx300, endpoint=c101, codes=298, sink=mongo
91
+
92
+ === START: c101 === total=298, chunk_size=5
93
+ progress: 25/298 (ok=25, fail=0)
94
+ progress: 50/298 (ok=50, fail=0)
95
+ ...
96
+ === DONE: c101 === ok=297, fail=1, total=298
97
+
98
+ ⏱ elapsed time: 6m 42s
99
+
100
+ 특징
101
+ • DB에 저장된 universe_latest 기준
102
+ • 진행률, 성공/실패 개수 실시간 출력
103
+ • 전체 작업 종료 시 총 소요 시간 표시
104
+ • 운영/스케줄러용 메인 모드
105
+
106
+
107
+
108
+ 3️⃣ 작업 스케줄러 예시 (cron)
109
+
110
+ 매일 새벽 2시 전체 수집
111
+
112
+ 0 2 * * * /path/to/venv/bin/scraper2 nfs all all >> /var/log/scraper2_nfs.log 2>&1
113
+
114
+ endpoint 분리 실행 (권장)
115
+
116
+ 0 2 * * * /path/to/venv/bin/scraper2 nfs all c101 >> /var/log/nfs_c101.log 2>&1
117
+ 10 2 * * * /path/to/venv/bin/scraper2 nfs all c103 >> /var/log/nfs_c103.log 2>&1
118
+
119
+ 테스트용 제한 실행
120
+
121
+ 0 1 * * * /path/to/venv/bin/scraper2 nfs all c101 --limit 20 >> /var/log/nfs_test.log 2>&1
122
+
123
+
124
+
125
+
126
+ 운영 권장 사항
127
+ • all 모드는 반드시 mongo sink 사용
128
+ • universe는 krx sync로 관리
129
+ • 로그에는
130
+ • 진행률
131
+ • ok / fail 개수
132
+ • 전체 소요 시간
133
+ 이 모두 남도록 설계됨
134
+ • 장애 발생 시 어디까지 진행됐는지 로그만 보고 판단 가능
135
+
136
+
137
+
138
+ 요약
139
+
140
+ 목적 명령
141
+ 단건 검증 scraper2 nfs one c101 005930
142
+ 전체 수집 scraper2 nfs all all
143
+ 스케줄러 scraper2 nfs all <endpoint>
144
+
@@ -4,8 +4,8 @@ build-backend = "flit_core.buildapi"
4
4
 
5
5
  [project]
6
6
  name = "scraper2-hj3415" # PyPI 이름 (하이픈 허용)
7
- version = "1.0.1"
8
- description = "Naver WiseReport C103/C104 scraper + ingestion orchestrator"
7
+ version = "2.1.0"
8
+ description = "Naver WiseReport scraper"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
11
11
  license = { file = "LICENSE" }
@@ -18,25 +18,17 @@ classifiers = [
18
18
  ]
19
19
 
20
20
  dependencies = [
21
- "logging-hj3415>=0.1",
22
- "httpx>=0.28",
23
- "tenacity>=9.1",
24
21
  "playwright>=1.55",
25
- "pandas>=2.3",
26
- "tabulate>=0.9",
27
- "contracts-hj3415>=0.1",
28
- "db2-hj3415>=0.1",
22
+ "pandas>=2.3.3",
23
+ "pandas-stubs>=2.3.3",
24
+ "lxml>=6.0.2",
25
+ "typer>=0.21.0",
26
+ "db2-hj3415",
29
27
  ]
30
28
 
31
- # 선택적 의존성 (pip install your-pkg[dev] 형태)
32
- [project.optional-dependencies]
33
- dev = ["pytest>=8", "pytest-cov>=7.0", "pytest-asyncio>=1.2", "ruff>=0.5", "mypy>=1.10"]
34
-
35
-
36
29
  [tool.flit.module]
37
- name = "scraper2_hj3415"
38
- path = "src/scraper2_hj3415"
39
-
30
+ name = "scraper2"
31
+ path = "src/scraper2"
40
32
 
41
33
  [project.scripts]
42
- scraper2 = "scraper2_hj3415.entrypoints.cli:app"
34
+ scraper2 = "scraper2.main:app"
@@ -0,0 +1,103 @@
1
+ # src/scraper2/adapters/out/playwright/session.py
2
+ from __future__ import annotations
3
+ from typing import Any
4
+ from io import StringIO
5
+ import pandas as pd
6
+ from playwright.async_api import Page
7
+
8
+ class PlaywrightBrowser:
9
+ def __init__(self, page: Page):
10
+ self.page = page
11
+
12
+ async def goto(self, url: str, timeout_ms: int = 10_000) -> None:
13
+ self.page.set_default_timeout(10_000)
14
+ await self.page.goto(url, timeout=timeout_ms)
15
+
16
+ async def title(self) -> str:
17
+ return await self.page.title()
18
+
19
+ async def current_url(self) -> str:
20
+ return self.page.url
21
+
22
+ async def wait(self, selector: str, timeout_ms: int = 10_000) -> None:
23
+ await self.page.wait_for_selector(selector, timeout=timeout_ms, state="attached")
24
+
25
+ async def text(self, selector: str) -> str:
26
+ await self.wait(selector)
27
+ return (await self.page.locator(selector).first.text_content()) or ""
28
+
29
+ async def texts(self, selector: str) -> list[str]:
30
+ await self.wait(selector)
31
+ loc = self.page.locator(selector)
32
+ items = await loc.all()
33
+ out: list[str] = []
34
+ for it in items:
35
+ out.append((await it.text_content()) or "")
36
+ return out
37
+
38
+ async def text_first_by_text(self, needle: str) -> str:
39
+ return (await self.page.get_by_text(needle).first.text_content()) or ""
40
+
41
+ async def inner_text(self, selector: str) -> str:
42
+ await self.wait(selector)
43
+ return await self.page.locator(selector).first.inner_text()
44
+
45
+ async def click(self, selector: str) -> None:
46
+ await self.wait(selector)
47
+ await self.page.locator(selector).click()
48
+
49
+ async def table_records(
50
+ self,
51
+ table_selector: str,
52
+ *,
53
+ header: int | list[int] = 0
54
+ ) -> list[dict[str, Any]]:
55
+ await self.wait(table_selector)
56
+
57
+ table = self.page.locator(table_selector).first
58
+ html = await table.evaluate("el => el.outerHTML") # <table> 포함
59
+ #print(html)
60
+
61
+ try:
62
+ df = pd.read_html(StringIO(html), header=header)[0]
63
+ #print(df.head(3))
64
+ except Exception as e:
65
+ # ImportError(lxml 없음), ValueError 등 모두 여기서 잡아서 원인 노출
66
+ raise RuntimeError(f"pd.read_html failed: {type(e).__name__}: {e}") from e
67
+
68
+ if header == 0:
69
+ if "항목" in df.columns:
70
+ df["항목"] = df["항목"].astype(str).str.replace("펼치기", "").str.strip()
71
+
72
+ df.columns = (
73
+ df.columns.astype(str)
74
+ .str.replace("연간컨센서스보기", "", regex=False)
75
+ .str.replace("연간컨센서스닫기", "", regex=False)
76
+ .str.replace("(IFRS연결)", "", regex=False)
77
+ .str.replace("(IFRS별도)", "", regex=False)
78
+ .str.replace("(GAAP개별)", "", regex=False)
79
+ .str.replace("(YoY)", "", regex=False)
80
+ .str.replace("(QoQ)", "", regex=False)
81
+ .str.replace("(E)", "", regex=False)
82
+ .str.replace(".", "", regex=False)
83
+ .str.strip()
84
+ )
85
+
86
+ # NaN -> None 처리
87
+ records: list[dict[str, Any]] = df.where(pd.notnull(df), None).to_dict(orient="records")
88
+ return records
89
+
90
+ async def outer_html(self, selector: str) -> str:
91
+ loc = self.page.locator(selector).first
92
+ return await loc.evaluate("el => el.outerHTML")
93
+
94
+ async def all_texts(self, selector: str) -> list[str]:
95
+ # selector는 css도 되고, "xpath=..." 도 됨
96
+ loc = self.page.locator(selector)
97
+ return await loc.all_text_contents()
98
+
99
+ async def outer_html_nth(self, selector: str, index: int) -> str:
100
+ loc = self.page.locator(selector).nth(index)
101
+ # index가 범위를 벗어나면 playwright가 에러를 내는데,
102
+ # 필요하면 여기서 더 친절한 에러로 감싸도 됨.
103
+ return await loc.evaluate("el => el.outerHTML")
@@ -0,0 +1,112 @@
1
+ # scraper2/adapters/out/playwright/browser_factory.py
2
+ from __future__ import annotations
3
+
4
+ import asyncio
5
+ from contextlib import asynccontextmanager
6
+ from dataclasses import dataclass
7
+ from typing import AsyncIterator
8
+
9
+ from scraper2.app.ports.browser.browser_factory_port import BrowserFactoryPort
10
+ from scraper2.app.ports.browser.browser_port import BrowserPort
11
+ from scraper2.adapters.out.playwright.session import PlaywrightPageSession
12
+ from scraper2.adapters.out.playwright.browser import PlaywrightBrowser
13
+
14
+
15
+ @dataclass
16
+ class _LeaseItem:
17
+ session: PlaywrightPageSession
18
+ browser: BrowserPort
19
+
20
+
21
+ class PlaywrightBrowserFactory(BrowserFactoryPort):
22
+ """
23
+ 풀링 방식:
24
+ - astart()에서 max_concurrency 만큼 세션/페이지/브라우저를 미리 생성
25
+ - lease()는 큐에서 하나 빌려주고 반납받음
26
+ - aclose()에서 모두 종료
27
+ """
28
+
29
+ def __init__(self, *, headless: bool, timeout_ms: int, max_concurrency: int = 2):
30
+ self.headless = headless
31
+ self.timeout_ms = timeout_ms
32
+ self.max_concurrency = max_concurrency
33
+
34
+ self._pool: asyncio.Queue[_LeaseItem] = asyncio.Queue(maxsize=max_concurrency)
35
+ self._items: list[_LeaseItem] = [] # 종료용 레퍼런스
36
+ self._started = False
37
+ self._start_lock = asyncio.Lock()
38
+ self._closed = False
39
+
40
+ async def astart(self) -> None:
41
+ """
42
+ 풀을 미리 채움.
43
+ 여러 번 호출돼도 1회만 초기화되도록 방어.
44
+ """
45
+ if self._started:
46
+ return
47
+
48
+ async with self._start_lock:
49
+ if self._started:
50
+ return
51
+ if self._closed:
52
+ raise RuntimeError("Factory is closed; cannot start again.")
53
+
54
+ for _ in range(self.max_concurrency):
55
+ session = PlaywrightPageSession(headless=self.headless, timeout_ms=self.timeout_ms)
56
+ page = await session.start()
57
+ browser = PlaywrightBrowser(page)
58
+
59
+ item = _LeaseItem(session=session, browser=browser)
60
+ self._items.append(item)
61
+ await self._pool.put(item)
62
+
63
+ self._started = True
64
+
65
+ @asynccontextmanager
66
+ async def lease(self) -> AsyncIterator[BrowserPort]:
67
+ """
68
+ 브라우저 하나를 풀에서 빌려줌.
69
+ 사용 후 반드시 풀에 반납.
70
+ """
71
+ if self._closed:
72
+ raise RuntimeError("Factory is closed; cannot lease.")
73
+ if not self._started:
74
+ await self.astart()
75
+
76
+ item = await self._pool.get()
77
+ try:
78
+ yield item.browser
79
+ finally:
80
+ # close 중이면 반납하지 말고 그냥 종료 플로우에 맡김
81
+ if not self._closed:
82
+ await self._pool.put(item)
83
+
84
+ async def aclose(self) -> None:
85
+ """
86
+ 풀에 있는 모든 세션 종료.
87
+ - 실행 중인 lease가 끝나기 전에 닫으면: 남아있는 세션만 닫히고,
88
+ 나중에 lease가 반납하려 할 때 _closed=True라 put이 안 되도록 처리.
89
+ """
90
+ if self._closed:
91
+ return
92
+ self._closed = True
93
+
94
+ # 전체 세션 종료
95
+ # (이미 대여 중인 애도 결국 같은 session 객체이므로 close 시도됨)
96
+ for item in self._items:
97
+ try:
98
+ await item.session.close()
99
+ except Exception:
100
+ # 종료 단계에서는 예외 삼키는 게 보통 안전
101
+ pass
102
+
103
+ self._items.clear()
104
+
105
+ # 큐 비우기 (참조 제거)
106
+ try:
107
+ while True:
108
+ self._pool.get_nowait()
109
+ except asyncio.QueueEmpty:
110
+ pass
111
+
112
+
@@ -0,0 +1,121 @@
1
+ # src/scraper2/adapters/out/playwright/session.py
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ from dataclasses import dataclass
8
+ from typing import Optional
9
+
10
+ from playwright.async_api import (
11
+ async_playwright,
12
+ Browser,
13
+ BrowserContext,
14
+ Page,
15
+ Error as PWError,
16
+ )
17
+
18
+
19
+ def _install_playwright_browsers(*names: str) -> None:
20
+ """python -m playwright install [names...] 를 코드에서 실행"""
21
+ subprocess.run([sys.executable, "-m", "playwright", "install", *names], check=True)
22
+
23
+ if sys.platform.startswith("linux"):
24
+ # deps는 실패해도 그냥 진행 (환경에 따라 불필요/권한 문제)
25
+ try:
26
+ subprocess.run(
27
+ [sys.executable, "-m", "playwright", "install-deps"], check=True
28
+ )
29
+ except Exception:
30
+ pass
31
+
32
+
33
+ def _need_install(e: Exception) -> bool:
34
+ msg = str(e)
35
+ return (
36
+ "Executable doesn't exist" in msg
37
+ or "download new browsers" in msg
38
+ or "playwright install" in msg
39
+ or "Please run the following command" in msg
40
+ )
41
+
42
+
43
+ @dataclass
44
+ class PlaywrightPageSession:
45
+ """
46
+ main에서 쓰기 쉬운 세션:
47
+ s = PlaywrightPageSession(headless=True)
48
+ page = await s.start()
49
+ ...
50
+ await s.close()
51
+ """
52
+
53
+ headless: bool = True
54
+ browser_name: str = "chromium"
55
+ timeout_ms: int = 10_000
56
+ auto_install: bool = True # env PW_SKIP_AUTO_INSTALL=1이면 자동으로 꺼짐
57
+
58
+ # runtime resources
59
+ pw: Optional[object] = None
60
+ browser: Optional[Browser] = None
61
+ context: Optional[BrowserContext] = None
62
+ page: Optional[Page] = None
63
+
64
+ async def start(self) -> Page:
65
+ if self.page is not None:
66
+ return self.page # 이미 시작된 경우 재사용(원치 않으면 제거)
67
+
68
+ self.pw = await async_playwright().start()
69
+ try:
70
+ browser_type = getattr(self.pw, self.browser_name)
71
+
72
+ try:
73
+ self.browser = await browser_type.launch(headless=self.headless)
74
+ except PWError as e:
75
+ should_auto = self.auto_install and os.getenv("PW_SKIP_AUTO_INSTALL") != "1"
76
+ if should_auto and _need_install(e):
77
+ # pw 종료 -> 설치 -> pw 재시작
78
+ await self.pw.stop()
79
+ _install_playwright_browsers(self.browser_name)
80
+ self.pw = await async_playwright().start()
81
+ browser_type = getattr(self.pw, self.browser_name)
82
+ self.browser = await browser_type.launch(headless=self.headless)
83
+ else:
84
+ raise
85
+
86
+ self.context = await self.browser.new_context()
87
+ self.page = await self.context.new_page()
88
+ self.page.set_default_timeout(self.timeout_ms)
89
+ return self.page
90
+
91
+ except Exception:
92
+ # start 중간에 터지면 자원 정리
93
+ await self.close()
94
+ raise
95
+
96
+ async def close(self) -> None:
97
+ # 역순 정리 (page는 context close 시 같이 정리됨)
98
+ if self.context is not None:
99
+ try:
100
+ await self.context.close()
101
+ except Exception:
102
+ pass
103
+ finally:
104
+ self.context = None
105
+ self.page = None
106
+
107
+ if self.browser is not None:
108
+ try:
109
+ await self.browser.close()
110
+ except Exception:
111
+ pass
112
+ finally:
113
+ self.browser = None
114
+
115
+ if self.pw is not None:
116
+ try:
117
+ await self.pw.stop()
118
+ except Exception:
119
+ pass
120
+ finally:
121
+ self.pw = None