analyser_hj3415 2.0.2__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analyser_hj3415/.DS_Store +0 -0
- analyser_hj3415/analysers/eval.py +289 -0
- analyser_hj3415/{report.py → analysers/report.py} +23 -59
- analyser_hj3415/{score.py → analysers/score.py} +56 -61
- analyser_hj3415/{db/evaltools.py → tools.py} +102 -79
- analyser_hj3415/trash.py +210 -0
- {analyser_hj3415-2.0.2.dist-info → analyser_hj3415-2.1.0.dist-info}/METADATA +5 -9
- analyser_hj3415-2.1.0.dist-info/RECORD +14 -0
- analyser_hj3415/db/.DS_Store +0 -0
- analyser_hj3415/db/chk_db.py +0 -240
- analyser_hj3415/db/mongo.py +0 -934
- analyser_hj3415/eval.py +0 -382
- analyser_hj3415-2.0.2.dist-info/RECORD +0 -16
- /analyser_hj3415/{db/__init__.py → run.py} +0 -0
- {analyser_hj3415-2.0.2.dist-info → analyser_hj3415-2.1.0.dist-info}/LICENSE +0 -0
- {analyser_hj3415-2.0.2.dist-info → analyser_hj3415-2.1.0.dist-info}/WHEEL +0 -0
- {analyser_hj3415-2.0.2.dist-info → analyser_hj3415-2.1.0.dist-info}/entry_points.txt +0 -0
analyser_hj3415/db/chk_db.py
DELETED
@@ -1,240 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import sys
|
3
|
-
from _datetime import datetime
|
4
|
-
from typing import Dict, List, Tuple, Callable
|
5
|
-
from multiprocessing import Process, Queue
|
6
|
-
from . import mongo
|
7
|
-
from utils_hj3415 import utils, noti
|
8
|
-
from scraper2_hj3415.krx import krx
|
9
|
-
from scraper2_hj3415.nfscrapy import run as nfsrun
|
10
|
-
from pymongo import MongoClient
|
11
|
-
from selenium.webdriver.chrome.webdriver import WebDriver
|
12
|
-
|
13
|
-
import logging
|
14
|
-
logger = logging.getLogger(__name__)
|
15
|
-
formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
|
16
|
-
ch = logging.StreamHandler()
|
17
|
-
ch.setFormatter(formatter)
|
18
|
-
logger.addHandler(ch)
|
19
|
-
logger.setLevel(logging.INFO)
|
20
|
-
|
21
|
-
|
22
|
-
"""
|
23
|
-
chk_integrity_corps 함수로 종목코드를 데이터베이스명으로 가지는 DB의 유효성을 검사한다.
|
24
|
-
"""
|
25
|
-
|
26
|
-
|
27
|
-
def test_corp_one(client1: MongoClient, code: str, driver: WebDriver = None, waiting_time: int = 10) -> Dict[str, list]:
|
28
|
-
"""
|
29
|
-
종목 하나의 컬렉션의 유효성을 검사하여 부족한 컬렉션을 딕셔너리로 만들어서 반환한다.
|
30
|
-
driver와 waiting_time은 본 함수에서 사용하지는 않으나 다른 함수와 인자를 맞추기위해 인자로 받아준다.
|
31
|
-
리턴값 - {'005930': ['c104','c103'...]}
|
32
|
-
"""
|
33
|
-
|
34
|
-
def is_same_count_of_docs(col_name1: str, col_name2: str) -> bool:
|
35
|
-
logger.debug(f"In is_same_count_of_docs {code}/ {col_name1}, {col_name2}")
|
36
|
-
corp_one.page = col_name1
|
37
|
-
count_doc1 = corp_one.count_docs_in_col()
|
38
|
-
corp_one.page = col_name2
|
39
|
-
count_doc2 = corp_one.count_docs_in_col()
|
40
|
-
if count_doc1 == count_doc2:
|
41
|
-
return True
|
42
|
-
else:
|
43
|
-
return False
|
44
|
-
|
45
|
-
proper_collections = {'c101', 'c104y', 'c104q', 'c106y', 'c106q', 'c103손익계산서q', 'c103재무상태표q',
|
46
|
-
'c103현금흐름표q', 'c103손익계산서y', 'c103재무상태표y', 'c103현금흐름표y'}
|
47
|
-
|
48
|
-
logger.debug('In test_corp_one function...')
|
49
|
-
return_dict = {}
|
50
|
-
|
51
|
-
logger.debug(f'return_dict is ... {return_dict}')
|
52
|
-
# 한 종목의 유효성 검사코드
|
53
|
-
corp_one = mongo.Corps(client1, code, 'c101')
|
54
|
-
|
55
|
-
# 차집합을 사용해서 db내에 없는 컬렉션이 있는지 확인한다.
|
56
|
-
set_deficient_collentions = set.difference(proper_collections, set(corp_one.list_collection_names()))
|
57
|
-
|
58
|
-
logger.debug(f'After take a set of difference : {set_deficient_collentions}')
|
59
|
-
|
60
|
-
return_dict[code] = set()
|
61
|
-
# 컬렉션이 아예 없는 것이 있다면 falied_codes에 추가한다.
|
62
|
-
if set_deficient_collentions != set():
|
63
|
-
for item in set_deficient_collentions:
|
64
|
-
# 컬렉션 이름 중 앞의 네글자만 추려서 추가해준다.(ex - c103손익계산서q -> c103)
|
65
|
-
return_dict[code].add(item[:4])
|
66
|
-
|
67
|
-
# 각 컬렉션의 q와 y의 도큐먼트 갯수를 비교하여 차이가 있는지 확인한다.
|
68
|
-
if not is_same_count_of_docs('c104y', 'c104q'):
|
69
|
-
return_dict[code].add('c104')
|
70
|
-
if not is_same_count_of_docs('c106y', 'c106q'):
|
71
|
-
return_dict[code].add('c106')
|
72
|
-
if not is_same_count_of_docs('c103손익계산서q', 'c103손익계산서y') \
|
73
|
-
or not is_same_count_of_docs('c103재무상태표q', 'c103재무상태표y') \
|
74
|
-
or not is_same_count_of_docs('c103현금흐름표y', 'c103현금흐름표q'):
|
75
|
-
return_dict[code].add('c103')
|
76
|
-
|
77
|
-
# 집합을 리스트로 바꿔서 다시 저장한다.
|
78
|
-
return_dict[code] = list(return_dict[code])
|
79
|
-
logger.debug(f'Going out test_corp_one : {return_dict}')
|
80
|
-
return return_dict
|
81
|
-
|
82
|
-
|
83
|
-
def test_corp_one_is_modified(client: MongoClient, code: str, driver: WebDriver, waiting_time: int) -> Dict[str, bool]:
|
84
|
-
"""
|
85
|
-
웹에서 스크랩한 c103손익계산서y와 데이터베이스에 있는 c103손익계산서y를 비교하여 다른지 확인하여 업데이트 유무를 반환한다.
|
86
|
-
리턴값 - (코드, bool-업데이트가필요한지)
|
87
|
-
"""
|
88
|
-
df_online = nfsrun.scrape_c103_first_page(driver, code, waiting_time=waiting_time)
|
89
|
-
df_mongo = mongo.C103(client, code=code, page='c103손익계산서y').load_df()
|
90
|
-
|
91
|
-
logger.debug(df_online)
|
92
|
-
logger.debug(df_mongo)
|
93
|
-
|
94
|
-
return_dict = {code: not df_online.equals(df_mongo)}
|
95
|
-
return return_dict
|
96
|
-
|
97
|
-
|
98
|
-
def working_with_parts(test_func: Callable[[MongoClient, str, WebDriver, int], dict], db_addr: str, divided_code_list: list, my_q: Queue, waiting_time: int):
|
99
|
-
# 각 코어별로 디비 클라이언트를 만들어야만 한다. 안그러면 에러발생
|
100
|
-
client = mongo.connect_mongo(db_addr)
|
101
|
-
driver = utils.get_driver()
|
102
|
-
t = len(divided_code_list)
|
103
|
-
|
104
|
-
failed_dict_part = {}
|
105
|
-
|
106
|
-
for i, code in enumerate(divided_code_list):
|
107
|
-
try:
|
108
|
-
failed_one_dict = test_func(client, code, driver, waiting_time)
|
109
|
-
except Exception as e:
|
110
|
-
print(f"{code} has a error : {e}", file=sys.stderr)
|
111
|
-
continue
|
112
|
-
print(f'{i + 1}/{t} {failed_one_dict}')
|
113
|
-
if failed_one_dict[code]:
|
114
|
-
# 빈리스트가 아니라면...또는 C103이 변화되었다면.. 큐에 추가한다.
|
115
|
-
failed_dict_part.update(failed_one_dict)
|
116
|
-
else:
|
117
|
-
# 큐에서 put은 함수 리턴처럼 함수에서 한번만 한다.
|
118
|
-
my_q.put(failed_dict_part)
|
119
|
-
driver.close()
|
120
|
-
|
121
|
-
|
122
|
-
# 멀티프로세싱을 사용하기 위해서 독립된 함수로 제작하였음(피클링이 가능해야함)
|
123
|
-
def chk_integrity_corps(client: MongoClient, code: str = 'all') -> Dict[str, list]:
|
124
|
-
"""
|
125
|
-
몽고 디비의 corps들의 integrity 검사후 이상이 있는 코드 리스트 반환
|
126
|
-
이상을 찾는 방법 - 각 컬렉션이 다 있는가. 각 컬렉션에서 연도와 분기의 도큐먼트 갯수가 같은가
|
127
|
-
return - {'코드': ['cxxx',...], '코드': ['cxxx',...]...}
|
128
|
-
"""
|
129
|
-
failed_codes = {}
|
130
|
-
codes_in_db = mongo.Corps.get_all_codes(client)
|
131
|
-
if code == 'all':
|
132
|
-
print('*' * 25, f"Check all Corp db integrity using multiprocess", '*' * 25)
|
133
|
-
print(f'Total {len(codes_in_db)} items..')
|
134
|
-
n, divided_list = utils.code_divider_by_cpu_core(codes_in_db)
|
135
|
-
|
136
|
-
addr = mongo.extract_addr_from_client(client)
|
137
|
-
|
138
|
-
start_time = time.time()
|
139
|
-
q = Queue()
|
140
|
-
ths = []
|
141
|
-
for i in range(n):
|
142
|
-
ths.append(Process(target=working_with_parts, args=(test_corp_one, addr, divided_list[i], q, 0)))
|
143
|
-
for i in range(n):
|
144
|
-
ths[i].start()
|
145
|
-
|
146
|
-
for i in range(n):
|
147
|
-
failed_codes.update(q.get())
|
148
|
-
|
149
|
-
for i in range(n):
|
150
|
-
ths[i].join()
|
151
|
-
|
152
|
-
logger.debug(f"failed_codes : {failed_codes}")
|
153
|
-
print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
|
154
|
-
else:
|
155
|
-
print('*' * 25, f"Check {code} db integrity", '*' * 25)
|
156
|
-
if code in codes_in_db:
|
157
|
-
result_dict = test_corp_one(client, code)
|
158
|
-
print(f'{code} : {result_dict[code]}')
|
159
|
-
if result_dict[code]: # 빈리스트가 아니라면...
|
160
|
-
failed_codes.update(result_dict)
|
161
|
-
|
162
|
-
else:
|
163
|
-
Exception(f'{code} is not in db..')
|
164
|
-
return failed_codes
|
165
|
-
|
166
|
-
|
167
|
-
def chk_modifying_corps(client, code: str = 'all', waiting_time: int = 60) -> Dict[str, bool]:
|
168
|
-
"""
|
169
|
-
각 종목의 웹과 DB의 C103손익계산서y를 비교하여 변화가 있어 refresh가 필요한지를 반환한다.
|
170
|
-
"""
|
171
|
-
failed_codes = {}
|
172
|
-
codes_in_db = mongo.Corps.get_all_codes(client)
|
173
|
-
if code == 'all':
|
174
|
-
print('*' * 25, f"Check all Corp db need for updating using multiprocess", '*' * 25)
|
175
|
-
print(f'Total {len(codes_in_db)} items..')
|
176
|
-
n, divided_list = utils.code_divider_by_cpu_core(codes_in_db)
|
177
|
-
|
178
|
-
addr = mongo.extract_addr_from_client(client)
|
179
|
-
|
180
|
-
start_time = time.time()
|
181
|
-
q = Queue()
|
182
|
-
ths = []
|
183
|
-
for i in range(n):
|
184
|
-
ths.append(Process(target=working_with_parts, args=(test_corp_one_is_modified, addr, divided_list[i], q, waiting_time)))
|
185
|
-
for i in range(n):
|
186
|
-
ths[i].start()
|
187
|
-
|
188
|
-
for i in range(n):
|
189
|
-
failed_codes.update(q.get())
|
190
|
-
|
191
|
-
for i in range(n):
|
192
|
-
ths[i].join()
|
193
|
-
|
194
|
-
logger.debug(f"failed_codes : {failed_codes}")
|
195
|
-
print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
|
196
|
-
else:
|
197
|
-
print('*' * 25, f"Check {code} db need for updating ", '*' * 25)
|
198
|
-
driver = utils.get_driver()
|
199
|
-
if code in codes_in_db:
|
200
|
-
result_dict = test_corp_one_is_modified(client, code, driver)
|
201
|
-
print(f'{code} : {result_dict[code]}')
|
202
|
-
if result_dict[code]:
|
203
|
-
failed_codes.update(result_dict)
|
204
|
-
|
205
|
-
else:
|
206
|
-
Exception(f'{code} is not in db..')
|
207
|
-
return failed_codes
|
208
|
-
|
209
|
-
|
210
|
-
def sync_mongo_with_krx(client):
|
211
|
-
print('*' * 20, 'Sync with krx and mongodb', '*' * 20)
|
212
|
-
all_codes_in_db = mongo.Corps.get_all_codes(client)
|
213
|
-
print('*' * 20, 'Refreshing krx.db...', '*' * 20)
|
214
|
-
krx.make_db()
|
215
|
-
print('*' * 80)
|
216
|
-
all_codes_in_krx = krx.get_codes()
|
217
|
-
print('\tThe number of codes in krx: ', len(all_codes_in_krx))
|
218
|
-
logger.debug(all_codes_in_krx)
|
219
|
-
try:
|
220
|
-
print('\tThe number of dbs in mongo: ', len(all_codes_in_db))
|
221
|
-
logger.debug(all_codes_in_db)
|
222
|
-
except TypeError:
|
223
|
-
err_msg = "Error while sync mongo data...it's possible mongo db doesn't set yet.."
|
224
|
-
logger.error(err_msg)
|
225
|
-
noti.telegram_to(botname='manager', text=err_msg)
|
226
|
-
return
|
227
|
-
del_targets = list(set(all_codes_in_db) - set(all_codes_in_krx))
|
228
|
-
add_targets = list(set(all_codes_in_krx) - set(all_codes_in_db))
|
229
|
-
print('\tDelete target: ', del_targets)
|
230
|
-
print('\tAdd target: ', add_targets)
|
231
|
-
|
232
|
-
for target in del_targets:
|
233
|
-
mongo.Corps.del_db(client, target)
|
234
|
-
|
235
|
-
if add_targets:
|
236
|
-
print(f'Starting.. c10346 scraper.. items : {len(add_targets)}')
|
237
|
-
addr = mongo.extract_addr_from_client(client)
|
238
|
-
nfsrun.c103(add_targets, addr)
|
239
|
-
nfsrun.c104(add_targets, addr)
|
240
|
-
nfsrun.c106(add_targets, addr)
|