analyser_hj3415 2.0.1__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- analyser_hj3415/.DS_Store +0 -0
- analyser_hj3415/analysers/eval.py +289 -0
- analyser_hj3415/{report.py → analysers/report.py} +23 -59
- analyser_hj3415/{score.py → analysers/score.py} +56 -61
- analyser_hj3415/{db/evaltools.py → tools.py} +102 -79
- analyser_hj3415/trash.py +210 -0
- {analyser_hj3415-2.0.1.dist-info → analyser_hj3415-2.1.0.dist-info}/METADATA +5 -9
- analyser_hj3415-2.1.0.dist-info/RECORD +14 -0
- analyser_hj3415/db/.DS_Store +0 -0
- analyser_hj3415/db/chk_db.py +0 -240
- analyser_hj3415/db/mongo.py +0 -934
- analyser_hj3415/eval.py +0 -382
- analyser_hj3415-2.0.1.dist-info/RECORD +0 -16
- /analyser_hj3415/{db/__init__.py → run.py} +0 -0
- {analyser_hj3415-2.0.1.dist-info → analyser_hj3415-2.1.0.dist-info}/LICENSE +0 -0
- {analyser_hj3415-2.0.1.dist-info → analyser_hj3415-2.1.0.dist-info}/WHEEL +0 -0
- {analyser_hj3415-2.0.1.dist-info → analyser_hj3415-2.1.0.dist-info}/entry_points.txt +0 -0
analyser_hj3415/db/chk_db.py
DELETED
@@ -1,240 +0,0 @@
|
|
1
|
-
import time
|
2
|
-
import sys
|
3
|
-
from _datetime import datetime
|
4
|
-
from typing import Dict, List, Tuple, Callable
|
5
|
-
from multiprocessing import Process, Queue
|
6
|
-
from . import mongo
|
7
|
-
from utils_hj3415 import utils, noti
|
8
|
-
from scraper2_hj3415.krx import krx
|
9
|
-
from scraper2_hj3415.nfscrapy import run as nfsrun
|
10
|
-
from pymongo import MongoClient
|
11
|
-
from selenium.webdriver.chrome.webdriver import WebDriver
|
12
|
-
|
13
|
-
import logging
|
14
|
-
logger = logging.getLogger(__name__)
|
15
|
-
formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
|
16
|
-
ch = logging.StreamHandler()
|
17
|
-
ch.setFormatter(formatter)
|
18
|
-
logger.addHandler(ch)
|
19
|
-
logger.setLevel(logging.INFO)
|
20
|
-
|
21
|
-
|
22
|
-
"""
|
23
|
-
chk_integrity_corps 함수로 종목코드를 데이터베이스명으로 가지는 DB의 유효성을 검사한다.
|
24
|
-
"""
|
25
|
-
|
26
|
-
|
27
|
-
def test_corp_one(client1: MongoClient, code: str, driver: WebDriver = None, waiting_time: int = 10) -> Dict[str, list]:
|
28
|
-
"""
|
29
|
-
종목 하나의 컬렉션의 유효성을 검사하여 부족한 컬렉션을 딕셔너리로 만들어서 반환한다.
|
30
|
-
driver와 waiting_time은 본 함수에서 사용하지는 않으나 다른 함수와 인자를 맞추기위해 인자로 받아준다.
|
31
|
-
리턴값 - {'005930': ['c104','c103'...]}
|
32
|
-
"""
|
33
|
-
|
34
|
-
def is_same_count_of_docs(col_name1: str, col_name2: str) -> bool:
|
35
|
-
logger.debug(f"In is_same_count_of_docs {code}/ {col_name1}, {col_name2}")
|
36
|
-
corp_one.page = col_name1
|
37
|
-
count_doc1 = corp_one.count_docs_in_col()
|
38
|
-
corp_one.page = col_name2
|
39
|
-
count_doc2 = corp_one.count_docs_in_col()
|
40
|
-
if count_doc1 == count_doc2:
|
41
|
-
return True
|
42
|
-
else:
|
43
|
-
return False
|
44
|
-
|
45
|
-
proper_collections = {'c101', 'c104y', 'c104q', 'c106y', 'c106q', 'c103손익계산서q', 'c103재무상태표q',
|
46
|
-
'c103현금흐름표q', 'c103손익계산서y', 'c103재무상태표y', 'c103현금흐름표y'}
|
47
|
-
|
48
|
-
logger.debug('In test_corp_one function...')
|
49
|
-
return_dict = {}
|
50
|
-
|
51
|
-
logger.debug(f'return_dict is ... {return_dict}')
|
52
|
-
# 한 종목의 유효성 검사코드
|
53
|
-
corp_one = mongo.Corps(client1, code, 'c101')
|
54
|
-
|
55
|
-
# 차집합을 사용해서 db내에 없는 컬렉션이 있는지 확인한다.
|
56
|
-
set_deficient_collentions = set.difference(proper_collections, set(corp_one.list_collection_names()))
|
57
|
-
|
58
|
-
logger.debug(f'After take a set of difference : {set_deficient_collentions}')
|
59
|
-
|
60
|
-
return_dict[code] = set()
|
61
|
-
# 컬렉션이 아예 없는 것이 있다면 falied_codes에 추가한다.
|
62
|
-
if set_deficient_collentions != set():
|
63
|
-
for item in set_deficient_collentions:
|
64
|
-
# 컬렉션 이름 중 앞의 네글자만 추려서 추가해준다.(ex - c103손익계산서q -> c103)
|
65
|
-
return_dict[code].add(item[:4])
|
66
|
-
|
67
|
-
# 각 컬렉션의 q와 y의 도큐먼트 갯수를 비교하여 차이가 있는지 확인한다.
|
68
|
-
if not is_same_count_of_docs('c104y', 'c104q'):
|
69
|
-
return_dict[code].add('c104')
|
70
|
-
if not is_same_count_of_docs('c106y', 'c106q'):
|
71
|
-
return_dict[code].add('c106')
|
72
|
-
if not is_same_count_of_docs('c103손익계산서q', 'c103손익계산서y') \
|
73
|
-
or not is_same_count_of_docs('c103재무상태표q', 'c103재무상태표y') \
|
74
|
-
or not is_same_count_of_docs('c103현금흐름표y', 'c103현금흐름표q'):
|
75
|
-
return_dict[code].add('c103')
|
76
|
-
|
77
|
-
# 집합을 리스트로 바꿔서 다시 저장한다.
|
78
|
-
return_dict[code] = list(return_dict[code])
|
79
|
-
logger.debug(f'Going out test_corp_one : {return_dict}')
|
80
|
-
return return_dict
|
81
|
-
|
82
|
-
|
83
|
-
def test_corp_one_is_modified(client: MongoClient, code: str, driver: WebDriver, waiting_time: int) -> Dict[str, bool]:
|
84
|
-
"""
|
85
|
-
웹에서 스크랩한 c103손익계산서y와 데이터베이스에 있는 c103손익계산서y를 비교하여 다른지 확인하여 업데이트 유무를 반환한다.
|
86
|
-
리턴값 - (코드, bool-업데이트가필요한지)
|
87
|
-
"""
|
88
|
-
df_online = nfsrun.scrape_c103_first_page(driver, code, waiting_time=waiting_time)
|
89
|
-
df_mongo = mongo.C103(client, code=code, page='c103손익계산서y').load_df()
|
90
|
-
|
91
|
-
logger.debug(df_online)
|
92
|
-
logger.debug(df_mongo)
|
93
|
-
|
94
|
-
return_dict = {code: not df_online.equals(df_mongo)}
|
95
|
-
return return_dict
|
96
|
-
|
97
|
-
|
98
|
-
def working_with_parts(test_func: Callable[[MongoClient, str, WebDriver, int], dict], db_addr: str, divided_code_list: list, my_q: Queue, waiting_time: int):
|
99
|
-
# 각 코어별로 디비 클라이언트를 만들어야만 한다. 안그러면 에러발생
|
100
|
-
client = mongo.connect_mongo(db_addr)
|
101
|
-
driver = utils.get_driver()
|
102
|
-
t = len(divided_code_list)
|
103
|
-
|
104
|
-
failed_dict_part = {}
|
105
|
-
|
106
|
-
for i, code in enumerate(divided_code_list):
|
107
|
-
try:
|
108
|
-
failed_one_dict = test_func(client, code, driver, waiting_time)
|
109
|
-
except Exception as e:
|
110
|
-
print(f"{code} has a error : {e}", file=sys.stderr)
|
111
|
-
continue
|
112
|
-
print(f'{i + 1}/{t} {failed_one_dict}')
|
113
|
-
if failed_one_dict[code]:
|
114
|
-
# 빈리스트가 아니라면...또는 C103이 변화되었다면.. 큐에 추가한다.
|
115
|
-
failed_dict_part.update(failed_one_dict)
|
116
|
-
else:
|
117
|
-
# 큐에서 put은 함수 리턴처럼 함수에서 한번만 한다.
|
118
|
-
my_q.put(failed_dict_part)
|
119
|
-
driver.close()
|
120
|
-
|
121
|
-
|
122
|
-
# 멀티프로세싱을 사용하기 위해서 독립된 함수로 제작하였음(피클링이 가능해야함)
|
123
|
-
def chk_integrity_corps(client: MongoClient, code: str = 'all') -> Dict[str, list]:
|
124
|
-
"""
|
125
|
-
몽고 디비의 corps들의 integrity 검사후 이상이 있는 코드 리스트 반환
|
126
|
-
이상을 찾는 방법 - 각 컬렉션이 다 있는가. 각 컬렉션에서 연도와 분기의 도큐먼트 갯수가 같은가
|
127
|
-
return - {'코드': ['cxxx',...], '코드': ['cxxx',...]...}
|
128
|
-
"""
|
129
|
-
failed_codes = {}
|
130
|
-
codes_in_db = mongo.Corps.get_all_codes(client)
|
131
|
-
if code == 'all':
|
132
|
-
print('*' * 25, f"Check all Corp db integrity using multiprocess", '*' * 25)
|
133
|
-
print(f'Total {len(codes_in_db)} items..')
|
134
|
-
n, divided_list = utils.code_divider_by_cpu_core(codes_in_db)
|
135
|
-
|
136
|
-
addr = mongo.extract_addr_from_client(client)
|
137
|
-
|
138
|
-
start_time = time.time()
|
139
|
-
q = Queue()
|
140
|
-
ths = []
|
141
|
-
for i in range(n):
|
142
|
-
ths.append(Process(target=working_with_parts, args=(test_corp_one, addr, divided_list[i], q, 0)))
|
143
|
-
for i in range(n):
|
144
|
-
ths[i].start()
|
145
|
-
|
146
|
-
for i in range(n):
|
147
|
-
failed_codes.update(q.get())
|
148
|
-
|
149
|
-
for i in range(n):
|
150
|
-
ths[i].join()
|
151
|
-
|
152
|
-
logger.debug(f"failed_codes : {failed_codes}")
|
153
|
-
print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
|
154
|
-
else:
|
155
|
-
print('*' * 25, f"Check {code} db integrity", '*' * 25)
|
156
|
-
if code in codes_in_db:
|
157
|
-
result_dict = test_corp_one(client, code)
|
158
|
-
print(f'{code} : {result_dict[code]}')
|
159
|
-
if result_dict[code]: # 빈리스트가 아니라면...
|
160
|
-
failed_codes.update(result_dict)
|
161
|
-
|
162
|
-
else:
|
163
|
-
Exception(f'{code} is not in db..')
|
164
|
-
return failed_codes
|
165
|
-
|
166
|
-
|
167
|
-
def chk_modifying_corps(client, code: str = 'all', waiting_time: int = 60) -> Dict[str, bool]:
|
168
|
-
"""
|
169
|
-
각 종목의 웹과 DB의 C103손익계산서y를 비교하여 변화가 있어 refresh가 필요한지를 반환한다.
|
170
|
-
"""
|
171
|
-
failed_codes = {}
|
172
|
-
codes_in_db = mongo.Corps.get_all_codes(client)
|
173
|
-
if code == 'all':
|
174
|
-
print('*' * 25, f"Check all Corp db need for updating using multiprocess", '*' * 25)
|
175
|
-
print(f'Total {len(codes_in_db)} items..')
|
176
|
-
n, divided_list = utils.code_divider_by_cpu_core(codes_in_db)
|
177
|
-
|
178
|
-
addr = mongo.extract_addr_from_client(client)
|
179
|
-
|
180
|
-
start_time = time.time()
|
181
|
-
q = Queue()
|
182
|
-
ths = []
|
183
|
-
for i in range(n):
|
184
|
-
ths.append(Process(target=working_with_parts, args=(test_corp_one_is_modified, addr, divided_list[i], q, waiting_time)))
|
185
|
-
for i in range(n):
|
186
|
-
ths[i].start()
|
187
|
-
|
188
|
-
for i in range(n):
|
189
|
-
failed_codes.update(q.get())
|
190
|
-
|
191
|
-
for i in range(n):
|
192
|
-
ths[i].join()
|
193
|
-
|
194
|
-
logger.debug(f"failed_codes : {failed_codes}")
|
195
|
-
print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
|
196
|
-
else:
|
197
|
-
print('*' * 25, f"Check {code} db need for updating ", '*' * 25)
|
198
|
-
driver = utils.get_driver()
|
199
|
-
if code in codes_in_db:
|
200
|
-
result_dict = test_corp_one_is_modified(client, code, driver)
|
201
|
-
print(f'{code} : {result_dict[code]}')
|
202
|
-
if result_dict[code]:
|
203
|
-
failed_codes.update(result_dict)
|
204
|
-
|
205
|
-
else:
|
206
|
-
Exception(f'{code} is not in db..')
|
207
|
-
return failed_codes
|
208
|
-
|
209
|
-
|
210
|
-
def sync_mongo_with_krx(client):
|
211
|
-
print('*' * 20, 'Sync with krx and mongodb', '*' * 20)
|
212
|
-
all_codes_in_db = mongo.Corps.get_all_codes(client)
|
213
|
-
print('*' * 20, 'Refreshing krx.db...', '*' * 20)
|
214
|
-
krx.make_db()
|
215
|
-
print('*' * 80)
|
216
|
-
all_codes_in_krx = krx.get_codes()
|
217
|
-
print('\tThe number of codes in krx: ', len(all_codes_in_krx))
|
218
|
-
logger.debug(all_codes_in_krx)
|
219
|
-
try:
|
220
|
-
print('\tThe number of dbs in mongo: ', len(all_codes_in_db))
|
221
|
-
logger.debug(all_codes_in_db)
|
222
|
-
except TypeError:
|
223
|
-
err_msg = "Error while sync mongo data...it's possible mongo db doesn't set yet.."
|
224
|
-
logger.error(err_msg)
|
225
|
-
noti.telegram_to(botname='manager', text=err_msg)
|
226
|
-
return
|
227
|
-
del_targets = list(set(all_codes_in_db) - set(all_codes_in_krx))
|
228
|
-
add_targets = list(set(all_codes_in_krx) - set(all_codes_in_db))
|
229
|
-
print('\tDelete target: ', del_targets)
|
230
|
-
print('\tAdd target: ', add_targets)
|
231
|
-
|
232
|
-
for target in del_targets:
|
233
|
-
mongo.Corps.del_db(client, target)
|
234
|
-
|
235
|
-
if add_targets:
|
236
|
-
print(f'Starting.. c10346 scraper.. items : {len(add_targets)}')
|
237
|
-
addr = mongo.extract_addr_from_client(client)
|
238
|
-
nfsrun.c103(add_targets, addr)
|
239
|
-
nfsrun.c104(add_targets, addr)
|
240
|
-
nfsrun.c106(add_targets, addr)
|