analyser_hj3415 2.0.1__py2.py3-none-any.whl → 2.1.0__py2.py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,240 +0,0 @@
1
- import time
2
- import sys
3
- from _datetime import datetime
4
- from typing import Dict, List, Tuple, Callable
5
- from multiprocessing import Process, Queue
6
- from . import mongo
7
- from utils_hj3415 import utils, noti
8
- from scraper2_hj3415.krx import krx
9
- from scraper2_hj3415.nfscrapy import run as nfsrun
10
- from pymongo import MongoClient
11
- from selenium.webdriver.chrome.webdriver import WebDriver
12
-
13
- import logging
14
- logger = logging.getLogger(__name__)
15
- formatter = logging.Formatter('%(levelname)s: [%(name)s] %(message)s')
16
- ch = logging.StreamHandler()
17
- ch.setFormatter(formatter)
18
- logger.addHandler(ch)
19
- logger.setLevel(logging.INFO)
20
-
21
-
22
- """
23
- chk_integrity_corps 함수로 종목코드를 데이터베이스명으로 가지는 DB의 유효성을 검사한다.
24
- """
25
-
26
-
27
- def test_corp_one(client1: MongoClient, code: str, driver: WebDriver = None, waiting_time: int = 10) -> Dict[str, list]:
28
- """
29
- 종목 하나의 컬렉션의 유효성을 검사하여 부족한 컬렉션을 딕셔너리로 만들어서 반환한다.
30
- driver와 waiting_time은 본 함수에서 사용하지는 않으나 다른 함수와 인자를 맞추기위해 인자로 받아준다.
31
- 리턴값 - {'005930': ['c104','c103'...]}
32
- """
33
-
34
- def is_same_count_of_docs(col_name1: str, col_name2: str) -> bool:
35
- logger.debug(f"In is_same_count_of_docs {code}/ {col_name1}, {col_name2}")
36
- corp_one.page = col_name1
37
- count_doc1 = corp_one.count_docs_in_col()
38
- corp_one.page = col_name2
39
- count_doc2 = corp_one.count_docs_in_col()
40
- if count_doc1 == count_doc2:
41
- return True
42
- else:
43
- return False
44
-
45
- proper_collections = {'c101', 'c104y', 'c104q', 'c106y', 'c106q', 'c103손익계산서q', 'c103재무상태표q',
46
- 'c103현금흐름표q', 'c103손익계산서y', 'c103재무상태표y', 'c103현금흐름표y'}
47
-
48
- logger.debug('In test_corp_one function...')
49
- return_dict = {}
50
-
51
- logger.debug(f'return_dict is ... {return_dict}')
52
- # 한 종목의 유효성 검사코드
53
- corp_one = mongo.Corps(client1, code, 'c101')
54
-
55
- # 차집합을 사용해서 db내에 없는 컬렉션이 있는지 확인한다.
56
- set_deficient_collentions = set.difference(proper_collections, set(corp_one.list_collection_names()))
57
-
58
- logger.debug(f'After take a set of difference : {set_deficient_collentions}')
59
-
60
- return_dict[code] = set()
61
- # 컬렉션이 아예 없는 것이 있다면 falied_codes에 추가한다.
62
- if set_deficient_collentions != set():
63
- for item in set_deficient_collentions:
64
- # 컬렉션 이름 중 앞의 네글자만 추려서 추가해준다.(ex - c103손익계산서q -> c103)
65
- return_dict[code].add(item[:4])
66
-
67
- # 각 컬렉션의 q와 y의 도큐먼트 갯수를 비교하여 차이가 있는지 확인한다.
68
- if not is_same_count_of_docs('c104y', 'c104q'):
69
- return_dict[code].add('c104')
70
- if not is_same_count_of_docs('c106y', 'c106q'):
71
- return_dict[code].add('c106')
72
- if not is_same_count_of_docs('c103손익계산서q', 'c103손익계산서y') \
73
- or not is_same_count_of_docs('c103재무상태표q', 'c103재무상태표y') \
74
- or not is_same_count_of_docs('c103현금흐름표y', 'c103현금흐름표q'):
75
- return_dict[code].add('c103')
76
-
77
- # 집합을 리스트로 바꿔서 다시 저장한다.
78
- return_dict[code] = list(return_dict[code])
79
- logger.debug(f'Going out test_corp_one : {return_dict}')
80
- return return_dict
81
-
82
-
83
- def test_corp_one_is_modified(client: MongoClient, code: str, driver: WebDriver, waiting_time: int) -> Dict[str, bool]:
84
- """
85
- 웹에서 스크랩한 c103손익계산서y와 데이터베이스에 있는 c103손익계산서y를 비교하여 다른지 확인하여 업데이트 유무를 반환한다.
86
- 리턴값 - (코드, bool-업데이트가필요한지)
87
- """
88
- df_online = nfsrun.scrape_c103_first_page(driver, code, waiting_time=waiting_time)
89
- df_mongo = mongo.C103(client, code=code, page='c103손익계산서y').load_df()
90
-
91
- logger.debug(df_online)
92
- logger.debug(df_mongo)
93
-
94
- return_dict = {code: not df_online.equals(df_mongo)}
95
- return return_dict
96
-
97
-
98
- def working_with_parts(test_func: Callable[[MongoClient, str, WebDriver, int], dict], db_addr: str, divided_code_list: list, my_q: Queue, waiting_time: int):
99
- # 각 코어별로 디비 클라이언트를 만들어야만 한다. 안그러면 에러발생
100
- client = mongo.connect_mongo(db_addr)
101
- driver = utils.get_driver()
102
- t = len(divided_code_list)
103
-
104
- failed_dict_part = {}
105
-
106
- for i, code in enumerate(divided_code_list):
107
- try:
108
- failed_one_dict = test_func(client, code, driver, waiting_time)
109
- except Exception as e:
110
- print(f"{code} has a error : {e}", file=sys.stderr)
111
- continue
112
- print(f'{i + 1}/{t} {failed_one_dict}')
113
- if failed_one_dict[code]:
114
- # 빈리스트가 아니라면...또는 C103이 변화되었다면.. 큐에 추가한다.
115
- failed_dict_part.update(failed_one_dict)
116
- else:
117
- # 큐에서 put은 함수 리턴처럼 함수에서 한번만 한다.
118
- my_q.put(failed_dict_part)
119
- driver.close()
120
-
121
-
122
- # 멀티프로세싱을 사용하기 위해서 독립된 함수로 제작하였음(피클링이 가능해야함)
123
- def chk_integrity_corps(client: MongoClient, code: str = 'all') -> Dict[str, list]:
124
- """
125
- 몽고 디비의 corps들의 integrity 검사후 이상이 있는 코드 리스트 반환
126
- 이상을 찾는 방법 - 각 컬렉션이 다 있는가. 각 컬렉션에서 연도와 분기의 도큐먼트 갯수가 같은가
127
- return - {'코드': ['cxxx',...], '코드': ['cxxx',...]...}
128
- """
129
- failed_codes = {}
130
- codes_in_db = mongo.Corps.get_all_codes(client)
131
- if code == 'all':
132
- print('*' * 25, f"Check all Corp db integrity using multiprocess", '*' * 25)
133
- print(f'Total {len(codes_in_db)} items..')
134
- n, divided_list = utils.code_divider_by_cpu_core(codes_in_db)
135
-
136
- addr = mongo.extract_addr_from_client(client)
137
-
138
- start_time = time.time()
139
- q = Queue()
140
- ths = []
141
- for i in range(n):
142
- ths.append(Process(target=working_with_parts, args=(test_corp_one, addr, divided_list[i], q, 0)))
143
- for i in range(n):
144
- ths[i].start()
145
-
146
- for i in range(n):
147
- failed_codes.update(q.get())
148
-
149
- for i in range(n):
150
- ths[i].join()
151
-
152
- logger.debug(f"failed_codes : {failed_codes}")
153
- print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
154
- else:
155
- print('*' * 25, f"Check {code} db integrity", '*' * 25)
156
- if code in codes_in_db:
157
- result_dict = test_corp_one(client, code)
158
- print(f'{code} : {result_dict[code]}')
159
- if result_dict[code]: # 빈리스트가 아니라면...
160
- failed_codes.update(result_dict)
161
-
162
- else:
163
- Exception(f'{code} is not in db..')
164
- return failed_codes
165
-
166
-
167
- def chk_modifying_corps(client, code: str = 'all', waiting_time: int = 60) -> Dict[str, bool]:
168
- """
169
- 각 종목의 웹과 DB의 C103손익계산서y를 비교하여 변화가 있어 refresh가 필요한지를 반환한다.
170
- """
171
- failed_codes = {}
172
- codes_in_db = mongo.Corps.get_all_codes(client)
173
- if code == 'all':
174
- print('*' * 25, f"Check all Corp db need for updating using multiprocess", '*' * 25)
175
- print(f'Total {len(codes_in_db)} items..')
176
- n, divided_list = utils.code_divider_by_cpu_core(codes_in_db)
177
-
178
- addr = mongo.extract_addr_from_client(client)
179
-
180
- start_time = time.time()
181
- q = Queue()
182
- ths = []
183
- for i in range(n):
184
- ths.append(Process(target=working_with_parts, args=(test_corp_one_is_modified, addr, divided_list[i], q, waiting_time)))
185
- for i in range(n):
186
- ths[i].start()
187
-
188
- for i in range(n):
189
- failed_codes.update(q.get())
190
-
191
- for i in range(n):
192
- ths[i].join()
193
-
194
- logger.debug(f"failed_codes : {failed_codes}")
195
- print(f'Total spent time : {round(time.time() - start_time, 2)} sec.')
196
- else:
197
- print('*' * 25, f"Check {code} db need for updating ", '*' * 25)
198
- driver = utils.get_driver()
199
- if code in codes_in_db:
200
- result_dict = test_corp_one_is_modified(client, code, driver)
201
- print(f'{code} : {result_dict[code]}')
202
- if result_dict[code]:
203
- failed_codes.update(result_dict)
204
-
205
- else:
206
- Exception(f'{code} is not in db..')
207
- return failed_codes
208
-
209
-
210
- def sync_mongo_with_krx(client):
211
- print('*' * 20, 'Sync with krx and mongodb', '*' * 20)
212
- all_codes_in_db = mongo.Corps.get_all_codes(client)
213
- print('*' * 20, 'Refreshing krx.db...', '*' * 20)
214
- krx.make_db()
215
- print('*' * 80)
216
- all_codes_in_krx = krx.get_codes()
217
- print('\tThe number of codes in krx: ', len(all_codes_in_krx))
218
- logger.debug(all_codes_in_krx)
219
- try:
220
- print('\tThe number of dbs in mongo: ', len(all_codes_in_db))
221
- logger.debug(all_codes_in_db)
222
- except TypeError:
223
- err_msg = "Error while sync mongo data...it's possible mongo db doesn't set yet.."
224
- logger.error(err_msg)
225
- noti.telegram_to(botname='manager', text=err_msg)
226
- return
227
- del_targets = list(set(all_codes_in_db) - set(all_codes_in_krx))
228
- add_targets = list(set(all_codes_in_krx) - set(all_codes_in_db))
229
- print('\tDelete target: ', del_targets)
230
- print('\tAdd target: ', add_targets)
231
-
232
- for target in del_targets:
233
- mongo.Corps.del_db(client, target)
234
-
235
- if add_targets:
236
- print(f'Starting.. c10346 scraper.. items : {len(add_targets)}')
237
- addr = mongo.extract_addr_from_client(client)
238
- nfsrun.c103(add_targets, addr)
239
- nfsrun.c104(add_targets, addr)
240
- nfsrun.c106(add_targets, addr)