utilskit 0.1.2__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {utilskit-0.1.2 → utilskit-0.2.0}/PKG-INFO +5 -1
- {utilskit-0.1.2 → utilskit-0.2.0}/README.md +3 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/setup.py +3 -2
- utilskit-0.2.0/test/test.py +321 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit/classificationutils/classificationutils.py +30 -8
- utilskit-0.2.0/utilskit/dataframeutils/__init__.py +1 -0
- utilskit-0.2.0/utilskit/dataframeutils/dataframeutils.py +255 -0
- utilskit-0.2.0/utilskit/dbutils/__init__.py +0 -0
- utilskit-0.2.0/utilskit/logutils/__init__.py +1 -0
- utilskit-0.2.0/utilskit/logutils/logutils.py +215 -0
- utilskit-0.2.0/utilskit/plotutils/__init__.py +1 -0
- {utilskit-0.1.2/utilskit → utilskit-0.2.0/utilskit/plotutils}/plotutils.py +57 -38
- utilskit-0.2.0/utilskit/repeatutils/__init__.py +1 -0
- {utilskit-0.1.2/utilskit → utilskit-0.2.0/utilskit/repeatutils}/repeatutils.py +33 -17
- utilskit-0.2.0/utilskit/timeutils/__init__.py +1 -0
- utilskit-0.2.0/utilskit/timeutils/timeutils.py +48 -0
- utilskit-0.2.0/utilskit/utils/__init__.py +1 -0
- {utilskit-0.1.2/utilskit → utilskit-0.2.0/utilskit/utils}/utils.py +6 -17
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit.egg-info/PKG-INFO +5 -1
- utilskit-0.2.0/utilskit.egg-info/SOURCES.txt +26 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit.egg-info/requires.txt +1 -0
- utilskit-0.1.2/utilskit/dataframeutils.py +0 -328
- utilskit-0.1.2/utilskit/logutils.py +0 -109
- utilskit-0.1.2/utilskit/timeutils.py +0 -40
- utilskit-0.1.2/utilskit.egg-info/SOURCES.txt +0 -18
- {utilskit-0.1.2 → utilskit-0.2.0}/MANIFEST.in +0 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/setup.cfg +0 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit/__init__.py +0 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit/classificationutils/__init__.py +0 -0
- {utilskit-0.1.2/utilskit → utilskit-0.2.0/utilskit/dbutils}/dbutils.py +0 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit.egg-info/dependency_links.txt +0 -0
- {utilskit-0.1.2 → utilskit-0.2.0}/utilskit.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: utilskit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: description
|
|
5
5
|
Author: Kimyh
|
|
6
6
|
Author-email: kim_yh663927@naver.com
|
|
@@ -15,6 +15,7 @@ Requires-Dist: pandas==2.3.1
|
|
|
15
15
|
Requires-Dist: PyMySQL==1.1.1
|
|
16
16
|
Requires-Dist: SQLAlchemy==2.0.41
|
|
17
17
|
Requires-Dist: tqdm==4.67.1
|
|
18
|
+
Requires-Dist: xlrd==2.0.2
|
|
18
19
|
Dynamic: author
|
|
19
20
|
Dynamic: author-email
|
|
20
21
|
Dynamic: classifier
|
|
@@ -24,6 +25,9 @@ Dynamic: requires-dist
|
|
|
24
25
|
Dynamic: requires-python
|
|
25
26
|
Dynamic: summary
|
|
26
27
|
|
|
28
|
+
0.2.0
|
|
29
|
+
- 정식 최초 배포버전
|
|
30
|
+
- 각 함수의 사용성 강화 및 비활성 함수 지정
|
|
27
31
|
0.1.2
|
|
28
32
|
- repeatutils 의 get_repeat_section 에서 하나의 값이 여러 구간에서 반복될때 마지막 구간만 나오는 부분 수정
|
|
29
33
|
- repeatutils 의 get_repeat_section 및 get_stan_repeat_section 에서 추출되는 구간의 마지막 값이 +1 이 되는 부분 수정
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name="utilskit", # 패키지 이름 (pip install 시 사용될 이름)
|
|
5
|
-
version="0.
|
|
5
|
+
version="0.2.0", # 버전
|
|
6
6
|
packages=find_packages(), # textbasic 폴더 내 모든 패키지 포함
|
|
7
7
|
include_package_data=True, # 이 설정을 통해 패키지 내 데이터 파일을 포함시킬 수 있음
|
|
8
8
|
package_data={
|
|
@@ -13,7 +13,8 @@ setup(
|
|
|
13
13
|
"pandas==2.3.1",
|
|
14
14
|
"PyMySQL==1.1.1",
|
|
15
15
|
"SQLAlchemy==2.0.41",
|
|
16
|
-
"tqdm==4.67.1"
|
|
16
|
+
"tqdm==4.67.1",
|
|
17
|
+
"xlrd==2.0.2"
|
|
17
18
|
],
|
|
18
19
|
# install_requires=[
|
|
19
20
|
# "pandas>=1.3.0,<2.0.0", # 버전 범위 설정 방법
|
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
# import time
|
|
4
|
+
|
|
5
|
+
sys.path.append('/home/kimyh/library/utilskit')
|
|
6
|
+
def main():
|
|
7
|
+
from utilskit import classificationutils as clu
|
|
8
|
+
label2id_dict = {
|
|
9
|
+
'고양이':0,
|
|
10
|
+
'개':1
|
|
11
|
+
}
|
|
12
|
+
t = ['고양이', '개', '개', '고양이', '고양이', '개']
|
|
13
|
+
p = ['개', '개', '고양이', '고양이', '고양이', '개']
|
|
14
|
+
id2label_dict = {
|
|
15
|
+
0:'고양이',
|
|
16
|
+
1:'개'
|
|
17
|
+
}
|
|
18
|
+
t = [1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0]
|
|
19
|
+
p = [1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1]
|
|
20
|
+
|
|
21
|
+
print(type(id2label_dict))
|
|
22
|
+
cm = clu.confusion_matrix(
|
|
23
|
+
class_dict=id2label_dict,
|
|
24
|
+
true_list=t,
|
|
25
|
+
pred_list=p,
|
|
26
|
+
ignore_idx=None,
|
|
27
|
+
round_num=2,
|
|
28
|
+
percentage=True
|
|
29
|
+
)
|
|
30
|
+
print(cm)
|
|
31
|
+
cm.to_csv('cm.csv', encoding='utf-8-sig')
|
|
32
|
+
|
|
33
|
+
def main2():
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import numpy as np
|
|
36
|
+
from datetime import datetime, timedelta
|
|
37
|
+
from utilskit import dataframeutils as dfu
|
|
38
|
+
start_time = datetime.strptime('2025-07-22 10:05:15', '%Y-%m-%d %H:%M:%S')
|
|
39
|
+
end_time = start_time + timedelta(seconds=5)
|
|
40
|
+
time_range = pd.date_range(start=start_time, end=end_time, freq='S')
|
|
41
|
+
value_ary = np.random.randint(10, 20, len(time_range))
|
|
42
|
+
df = pd.DataFrame({
|
|
43
|
+
'time':time_range,
|
|
44
|
+
'value':value_ary
|
|
45
|
+
})
|
|
46
|
+
print(df)
|
|
47
|
+
df = dfu.utc2kor(
|
|
48
|
+
dataframe=df,
|
|
49
|
+
column='time',
|
|
50
|
+
extend=True
|
|
51
|
+
)
|
|
52
|
+
print(df)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def main3():
|
|
56
|
+
import numpy as np
|
|
57
|
+
import pandas as pd
|
|
58
|
+
from utilskit import dataframeutils as dfu
|
|
59
|
+
|
|
60
|
+
value_ary1 = [1, 6, 3, 8, 5]
|
|
61
|
+
value_ary2 = [5, 7, 2, 6, 9]
|
|
62
|
+
df = pd.DataFrame({'col1':value_ary1, 'col2':value_ary2})
|
|
63
|
+
df = dfu.adnormal2nan(
|
|
64
|
+
dataframe=df,
|
|
65
|
+
column='col1',
|
|
66
|
+
max_value=7,
|
|
67
|
+
min_value=2
|
|
68
|
+
)
|
|
69
|
+
print(df)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main4():
|
|
73
|
+
import numpy as np
|
|
74
|
+
import pandas as pd
|
|
75
|
+
from datetime import datetime
|
|
76
|
+
from utilskit import dataframeutils as dfu
|
|
77
|
+
|
|
78
|
+
time_ary = ['2024-05-11 03:45:12', '2024-05-11 03:45:15', '2024-05-11 03:45:16']
|
|
79
|
+
value_ary = [1, 5, 6]
|
|
80
|
+
df = pd.DataFrame({
|
|
81
|
+
'time':time_ary,
|
|
82
|
+
'value':value_ary
|
|
83
|
+
})
|
|
84
|
+
print(df)
|
|
85
|
+
df = dfu.time_filling(
|
|
86
|
+
dataframe=df,
|
|
87
|
+
start='2024-05-11 03:45:10',
|
|
88
|
+
end='2024-05-11 03:45:20',
|
|
89
|
+
column='time'
|
|
90
|
+
)
|
|
91
|
+
print(df)
|
|
92
|
+
df = pd.DataFrame([1, 2, 3, 4], columns=['value'])
|
|
93
|
+
if dfu.isdfvalid(df, ['value']):
|
|
94
|
+
print('컬럼이 전부 존재합니다.')
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def main5():
|
|
98
|
+
import pandas as pd
|
|
99
|
+
import numpy as np
|
|
100
|
+
from utilskit import dataframeutils as dfu
|
|
101
|
+
|
|
102
|
+
value_ary1 = [1, np.nan, np.nan, 2, 3, np.nan, np.nan, np.nan]
|
|
103
|
+
value_ary2 = np.random.randint(0, 10, size=len(value_ary1))
|
|
104
|
+
df = pd.DataFrame({
|
|
105
|
+
'value1':value_ary1,
|
|
106
|
+
'value2':value_ary2
|
|
107
|
+
})
|
|
108
|
+
print(df)
|
|
109
|
+
df = dfu.fill_repeat_nan(
|
|
110
|
+
dataframe=df,
|
|
111
|
+
column='value1',
|
|
112
|
+
repeat=3
|
|
113
|
+
)
|
|
114
|
+
print(df)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def main6():
|
|
118
|
+
import pandas as pd
|
|
119
|
+
import numpy as np
|
|
120
|
+
from utilskit import dataframeutils as dfu
|
|
121
|
+
|
|
122
|
+
# value_ary1 = [20, 20, 20, 20, 2, 20, 20, 20, 1, 1, 2, 1]
|
|
123
|
+
# # value_ary2 = np.random.randint(0, 10, size=len(value_ary1))
|
|
124
|
+
# df = pd.DataFrame({
|
|
125
|
+
# 'value1':value_ary1
|
|
126
|
+
# })
|
|
127
|
+
# # print(df)
|
|
128
|
+
# df = dfu.pin2nan(
|
|
129
|
+
# dataframe=df,
|
|
130
|
+
# column='value1',
|
|
131
|
+
# max_diff=0.1,
|
|
132
|
+
# repeat=3
|
|
133
|
+
# )
|
|
134
|
+
# print(df)
|
|
135
|
+
|
|
136
|
+
import pandas as pd
|
|
137
|
+
|
|
138
|
+
data = [19, 19, 20, 20, 1, 21, 21, 22, 1, 1, 2, 1]
|
|
139
|
+
df = pd.DataFrame({'val': data})
|
|
140
|
+
|
|
141
|
+
# 이전, 현재, 다음 값을 비교하기 위해 shift를 활용
|
|
142
|
+
df['prev'] = df['val'].shift(1)
|
|
143
|
+
df['next'] = df['val'].shift(-1)
|
|
144
|
+
print(df)
|
|
145
|
+
|
|
146
|
+
# [1]만 추출: 이전값과 다음값이 모두 1이 아니면서 현재값이 1인 경우
|
|
147
|
+
isolated_ones = df[(df['val'] == 1) & (df['prev'] != 1) & (df['next'] != 1)]
|
|
148
|
+
|
|
149
|
+
print(isolated_ones)
|
|
150
|
+
|
|
151
|
+
def main7():
|
|
152
|
+
from utilskit import logutils as lu
|
|
153
|
+
log = lu.get_logger(
|
|
154
|
+
log_path='./log3',
|
|
155
|
+
log_name='whole',
|
|
156
|
+
rollover=True
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
log.debug("DEBUG 메시지입니다.")
|
|
160
|
+
log.info("INFO 메시지입니다.")
|
|
161
|
+
log.warning("WARNING 메시지입니다.")
|
|
162
|
+
log.error("ERROR 메시지입니다.")
|
|
163
|
+
log.critical("CRITICAL 메시지입니다.")
|
|
164
|
+
lu.log_sort('./log3')
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def main8():
|
|
168
|
+
import numpy as np
|
|
169
|
+
from utilskit import plotutils as plu
|
|
170
|
+
np.random.seed(42)
|
|
171
|
+
x = np.arange(100)
|
|
172
|
+
data = np.random.randint(5, 20, size=100)
|
|
173
|
+
data1 = np.random.randint(5, 20, size=100)
|
|
174
|
+
data2 = np.random.randint(5, 20, size=100)
|
|
175
|
+
|
|
176
|
+
plu.draw_plot(
|
|
177
|
+
title='whole2',
|
|
178
|
+
x=x,
|
|
179
|
+
y=data,
|
|
180
|
+
fig_size=(30, 8),
|
|
181
|
+
x_range=(-10, 120),
|
|
182
|
+
y_range=(0, 25),
|
|
183
|
+
x_label='x data',
|
|
184
|
+
y_label='y data',
|
|
185
|
+
legend=True,
|
|
186
|
+
title_font=25,
|
|
187
|
+
x_font=20,
|
|
188
|
+
y_font=20,
|
|
189
|
+
x_label_font=23,
|
|
190
|
+
y_label_font=23,
|
|
191
|
+
line_style='dash',
|
|
192
|
+
line_size=3,
|
|
193
|
+
marker_style='circle',
|
|
194
|
+
marker_size=10,
|
|
195
|
+
marker_color='white',
|
|
196
|
+
marker_border_size=2,
|
|
197
|
+
marker_border_color='black',
|
|
198
|
+
add_x_list=[x, x],
|
|
199
|
+
add_y_list=[data1, data2],
|
|
200
|
+
add_color_list=['red', 'violet'],
|
|
201
|
+
focus_list=[(22, 27), (42, 53), (70, 76)],
|
|
202
|
+
focus_color_list=['red', 'red', 'blue'],
|
|
203
|
+
alpha_list=[0.1, 0.5, 1],
|
|
204
|
+
save_path='./image'
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def main9():
|
|
208
|
+
import numpy as np
|
|
209
|
+
from utilskit import plotutils as plu
|
|
210
|
+
np.random.seed(42)
|
|
211
|
+
x = np.arange(100)
|
|
212
|
+
data = np.random.randint(5, 20, size=100)
|
|
213
|
+
data1 = np.random.randint(50, 90, size=100)
|
|
214
|
+
data2 = np.random.randint(180, 190, size=100)
|
|
215
|
+
|
|
216
|
+
plu.draw_subplot(
|
|
217
|
+
sub_title_list=['data', 'data1', 'data2'],
|
|
218
|
+
x_list=[x, x, x],
|
|
219
|
+
y_list=[data, data1, data2],
|
|
220
|
+
# sub_row_idx=3,
|
|
221
|
+
# sub_col_idx=1,
|
|
222
|
+
# fig_size=(30, 5*3),
|
|
223
|
+
# x_range_list=[(0, 100), (-10, 110), (-20, 120)],
|
|
224
|
+
# y_range_list=[(-10, 100), (-10, 100), (150, 240)],
|
|
225
|
+
# title_font=25,
|
|
226
|
+
# x_font=15,
|
|
227
|
+
# y_font=5,
|
|
228
|
+
focus_list=[(22, 27), (42, 53), (70, 76)],
|
|
229
|
+
focus_color_list=['red', 'red', 'green'],
|
|
230
|
+
alpha_list=[0.2, 0.2, 0.2],
|
|
231
|
+
save_path='./sub_image',
|
|
232
|
+
save_name='sub-focus'
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def main10():
|
|
237
|
+
import numpy as np
|
|
238
|
+
from utilskit import repeatutils as rpu
|
|
239
|
+
data = np.array(
|
|
240
|
+
[
|
|
241
|
+
1, 1, 1, 1, 1, # 0 ~ 4
|
|
242
|
+
2, 2, 2, 2, # 5 ~ 8
|
|
243
|
+
3, 3, # 9 ~ 10
|
|
244
|
+
4, 4, 4, # 11 ~ 13
|
|
245
|
+
np.nan, np.nan, np.nan, np.nan, # 14 ~ 17
|
|
246
|
+
1, 1, 1, 1, # 18 ~ 21
|
|
247
|
+
3, 3, 3, # 22 ~ 24
|
|
248
|
+
np.nan, np.nan, np.nan, np.nan, np.nan, # 25 ~ 29
|
|
249
|
+
1, 1, 1, 1, 1, 1, 1, # 30 ~ 36
|
|
250
|
+
np.nan # 37
|
|
251
|
+
]
|
|
252
|
+
)
|
|
253
|
+
data = ['아', '아', '아', '아', '아', '바', '바']
|
|
254
|
+
print(data)
|
|
255
|
+
repeat_section = rpu.get_repeat_section(
|
|
256
|
+
data=data,
|
|
257
|
+
repeat=4,
|
|
258
|
+
except_nan=False
|
|
259
|
+
)
|
|
260
|
+
print(repeat_section)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def main11():
|
|
264
|
+
import numpy as np
|
|
265
|
+
from utilskit import repeatutils as rpu
|
|
266
|
+
data = np.array(
|
|
267
|
+
[
|
|
268
|
+
1, 1, 1, 1, 1, # 0 ~ 4
|
|
269
|
+
2, 2, 2, 2, # 5 ~ 8
|
|
270
|
+
3, 3, # 9 ~ 10
|
|
271
|
+
4, 4, 4, # 11 ~ 13
|
|
272
|
+
np.nan, np.nan, np.nan, np.nan, # 14 ~ 17
|
|
273
|
+
1, 1, 1, 1, # 18 ~ 21
|
|
274
|
+
3, 3, 3, # 22 ~ 24
|
|
275
|
+
np.nan, np.nan, np.nan, np.nan, np.nan, # 25 ~ 29
|
|
276
|
+
1, 1, 1, 1, 1, 1, 1, # 30 ~ 36
|
|
277
|
+
np.nan # 37
|
|
278
|
+
]
|
|
279
|
+
)
|
|
280
|
+
repeat_section = rpu.get_stan_repeat_section(
|
|
281
|
+
data=data,
|
|
282
|
+
value=1,
|
|
283
|
+
repeat=4,
|
|
284
|
+
mode='a',
|
|
285
|
+
reverse=True
|
|
286
|
+
)
|
|
287
|
+
print(repeat_section)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def main12():
|
|
291
|
+
from utilskit import timeutils as tiu
|
|
292
|
+
now = tiu.get_now('년|분|시|월|초|일')
|
|
293
|
+
|
|
294
|
+
import time
|
|
295
|
+
hh, mm, ss = tiu.time_measure(-1)
|
|
296
|
+
print(f'입력된 값은 {hh}시간 {mm}분 {ss}초 입니다.')
|
|
297
|
+
|
|
298
|
+
date_list = tiu.get_date_list(
|
|
299
|
+
year=2025,
|
|
300
|
+
mon_list=[2],
|
|
301
|
+
start_day_list=[25],
|
|
302
|
+
end_day_list=[33]
|
|
303
|
+
)
|
|
304
|
+
print(date_list)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def main13():
|
|
308
|
+
from utilskit import utils as u
|
|
309
|
+
u.envs_setting()
|
|
310
|
+
|
|
311
|
+
a = 1
|
|
312
|
+
b = '2'
|
|
313
|
+
try:
|
|
314
|
+
c = a + b
|
|
315
|
+
except TypeError:
|
|
316
|
+
error_info = u.get_error_info()
|
|
317
|
+
print(error_info)
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
if __name__ == '__main__':
|
|
321
|
+
main13()
|
|
@@ -5,7 +5,7 @@ import sys
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import numpy as np
|
|
7
7
|
|
|
8
|
-
__all__ = ["
|
|
8
|
+
__all__ = ["confusion_matrix"]
|
|
9
9
|
|
|
10
10
|
def get_max_2nd_n_reliability(pred):
|
|
11
11
|
pred_min = np.expand_dims(np.min(pred, axis=1), axis=1)
|
|
@@ -79,16 +79,38 @@ def matrix2confusion(matrix, uni_label_list, round_num=4, show_percentage=True):
|
|
|
79
79
|
|
|
80
80
|
# confusion matrix
|
|
81
81
|
confusion_matrix = pd.DataFrame(matrix, index=index_list, columns=column_list)
|
|
82
|
-
confusion_matrix['accuracy'][0] = whole_accuracy
|
|
82
|
+
# confusion_matrix['accuracy'][0] = whole_accuracy
|
|
83
|
+
confusion_matrix.iloc[0, confusion_matrix.columns.get_loc('accuracy')] = whole_accuracy
|
|
83
84
|
|
|
84
85
|
return confusion_matrix
|
|
85
86
|
|
|
86
87
|
|
|
87
|
-
def
|
|
88
|
+
def confusion_matrix(class_dict, true_list, pred_list,
|
|
89
|
+
ignore_idx=None, round_num=2, percentage=True):
|
|
90
|
+
|
|
91
|
+
# 모드, 데이터, dict 간 호환성 검증
|
|
92
|
+
key_list = list(class_dict.keys())
|
|
93
|
+
value_list = list(class_dict.values())
|
|
94
|
+
try:
|
|
95
|
+
_ = int(value_list[0]) # value 값이 id (정수) 인 경우
|
|
96
|
+
mode = 'label2id'
|
|
97
|
+
except ValueError:
|
|
98
|
+
try:
|
|
99
|
+
_ = int(key_list[0])
|
|
100
|
+
except ValueError:
|
|
101
|
+
raise ValueError('id 값은 정수형이어야합니다.')
|
|
102
|
+
mode = 'id2label'
|
|
103
|
+
|
|
104
|
+
t_unique_list = np.unique(true_list).tolist()
|
|
105
|
+
p_unique_list = np.unique(pred_list).tolist()
|
|
106
|
+
if not set(t_unique_list).issubset(key_list) or not set(p_unique_list).issubset(key_list):
|
|
107
|
+
raise ValueError(f'입력된 정답 데이터({t_unique_list}) 또는 예측 데이터({p_unique_list}) 가 클래스 사전의 key({key_list}) 값과 일치하지 않습니다.')
|
|
108
|
+
|
|
109
|
+
|
|
88
110
|
if mode == 'label2id':
|
|
89
|
-
uni_label_list =
|
|
111
|
+
uni_label_list = key_list.copy()
|
|
90
112
|
elif mode == 'id2label':
|
|
91
|
-
uni_label_list =
|
|
113
|
+
uni_label_list = value_list.copy()
|
|
92
114
|
|
|
93
115
|
# matrix
|
|
94
116
|
matrix = []
|
|
@@ -100,8 +122,8 @@ def make_confusion_matrix(mode, true_list, pred_list, ignore_idx=None, round_num
|
|
|
100
122
|
# count
|
|
101
123
|
if mode == 'label2id':
|
|
102
124
|
for t, p in zip(true_list, pred_list):
|
|
103
|
-
t_i =
|
|
104
|
-
p_i =
|
|
125
|
+
t_i = class_dict[t]
|
|
126
|
+
p_i = class_dict[p]
|
|
105
127
|
matrix[t_i][p_i] += 1
|
|
106
128
|
|
|
107
129
|
elif mode == 'id2label':
|
|
@@ -117,7 +139,7 @@ def make_confusion_matrix(mode, true_list, pred_list, ignore_idx=None, round_num
|
|
|
117
139
|
matrix=matrix,
|
|
118
140
|
uni_label_list=uni_label_list,
|
|
119
141
|
round_num=round_num,
|
|
120
|
-
show_percentage=
|
|
142
|
+
show_percentage=percentage
|
|
121
143
|
)
|
|
122
144
|
|
|
123
145
|
return confusion_matrix
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .dataframeutils import *
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import csv
|
|
7
|
+
import warnings
|
|
8
|
+
warnings.filterwarnings('ignore')
|
|
9
|
+
|
|
10
|
+
# from utilskit import utils as u
|
|
11
|
+
from utilskit import repeatutils as rpu
|
|
12
|
+
|
|
13
|
+
__all__ = ['read_df', 'utc2kor', 'adnormal2nan', 'time_filling',
|
|
14
|
+
'isdfvalid', 'fill_repeat_nan', 'pin2nan']
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def read_df(path):
|
|
18
|
+
extention = path.split('.')[-1]
|
|
19
|
+
if extention in ['csv', 'CSV']:
|
|
20
|
+
switch = 'csv'
|
|
21
|
+
elif extention in ['xlsx', 'xls']:
|
|
22
|
+
switch = 'excel'
|
|
23
|
+
elif extention in ['txt']:
|
|
24
|
+
switch = 'txt'
|
|
25
|
+
else:
|
|
26
|
+
raise ValueError(f'{extention}은(는) 잘못되거나 지정되지 않은 확장자입니다.')
|
|
27
|
+
|
|
28
|
+
if switch == 'csv':
|
|
29
|
+
encoding = 'utf-8-sig'
|
|
30
|
+
while True:
|
|
31
|
+
try:
|
|
32
|
+
data_df = pd.read_csv(path, encoding=encoding)
|
|
33
|
+
break
|
|
34
|
+
except UnicodeDecodeError:
|
|
35
|
+
encoding = 'cp949'
|
|
36
|
+
except pd.errors.ParserError:
|
|
37
|
+
f = open(path, encoding=encoding)
|
|
38
|
+
reader = csv.reader(f)
|
|
39
|
+
csv_list = []
|
|
40
|
+
for line in reader:
|
|
41
|
+
if len(line) != 38:
|
|
42
|
+
pass
|
|
43
|
+
csv_list.append(line)
|
|
44
|
+
f.close()
|
|
45
|
+
data_df = pd.DataFrame(csv_list)
|
|
46
|
+
data_df.columns = data_df.iloc[0].to_list()
|
|
47
|
+
data_df = data_df.drop(index=data_df.index[0]) # 0번째 행을 지움
|
|
48
|
+
break
|
|
49
|
+
if switch == 'excel':
|
|
50
|
+
data_df = pd.read_excel(path)
|
|
51
|
+
if switch == 'txt':
|
|
52
|
+
line_list = []
|
|
53
|
+
with open(path, 'r', encoding='utf-8-sig') as f:
|
|
54
|
+
for line in f.readlines():
|
|
55
|
+
line = line.replace('\n', '')
|
|
56
|
+
if ',' in line:
|
|
57
|
+
line = line.split(',')
|
|
58
|
+
line_list.append(line)
|
|
59
|
+
data_df = pd.DataFrame(line_list[1:], columns=line_list[0])
|
|
60
|
+
return data_df
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def utc2kor(dataframe, column='time', extend=True):
|
|
64
|
+
df = dataframe.copy()
|
|
65
|
+
if df.empty:
|
|
66
|
+
return df
|
|
67
|
+
if extend:
|
|
68
|
+
new_column = f'{column}_kor'
|
|
69
|
+
else:
|
|
70
|
+
new_column = column
|
|
71
|
+
|
|
72
|
+
df[new_column] = df[column].astype('str')
|
|
73
|
+
df[new_column] = df[new_column].apply(lambda x: x.replace('T', ' '))
|
|
74
|
+
df[new_column] = df[new_column].apply(lambda x: x.replace('Z', ''))
|
|
75
|
+
|
|
76
|
+
# UTC 시간을 한국 시간으로 (+9 시간)
|
|
77
|
+
df[new_column] = df[new_column].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
|
|
78
|
+
df[new_column] = df[new_column].apply(lambda x: x + timedelta(hours=9))
|
|
79
|
+
df[new_column] = df[new_column].astype('str')
|
|
80
|
+
|
|
81
|
+
df = df.sort_values(by=new_column, ascending=True)
|
|
82
|
+
|
|
83
|
+
return df
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def adnormal2nan(dataframe, column, max_value=None, min_value=None):
|
|
87
|
+
df = dataframe.copy()
|
|
88
|
+
if max_value is not None:
|
|
89
|
+
df[column][df[column] > max_value] = np.nan
|
|
90
|
+
if min_value is not None:
|
|
91
|
+
df[column][df[column] < min_value] = np.nan
|
|
92
|
+
return df
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def time_filling(dataframe, start, end, column='time'):
|
|
96
|
+
df = dataframe.copy()
|
|
97
|
+
if df.empty:
|
|
98
|
+
return df
|
|
99
|
+
time_range = pd.date_range(start=start, end=end, freq='S')
|
|
100
|
+
time_range_df = pd.DataFrame(time_range, columns=[column])
|
|
101
|
+
time_range_df = time_range_df.astype('str')
|
|
102
|
+
|
|
103
|
+
# 합치기
|
|
104
|
+
df = pd.merge(df, time_range_df, how='right')
|
|
105
|
+
return df
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def drop_nan(df, stan_col):
|
|
109
|
+
try:
|
|
110
|
+
df = df.dropna(subset=[stan_col])
|
|
111
|
+
except KeyError:
|
|
112
|
+
pass
|
|
113
|
+
return df
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def isdfvalid(dataframe, column_list):
|
|
117
|
+
# 유효 컬럼 존재 여부 확인
|
|
118
|
+
try:
|
|
119
|
+
_ = dataframe[column_list]
|
|
120
|
+
return True
|
|
121
|
+
except KeyError:
|
|
122
|
+
return False
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def fill_repeat_nan(dataframe, column, repeat=5):
|
|
126
|
+
'''
|
|
127
|
+
repeat 에 지정한 수치 이상 반복되는 결측치 구간을
|
|
128
|
+
앞뒤값 채우기로 보정하는 함수
|
|
129
|
+
'''
|
|
130
|
+
df = dataframe.copy()
|
|
131
|
+
stan_ary = df[column].values
|
|
132
|
+
|
|
133
|
+
# NaN 가 반복되는 구간 산정
|
|
134
|
+
repeat_section = rpu.get_stan_repeat_section(
|
|
135
|
+
ary=stan_ary,
|
|
136
|
+
value='nan',
|
|
137
|
+
repeat=repeat,
|
|
138
|
+
mode='a'
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# 결측치 채우기
|
|
142
|
+
if len(repeat_section) > 0:
|
|
143
|
+
repeat_section = np.array(repeat_section)
|
|
144
|
+
nan_start_idx_list = repeat_section[:, :1].tolist()
|
|
145
|
+
nan_end_idx_list = repeat_section[:, 1:].tolist()
|
|
146
|
+
|
|
147
|
+
for nan_si, nan_ei in zip(nan_start_idx_list, nan_end_idx_list):
|
|
148
|
+
nan_si = nan_si[0]
|
|
149
|
+
nan_ei = nan_ei[0]
|
|
150
|
+
df.loc[nan_si-1:nan_ei, column] = df.loc[nan_si-1:nan_ei, column].fillna(method='ffill')
|
|
151
|
+
df.loc[nan_si:nan_ei+1, column] = df.loc[nan_si:nan_ei+1, column].fillna(method='bfill')
|
|
152
|
+
|
|
153
|
+
return df
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def pin2nan(dataframe, column, max_diff=0.1, repeat=3):
|
|
157
|
+
'''
|
|
158
|
+
이상치 범위에 속하지 않지만
|
|
159
|
+
데이터 흐름상 이상치로 볼 필요가 있는 국소 범위의 값들을 결측치로 변경하는 함수
|
|
160
|
+
|
|
161
|
+
예시: 20, 20, 20, 20, [ 1], 20, 20, 20, 1, 1, 2, 1
|
|
162
|
+
결과: 20, 20, 20, 20, [NaN], 20, 20, 20, 1, 1, 2, 1
|
|
163
|
+
'''
|
|
164
|
+
df = dataframe.copy()
|
|
165
|
+
|
|
166
|
+
# 기준 컬럼 데이터 추출
|
|
167
|
+
stan_ary = df[column].values
|
|
168
|
+
|
|
169
|
+
# 현재 값에서 이전값을 뺀 데이터 ary 를 생성
|
|
170
|
+
stan_1_list = stan_ary.tolist()
|
|
171
|
+
stan_1_list.insert(0, stan_ary[0])
|
|
172
|
+
stan_1_ary = np.array(stan_1_list)[:-1]
|
|
173
|
+
diff_ary = np.round(stan_ary - stan_1_ary, 4)
|
|
174
|
+
diff_ary = np.array(list(map(abs, diff_ary)))
|
|
175
|
+
|
|
176
|
+
#==
|
|
177
|
+
a = df[column].rolling(window=3, min_periods=1).mean()
|
|
178
|
+
print(a)
|
|
179
|
+
sys.exit()
|
|
180
|
+
#==
|
|
181
|
+
|
|
182
|
+
# print()
|
|
183
|
+
idx_list = []
|
|
184
|
+
for idx, diff in enumerate(diff_ary):
|
|
185
|
+
|
|
186
|
+
# 앞뒤 차이값이 최대 차이값 보다 작은 경우
|
|
187
|
+
if diff < max_diff:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
# idx 위치 이전 10개 데이터에 대한 평균
|
|
191
|
+
before_aver = np.average(stan_ary[idx-10:idx])
|
|
192
|
+
|
|
193
|
+
# idx 위치 이후 10개 데이터에 대한 평균
|
|
194
|
+
after_aver = np.average(stan_ary[idx+1:idx+11])
|
|
195
|
+
|
|
196
|
+
# 구간 내 nan 이 존재하는 경우 앞뒤 평균을 동일시
|
|
197
|
+
if str(before_aver) == 'nan':
|
|
198
|
+
before_aver = after_aver
|
|
199
|
+
if str(after_aver) == 'nan':
|
|
200
|
+
after_aver = before_aver
|
|
201
|
+
|
|
202
|
+
# 앞뒤 평균값 간의 차이값 절대값 계산
|
|
203
|
+
aver_diff = abs(after_aver - before_aver)
|
|
204
|
+
|
|
205
|
+
# 바로 앞 뒤의 차이값과 평균값 간 차이값의 차이값 p 계산
|
|
206
|
+
p = np.round(diff - aver_diff, 4)
|
|
207
|
+
|
|
208
|
+
# p 가 최대 차이값 보다 큰 경우 이상치로 판단
|
|
209
|
+
if p > max_diff:
|
|
210
|
+
idx_list.append(idx)
|
|
211
|
+
|
|
212
|
+
# print(f'{idx:5d}, {before_aver:.2f}, {diff:.2f}, {after_aver:.2f}, {aver_diff:.2f}')
|
|
213
|
+
# print(p)
|
|
214
|
+
# print(idx_list)
|
|
215
|
+
del idx
|
|
216
|
+
|
|
217
|
+
# pin idx 가 존재하는 경우 해당 범위를 nan 으로 대체
|
|
218
|
+
print(stan_ary)
|
|
219
|
+
print(idx_list)
|
|
220
|
+
sys.exit()
|
|
221
|
+
temp_ary = stan_ary.copy()
|
|
222
|
+
if len(idx_list) > 0:
|
|
223
|
+
for idx in idx_list:
|
|
224
|
+
if idx < 3:
|
|
225
|
+
temp_ary[:idx+3] = np.nan
|
|
226
|
+
else:
|
|
227
|
+
temp_ary[idx-3:idx+3] = np.nan
|
|
228
|
+
|
|
229
|
+
# NaN 가 반복되는 구간 산정
|
|
230
|
+
repeat_section = rpu.get_stan_repeat_section(
|
|
231
|
+
ary=stan_ary,
|
|
232
|
+
value='nan',
|
|
233
|
+
repeat=repeat,
|
|
234
|
+
mode='a'
|
|
235
|
+
)
|
|
236
|
+
print(repeat_section)
|
|
237
|
+
sys.exit()
|
|
238
|
+
|
|
239
|
+
# # nan 의 위치 구하기
|
|
240
|
+
# for_fill_start_idx_list, for_fill_end_idx_list = um.identify_stan_repeat_section(
|
|
241
|
+
# ary=temp_ary,
|
|
242
|
+
# stan_value='nan',
|
|
243
|
+
# stan_repeat=repeat,
|
|
244
|
+
# mode='below',
|
|
245
|
+
# reverse=False
|
|
246
|
+
# )
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
# 해당 부분을 NaN 값으로 변환
|
|
250
|
+
for fsi, fei in zip(for_fill_start_idx_list, for_fill_end_idx_list):
|
|
251
|
+
df.loc[fsi:fei, column] = np.nan
|
|
252
|
+
df.loc[fsi-1:fei, column] = df.loc[fsi-1:fei, column].fillna(method='ffill')
|
|
253
|
+
df.loc[fsi:fei+1, column] = df.loc[fsi:fei+1, column].fillna(method='bfill')
|
|
254
|
+
|
|
255
|
+
return df
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .logutils import *
|