vos-data-utils 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vos-data-utils might be problematic. Click here for more details.
- vdutils/__init__.py +21 -45
- vdutils/bjd.py +40 -10
- vdutils/bjdconnector.py +322 -55
- vdutils/convaddr.py +121 -26
- vdutils/cordate.py +1 -1
- vdutils/data/__init__.py +10 -9
- vdutils/genpnu.py +623 -0
- vdutils/library/__init__.py +42 -0
- vdutils/library/data.py +51 -1
- vdutils/tests/test_convaddr.py +1 -3
- vdutils/tests/test_cordate.py +6 -1
- vdutils/tests/test_genpnu.py +1004 -0
- vdutils/tests/test_vid.py +247 -0
- vdutils/tests/tests.py +15 -5
- vdutils/vid.py +157 -99
- {vos_data_utils-0.0.2.dist-info → vos_data_utils-0.0.4.dist-info}/METADATA +3 -2
- vos_data_utils-0.0.4.dist-info/RECORD +21 -0
- vdutils/data/bjd.txt +0 -49844
- vdutils/data/bjd_changed.txt +0 -8579
- vdutils/data/bjd_connectors.pkl +0 -0
- vdutils/data/bjd_current.txt +0 -20560
- vdutils/data/bjd_frequency_dictionary.txt +0 -11290
- vdutils/data/bjd_smallest.txt +0 -9786
- vdutils/data/date_dictionary.txt +0 -738978
- vdutils/data/full_bjd_connectors.pkl +0 -0
- vdutils/data/multiple_word_sgg_list.txt +0 -65
- vdutils/data/pnu/bjd_20230701.pkl +0 -0
- vdutils/data/pnu/bjd_20240101.pkl +0 -0
- vdutils/data/pnu/bjd_20240118.pkl +0 -0
- vdutils/pnu.py +0 -221
- vos_data_utils-0.0.2.dist-info/RECORD +0 -31
- {vos_data_utils-0.0.2.dist-info → vos_data_utils-0.0.4.dist-info}/WHEEL +0 -0
- {vos_data_utils-0.0.2.dist-info → vos_data_utils-0.0.4.dist-info}/entry_points.txt +0 -0
- {vos_data_utils-0.0.2.dist-info → vos_data_utils-0.0.4.dist-info}/top_level.txt +0 -0
vdutils/convaddr.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import re
|
|
2
|
+
import pkg_resources
|
|
2
3
|
import pandas as pd
|
|
3
4
|
from typing import (
|
|
4
5
|
List,
|
|
@@ -6,15 +7,41 @@ from typing import (
|
|
|
6
7
|
Optional
|
|
7
8
|
)
|
|
8
9
|
from dataclasses import dataclass
|
|
9
|
-
from vdutils import Log
|
|
10
|
-
from vdutils.
|
|
10
|
+
from vdutils.library import Log
|
|
11
|
+
from vdutils.data import (
|
|
12
|
+
__sep__,
|
|
13
|
+
__index__,
|
|
14
|
+
__encoding__,
|
|
15
|
+
_get_folder_names
|
|
16
|
+
)
|
|
11
17
|
|
|
12
18
|
|
|
13
19
|
@dataclass
|
|
14
20
|
class ConvAddr():
|
|
15
21
|
|
|
16
22
|
|
|
17
|
-
def __init__(
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
base_dt: Optional[str] = None,
|
|
26
|
+
is_inherit: bool = False
|
|
27
|
+
):
|
|
28
|
+
|
|
29
|
+
if base_dt is not None:
|
|
30
|
+
if not isinstance(base_dt, str):
|
|
31
|
+
raise TypeError("type of object('base_dt') must be string")
|
|
32
|
+
|
|
33
|
+
if not base_dt.isdigit():
|
|
34
|
+
raise ValueError("object('base_dt') should be a string consisting of numbers")
|
|
35
|
+
|
|
36
|
+
if len(base_dt) != 8:
|
|
37
|
+
raise ValueError("object('base_dt') should be a string consisting of exactly 8(YYYYMMDD) digits")
|
|
38
|
+
else: pass
|
|
39
|
+
|
|
40
|
+
self.sep = __sep__
|
|
41
|
+
self.index = __index__
|
|
42
|
+
self.encoding = __encoding__
|
|
43
|
+
self.base_dt: str = base_dt
|
|
44
|
+
self.is_inherit: bool = is_inherit
|
|
18
45
|
self.bjd_current_dic: Dict[str, str] = None
|
|
19
46
|
self.bjd_smallest_list: List[str] = None
|
|
20
47
|
self.bjd_current_bjd_nm_list: List[str] = None
|
|
@@ -27,7 +54,77 @@ class ConvAddr():
|
|
|
27
54
|
self.bjd_changed_dic: Dict[str, str] = None
|
|
28
55
|
self.bjd_changed_old_bjd_nm_list: List[str] = None
|
|
29
56
|
self.logger = Log('ConvertAddress').stream_handler("INFO")
|
|
57
|
+
self._get_base_dt()
|
|
58
|
+
self._get_file_names()
|
|
30
59
|
self._prepare()
|
|
60
|
+
self.base_dt_print: str = f"{self.base_dt[:4]}-{self.base_dt[4:6]}-{self.base_dt[6:8]}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _find_latest_base_dt(
|
|
64
|
+
self,
|
|
65
|
+
base_dts: List[str]
|
|
66
|
+
) -> str:
|
|
67
|
+
|
|
68
|
+
"""
|
|
69
|
+
입력된 날짜(YYYYMMDD)와 법정동 데이터 시점 리스트와 비교하여 입력된 날짜보다 과거 시점 중 최신 시점을 반환
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
for date in base_dts:
|
|
73
|
+
if date < self.base_dt:
|
|
74
|
+
return date
|
|
75
|
+
|
|
76
|
+
# 입력된 날짜보다 작은 날짜가 없을 경우
|
|
77
|
+
self.logger.info("입력된 날짜보다 이전 시점의 법정동 데이터가 존재하지 않습니다. 보유한 데이터중 최신 데이터를 적용합니다.")
|
|
78
|
+
return base_dts[0]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _get_base_dt(self):
|
|
82
|
+
|
|
83
|
+
"""
|
|
84
|
+
입력된 날짜(YYYYMMDD)와 법정동 데이터 시점 리스트와 비교하여 입력된 날짜보다 과거 시점 중 최신 시점을 반환 \n
|
|
85
|
+
입력된 날짜(YYYYMMDD)가 없으면 데이터 시점 리스트 중 최신 시점을 반환
|
|
86
|
+
"""
|
|
87
|
+
|
|
88
|
+
if self.is_inherit:
|
|
89
|
+
return self.base_dt
|
|
90
|
+
|
|
91
|
+
base_dts = _get_folder_names(base_folder_path='vdutils/data/bjd')
|
|
92
|
+
base_dts = sorted(base_dts, reverse=True)
|
|
93
|
+
try:
|
|
94
|
+
if self.base_dt is None:
|
|
95
|
+
self.base_dt = base_dts[0]
|
|
96
|
+
else:
|
|
97
|
+
self.base_dt = self._find_latest_base_dt(base_dts=base_dts)
|
|
98
|
+
finally:
|
|
99
|
+
self.logger.info(f"적용 법정동 데이터 시점: {self.base_dt}")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _get_file_names(self):
|
|
103
|
+
self.file_name_bjd = pkg_resources.resource_filename(
|
|
104
|
+
"vdutils",
|
|
105
|
+
f"data/bjd/{self.base_dt}/bjd.txt"
|
|
106
|
+
)
|
|
107
|
+
self.file_name_bjd_current = pkg_resources.resource_filename(
|
|
108
|
+
"vdutils",
|
|
109
|
+
f"data/bjd/{self.base_dt}/bjd_current.txt"
|
|
110
|
+
)
|
|
111
|
+
self.file_name_bjd_changed = pkg_resources.resource_filename(
|
|
112
|
+
"vdutils",
|
|
113
|
+
f"data/bjd/{self.base_dt}/bjd_changed.txt"
|
|
114
|
+
)
|
|
115
|
+
self.file_name_bjd_smallest = pkg_resources.resource_filename(
|
|
116
|
+
"vdutils",
|
|
117
|
+
f"data/bjd/{self.base_dt}/bjd_smallest.txt"
|
|
118
|
+
)
|
|
119
|
+
self.file_name_bjd_frequency_dictionary = pkg_resources.resource_filename(
|
|
120
|
+
"vdutils",
|
|
121
|
+
f"data/bjd/{self.base_dt}/bjd_frequency_dictionary.txt"
|
|
122
|
+
)
|
|
123
|
+
self.file_name_multiple_word_sgg_list = pkg_resources.resource_filename(
|
|
124
|
+
"vdutils",
|
|
125
|
+
f"data/bjd/{self.base_dt}/multiple_word_sgg_list.txt"
|
|
126
|
+
)
|
|
127
|
+
|
|
31
128
|
|
|
32
129
|
@staticmethod
|
|
33
130
|
def _concat_sido_sgg(
|
|
@@ -41,6 +138,7 @@ class ConvAddr():
|
|
|
41
138
|
else:
|
|
42
139
|
return None
|
|
43
140
|
|
|
141
|
+
|
|
44
142
|
def _create_bjd_changed_dictionary(
|
|
45
143
|
self,
|
|
46
144
|
bjd_changed_df: pd.DataFrame
|
|
@@ -59,32 +157,23 @@ class ConvAddr():
|
|
|
59
157
|
bjd_changed_dictionary[old_bjd_nm] = new_bjd_nm
|
|
60
158
|
return bjd_changed_dictionary
|
|
61
159
|
|
|
160
|
+
|
|
62
161
|
def _prepare(self):
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
file_name_bjd_current: str = cls_bjd.file_name_bjd_current
|
|
66
|
-
file_name_bjd_changed: str = cls_bjd.file_name_bjd_changed
|
|
67
|
-
file_name_bjd_smallest: str = cls_bjd.file_name_bjd_smallest
|
|
68
|
-
file_name_bjd_frequency_dictionary: str = cls_bjd.file_name_bjd_frequency_dictionary
|
|
69
|
-
file_name_multiple_word_sgg_list: str = cls_bjd.file_name_multiple_word_sgg_list
|
|
70
|
-
input_encoding = cls_bjd.output_encoding
|
|
71
|
-
input_index = cls_bjd.output_index
|
|
72
|
-
input_sep = cls_bjd.output_sep
|
|
73
|
-
|
|
74
|
-
with open(file_name_bjd_current, 'r') as file_bjd_current:
|
|
162
|
+
|
|
163
|
+
with open(self.file_name_bjd_current, 'r') as file_bjd_current:
|
|
75
164
|
self.bjd_current_dic: Dict[str, str] = dict((line.split('\t')[2], line.split('\t')[9].replace('\n', '')) for line in file_bjd_current)
|
|
76
165
|
|
|
77
|
-
with open(file_name_bjd_smallest, 'r') as file_bjd_smallest:
|
|
166
|
+
with open(self.file_name_bjd_smallest, 'r') as file_bjd_smallest:
|
|
78
167
|
self.bjd_smallest_list: List[str] = [line.strip() for line in file_bjd_smallest]
|
|
79
168
|
|
|
80
|
-
with open(file_name_multiple_word_sgg_list, 'r') as file_multiple_word_sgg_list:
|
|
169
|
+
with open(self.file_name_multiple_word_sgg_list, 'r') as file_multiple_word_sgg_list:
|
|
81
170
|
self.multiple_word_sgg_list: List[str] = [line.strip() for line in file_multiple_word_sgg_list]
|
|
82
171
|
|
|
83
172
|
bjd_df: pd.DataFrame = pd.read_csv(
|
|
84
|
-
file_name_bjd,
|
|
85
|
-
sep=
|
|
173
|
+
self.file_name_bjd,
|
|
174
|
+
sep=self.sep,
|
|
86
175
|
engine='python',
|
|
87
|
-
encoding=
|
|
176
|
+
encoding=self.encoding,
|
|
88
177
|
dtype={
|
|
89
178
|
'과거법정동코드': str,
|
|
90
179
|
'법정동코드': str
|
|
@@ -96,10 +185,11 @@ class ConvAddr():
|
|
|
96
185
|
self.ri_list: List[str] = list(ri for ri in bjd_df['리명'].unique() if isinstance(ri, str))
|
|
97
186
|
|
|
98
187
|
bjd_current_df: pd.DataFrame = pd.read_csv(
|
|
99
|
-
file_name_bjd_current,
|
|
100
|
-
sep=
|
|
188
|
+
self.file_name_bjd_current,
|
|
189
|
+
sep=self.sep,
|
|
101
190
|
engine='python',
|
|
102
|
-
encoding=
|
|
191
|
+
encoding=self.encoding
|
|
192
|
+
)
|
|
103
193
|
self.bjd_current_bjd_nm_list: List[str] = list(bjd_nm for bjd_nm in bjd_current_df['법정동명'] if bjd_nm is not None)
|
|
104
194
|
bjd_current_df['시도시군구명'] = bjd_current_df[['시도명', '시군구명']].apply(lambda x: self._concat_sido_sgg(*x), axis=1)
|
|
105
195
|
self.current_sido_sgg_list: List[str] = list(sido_sgg for sido_sgg in bjd_current_df['시도시군구명'].unique() if isinstance(sido_sgg, str))
|
|
@@ -110,10 +200,10 @@ class ConvAddr():
|
|
|
110
200
|
self.bjd_current_df = bjd_current_df
|
|
111
201
|
|
|
112
202
|
bjd_changed_df: pd.DataFrame = pd.read_csv(
|
|
113
|
-
file_name_bjd_changed,
|
|
114
|
-
sep=
|
|
203
|
+
self.file_name_bjd_changed,
|
|
204
|
+
sep=self.sep,
|
|
115
205
|
engine='python',
|
|
116
|
-
encoding=
|
|
206
|
+
encoding=self.encoding,
|
|
117
207
|
dtype={
|
|
118
208
|
'과거법정동코드': str,
|
|
119
209
|
'법정동코드': str
|
|
@@ -125,6 +215,7 @@ class ConvAddr():
|
|
|
125
215
|
self.bjd_changed_dic: Dict[str, str] = self._create_bjd_changed_dictionary(sub_bjd_changed_df)
|
|
126
216
|
self.bjd_changed_old_bjd_nm_list: List[str] = list(self.bjd_changed_dic.keys())
|
|
127
217
|
|
|
218
|
+
|
|
128
219
|
@staticmethod
|
|
129
220
|
def correct_simple_spacing(
|
|
130
221
|
addr: str
|
|
@@ -148,6 +239,7 @@ class ConvAddr():
|
|
|
148
239
|
|
|
149
240
|
return re.sub(r'\s+', ' ', addr)
|
|
150
241
|
|
|
242
|
+
|
|
151
243
|
# 가장 작은 법정동명 뒤 번지가 띄어쓰기 없이 붙어있을 경우,
|
|
152
244
|
# 가장 작은 법정동명에 포함된 숫자중 2자리수는 없음. 예 당산동1가, 을지로5가 등
|
|
153
245
|
def correct_smallest_bjd_spacing(
|
|
@@ -181,6 +273,7 @@ class ConvAddr():
|
|
|
181
273
|
return addr
|
|
182
274
|
return addr
|
|
183
275
|
|
|
276
|
+
|
|
184
277
|
@staticmethod
|
|
185
278
|
def union_similar_changed_bjd(
|
|
186
279
|
changed_bjd_list: List[str]
|
|
@@ -199,6 +292,7 @@ class ConvAddr():
|
|
|
199
292
|
else:
|
|
200
293
|
return changed_bjd_list
|
|
201
294
|
|
|
295
|
+
|
|
202
296
|
def correct_changed_bjd(
|
|
203
297
|
self,
|
|
204
298
|
addr: str,
|
|
@@ -249,6 +343,7 @@ class ConvAddr():
|
|
|
249
343
|
|
|
250
344
|
return addr
|
|
251
345
|
|
|
346
|
+
|
|
252
347
|
def correct_bjd(
|
|
253
348
|
self,
|
|
254
349
|
addr: str,
|
vdutils/cordate.py
CHANGED
|
@@ -19,7 +19,7 @@ class CorDate():
|
|
|
19
19
|
self.sym_spell = SymSpell()
|
|
20
20
|
self.dictionary_path = pkg_resources.resource_filename(
|
|
21
21
|
"vdutils",
|
|
22
|
-
"data/date_dictionary.txt"
|
|
22
|
+
"data/date/date_dictionary.txt"
|
|
23
23
|
)
|
|
24
24
|
self.this_year: int = datetime.now().year
|
|
25
25
|
self.this_year_two_length: int = int(str(self.this_year)[2:])
|
vdutils/data/__init__.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import os
|
|
2
|
-
import
|
|
2
|
+
from typing import (
|
|
3
|
+
List
|
|
4
|
+
)
|
|
3
5
|
|
|
4
6
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
with open(data_file_path, "rb") as f:
|
|
9
|
-
return pickle.load(f)
|
|
7
|
+
__sep__ = '\t'
|
|
8
|
+
__encoding__: str = 'utf-8'
|
|
9
|
+
__index__: bool = False
|
|
10
10
|
|
|
11
|
-
def
|
|
12
|
-
|
|
13
|
-
|
|
11
|
+
def _get_folder_names(
|
|
12
|
+
base_folder_path: str
|
|
13
|
+
) -> List[str]:
|
|
14
|
+
return [f.name for f in os.scandir(base_folder_path) if f.is_dir()]
|