vos-data-utils 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of vos-data-utils might be problematic. Click here for more details.

vdutils/convaddr.py CHANGED
@@ -1,4 +1,5 @@
1
1
  import re
2
+ import pkg_resources
2
3
  import pandas as pd
3
4
  from typing import (
4
5
  List,
@@ -6,15 +7,41 @@ from typing import (
6
7
  Optional
7
8
  )
8
9
  from dataclasses import dataclass
9
- from vdutils import Log
10
- from vdutils.bjd import Bjd
10
+ from vdutils.library import Log
11
+ from vdutils.data import (
12
+ __sep__,
13
+ __index__,
14
+ __encoding__,
15
+ _get_folder_names
16
+ )
11
17
 
12
18
 
13
19
  @dataclass
14
20
  class ConvAddr():
15
21
 
16
22
 
17
- def __init__(self):
23
+ def __init__(
24
+ self,
25
+ base_dt: Optional[str] = None,
26
+ is_inherit: bool = False
27
+ ):
28
+
29
+ if base_dt is not None:
30
+ if not isinstance(base_dt, str):
31
+ raise TypeError("type of object('base_dt') must be string")
32
+
33
+ if not base_dt.isdigit():
34
+ raise ValueError("object('base_dt') should be a string consisting of numbers")
35
+
36
+ if len(base_dt) != 8:
37
+ raise ValueError("object('base_dt') should be a string consisting of exactly 8(YYYYMMDD) digits")
38
+ else: pass
39
+
40
+ self.sep = __sep__
41
+ self.index = __index__
42
+ self.encoding = __encoding__
43
+ self.base_dt: str = base_dt
44
+ self.is_inherit: bool = is_inherit
18
45
  self.bjd_current_dic: Dict[str, str] = None
19
46
  self.bjd_smallest_list: List[str] = None
20
47
  self.bjd_current_bjd_nm_list: List[str] = None
@@ -27,7 +54,77 @@ class ConvAddr():
27
54
  self.bjd_changed_dic: Dict[str, str] = None
28
55
  self.bjd_changed_old_bjd_nm_list: List[str] = None
29
56
  self.logger = Log('ConvertAddress').stream_handler("INFO")
57
+ self._get_base_dt()
58
+ self._get_file_names()
30
59
  self._prepare()
60
+ self.base_dt_print: str = f"{self.base_dt[:4]}-{self.base_dt[4:6]}-{self.base_dt[6:8]}"
61
+
62
+
63
+ def _find_latest_base_dt(
64
+ self,
65
+ base_dts: List[str]
66
+ ) -> str:
67
+
68
+ """
69
+ 입력된 날짜(YYYYMMDD)와 법정동 데이터 시점 리스트와 비교하여 입력된 날짜보다 과거 시점 중 최신 시점을 반환
70
+ """
71
+
72
+ for date in base_dts:
73
+ if date < self.base_dt:
74
+ return date
75
+
76
+ # 입력된 날짜보다 작은 날짜가 없을 경우
77
+ self.logger.info("입력된 날짜보다 이전 시점의 법정동 데이터가 존재하지 않습니다. 보유한 데이터중 최신 데이터를 적용합니다.")
78
+ return base_dts[0]
79
+
80
+
81
+ def _get_base_dt(self):
82
+
83
+ """
84
+ 입력된 날짜(YYYYMMDD)와 법정동 데이터 시점 리스트와 비교하여 입력된 날짜보다 과거 시점 중 최신 시점을 반환 \n
85
+ 입력된 날짜(YYYYMMDD)가 없으면 데이터 시점 리스트 중 최신 시점을 반환
86
+ """
87
+
88
+ if self.is_inherit:
89
+ return self.base_dt
90
+
91
+ base_dts = _get_folder_names(base_folder_path='vdutils/data/bjd')
92
+ base_dts = sorted(base_dts, reverse=True)
93
+ try:
94
+ if self.base_dt is None:
95
+ self.base_dt = base_dts[0]
96
+ else:
97
+ self.base_dt = self._find_latest_base_dt(base_dts=base_dts)
98
+ finally:
99
+ self.logger.info(f"적용 법정동 데이터 시점: {self.base_dt}")
100
+
101
+
102
+ def _get_file_names(self):
103
+ self.file_name_bjd = pkg_resources.resource_filename(
104
+ "vdutils",
105
+ f"data/bjd/{self.base_dt}/bjd.txt"
106
+ )
107
+ self.file_name_bjd_current = pkg_resources.resource_filename(
108
+ "vdutils",
109
+ f"data/bjd/{self.base_dt}/bjd_current.txt"
110
+ )
111
+ self.file_name_bjd_changed = pkg_resources.resource_filename(
112
+ "vdutils",
113
+ f"data/bjd/{self.base_dt}/bjd_changed.txt"
114
+ )
115
+ self.file_name_bjd_smallest = pkg_resources.resource_filename(
116
+ "vdutils",
117
+ f"data/bjd/{self.base_dt}/bjd_smallest.txt"
118
+ )
119
+ self.file_name_bjd_frequency_dictionary = pkg_resources.resource_filename(
120
+ "vdutils",
121
+ f"data/bjd/{self.base_dt}/bjd_frequency_dictionary.txt"
122
+ )
123
+ self.file_name_multiple_word_sgg_list = pkg_resources.resource_filename(
124
+ "vdutils",
125
+ f"data/bjd/{self.base_dt}/multiple_word_sgg_list.txt"
126
+ )
127
+
31
128
 
32
129
  @staticmethod
33
130
  def _concat_sido_sgg(
@@ -41,6 +138,7 @@ class ConvAddr():
41
138
  else:
42
139
  return None
43
140
 
141
+
44
142
  def _create_bjd_changed_dictionary(
45
143
  self,
46
144
  bjd_changed_df: pd.DataFrame
@@ -59,32 +157,23 @@ class ConvAddr():
59
157
  bjd_changed_dictionary[old_bjd_nm] = new_bjd_nm
60
158
  return bjd_changed_dictionary
61
159
 
160
+
62
161
  def _prepare(self):
63
- cls_bjd = Bjd()
64
- file_name_bjd: str = cls_bjd.file_name_bjd
65
- file_name_bjd_current: str = cls_bjd.file_name_bjd_current
66
- file_name_bjd_changed: str = cls_bjd.file_name_bjd_changed
67
- file_name_bjd_smallest: str = cls_bjd.file_name_bjd_smallest
68
- file_name_bjd_frequency_dictionary: str = cls_bjd.file_name_bjd_frequency_dictionary
69
- file_name_multiple_word_sgg_list: str = cls_bjd.file_name_multiple_word_sgg_list
70
- input_encoding = cls_bjd.output_encoding
71
- input_index = cls_bjd.output_index
72
- input_sep = cls_bjd.output_sep
73
-
74
- with open(file_name_bjd_current, 'r') as file_bjd_current:
162
+
163
+ with open(self.file_name_bjd_current, 'r') as file_bjd_current:
75
164
  self.bjd_current_dic: Dict[str, str] = dict((line.split('\t')[2], line.split('\t')[9].replace('\n', '')) for line in file_bjd_current)
76
165
 
77
- with open(file_name_bjd_smallest, 'r') as file_bjd_smallest:
166
+ with open(self.file_name_bjd_smallest, 'r') as file_bjd_smallest:
78
167
  self.bjd_smallest_list: List[str] = [line.strip() for line in file_bjd_smallest]
79
168
 
80
- with open(file_name_multiple_word_sgg_list, 'r') as file_multiple_word_sgg_list:
169
+ with open(self.file_name_multiple_word_sgg_list, 'r') as file_multiple_word_sgg_list:
81
170
  self.multiple_word_sgg_list: List[str] = [line.strip() for line in file_multiple_word_sgg_list]
82
171
 
83
172
  bjd_df: pd.DataFrame = pd.read_csv(
84
- file_name_bjd,
85
- sep=input_sep,
173
+ self.file_name_bjd,
174
+ sep=self.sep,
86
175
  engine='python',
87
- encoding=input_encoding,
176
+ encoding=self.encoding,
88
177
  dtype={
89
178
  '과거법정동코드': str,
90
179
  '법정동코드': str
@@ -96,10 +185,11 @@ class ConvAddr():
96
185
  self.ri_list: List[str] = list(ri for ri in bjd_df['리명'].unique() if isinstance(ri, str))
97
186
 
98
187
  bjd_current_df: pd.DataFrame = pd.read_csv(
99
- file_name_bjd_current,
100
- sep=input_sep,
188
+ self.file_name_bjd_current,
189
+ sep=self.sep,
101
190
  engine='python',
102
- encoding=input_encoding)
191
+ encoding=self.encoding
192
+ )
103
193
  self.bjd_current_bjd_nm_list: List[str] = list(bjd_nm for bjd_nm in bjd_current_df['법정동명'] if bjd_nm is not None)
104
194
  bjd_current_df['시도시군구명'] = bjd_current_df[['시도명', '시군구명']].apply(lambda x: self._concat_sido_sgg(*x), axis=1)
105
195
  self.current_sido_sgg_list: List[str] = list(sido_sgg for sido_sgg in bjd_current_df['시도시군구명'].unique() if isinstance(sido_sgg, str))
@@ -110,10 +200,10 @@ class ConvAddr():
110
200
  self.bjd_current_df = bjd_current_df
111
201
 
112
202
  bjd_changed_df: pd.DataFrame = pd.read_csv(
113
- file_name_bjd_changed,
114
- sep=input_sep,
203
+ self.file_name_bjd_changed,
204
+ sep=self.sep,
115
205
  engine='python',
116
- encoding=input_encoding,
206
+ encoding=self.encoding,
117
207
  dtype={
118
208
  '과거법정동코드': str,
119
209
  '법정동코드': str
@@ -125,6 +215,7 @@ class ConvAddr():
125
215
  self.bjd_changed_dic: Dict[str, str] = self._create_bjd_changed_dictionary(sub_bjd_changed_df)
126
216
  self.bjd_changed_old_bjd_nm_list: List[str] = list(self.bjd_changed_dic.keys())
127
217
 
218
+
128
219
  @staticmethod
129
220
  def correct_simple_spacing(
130
221
  addr: str
@@ -148,6 +239,7 @@ class ConvAddr():
148
239
 
149
240
  return re.sub(r'\s+', ' ', addr)
150
241
 
242
+
151
243
  # 가장 작은 법정동명 뒤 번지가 띄어쓰기 없이 붙어있을 경우,
152
244
  # 가장 작은 법정동명에 포함된 숫자중 2자리수는 없음. 예 당산동1가, 을지로5가 등
153
245
  def correct_smallest_bjd_spacing(
@@ -181,6 +273,7 @@ class ConvAddr():
181
273
  return addr
182
274
  return addr
183
275
 
276
+
184
277
  @staticmethod
185
278
  def union_similar_changed_bjd(
186
279
  changed_bjd_list: List[str]
@@ -199,6 +292,7 @@ class ConvAddr():
199
292
  else:
200
293
  return changed_bjd_list
201
294
 
295
+
202
296
  def correct_changed_bjd(
203
297
  self,
204
298
  addr: str,
@@ -249,6 +343,7 @@ class ConvAddr():
249
343
 
250
344
  return addr
251
345
 
346
+
252
347
  def correct_bjd(
253
348
  self,
254
349
  addr: str,
vdutils/cordate.py CHANGED
@@ -19,7 +19,7 @@ class CorDate():
19
19
  self.sym_spell = SymSpell()
20
20
  self.dictionary_path = pkg_resources.resource_filename(
21
21
  "vdutils",
22
- "data/date_dictionary.txt"
22
+ "data/date/date_dictionary.txt"
23
23
  )
24
24
  self.this_year: int = datetime.now().year
25
25
  self.this_year_two_length: int = int(str(self.this_year)[2:])
vdutils/data/__init__.py CHANGED
@@ -1,13 +1,14 @@
1
1
  import os
2
- import pickle
2
+ from typing import (
3
+ List
4
+ )
3
5
 
4
6
 
5
- def get_data_from_pnu(file_name):
6
- data_dir_path = os.path.abspath(__file__).replace("__init__.py", '')
7
- data_file_path = f"{data_dir_path}/pnu/{file_name}"
8
- with open(data_file_path, "rb") as f:
9
- return pickle.load(f)
7
+ __sep__ = '\t'
8
+ __encoding__: str = 'utf-8'
9
+ __index__: bool = False
10
10
 
11
- def get_files_from_pnu():
12
- data_dir_path = os.path.abspath(__file__).replace("__init__.py", '')
13
- return os.listdir(f"{data_dir_path}/pnu")
11
+ def _get_folder_names(
12
+ base_folder_path: str
13
+ ) -> List[str]:
14
+ return [f.name for f in os.scandir(base_folder_path) if f.is_dir()]