ssdlab-region-parser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ssdlab_region_parser-0.1.0/LICENSE +21 -0
- ssdlab_region_parser-0.1.0/PKG-INFO +15 -0
- ssdlab_region_parser-0.1.0/README.md +2 -0
- ssdlab_region_parser-0.1.0/pyproject.toml +24 -0
- ssdlab_region_parser-0.1.0/setup.cfg +4 -0
- ssdlab_region_parser-0.1.0/setup.py +26 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser/__init__.py +3 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser/data/__init__.py +0 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser/models.py +75 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser/parser.py +330 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser/resources.py +136 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser.egg-info/PKG-INFO +15 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser.egg-info/SOURCES.txt +24 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser.egg-info/dependency_links.txt +1 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser.egg-info/requires.txt +3 -0
- ssdlab_region_parser-0.1.0/src/SSDLab_region_parser.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 SSDLab
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ssdlab-region-parser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Chinese administrative region parser, not include Honkong, Macau and Taiwan yet.
|
|
5
|
+
Author: ChufanHe
|
|
6
|
+
Author-email: sthechufan@gmail.com
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Requires-Dist: jieba>=0.42.1
|
|
12
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
13
|
+
|
|
14
|
+
# region_parser
|
|
15
|
+
Chinese administrative region parser, not include Honkong, Macau and Taiwan yet.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ssdlab-region-parser"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Chinese administrative region parser, not include Honkong, Macau and Taiwan yet."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"pandas>=2.0",
|
|
13
|
+
"jieba>=0.42.1",
|
|
14
|
+
"openpyxl>=3.1.0"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[tool.setuptools]
|
|
18
|
+
package-dir = {"" = "src"}
|
|
19
|
+
|
|
20
|
+
[tool.setuptools.packages.find]
|
|
21
|
+
where = ["src"]
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.package-data]
|
|
24
|
+
"region_parser.data" = ["*.xlsx"]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="ssdlab-region-parser",
|
|
8
|
+
version="0.1.0",
|
|
9
|
+
author="ChufanHe",
|
|
10
|
+
author_email="sthechufan@gmail.com",
|
|
11
|
+
description="Chinese administrative region parser, not include Hong Kong, Macau and Taiwan yet.",
|
|
12
|
+
long_description=long_description,
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
package_dir={"": "src"},
|
|
15
|
+
packages=find_packages(where="src"),
|
|
16
|
+
python_requires=">=3.10",
|
|
17
|
+
install_requires=[
|
|
18
|
+
"pandas>=2.0",
|
|
19
|
+
"jieba>=0.42.1",
|
|
20
|
+
"openpyxl>=3.1.0",
|
|
21
|
+
],
|
|
22
|
+
package_data={
|
|
23
|
+
"region_parser.data": ["*.xlsx"],
|
|
24
|
+
},
|
|
25
|
+
include_package_data=True,
|
|
26
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@dataclass
|
|
6
|
+
class CN_RegionParseResult:
|
|
7
|
+
province_name: object = pd.NA
|
|
8
|
+
province_code: object = pd.NA
|
|
9
|
+
city_name: object = pd.NA
|
|
10
|
+
city_code: object = pd.NA
|
|
11
|
+
county_name: object = pd.NA
|
|
12
|
+
county_code: object = pd.NA
|
|
13
|
+
|
|
14
|
+
matched_words: list[str] = field(default_factory=list)
|
|
15
|
+
kept_words: list[str] = field(default_factory=list)
|
|
16
|
+
dropped_words: list[str] = field(default_factory=list)
|
|
17
|
+
best_match: object = pd.NA
|
|
18
|
+
|
|
19
|
+
def to_dict(self) -> dict:
|
|
20
|
+
return {
|
|
21
|
+
"省级名称": self.province_name,
|
|
22
|
+
"省级代码": self.province_code,
|
|
23
|
+
"城市名称": self.city_name,
|
|
24
|
+
"城市代码": self.city_code,
|
|
25
|
+
"区县名称": self.county_name,
|
|
26
|
+
"区县代码": self.county_code,
|
|
27
|
+
# "matched_words": self.matched_words,
|
|
28
|
+
# "kept_words": self.kept_words,
|
|
29
|
+
# "dropped_words": self.dropped_words,
|
|
30
|
+
# "best_match": self.best_match,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
# series
|
|
34
|
+
def to_series(self) -> pd.Series:
|
|
35
|
+
return pd.Series(self.to_dict())
|
|
36
|
+
|
|
37
|
+
# tuple
|
|
38
|
+
def to_tuple(self) -> tuple:
|
|
39
|
+
return (
|
|
40
|
+
self.province_name, self.province_code,
|
|
41
|
+
self.city_name, self.city_code,
|
|
42
|
+
self.county_name, self.county_code
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# list
|
|
46
|
+
def to_list(self) -> list:
|
|
47
|
+
return [
|
|
48
|
+
self.province_name, self.province_code,
|
|
49
|
+
self.city_name, self.city_code,
|
|
50
|
+
self.county_name, self.county_code,
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# str
|
|
54
|
+
def to_str(self) -> str:
|
|
55
|
+
return (
|
|
56
|
+
f"省级名称:{self.province_name} 省份代码:{self.province_code} | "
|
|
57
|
+
f"城市名称:{self.city_name} 城市代码:{self.city_code} | "
|
|
58
|
+
f"区县名称:{self.county_name} 区县代码:{self.county_code}"
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class CN_RegionResources:
|
|
64
|
+
df_region_code: pd.DataFrame
|
|
65
|
+
province_keywords: list[str]
|
|
66
|
+
city_keywords: list[str]
|
|
67
|
+
county_keywords: list[str]
|
|
68
|
+
region_keywords: list[str]
|
|
69
|
+
word_set: set[str]
|
|
70
|
+
province_dict: dict
|
|
71
|
+
city_dict: dict
|
|
72
|
+
county_dict: dict
|
|
73
|
+
region_mapping_dict: dict
|
|
74
|
+
df_paths: pd.DataFrame
|
|
75
|
+
word_to_paths: dict
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
import jieba
|
|
2
|
+
jieba.setLogLevel(jieba.logging.WARN)
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
from .resources import build_region_resources, load_keywords_into_jieba
|
|
6
|
+
from .models import CN_RegionParseResult
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CN_RegionParser:
|
|
10
|
+
def __init__(self):
|
|
11
|
+
self.resources = build_region_resources()
|
|
12
|
+
load_keywords_into_jieba(self.resources.region_keywords)
|
|
13
|
+
|
|
14
|
+
@staticmethod
|
|
15
|
+
def _deduplicate_keep_order(words: list[str]) -> list[str]:
|
|
16
|
+
seen = set()
|
|
17
|
+
result = []
|
|
18
|
+
for word in words:
|
|
19
|
+
if pd.isna(word):
|
|
20
|
+
continue
|
|
21
|
+
word = str(word).strip()
|
|
22
|
+
if not word:
|
|
23
|
+
continue
|
|
24
|
+
if word not in seen:
|
|
25
|
+
seen.add(word)
|
|
26
|
+
result.append(word)
|
|
27
|
+
return result
|
|
28
|
+
|
|
29
|
+
def extract_region(self, text: str) -> str:
|
|
30
|
+
if pd.isna(text):
|
|
31
|
+
return pd.NA
|
|
32
|
+
text = str(text).strip()
|
|
33
|
+
if not text:
|
|
34
|
+
return pd.NA
|
|
35
|
+
|
|
36
|
+
words = jieba.lcut(text)
|
|
37
|
+
matched = [w for w in words if w in self.resources.word_set]
|
|
38
|
+
matched = self._deduplicate_keep_order(matched)
|
|
39
|
+
return " ".join(matched) if matched else pd.NA
|
|
40
|
+
|
|
41
|
+
def extract_region_lcut_all(self, text: str) -> str:
|
|
42
|
+
if pd.isna(text):
|
|
43
|
+
return pd.NA
|
|
44
|
+
text = str(text).strip()
|
|
45
|
+
if not text:
|
|
46
|
+
return pd.NA
|
|
47
|
+
|
|
48
|
+
words = jieba.lcut(text, cut_all=True)
|
|
49
|
+
matched = [w for w in words if w in self.resources.word_set]
|
|
50
|
+
matched = self._deduplicate_keep_order(matched)
|
|
51
|
+
return " ".join(matched) if matched else pd.NA
|
|
52
|
+
|
|
53
|
+
def extract_region_lcut_for_search(self, text: str) -> str:
|
|
54
|
+
if pd.isna(text):
|
|
55
|
+
return pd.NA
|
|
56
|
+
text = str(text).strip()
|
|
57
|
+
if not text:
|
|
58
|
+
return pd.NA
|
|
59
|
+
|
|
60
|
+
words = jieba.lcut_for_search(text)
|
|
61
|
+
matched = [w for w in words if w in self.resources.word_set]
|
|
62
|
+
matched = self._deduplicate_keep_order(matched)
|
|
63
|
+
return " ".join(matched) if matched else pd.NA
|
|
64
|
+
|
|
65
|
+
def extract_candidates(self, text: str) -> dict:
|
|
66
|
+
a1 = self.extract_region(text)
|
|
67
|
+
a2 = self.extract_region_lcut_all(text)
|
|
68
|
+
a3 = self.extract_region_lcut_for_search(text)
|
|
69
|
+
|
|
70
|
+
a1_list = a1.split() if pd.notna(a1) else []
|
|
71
|
+
a2_list = a2.split() if pd.notna(a2) else []
|
|
72
|
+
a3_list = a3.split() if pd.notna(a3) else []
|
|
73
|
+
|
|
74
|
+
merged = self._deduplicate_keep_order(a1_list + a2_list + a3_list)
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
"extract_region": a1,
|
|
78
|
+
"extract_region_lcut_all": a2,
|
|
79
|
+
"extract_region_lcut_for_search": a3,
|
|
80
|
+
"merged_words": merged,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
def can_coexist_in_same_path(self, w1: str, w2: str) -> bool:
|
|
84
|
+
paths1 = self.resources.word_to_paths.get(w1, set())
|
|
85
|
+
paths2 = self.resources.word_to_paths.get(w2, set())
|
|
86
|
+
return len(paths1 & paths2) > 0
|
|
87
|
+
|
|
88
|
+
def keep_words_by_chain(self, words: list[str]) -> dict:
|
|
89
|
+
if not words:
|
|
90
|
+
return {
|
|
91
|
+
"kept_words": [],
|
|
92
|
+
"dropped_words": [],
|
|
93
|
+
"common_paths": set(),
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
kept = []
|
|
97
|
+
dropped = []
|
|
98
|
+
|
|
99
|
+
current_paths = None
|
|
100
|
+
|
|
101
|
+
for word in words:
|
|
102
|
+
word_paths = self.resources.word_to_paths.get(word, set())
|
|
103
|
+
|
|
104
|
+
if not word_paths:
|
|
105
|
+
dropped.append(word)
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
if current_paths is None:
|
|
109
|
+
kept.append(word)
|
|
110
|
+
current_paths = set(word_paths)
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
overlap = current_paths & word_paths
|
|
114
|
+
if overlap:
|
|
115
|
+
kept.append(word)
|
|
116
|
+
current_paths = overlap
|
|
117
|
+
else:
|
|
118
|
+
dropped.append(word)
|
|
119
|
+
|
|
120
|
+
return {
|
|
121
|
+
"kept_words": kept,
|
|
122
|
+
"dropped_words": dropped,
|
|
123
|
+
"common_paths": current_paths if current_paths is not None else set(),
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
def parse(self, text: str):
|
|
127
|
+
candidates = self.extract_candidates(text)
|
|
128
|
+
filtered = self.keep_words_by_chain(candidates["merged_words"])
|
|
129
|
+
|
|
130
|
+
best = self.find_best_match(
|
|
131
|
+
kept_words=filtered["kept_words"],
|
|
132
|
+
common_paths=filtered["common_paths"],
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# matched_words = self.map_words_to_standard_names(filtered["kept_words"])
|
|
136
|
+
|
|
137
|
+
prov, city, county = self.fill_region_by_best(
|
|
138
|
+
best=best,
|
|
139
|
+
context_list=filtered["kept_words"],
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
region_info = self.expand_region_codes(prov, city, county)
|
|
143
|
+
|
|
144
|
+
return CN_RegionParseResult(
|
|
145
|
+
province_name = region_info["省级名称"],
|
|
146
|
+
province_code = region_info["省级代码"],
|
|
147
|
+
city_name = region_info["城市名称"],
|
|
148
|
+
city_code = region_info["城市代码"],
|
|
149
|
+
county_name = region_info["区县名称"],
|
|
150
|
+
county_code = region_info["区县代码"],
|
|
151
|
+
# matched_words = matched_words,
|
|
152
|
+
# kept_words = filtered["kept_words"],
|
|
153
|
+
# dropped_words = filtered["dropped_words"],
|
|
154
|
+
# best_match = best,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def map_words_to_standard_names(self, words: list[str]) -> list[str]:
|
|
158
|
+
mapped = []
|
|
159
|
+
for word in words:
|
|
160
|
+
std = self.resources.region_mapping_dict.get(word, word)
|
|
161
|
+
if std not in mapped:
|
|
162
|
+
mapped.append(std)
|
|
163
|
+
return mapped
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def _split_path(path: str) -> tuple[str, str, str]:
|
|
167
|
+
parts = str(path).split("|")
|
|
168
|
+
parts = parts + [""] * (3 - len(parts))
|
|
169
|
+
return parts[0], parts[1], parts[2]
|
|
170
|
+
|
|
171
|
+
def find_best_match(self, kept_words: list[str], common_paths: set) -> object:
|
|
172
|
+
if not kept_words:
|
|
173
|
+
return pd.NA
|
|
174
|
+
|
|
175
|
+
std_words = self.map_words_to_standard_names(kept_words)
|
|
176
|
+
|
|
177
|
+
province_names = set(self.resources.df_region_code["省级名称"].dropna().astype(str))
|
|
178
|
+
city_names = set(self.resources.df_region_code["城市名称"].dropna().astype(str))
|
|
179
|
+
county_names = set(self.resources.df_region_code["区县名称"].dropna().astype(str))
|
|
180
|
+
|
|
181
|
+
provs = [w for w in std_words if w in province_names]
|
|
182
|
+
cities = [w for w in std_words if w in city_names]
|
|
183
|
+
counties = [w for w in std_words if w in county_names]
|
|
184
|
+
|
|
185
|
+
# 情况1:唯一合法路径,优先返回最细层级
|
|
186
|
+
if len(common_paths) == 1:
|
|
187
|
+
path = next(iter(common_paths))
|
|
188
|
+
prov, city, county = self._split_path(path)
|
|
189
|
+
if county:
|
|
190
|
+
return county
|
|
191
|
+
if city:
|
|
192
|
+
return city
|
|
193
|
+
if prov:
|
|
194
|
+
return prov
|
|
195
|
+
|
|
196
|
+
# 情况2:如果明确命中了区县,优先区县
|
|
197
|
+
if len(counties) == 1:
|
|
198
|
+
return counties[0]
|
|
199
|
+
|
|
200
|
+
# 情况3:如果明确命中了城市,返回城市
|
|
201
|
+
if len(cities) == 1:
|
|
202
|
+
return cities[0]
|
|
203
|
+
|
|
204
|
+
# 情况4:如果只明确到省
|
|
205
|
+
if len(provs) == 1:
|
|
206
|
+
return provs[0]
|
|
207
|
+
|
|
208
|
+
# 情况5:多个候选时,从 kept_words 末尾往前找最细层级
|
|
209
|
+
for w in reversed(std_words):
|
|
210
|
+
if w in county_names:
|
|
211
|
+
return w
|
|
212
|
+
for w in reversed(std_words):
|
|
213
|
+
if w in city_names:
|
|
214
|
+
return w
|
|
215
|
+
for w in reversed(std_words):
|
|
216
|
+
if w in province_names:
|
|
217
|
+
return w
|
|
218
|
+
|
|
219
|
+
return pd.NA
|
|
220
|
+
|
|
221
|
+
def fill_region_by_best(self, best: object, context_list: list[str]) -> tuple[object, object, object]:
|
|
222
|
+
if pd.isna(best):
|
|
223
|
+
return pd.NA, pd.NA, pd.NA
|
|
224
|
+
|
|
225
|
+
df = self.resources.df_region_code
|
|
226
|
+
std_context = self.map_words_to_standard_names(context_list)
|
|
227
|
+
|
|
228
|
+
province_names = set(df["省级名称"].dropna().astype(str))
|
|
229
|
+
city_names = set(df["城市名称"].dropna().astype(str))
|
|
230
|
+
county_names = set(df["区县名称"].dropna().astype(str))
|
|
231
|
+
|
|
232
|
+
# 1) best 是省级:只返回省
|
|
233
|
+
if best in province_names:
|
|
234
|
+
return best, pd.NA, pd.NA
|
|
235
|
+
|
|
236
|
+
# 2) best 是市级:返回省、市;区县留空
|
|
237
|
+
if best in city_names:
|
|
238
|
+
rows = df[df["城市名称"] == best].copy()
|
|
239
|
+
if rows.empty:
|
|
240
|
+
return pd.NA, best, pd.NA
|
|
241
|
+
|
|
242
|
+
best_score = -1
|
|
243
|
+
best_row = None
|
|
244
|
+
|
|
245
|
+
for _, row in rows.iterrows():
|
|
246
|
+
score = 0
|
|
247
|
+
if row["省级名称"] in std_context:
|
|
248
|
+
score += 1
|
|
249
|
+
if row["城市名称"] in std_context:
|
|
250
|
+
score += 1
|
|
251
|
+
|
|
252
|
+
if score > best_score:
|
|
253
|
+
best_score = score
|
|
254
|
+
best_row = row
|
|
255
|
+
|
|
256
|
+
if best_row is None:
|
|
257
|
+
best_row = rows.iloc[0]
|
|
258
|
+
|
|
259
|
+
return best_row["省级名称"], best_row["城市名称"], pd.NA
|
|
260
|
+
|
|
261
|
+
# 3) best 是区县级:返回完整省、市、县
|
|
262
|
+
if best in county_names:
|
|
263
|
+
rows = df[df["区县名称"] == best].copy()
|
|
264
|
+
if rows.empty:
|
|
265
|
+
return pd.NA, pd.NA, best
|
|
266
|
+
|
|
267
|
+
best_score = -1
|
|
268
|
+
best_row = None
|
|
269
|
+
|
|
270
|
+
for _, row in rows.iterrows():
|
|
271
|
+
score = 0
|
|
272
|
+
if row["省级名称"] in std_context:
|
|
273
|
+
score += 1
|
|
274
|
+
if row["城市名称"] in std_context:
|
|
275
|
+
score += 1
|
|
276
|
+
if row["区县名称"] in std_context:
|
|
277
|
+
score += 1
|
|
278
|
+
|
|
279
|
+
if score > best_score:
|
|
280
|
+
best_score = score
|
|
281
|
+
best_row = row
|
|
282
|
+
|
|
283
|
+
if best_row is None:
|
|
284
|
+
best_row = rows.iloc[0]
|
|
285
|
+
|
|
286
|
+
return best_row["省级名称"], best_row["城市名称"], best_row["区县名称"]
|
|
287
|
+
|
|
288
|
+
return pd.NA, pd.NA, pd.NA
|
|
289
|
+
def expand_region_codes(self, prov: object, city: object, county: object) -> dict:
|
|
290
|
+
df = self.resources.df_region_code
|
|
291
|
+
|
|
292
|
+
result = {
|
|
293
|
+
"省级名称": prov,
|
|
294
|
+
"省级代码": pd.NA,
|
|
295
|
+
"城市名称": city,
|
|
296
|
+
"城市代码": pd.NA,
|
|
297
|
+
"区县名称": county,
|
|
298
|
+
"区县代码": pd.NA,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
if pd.notna(prov) and pd.notna(city) and pd.notna(county):
|
|
302
|
+
row = df[
|
|
303
|
+
(df["省级名称"] == prov) &
|
|
304
|
+
(df["城市名称"] == city) &
|
|
305
|
+
(df["区县名称"] == county)
|
|
306
|
+
]
|
|
307
|
+
if not row.empty:
|
|
308
|
+
row = row.iloc[0]
|
|
309
|
+
result["省级代码"] = row.get("省份代码", pd.NA)
|
|
310
|
+
result["城市代码"] = row.get("城市代码", pd.NA)
|
|
311
|
+
result["区县代码"] = row.get("区县代码", pd.NA)
|
|
312
|
+
return result
|
|
313
|
+
|
|
314
|
+
if pd.notna(prov) and pd.notna(city):
|
|
315
|
+
row = df[
|
|
316
|
+
(df["省级名称"] == prov) &
|
|
317
|
+
(df["城市名称"] == city)
|
|
318
|
+
]
|
|
319
|
+
if not row.empty:
|
|
320
|
+
row = row.iloc[0]
|
|
321
|
+
result["省级代码"] = row.get("省份代码", pd.NA)
|
|
322
|
+
result["城市代码"] = row.get("城市代码", pd.NA)
|
|
323
|
+
|
|
324
|
+
if pd.notna(prov):
|
|
325
|
+
row = df[df["省级名称"] == prov]
|
|
326
|
+
if not row.empty:
|
|
327
|
+
row = row.iloc[0]
|
|
328
|
+
result["省级代码"] = row.get("省份代码", pd.NA)
|
|
329
|
+
|
|
330
|
+
return result
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from importlib.resources import files
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import jieba
|
|
4
|
+
|
|
5
|
+
from .models import CN_RegionResources
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
RESOURCE_PACKAGE = "region_parser.data"
|
|
9
|
+
REGION_EXCEL_NAME = "CN_Region_Reference_Library.xlsx"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def split_keywords(series: pd.Series) -> list[str]:
|
|
13
|
+
words = (
|
|
14
|
+
series.dropna()
|
|
15
|
+
.astype(str)
|
|
16
|
+
.str.split("|", regex=False)
|
|
17
|
+
.explode()
|
|
18
|
+
.astype(str)
|
|
19
|
+
.str.strip()
|
|
20
|
+
)
|
|
21
|
+
words = words[words != ""].unique().tolist()
|
|
22
|
+
words.sort(key=len, reverse=True)
|
|
23
|
+
return words
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_region_code_table() -> pd.DataFrame:
|
|
27
|
+
excel_path = files(RESOURCE_PACKAGE).joinpath(REGION_EXCEL_NAME)
|
|
28
|
+
return pd.read_excel(excel_path)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_word_to_paths(df_paths: pd.DataFrame) -> dict:
|
|
32
|
+
word_to_paths = {}
|
|
33
|
+
|
|
34
|
+
for _, row in df_paths.iterrows():
|
|
35
|
+
path = row["path"]
|
|
36
|
+
for col in ["省级关键词", "市级关键词", "区县关键词"]:
|
|
37
|
+
value = row.get(col)
|
|
38
|
+
if pd.isna(value):
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
for word in str(value).split("|"):
|
|
42
|
+
word = word.strip()
|
|
43
|
+
if not word:
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
if word not in word_to_paths:
|
|
47
|
+
word_to_paths[word] = set()
|
|
48
|
+
word_to_paths[word].add(path)
|
|
49
|
+
|
|
50
|
+
return word_to_paths
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def build_region_resources() -> CN_RegionResources:
|
|
54
|
+
df_region_code = load_region_code_table()
|
|
55
|
+
|
|
56
|
+
province_keywords = split_keywords(df_region_code["省级关键词"])
|
|
57
|
+
city_keywords = split_keywords(df_region_code["市级关键词"])
|
|
58
|
+
county_keywords = split_keywords(df_region_code["区县关键词"])
|
|
59
|
+
|
|
60
|
+
region_keywords = sorted(
|
|
61
|
+
set(province_keywords + city_keywords + county_keywords),
|
|
62
|
+
key=len,
|
|
63
|
+
reverse=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
word_set = set(region_keywords)
|
|
67
|
+
|
|
68
|
+
df_province = (
|
|
69
|
+
df_region_code[["省级名称", "省级关键词", "省份代码"]]
|
|
70
|
+
.dropna(subset=["省级关键词"])
|
|
71
|
+
.assign(省级关键词=lambda x: x["省级关键词"].str.split("|", regex=False))
|
|
72
|
+
.explode("省级关键词")
|
|
73
|
+
.assign(省级关键词=lambda x: x["省级关键词"].astype(str).str.strip())
|
|
74
|
+
.query("省级关键词 != ''")
|
|
75
|
+
.drop_duplicates()
|
|
76
|
+
.reset_index(drop=True)
|
|
77
|
+
)
|
|
78
|
+
province_dict = df_province.set_index("省级关键词")["省级名称"].to_dict()
|
|
79
|
+
|
|
80
|
+
df_city = (
|
|
81
|
+
df_region_code[["城市名称", "市级关键词", "城市代码"]]
|
|
82
|
+
.dropna(subset=["市级关键词"])
|
|
83
|
+
.assign(市级关键词=lambda x: x["市级关键词"].str.split("|", regex=False))
|
|
84
|
+
.explode("市级关键词")
|
|
85
|
+
.assign(市级关键词=lambda x: x["市级关键词"].astype(str).str.strip())
|
|
86
|
+
.query("市级关键词 != ''")
|
|
87
|
+
.drop_duplicates()
|
|
88
|
+
.reset_index(drop=True)
|
|
89
|
+
)
|
|
90
|
+
city_dict = df_city.set_index("市级关键词")["城市名称"].to_dict()
|
|
91
|
+
|
|
92
|
+
df_county = (
|
|
93
|
+
df_region_code[["区县名称", "区县关键词", "区县代码"]]
|
|
94
|
+
.dropna(subset=["区县关键词"])
|
|
95
|
+
.assign(区县关键词=lambda x: x["区县关键词"].str.split("|", regex=False))
|
|
96
|
+
.explode("区县关键词")
|
|
97
|
+
.assign(区县关键词=lambda x: x["区县关键词"].astype(str).str.strip())
|
|
98
|
+
.query("区县关键词 != ''")
|
|
99
|
+
.drop_duplicates()
|
|
100
|
+
.reset_index(drop=True)
|
|
101
|
+
)
|
|
102
|
+
county_dict = df_county.set_index("区县关键词")["区县名称"].to_dict()
|
|
103
|
+
|
|
104
|
+
region_mapping_dict = {**province_dict, **city_dict, **county_dict}
|
|
105
|
+
|
|
106
|
+
df_paths = df_region_code[
|
|
107
|
+
["省级名称", "城市名称", "区县名称", "省级关键词", "市级关键词", "区县关键词"]
|
|
108
|
+
].copy()
|
|
109
|
+
|
|
110
|
+
df_paths["path"] = (
|
|
111
|
+
df_paths["省级名称"].fillna("") + "|" +
|
|
112
|
+
df_paths["城市名称"].fillna("") + "|" +
|
|
113
|
+
df_paths["区县名称"].fillna("")
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
word_to_paths = build_word_to_paths(df_paths)
|
|
117
|
+
|
|
118
|
+
return CN_RegionResources(
|
|
119
|
+
df_region_code=df_region_code,
|
|
120
|
+
province_keywords=province_keywords,
|
|
121
|
+
city_keywords=city_keywords,
|
|
122
|
+
county_keywords=county_keywords,
|
|
123
|
+
region_keywords=region_keywords,
|
|
124
|
+
word_set=word_set,
|
|
125
|
+
province_dict=province_dict,
|
|
126
|
+
city_dict=city_dict,
|
|
127
|
+
county_dict=county_dict,
|
|
128
|
+
region_mapping_dict=region_mapping_dict,
|
|
129
|
+
df_paths=df_paths,
|
|
130
|
+
word_to_paths=word_to_paths,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def load_keywords_into_jieba(words: list[str]) -> None:
|
|
135
|
+
for word in words:
|
|
136
|
+
jieba.add_word(word, freq=1000)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ssdlab-region-parser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Chinese administrative region parser, not include Honkong, Macau and Taiwan yet.
|
|
5
|
+
Author: ChufanHe
|
|
6
|
+
Author-email: sthechufan@gmail.com
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Requires-Dist: jieba>=0.42.1
|
|
12
|
+
Requires-Dist: openpyxl>=3.1.0
|
|
13
|
+
|
|
14
|
+
# region_parser
|
|
15
|
+
Chinese administrative region parser, not include Honkong, Macau and Taiwan yet.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
src/SSDLab_region_parser/__init__.py
|
|
6
|
+
src/SSDLab_region_parser/models.py
|
|
7
|
+
src/SSDLab_region_parser/parser.py
|
|
8
|
+
src/SSDLab_region_parser/resources.py
|
|
9
|
+
src/SSDLab_region_parser.egg-info/PKG-INFO
|
|
10
|
+
src/SSDLab_region_parser.egg-info/SOURCES.txt
|
|
11
|
+
src/SSDLab_region_parser.egg-info/dependency_links.txt
|
|
12
|
+
src/SSDLab_region_parser.egg-info/requires.txt
|
|
13
|
+
src/SSDLab_region_parser.egg-info/top_level.txt
|
|
14
|
+
src/SSDLab_region_parser/data/__init__.py
|
|
15
|
+
src/ssdlab_region_parser/__init__.py
|
|
16
|
+
src/ssdlab_region_parser/models.py
|
|
17
|
+
src/ssdlab_region_parser/parser.py
|
|
18
|
+
src/ssdlab_region_parser/resources.py
|
|
19
|
+
src/ssdlab_region_parser.egg-info/PKG-INFO
|
|
20
|
+
src/ssdlab_region_parser.egg-info/SOURCES.txt
|
|
21
|
+
src/ssdlab_region_parser.egg-info/dependency_links.txt
|
|
22
|
+
src/ssdlab_region_parser.egg-info/requires.txt
|
|
23
|
+
src/ssdlab_region_parser.egg-info/top_level.txt
|
|
24
|
+
src/ssdlab_region_parser/data/__init__.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ssdlab_region_parser
|