geoparser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 dguzh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.1
2
+ Name: geoparser
3
+ Version: 0.1.0
4
+ Summary: A geoparsing library for English texts
5
+ Author: Diego Gomes
6
+ Author-email: diego.gomes@uzh.ch
7
+ Classifier: Development Status :: 2 - Pre-Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+
14
+ # geoparser
15
+ A geoparsing library for English texts
@@ -0,0 +1,2 @@
1
+ # geoparser
2
+ A geoparsing library for English texts
@@ -0,0 +1 @@
1
+ from .geoparser import Geoparser
@@ -0,0 +1,42 @@
1
+ from typing import List
2
+
3
+ class Location:
4
+ def __init__(self, geonameid: int, name: str, admin2_geonameid: int, admin2_name: str,
5
+ admin1_geonameid: int, admin1_name: str, country_geonameid: int,
6
+ country_name: str, feature_name: str, latitude: float, longitude: float,
7
+ elevation: int, population: int):
8
+ self.geonameid = geonameid
9
+ self.name = name
10
+ self.admin2_geonameid = admin2_geonameid
11
+ self.admin2_name = admin2_name
12
+ self.admin1_geonameid = admin1_geonameid
13
+ self.admin1_name = admin1_name
14
+ self.country_geonameid = country_geonameid
15
+ self.country_name = country_name
16
+ self.feature_name = feature_name
17
+ self.latitude = latitude
18
+ self.longitude = longitude
19
+ self.elevation = elevation
20
+ self.population = population
21
+
22
+ def __str__(self):
23
+ return f"{self.name} (https://www.geonames.org/{self.geonameid})"
24
+
25
+ class Toponym:
26
+ def __init__(self, name: str, start_char: int, end_char: int, context: str):
27
+ self.name = name
28
+ self.start_char = start_char
29
+ self.end_char = end_char
30
+ self.context = context
31
+ self.location = None
32
+
33
+ def __str__(self):
34
+ return f"{self.name} ({self.start_char}:{self.end_char})"
35
+
36
+ class Document:
37
+ def __init__(self, text: str):
38
+ self.text = text
39
+ self.toponyms: List[Toponym] = []
40
+
41
+ def __str__(self):
42
+ return self.text
@@ -0,0 +1,101 @@
1
+ import pkg_resources
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ class Gazetteer:
6
+ def __init__(self):
7
+ self.geonames_file = pkg_resources.resource_filename('geoparser', 'geonames/allCountries.txt')
8
+ self.admin1_file = pkg_resources.resource_filename('geoparser', 'geonames/admin1CodesASCII.txt')
9
+ self.admin2_file = pkg_resources.resource_filename('geoparser', 'geonames/admin2Codes.txt')
10
+ self.country_info_file = pkg_resources.resource_filename('geoparser', 'geonames/countryInfo.txt')
11
+ self.feature_codes_file = pkg_resources.resource_filename('geoparser', 'geonames/featureCodes_en.txt')
12
+ self.data = None
13
+
14
+ def load(self, all_candidates):
15
+
16
+ cols = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude',
17
+ 'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code', 'admin2_code',
18
+ 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone',
19
+ 'modification_date']
20
+
21
+ cols_to_load = ['geonameid', 'name', 'latitude', 'longitude', 'feature_code',
22
+ 'country_code', 'admin1_code', 'admin2_code', 'population', 'elevation']
23
+
24
+ dtype = {
25
+ 'geonameid': 'Int64',
26
+ 'feature_code': str,
27
+ 'country_code': str,
28
+ 'admin1_code': str,
29
+ 'admin2_code': str
30
+ }
31
+
32
+ chunks = pd.read_csv(self.geonames_file, delimiter='\t', header=None,
33
+ names=cols, usecols=cols_to_load, low_memory=False, chunksize=500000,
34
+ dtype=dtype)
35
+
36
+ filtered_chunks = []
37
+ for chunk in chunks:
38
+ filtered_chunk = chunk[chunk['geonameid'].isin(all_candidates)]
39
+ if not filtered_chunk.empty:
40
+ filtered_chunks.append(filtered_chunk)
41
+
42
+ if filtered_chunks:
43
+ self.data = pd.concat(filtered_chunks)
44
+ self.data.reset_index(drop=True, inplace=True)
45
+ self.enrich_data()
46
+
47
+ def enrich_data(self):
48
+ country_df = self.load_country_data(self.country_info_file)
49
+ admin1_df = self.load_admin1_data(self.admin1_file)
50
+ admin2_df = self.load_admin2_data(self.admin2_file)
51
+ feature_df = self.load_feature_data(self.feature_codes_file)
52
+
53
+ self.merge_data(country_df, admin1_df, admin2_df, feature_df)
54
+
55
+ self.data['country_name'] = self.data.apply(lambda x: np.nan if pd.isna(x['country_code']) else x['country_name'], axis=1)
56
+
57
+ self.data['pseudotext'] = self.data.apply(self.pseudotext_generator, axis=1)
58
+
59
+ def load_country_data(self, country_info_file):
60
+ cols = ['country_code', 'ISO3', 'ISO-Numeric', 'fips', 'country_name', 'Capital', 'Area(in sq km)',
61
+ 'Population', 'Continent', 'tld', 'CurrencyCode', 'CurrencyName', 'Phone', 'Postal Code Format',
62
+ 'Postal Code Regex', 'Languages', 'country_geonameid', 'neighbours', 'EquivalentFipsCode']
63
+ dtype = {'country_geonameid': 'Int64'}
64
+ country_df = pd.read_csv(country_info_file, sep='\t', header=None, skiprows=50, names=cols, dtype=dtype)
65
+ return country_df[['country_code', 'country_name', 'country_geonameid']]
66
+
67
+ def load_admin1_data(self, admin1_file):
68
+ cols = ['admin1_full_code', 'admin1_name', 'ascii_name', 'admin1_geonameid']
69
+ dtype = {'admin1_geonameid': 'Int64'}
70
+ admin1_df = pd.read_csv(admin1_file, sep='\t', header=None, names=cols, dtype=dtype)
71
+ admin1_df[['country_code', 'admin1_code']] = admin1_df['admin1_full_code'].str.split('.', expand=True).astype(str)
72
+ return admin1_df[['country_code', 'admin1_code', 'admin1_name', 'admin1_geonameid']]
73
+
74
+ def load_admin2_data(self, admin2_file):
75
+ cols = ['admin2_full_code', 'admin2_name', 'ascii_name', 'admin2_geonameid']
76
+ dtype = {'admin2_geonameid': 'Int64'}
77
+ admin2_df = pd.read_csv(admin2_file, sep='\t', header=None, names=cols, dtype=dtype)
78
+ admin2_df[['country_code', 'admin1_code', 'admin2_code']] = admin2_df['admin2_full_code'].str.split('.', expand=True).astype(str)
79
+ return admin2_df[['country_code', 'admin1_code', 'admin2_code', 'admin2_name', 'admin2_geonameid']]
80
+
81
+ def load_feature_data(self, feature_codes_file):
82
+ cols = ['feature_full_code', 'feature_name', 'feature_description']
83
+ feature_df = pd.read_csv(feature_codes_file, sep='\t', header=None, names=cols)
84
+ feature_df[['feature_class', 'feature_code']] = feature_df['feature_full_code'].str.split('.', expand=True).astype(str)
85
+ return feature_df[['feature_code', 'feature_name']]
86
+
87
+ def merge_data(self, country_df, admin1_df, admin2_df, feature_df):
88
+ self.data = self.data.merge(country_df, on='country_code', how='left')
89
+ self.data = self.data.merge(admin1_df, on=['country_code', 'admin1_code'], how='left')
90
+ self.data = self.data.merge(admin2_df, on=['country_code', 'admin1_code', 'admin2_code'], how='left')
91
+ self.data = self.data.merge(feature_df, on='feature_code', how='left')
92
+
93
+ def pseudotext_generator(self, row):
94
+ components = [row['name']]
95
+ for field in ['admin2_name', 'admin1_name', 'country_name']:
96
+ if pd.notna(row[field]):
97
+ components.append(row[field])
98
+ location_str = " in " + ", ".join(components[1:]) if len(components) > 1 else ""
99
+ feature_str = f" ({row['feature_name']})" if pd.notna(row['feature_name']) else ""
100
+
101
+ return f"{components[0]}{feature_str}{location_str}"
@@ -0,0 +1,199 @@
1
+ import os
2
+ import re
3
+ import pickle
4
+ import unicodedata
5
+ import logging
6
+ import pkg_resources
7
+ import spacy
8
+ from tqdm.auto import tqdm
9
+ from typing import List, Set
10
+ from sentence_transformers import SentenceTransformer, util
11
+ import torch
12
+
13
+ from .entities import Document, Toponym, Location
14
+ from .gazetteer import Gazetteer
15
+
16
+ # Suppress token length warnings from transformers
17
+ logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
18
+
19
+ class Geoparser:
20
+ def __init__(self, spacy_model='en_core_web_trf', transformer_model='dguzh/geo-all-distilroberta-v1'):
21
+ self.ensure_spacy_model(spacy_model)
22
+ self.nlp = spacy.load(spacy_model)
23
+ self.transformer = SentenceTransformer(transformer_model)
24
+ self.index_file = pkg_resources.resource_filename('geoparser', 'index.pkl')
25
+ self.geonames_file = pkg_resources.resource_filename('geoparser', 'geonames/allCountries.txt')
26
+ self.index = self.load_index()
27
+ self.tokenizer = self.transformer.tokenizer
28
+ self.model_max_length = self.tokenizer.model_max_length
29
+
30
+ def ensure_spacy_model(self, model_name):
31
+ if not spacy.util.is_package(model_name):
32
+ print(f"Downloading spaCy model '{model_name}'...")
33
+ spacy.cli.download(model_name)
34
+
35
+ def normalize_name(self, name):
36
+ name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
37
+ name = re.sub(r"[^\w\s]", "", name) # remove all punctuation
38
+ name = re.sub(r"\s+", " ", name).strip() # normalize whitespaces and strip
39
+ return name.lower() # convert to lowercase
40
+
41
+ def load_index(self):
42
+ if os.path.exists(self.index_file):
43
+ with open(self.index_file, 'rb') as file:
44
+ return pickle.load(file)
45
+ else:
46
+ index = self.build_index()
47
+ with open(self.index_file, 'wb') as file:
48
+ pickle.dump(index, file)
49
+ return index
50
+
51
+ def build_index(self):
52
+ index = {}
53
+ with open(self.geonames_file, 'r', encoding='utf-8') as file:
54
+ total_lines = sum(1 for line in file)
55
+ with open(self.geonames_file, 'r', encoding='utf-8') as file:
56
+ for line in tqdm(file, total=total_lines, desc="Building index", unit=" lines"):
57
+ columns = line.strip().split('\t')
58
+ geonameid = int(columns[0])
59
+ names = [columns[1]] + columns[3].split(',')
60
+ for name in names:
61
+ normalized_name = self.normalize_name(name)
62
+ if normalized_name:
63
+ if normalized_name not in index:
64
+ index[normalized_name] = set()
65
+ index[normalized_name].add(geonameid)
66
+ return index
67
+
68
+ def parse(self, texts: List[str]):
69
+ documents = [Document(text) for text in texts]
70
+ for document in documents:
71
+ self.extract_toponyms(document)
72
+ self.resolve_toponyms(documents)
73
+ return documents
74
+
75
+ def extract_toponyms(self, document: Document):
76
+ doc = self.nlp(document.text)
77
+ sentences = list(doc.sents)
78
+ total_tokens = len(self.tokenizer.tokenize(document.text))
79
+
80
+ for ent in doc.ents:
81
+ if ent.label_ in ['GPE', 'LOC', 'FAC']:
82
+ text, start_char, end_char = self.clean_ent(ent)
83
+ context = document.text if total_tokens <= self.model_max_length else self.truncate_context(sentences, ent)
84
+ document.toponyms.append(Toponym(text, start_char, end_char, context))
85
+
86
+ def clean_ent(self, ent):
87
+ # remove leading lowercase 'the'
88
+ original_text = ent.text
89
+ new_text = re.sub(r"^the\s+", "", original_text)
90
+ if new_text != original_text:
91
+ new_text = new_text.lstrip()
92
+ start_char = ent.start_char + (len(original_text) - len(new_text))
93
+ else:
94
+ start_char = ent.start_char
95
+
96
+ # remove trailing possessive 's
97
+ original_text = new_text
98
+ new_text = re.sub(r"\'s$", "", original_text)
99
+ if new_text != original_text:
100
+ new_text = new_text.rstrip()
101
+ end_char = start_char + len(new_text)
102
+ else:
103
+ end_char = start_char + len(original_text)
104
+
105
+ return new_text, start_char, end_char
106
+
107
+ def truncate_context(self, sentences, ent):
108
+ # Find the sentence containing the toponym
109
+ target_sentence = next((s for s in sentences if s.start_char <= ent.start_char and s.end_char >= ent.end_char), None)
110
+ if not target_sentence:
111
+ return ""
112
+
113
+ target_index = sentences.index(target_sentence)
114
+ token_limit = self.model_max_length
115
+ context_sentences = [target_sentence.text]
116
+ tokens_count = len(self.tokenizer.tokenize(target_sentence.text))
117
+
118
+ # Expand context by adding sentences before and after the toponym sentence
119
+ i, j = target_index, target_index
120
+ while True:
121
+ expanded = False
122
+ if i > 0:
123
+ prev_tokens = self.tokenizer.tokenize(sentences[i - 1].text)
124
+ if tokens_count + len(prev_tokens) < token_limit:
125
+ context_sentences.insert(0, sentences[i - 1].text)
126
+ tokens_count += len(prev_tokens)
127
+ i -= 1
128
+ expanded = True
129
+
130
+ if j < len(sentences) - 1:
131
+ next_tokens = self.tokenizer.tokenize(sentences[j + 1].text)
132
+ if tokens_count + len(next_tokens) < token_limit:
133
+ context_sentences.append(sentences[j + 1].text)
134
+ tokens_count += len(next_tokens)
135
+ j += 1
136
+ expanded = True
137
+
138
+ # Break if no sentences were added in the last iteration
139
+ if not expanded:
140
+ break
141
+
142
+ return ' '.join(context_sentences)
143
+
144
+ def query_index(self, toponym: str) -> set:
145
+ normalized_toponym = self.normalize_name(toponym)
146
+ return self.index.get(normalized_toponym, set())
147
+
148
+ def resolve_toponyms(self, documents: List[Document]):
149
+ all_candidates = set()
150
+ for document in documents:
151
+ for toponym in document.toponyms:
152
+ candidates = self.query_index(toponym.name)
153
+ all_candidates.update(candidates)
154
+ toponym.candidates = candidates
155
+
156
+ if all_candidates:
157
+ gazetteer = Gazetteer()
158
+ gazetteer.load(all_candidates)
159
+
160
+ pseudotexts = gazetteer.data['pseudotext'].tolist()
161
+
162
+ candidate_embeddings = self.transformer.encode(pseudotexts, batch_size=8, show_progress_bar=True, convert_to_tensor=True)
163
+
164
+ candidate_embeddings_lookup = dict(zip(gazetteer.data['geonameid'], candidate_embeddings))
165
+
166
+ contexts = [toponym.context for document in documents for toponym in document.toponyms]
167
+
168
+ toponym_embeddings = self.transformer.encode(contexts, batch_size=8, show_progress_bar=True, convert_to_tensor=True)
169
+
170
+ toponym_index = 0
171
+ for document in documents:
172
+ for toponym in document.toponyms:
173
+ candidates = list(toponym.candidates)
174
+ candidate_embeddings = [candidate_embeddings_lookup[geonameid] for geonameid in candidates if geonameid in candidate_embeddings_lookup]
175
+
176
+ if candidate_embeddings:
177
+ candidate_embeddings = torch.stack(candidate_embeddings)
178
+ similarities = util.cos_sim(toponym_embeddings[toponym_index], candidate_embeddings)
179
+ predicted_index = torch.argmax(similarities).item()
180
+ predicted_geonameid = candidates[predicted_index]
181
+
182
+ # Step 7: Create a Location object for the best match and assign to the toponym
183
+ predicted_location = gazetteer.data.loc[gazetteer.data['geonameid'] == predicted_geonameid].iloc[0]
184
+ toponym.location = Location(
185
+ geonameid=predicted_geonameid,
186
+ name=predicted_location['name'],
187
+ admin2_geonameid=predicted_location['admin2_geonameid'],
188
+ admin2_name=predicted_location['admin2_name'],
189
+ admin1_geonameid=predicted_location['admin1_geonameid'],
190
+ admin1_name=predicted_location['admin1_name'],
191
+ country_geonameid=predicted_location['country_geonameid'],
192
+ country_name=predicted_location['country_name'],
193
+ feature_name=predicted_location['feature_name'],
194
+ latitude=predicted_location['latitude'],
195
+ longitude=predicted_location['longitude'],
196
+ elevation=predicted_location['elevation'],
197
+ population=predicted_location['population']
198
+ )
199
+ toponym_index += 1
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 2.1
2
+ Name: geoparser
3
+ Version: 0.1.0
4
+ Summary: A geoparsing library for English texts
5
+ Author: Diego Gomes
6
+ Author-email: diego.gomes@uzh.ch
7
+ Classifier: Development Status :: 2 - Pre-Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.9
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+
14
+ # geoparser
15
+ A geoparsing library for English texts
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ README.md
3
+ setup.py
4
+ geoparser/__init__.py
5
+ geoparser/entities.py
6
+ geoparser/gazetteer.py
7
+ geoparser/geoparser.py
8
+ geoparser.egg-info/PKG-INFO
9
+ geoparser.egg-info/SOURCES.txt
10
+ geoparser.egg-info/dependency_links.txt
11
+ geoparser.egg-info/requires.txt
12
+ geoparser.egg-info/top_level.txt
@@ -0,0 +1,7 @@
1
+ pandas
2
+ numpy
3
+ spacy
4
+ sentence_transformers
5
+ tqdm
6
+ torch
7
+ requests
@@ -0,0 +1 @@
1
+ geoparser
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,74 @@
1
+ import pkg_resources
2
+ import requests
3
+ import zipfile
4
+ import os
5
+ import subprocess
6
+ import sys
7
+ from setuptools import setup, find_packages
8
+ from setuptools.command.install import install
9
+
10
+ class PostInstallCommand(install):
11
+ def run(self):
12
+ install.run(self)
13
+ self.download_spacy()
14
+ self.download_geonames()
15
+
16
+ def download_spacy(self):
17
+ models = ["en_core_web_sm", "en_core_web_trf"]
18
+ for model in models:
19
+ subprocess.check_call([sys.executable, "-m", "spacy", "download", model])
20
+
21
+ def download_geonames(self):
22
+ site_packages = os.path.join(sys.prefix, 'lib', 'site-packages')
23
+ data_dir = os.path.join(site_packages, 'geoparser', 'geonames')
24
+ os.makedirs(data_dir, exist_ok=True)
25
+
26
+ file_links = [
27
+ "http://download.geonames.org/export/dump/allCountries.zip",
28
+ "http://download.geonames.org/export/dump/admin1CodesASCII.txt",
29
+ "http://download.geonames.org/export/dump/admin2Codes.txt",
30
+ "http://download.geonames.org/export/dump/countryInfo.txt",
31
+ "http://download.geonames.org/export/dump/featureCodes_en.txt"
32
+ ]
33
+
34
+ for url in file_links:
35
+ filename = url.split('/')[-1]
36
+ file_path = os.path.join(data_dir, filename)
37
+
38
+ response = requests.get(url)
39
+ with open(file_path, 'wb') as f:
40
+ f.write(response.content)
41
+
42
+ if filename.endswith('.zip'):
43
+ with zipfile.ZipFile(file_path, 'r') as zip_ref:
44
+ zip_ref.extractall(data_dir)
45
+ os.remove(file_path)
46
+
47
+ setup(
48
+ name='geoparser',
49
+ version='0.1.0',
50
+ author='Diego Gomes',
51
+ author_email='diego.gomes@uzh.ch',
52
+ packages=find_packages(),
53
+ description='A geoparsing library for English texts',
54
+ long_description=open('README.md').read(),
55
+ long_description_content_type='text/markdown',
56
+ install_requires=[
57
+ 'pandas',
58
+ 'numpy',
59
+ 'spacy',
60
+ 'sentence_transformers',
61
+ 'tqdm',
62
+ 'torch',
63
+ 'requests'
64
+ ],
65
+ python_requires='>=3.9',
66
+ classifiers=[
67
+ 'Development Status :: 2 - Pre-Alpha',
68
+ 'Intended Audience :: Science/Research',
69
+ 'License :: OSI Approved :: MIT License',
70
+ ],
71
+ cmdclass={
72
+ 'install': PostInstallCommand,
73
+ }
74
+ )