PyPI - geoparser - Versions diffs - 0.1.0__tar.gz - Mend

geoparser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

geoparser-0.1.0/LICENSE +21 -0
geoparser-0.1.0/PKG-INFO +15 -0
geoparser-0.1.0/README.md +2 -0
geoparser-0.1.0/geoparser/__init__.py +1 -0
geoparser-0.1.0/geoparser/entities.py +42 -0
geoparser-0.1.0/geoparser/gazetteer.py +101 -0
geoparser-0.1.0/geoparser/geoparser.py +199 -0
geoparser-0.1.0/geoparser.egg-info/PKG-INFO +15 -0
geoparser-0.1.0/geoparser.egg-info/SOURCES.txt +12 -0
geoparser-0.1.0/geoparser.egg-info/dependency_links.txt +1 -0
geoparser-0.1.0/geoparser.egg-info/requires.txt +7 -0
geoparser-0.1.0/geoparser.egg-info/top_level.txt +1 -0
geoparser-0.1.0/setup.cfg +4 -0
geoparser-0.1.0/setup.py +74 -0

geoparser-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 dguzh
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

geoparser-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,15 @@
+Metadata-Version: 2.1
+Name: geoparser
+Version: 0.1.0
+Summary: A geoparsing library for English texts
+Author: Diego Gomes
+Author-email: diego.gomes@uzh.ch
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+# geoparser
+A geoparsing library for English texts

geoparser-0.1.0/README.md ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # geoparser
2	+ A geoparsing library for English texts

geoparser-0.1.0/geoparser/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .geoparser import Geoparser

geoparser-0.1.0/geoparser/entities.py ADDED Viewed

@@ -0,0 +1,42 @@
+from typing import List
+class Location:
+    def __init__(self, geonameid: int, name: str, admin2_geonameid: int, admin2_name: str,
+                 admin1_geonameid: int, admin1_name: str, country_geonameid: int,
+                 country_name: str, feature_name: str, latitude: float, longitude: float,
+                 elevation: int, population: int):
+        self.geonameid = geonameid
+        self.name = name
+        self.admin2_geonameid = admin2_geonameid
+        self.admin2_name = admin2_name
+        self.admin1_geonameid = admin1_geonameid
+        self.admin1_name = admin1_name
+        self.country_geonameid = country_geonameid
+        self.country_name = country_name
+        self.feature_name = feature_name
+        self.latitude = latitude
+        self.longitude = longitude
+        self.elevation = elevation
+        self.population = population
+    def __str__(self):
+        return f"{self.name} (https://www.geonames.org/{self.geonameid})"
+class Toponym:
+    def __init__(self, name: str, start_char: int, end_char: int, context: str):
+        self.name = name
+        self.start_char = start_char
+        self.end_char = end_char
+        self.context = context
+        self.location = None
+    def __str__(self):
+        return f"{self.name} ({self.start_char}:{self.end_char})"
+class Document:
+    def __init__(self, text: str):
+        self.text = text
+        self.toponyms: List[Toponym] = []
+    def __str__(self):
+        return self.text

geoparser-0.1.0/geoparser/gazetteer.py ADDED Viewed

@@ -0,0 +1,101 @@
+import pkg_resources
+import pandas as pd
+import numpy as np
+class Gazetteer:
+    def __init__(self):
+        self.geonames_file = pkg_resources.resource_filename('geoparser', 'geonames/allCountries.txt')
+        self.admin1_file = pkg_resources.resource_filename('geoparser', 'geonames/admin1CodesASCII.txt')
+        self.admin2_file = pkg_resources.resource_filename('geoparser', 'geonames/admin2Codes.txt')
+        self.country_info_file = pkg_resources.resource_filename('geoparser', 'geonames/countryInfo.txt')
+        self.feature_codes_file = pkg_resources.resource_filename('geoparser', 'geonames/featureCodes_en.txt')
+        self.data = None
+    def load(self, all_candidates):
+        cols = ['geonameid', 'name', 'asciiname', 'alternatenames', 'latitude', 'longitude',
+                 'feature_class', 'feature_code', 'country_code', 'cc2', 'admin1_code', 'admin2_code',
+                 'admin3_code', 'admin4_code', 'population', 'elevation', 'dem', 'timezone',
+                 'modification_date']
+        cols_to_load = ['geonameid', 'name', 'latitude', 'longitude', 'feature_code',
+                        'country_code', 'admin1_code', 'admin2_code', 'population', 'elevation']
+        dtype = {
+            'geonameid': 'Int64',
+            'feature_code': str,
+            'country_code': str,
+            'admin1_code': str,
+            'admin2_code': str
+        }
+        chunks = pd.read_csv(self.geonames_file, delimiter='\t', header=None,
+                             names=cols, usecols=cols_to_load, low_memory=False, chunksize=500000,
+                             dtype=dtype)
+        filtered_chunks = []
+        for chunk in chunks:
+            filtered_chunk = chunk[chunk['geonameid'].isin(all_candidates)]
+            if not filtered_chunk.empty:
+                filtered_chunks.append(filtered_chunk)
+        if filtered_chunks:
+            self.data = pd.concat(filtered_chunks)
+            self.data.reset_index(drop=True, inplace=True)
+            self.enrich_data()
+    def enrich_data(self):
+        country_df = self.load_country_data(self.country_info_file)
+        admin1_df = self.load_admin1_data(self.admin1_file)
+        admin2_df = self.load_admin2_data(self.admin2_file)
+        feature_df = self.load_feature_data(self.feature_codes_file)
+        self.merge_data(country_df, admin1_df, admin2_df, feature_df)
+        self.data['country_name'] = self.data.apply(lambda x: np.nan if pd.isna(x['country_code']) else x['country_name'], axis=1)
+        self.data['pseudotext'] = self.data.apply(self.pseudotext_generator, axis=1)
+    def load_country_data(self, country_info_file):
+        cols = ['country_code', 'ISO3', 'ISO-Numeric', 'fips', 'country_name', 'Capital', 'Area(in sq km)',
+                'Population', 'Continent', 'tld', 'CurrencyCode', 'CurrencyName', 'Phone', 'Postal Code Format',
+                'Postal Code Regex', 'Languages', 'country_geonameid', 'neighbours', 'EquivalentFipsCode']
+        dtype = {'country_geonameid': 'Int64'}
+        country_df = pd.read_csv(country_info_file, sep='\t', header=None, skiprows=50, names=cols, dtype=dtype)
+        return country_df[['country_code', 'country_name', 'country_geonameid']]
+    def load_admin1_data(self, admin1_file):
+        cols = ['admin1_full_code', 'admin1_name', 'ascii_name', 'admin1_geonameid']
+        dtype = {'admin1_geonameid': 'Int64'}
+        admin1_df = pd.read_csv(admin1_file, sep='\t', header=None, names=cols, dtype=dtype)
+        admin1_df[['country_code', 'admin1_code']] = admin1_df['admin1_full_code'].str.split('.', expand=True).astype(str)
+        return admin1_df[['country_code', 'admin1_code', 'admin1_name', 'admin1_geonameid']]
+    def load_admin2_data(self, admin2_file):
+        cols = ['admin2_full_code', 'admin2_name', 'ascii_name', 'admin2_geonameid']
+        dtype = {'admin2_geonameid': 'Int64'}
+        admin2_df = pd.read_csv(admin2_file, sep='\t', header=None, names=cols, dtype=dtype)
+        admin2_df[['country_code', 'admin1_code', 'admin2_code']] = admin2_df['admin2_full_code'].str.split('.', expand=True).astype(str)
+        return admin2_df[['country_code', 'admin1_code', 'admin2_code', 'admin2_name', 'admin2_geonameid']]
+    def load_feature_data(self, feature_codes_file):
+        cols = ['feature_full_code', 'feature_name', 'feature_description']
+        feature_df = pd.read_csv(feature_codes_file, sep='\t', header=None, names=cols)
+        feature_df[['feature_class', 'feature_code']] = feature_df['feature_full_code'].str.split('.', expand=True).astype(str)
+        return feature_df[['feature_code', 'feature_name']]
+    def merge_data(self, country_df, admin1_df, admin2_df, feature_df):
+        self.data = self.data.merge(country_df, on='country_code', how='left')
+        self.data = self.data.merge(admin1_df, on=['country_code', 'admin1_code'], how='left')
+        self.data = self.data.merge(admin2_df, on=['country_code', 'admin1_code', 'admin2_code'], how='left')
+        self.data = self.data.merge(feature_df, on='feature_code', how='left')
+    def pseudotext_generator(self, row):
+        components = [row['name']]
+        for field in ['admin2_name', 'admin1_name', 'country_name']:
+            if pd.notna(row[field]):
+                components.append(row[field])
+        location_str = " in " + ", ".join(components[1:]) if len(components) > 1 else ""
+        feature_str = f" ({row['feature_name']})" if pd.notna(row['feature_name']) else ""
+        return f"{components[0]}{feature_str}{location_str}"

geoparser-0.1.0/geoparser/geoparser.py ADDED Viewed

@@ -0,0 +1,199 @@
+import os
+import re
+import pickle
+import unicodedata
+import logging
+import pkg_resources
+import spacy
+from tqdm.auto import tqdm
+from typing import List, Set
+from sentence_transformers import SentenceTransformer, util
+import torch
+from .entities import Document, Toponym, Location
+from .gazetteer import Gazetteer
+# Suppress token length warnings from transformers
+logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
+class Geoparser:
+    def __init__(self, spacy_model='en_core_web_trf', transformer_model='dguzh/geo-all-distilroberta-v1'):
+        self.ensure_spacy_model(spacy_model)
+        self.nlp = spacy.load(spacy_model)
+        self.transformer = SentenceTransformer(transformer_model)
+        self.index_file = pkg_resources.resource_filename('geoparser', 'index.pkl')
+        self.geonames_file = pkg_resources.resource_filename('geoparser', 'geonames/allCountries.txt')
+        self.index = self.load_index()
+        self.tokenizer = self.transformer.tokenizer
+        self.model_max_length = self.tokenizer.model_max_length
+    def ensure_spacy_model(self, model_name):
+        if not spacy.util.is_package(model_name):
+            print(f"Downloading spaCy model '{model_name}'...")
+            spacy.cli.download(model_name)
+    def normalize_name(self, name):
+        name = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode('ascii')
+        name = re.sub(r"[^\w\s]", "", name)  # remove all punctuation
+        name = re.sub(r"\s+", " ", name).strip()  # normalize whitespaces and strip
+        return name.lower()  # convert to lowercase
+    def load_index(self):
+        if os.path.exists(self.index_file):
+            with open(self.index_file, 'rb') as file:
+                return pickle.load(file)
+        else:
+            index = self.build_index()
+            with open(self.index_file, 'wb') as file:
+                pickle.dump(index, file)
+            return index
+    def build_index(self):
+        index = {}
+        with open(self.geonames_file, 'r', encoding='utf-8') as file:
+            total_lines = sum(1 for line in file)
+        with open(self.geonames_file, 'r', encoding='utf-8') as file:
+            for line in tqdm(file, total=total_lines, desc="Building index", unit=" lines"):
+                columns = line.strip().split('\t')
+                geonameid = int(columns[0])
+                names = [columns[1]] + columns[3].split(',')
+                for name in names:
+                    normalized_name = self.normalize_name(name)
+                    if normalized_name:
+                        if normalized_name not in index:
+                            index[normalized_name] = set()
+                        index[normalized_name].add(geonameid)
+        return index
+    def parse(self, texts: List[str]):
+        documents = [Document(text) for text in texts]
+        for document in documents:
+            self.extract_toponyms(document)
+        self.resolve_toponyms(documents)
+        return documents
+    def extract_toponyms(self, document: Document):
+        doc = self.nlp(document.text)
+        sentences = list(doc.sents)
+        total_tokens = len(self.tokenizer.tokenize(document.text))
+        for ent in doc.ents:
+            if ent.label_ in ['GPE', 'LOC', 'FAC']:
+                text, start_char, end_char = self.clean_ent(ent)
+                context = document.text if total_tokens <= self.model_max_length else self.truncate_context(sentences, ent)
+                document.toponyms.append(Toponym(text, start_char, end_char, context))
+    def clean_ent(self, ent):
+        # remove leading lowercase 'the'
+        original_text = ent.text
+        new_text = re.sub(r"^the\s+", "", original_text)
+        if new_text != original_text:
+            new_text = new_text.lstrip()
+            start_char = ent.start_char + (len(original_text) - len(new_text))
+        else:
+            start_char = ent.start_char
+        # remove trailing possessive 's
+        original_text = new_text
+        new_text = re.sub(r"\'s$", "", original_text)
+        if new_text != original_text:
+            new_text = new_text.rstrip()
+            end_char = start_char + len(new_text)
+        else:
+            end_char = start_char + len(original_text)
+        return new_text, start_char, end_char
+    def truncate_context(self, sentences, ent):
+        # Find the sentence containing the toponym
+        target_sentence = next((s for s in sentences if s.start_char <= ent.start_char and s.end_char >= ent.end_char), None)
+        if not target_sentence:
+            return ""
+        target_index = sentences.index(target_sentence)
+        token_limit = self.model_max_length
+        context_sentences = [target_sentence.text]
+        tokens_count = len(self.tokenizer.tokenize(target_sentence.text))
+        # Expand context by adding sentences before and after the toponym sentence
+        i, j = target_index, target_index
+        while True:
+            expanded = False
+            if i > 0:
+                prev_tokens = self.tokenizer.tokenize(sentences[i - 1].text)
+                if tokens_count + len(prev_tokens) < token_limit:
+                    context_sentences.insert(0, sentences[i - 1].text)
+                    tokens_count += len(prev_tokens)
+                    i -= 1
+                    expanded = True
+            if j < len(sentences) - 1:
+                next_tokens = self.tokenizer.tokenize(sentences[j + 1].text)
+                if tokens_count + len(next_tokens) < token_limit:
+                    context_sentences.append(sentences[j + 1].text)
+                    tokens_count += len(next_tokens)
+                    j += 1
+                    expanded = True
+            # Break if no sentences were added in the last iteration
+            if not expanded:
+                break
+        return ' '.join(context_sentences)
+    def query_index(self, toponym: str) -> set:
+        normalized_toponym = self.normalize_name(toponym)
+        return self.index.get(normalized_toponym, set())
+    def resolve_toponyms(self, documents: List[Document]):
+        all_candidates = set()
+        for document in documents:
+            for toponym in document.toponyms:
+                candidates = self.query_index(toponym.name)
+                all_candidates.update(candidates)
+                toponym.candidates = candidates
+        if all_candidates:
+            gazetteer = Gazetteer()
+            gazetteer.load(all_candidates)
+            pseudotexts = gazetteer.data['pseudotext'].tolist()
+            candidate_embeddings = self.transformer.encode(pseudotexts, batch_size=8, show_progress_bar=True, convert_to_tensor=True)
+            candidate_embeddings_lookup = dict(zip(gazetteer.data['geonameid'], candidate_embeddings))
+            contexts = [toponym.context for document in documents for toponym in document.toponyms]
+            toponym_embeddings = self.transformer.encode(contexts, batch_size=8, show_progress_bar=True, convert_to_tensor=True)
+            toponym_index = 0
+            for document in documents:
+                for toponym in document.toponyms:
+                    candidates = list(toponym.candidates)
+                    candidate_embeddings = [candidate_embeddings_lookup[geonameid] for geonameid in candidates if geonameid in candidate_embeddings_lookup]
+                    if candidate_embeddings:
+                        candidate_embeddings = torch.stack(candidate_embeddings)
+                        similarities = util.cos_sim(toponym_embeddings[toponym_index], candidate_embeddings)
+                        predicted_index = torch.argmax(similarities).item()
+                        predicted_geonameid = candidates[predicted_index]
+                        # Step 7: Create a Location object for the best match and assign to the toponym
+                        predicted_location = gazetteer.data.loc[gazetteer.data['geonameid'] == predicted_geonameid].iloc[0]
+                        toponym.location = Location(
+                            geonameid=predicted_geonameid,
+                            name=predicted_location['name'],
+                            admin2_geonameid=predicted_location['admin2_geonameid'],
+                            admin2_name=predicted_location['admin2_name'],
+                            admin1_geonameid=predicted_location['admin1_geonameid'],
+                            admin1_name=predicted_location['admin1_name'],
+                            country_geonameid=predicted_location['country_geonameid'],
+                            country_name=predicted_location['country_name'],
+                            feature_name=predicted_location['feature_name'],
+                            latitude=predicted_location['latitude'],
+                            longitude=predicted_location['longitude'],
+                            elevation=predicted_location['elevation'],
+                            population=predicted_location['population']
+                        )
+                    toponym_index += 1

geoparser-0.1.0/geoparser.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,15 @@
+Metadata-Version: 2.1
+Name: geoparser
+Version: 0.1.0
+Summary: A geoparsing library for English texts
+Author: Diego Gomes
+Author-email: diego.gomes@uzh.ch
+Classifier: Development Status :: 2 - Pre-Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+License-File: LICENSE
+# geoparser
+A geoparsing library for English texts

geoparser-0.1.0/geoparser.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,12 @@
+LICENSE
+README.md
+setup.py
+geoparser/__init__.py
+geoparser/entities.py
+geoparser/gazetteer.py
+geoparser/geoparser.py
+geoparser.egg-info/PKG-INFO
+geoparser.egg-info/SOURCES.txt
+geoparser.egg-info/dependency_links.txt
+geoparser.egg-info/requires.txt
+geoparser.egg-info/top_level.txt

geoparser-0.1.0/geoparser.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

geoparser-0.1.0/geoparser.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,7 @@
+pandas
+numpy
+spacy
+sentence_transformers
+tqdm
+torch
+requests

geoparser-0.1.0/geoparser.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ geoparser

geoparser-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

geoparser-0.1.0/setup.py ADDED Viewed

@@ -0,0 +1,74 @@
+import pkg_resources
+import requests
+import zipfile
+import os
+import subprocess
+import sys
+from setuptools import setup, find_packages
+from setuptools.command.install import install
+class PostInstallCommand(install):
+    def run(self):
+        install.run(self)
+        self.download_spacy()
+        self.download_geonames()
+    def download_spacy(self):
+        models = ["en_core_web_sm", "en_core_web_trf"]
+        for model in models:
+            subprocess.check_call([sys.executable, "-m", "spacy", "download", model])
+    def download_geonames(self):
+        site_packages = os.path.join(sys.prefix, 'lib', 'site-packages')
+        data_dir = os.path.join(site_packages, 'geoparser', 'geonames')
+        os.makedirs(data_dir, exist_ok=True)
+        file_links = [
+            "http://download.geonames.org/export/dump/allCountries.zip",
+            "http://download.geonames.org/export/dump/admin1CodesASCII.txt",
+            "http://download.geonames.org/export/dump/admin2Codes.txt",
+            "http://download.geonames.org/export/dump/countryInfo.txt",
+            "http://download.geonames.org/export/dump/featureCodes_en.txt"
+        ]
+        for url in file_links:
+            filename = url.split('/')[-1]
+            file_path = os.path.join(data_dir, filename)
+            response = requests.get(url)
+            with open(file_path, 'wb') as f:
+                f.write(response.content)
+            if filename.endswith('.zip'):
+                with zipfile.ZipFile(file_path, 'r') as zip_ref:
+                    zip_ref.extractall(data_dir)
+                os.remove(file_path)
+setup(
+    name='geoparser',
+    version='0.1.0',
+    author='Diego Gomes',
+    author_email='diego.gomes@uzh.ch',
+    packages=find_packages(),
+    description='A geoparsing library for English texts',
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
+    install_requires=[
+        'pandas',
+        'numpy',
+        'spacy',
+        'sentence_transformers',
+        'tqdm',
+        'torch',
+        'requests'
+    ],
+    python_requires='>=3.9',
+    classifiers=[
+        'Development Status :: 2 - Pre-Alpha',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: MIT License',
+    ],
+    cmdclass={
+        'install': PostInstallCommand,
+    }
+)