nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,94 @@
|
|
1
|
+
import logging
|
2
|
+
from requests import Session
|
3
|
+
from importlib import import_module
|
4
|
+
from typing import Iterable, Generator, Optional, Type, cast
|
5
|
+
from followthemoney import DS, SE
|
6
|
+
|
7
|
+
from nomenklatura.cache import Cache
|
8
|
+
from nomenklatura.matching import DefaultAlgorithm
|
9
|
+
from nomenklatura.enrich.common import Enricher, EnricherConfig
|
10
|
+
from nomenklatura.enrich.common import EnrichmentAbort, EnrichmentException
|
11
|
+
from nomenklatura.judgement import Judgement
|
12
|
+
from nomenklatura.matching.types import ScoringConfig
|
13
|
+
from nomenklatura.resolver import Resolver
|
14
|
+
|
15
|
+
log = logging.getLogger(__name__)
|
16
|
+
__all__ = [
|
17
|
+
"Enricher",
|
18
|
+
"EnrichmentAbort",
|
19
|
+
"EnrichmentException",
|
20
|
+
"make_enricher",
|
21
|
+
"enrich",
|
22
|
+
"match",
|
23
|
+
]
|
24
|
+
|
25
|
+
|
26
|
+
def make_enricher(
|
27
|
+
dataset: DS,
|
28
|
+
cache: Cache,
|
29
|
+
config: EnricherConfig,
|
30
|
+
http_session: Optional[Session] = None,
|
31
|
+
) -> Enricher[DS]:
|
32
|
+
enricher_type = config.pop("type")
|
33
|
+
if ":" not in enricher_type:
|
34
|
+
raise RuntimeError("Invalid import path: %r" % enricher_type)
|
35
|
+
module_name, clazz_name = enricher_type.split(":", 1)
|
36
|
+
module = import_module(module_name)
|
37
|
+
clazz = getattr(module, clazz_name)
|
38
|
+
if clazz is None or not issubclass(clazz, Enricher):
|
39
|
+
raise RuntimeError("Invalid enricher: %r" % enricher_type)
|
40
|
+
enr_clazz = cast(Type[Enricher[DS]], clazz)
|
41
|
+
return enr_clazz(dataset, cache, config, session=http_session)
|
42
|
+
|
43
|
+
|
44
|
+
# nk match -i entities.json -o entities-with-matches.json -r resolver.json
|
45
|
+
# then:
|
46
|
+
# nk dedupe -i entities-with-matches.json -r resolver.json
|
47
|
+
def match(
|
48
|
+
enricher: Enricher[DS],
|
49
|
+
resolver: Resolver[SE],
|
50
|
+
entities: Iterable[SE],
|
51
|
+
config: Optional[ScoringConfig] = None,
|
52
|
+
) -> Generator[SE, None, None]:
|
53
|
+
if config is None:
|
54
|
+
config = ScoringConfig.defaults()
|
55
|
+
for entity in entities:
|
56
|
+
yield entity
|
57
|
+
try:
|
58
|
+
for match in enricher.match_wrapped(entity):
|
59
|
+
if entity.id is None or match.id is None:
|
60
|
+
continue
|
61
|
+
if not resolver.check_candidate(entity.id, match.id):
|
62
|
+
continue
|
63
|
+
if not entity.schema.can_match(match.schema):
|
64
|
+
continue
|
65
|
+
result = DefaultAlgorithm.compare(entity, match, config)
|
66
|
+
log.info("Match [%s]: %.2f -> %s", entity, result.score, match)
|
67
|
+
resolver.suggest(entity.id, match.id, result.score)
|
68
|
+
match.datasets.add(enricher.dataset.name)
|
69
|
+
match = resolver.apply(match)
|
70
|
+
yield match
|
71
|
+
except EnrichmentException:
|
72
|
+
log.exception("Failed to match: %r" % entity)
|
73
|
+
|
74
|
+
|
75
|
+
# nk enrich -i entities.json -r resolver.json -o combined.json
|
76
|
+
def enrich(
|
77
|
+
enricher: Enricher[DS], resolver: Resolver[SE], entities: Iterable[SE]
|
78
|
+
) -> Generator[SE, None, None]:
|
79
|
+
for entity in entities:
|
80
|
+
try:
|
81
|
+
for match in enricher.match_wrapped(entity):
|
82
|
+
if entity.id is None or match.id is None:
|
83
|
+
continue
|
84
|
+
judgement = resolver.get_judgement(match.id, entity.id)
|
85
|
+
if judgement != Judgement.POSITIVE:
|
86
|
+
continue
|
87
|
+
|
88
|
+
log.info("Enrich [%s]: %r", entity, match)
|
89
|
+
for adjacent in enricher.expand_wrapped(entity, match):
|
90
|
+
adjacent.datasets.add(enricher.dataset.name)
|
91
|
+
adjacent = resolver.apply(adjacent)
|
92
|
+
yield adjacent
|
93
|
+
except EnrichmentException:
|
94
|
+
log.exception("Failed to enrich: %r" % entity)
|
@@ -0,0 +1,141 @@
|
|
1
|
+
import os
|
2
|
+
import uuid
|
3
|
+
import logging
|
4
|
+
from banal import is_mapping, ensure_list, hash_data
|
5
|
+
from typing import Any, Dict, cast, Generator, Optional
|
6
|
+
from urllib.parse import urljoin
|
7
|
+
from functools import cached_property
|
8
|
+
from followthemoney.exc import InvalidData
|
9
|
+
from followthemoney.namespace import Namespace
|
10
|
+
from followthemoney import DS, SE
|
11
|
+
from requests import Session
|
12
|
+
from rigour.urls import build_url
|
13
|
+
|
14
|
+
from nomenklatura.cache import Cache
|
15
|
+
from nomenklatura.enrich.common import Enricher, EnricherConfig
|
16
|
+
|
17
|
+
log = logging.getLogger(__name__)
|
18
|
+
|
19
|
+
|
20
|
+
class AlephEnricher(Enricher[DS]):
|
21
|
+
def __init__(
|
22
|
+
self,
|
23
|
+
dataset: DS,
|
24
|
+
cache: Cache,
|
25
|
+
config: EnricherConfig,
|
26
|
+
session: Optional[Session] = None,
|
27
|
+
):
|
28
|
+
super().__init__(dataset, cache, config, session)
|
29
|
+
self._host: str = os.environ.get("ALEPH_HOST", "https://aleph.occrp.org/")
|
30
|
+
self._host = self.get_config_expand("host") or self._host
|
31
|
+
self._base_url: str = urljoin(self._host, "/api/2/")
|
32
|
+
self._collection: Optional[str] = self.get_config_expand("collection")
|
33
|
+
self._ns: Optional[Namespace] = None
|
34
|
+
if self.get_config_bool("strip_namespace"):
|
35
|
+
self._ns = Namespace()
|
36
|
+
self._api_key: Optional[str] = os.environ.get("ALEPH_API_KEY")
|
37
|
+
self._api_key = self.get_config_expand("api_key") or self._api_key
|
38
|
+
if self._api_key is not None:
|
39
|
+
self.session.headers["Authorization"] = f"ApiKey {self._api_key}"
|
40
|
+
self.session.headers["X-Aleph-Session"] = str(uuid.uuid4())
|
41
|
+
|
42
|
+
@cached_property
|
43
|
+
def collection_id(self) -> Optional[str]:
|
44
|
+
if self._collection is None:
|
45
|
+
return None
|
46
|
+
url = urljoin(self._base_url, "collections")
|
47
|
+
url = build_url(url, {"filter:foreign_id": self._collection})
|
48
|
+
res = self.session.get(url)
|
49
|
+
res.raise_for_status()
|
50
|
+
response = res.json()
|
51
|
+
for result in response.get("results", []):
|
52
|
+
return cast(str, result["id"])
|
53
|
+
return None
|
54
|
+
|
55
|
+
def load_aleph_entity(self, entity: SE, data: Dict[str, Any]) -> Optional[SE]:
|
56
|
+
data["referents"] = [data["id"]]
|
57
|
+
try:
|
58
|
+
proxy = super().load_entity(entity, data)
|
59
|
+
except InvalidData:
|
60
|
+
log.warning("Server model mismatch: %s" % data.get("schema"))
|
61
|
+
return None
|
62
|
+
links = data.get("links", {})
|
63
|
+
proxy.add("alephUrl", links.get("self"), quiet=True, cleaned=True)
|
64
|
+
collection = data.get("collection", {})
|
65
|
+
proxy.add("publisher", collection.get("label"), quiet=True, cleaned=True)
|
66
|
+
# clinks = collection.get("links", {})
|
67
|
+
# entity.add("publisherUrl", clinks.get("ui"), quiet=True, cleaned=True)
|
68
|
+
return proxy
|
69
|
+
|
70
|
+
def convert_nested(
|
71
|
+
self, entity: SE, data: Dict[str, Any]
|
72
|
+
) -> Generator[SE, None, None]:
|
73
|
+
proxy = self.load_aleph_entity(entity, data)
|
74
|
+
if proxy is not None:
|
75
|
+
if self._ns is not None:
|
76
|
+
entity = self._ns.apply(entity)
|
77
|
+
yield proxy
|
78
|
+
properties = data.get("properties", {})
|
79
|
+
for _, values in properties.items():
|
80
|
+
for value in ensure_list(values):
|
81
|
+
if is_mapping(value):
|
82
|
+
proxy = self.load_aleph_entity(entity, value)
|
83
|
+
if proxy is not None:
|
84
|
+
yield proxy
|
85
|
+
|
86
|
+
# def enrich_entity(self, entity):
|
87
|
+
# url = self.api._make_url("match")
|
88
|
+
# for page in range(10):
|
89
|
+
# data = self.post_match(url, entity)
|
90
|
+
# for res in data.get("results", []):
|
91
|
+
# proxy = self.convert_entity(res)
|
92
|
+
# yield self.make_match(entity, proxy)
|
93
|
+
|
94
|
+
# url = data.get("next")
|
95
|
+
# if url is None:
|
96
|
+
# break
|
97
|
+
|
98
|
+
# def expand_entity(self, entity):
|
99
|
+
# for url in entity.get("alephUrl", quiet=True):
|
100
|
+
# data = self.get_api(url)
|
101
|
+
# yield from self.convert_nested(data)
|
102
|
+
|
103
|
+
# _, entity_id = url.rsplit("/", 1)
|
104
|
+
# filters = (("entities", entity_id),)
|
105
|
+
# search_api = self.api._make_url("entities", filters=filters)
|
106
|
+
# while True:
|
107
|
+
# res = self.get_api(search_api)
|
108
|
+
# for data in ensure_list(res.get("results")):
|
109
|
+
# yield from self.convert_nested(data)
|
110
|
+
|
111
|
+
# search_api = res.get("next")
|
112
|
+
# if search_api is None:
|
113
|
+
# break
|
114
|
+
|
115
|
+
def match(self, entity: SE) -> Generator[SE, None, None]:
|
116
|
+
if not entity.schema.matchable:
|
117
|
+
return
|
118
|
+
url = urljoin(self._base_url, "match")
|
119
|
+
if self.collection_id is not None:
|
120
|
+
url = build_url(url, {"collection_ids": self.collection_id})
|
121
|
+
query = {
|
122
|
+
"schema": entity.schema.name,
|
123
|
+
"properties": entity.properties,
|
124
|
+
}
|
125
|
+
cache_id = entity.id or hash_data(query)
|
126
|
+
cache_key = f"{url}:{cache_id}"
|
127
|
+
response = self.http_post_json_cached(url, cache_key, query)
|
128
|
+
for result in response.get("results", []):
|
129
|
+
proxy = self.load_aleph_entity(entity, result)
|
130
|
+
if proxy is not None:
|
131
|
+
if self._ns is not None:
|
132
|
+
entity = self._ns.apply(entity)
|
133
|
+
yield proxy
|
134
|
+
|
135
|
+
def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
136
|
+
url = urljoin(self._base_url, f"entities/{match.id}")
|
137
|
+
for aleph_url in match.get("alephUrl", quiet=True):
|
138
|
+
if aleph_url.startswith(self._base_url):
|
139
|
+
url = aleph_url.replace("/entities/", "/api/2/entities/")
|
140
|
+
response = self.http_get_json_cached(url)
|
141
|
+
yield from self.convert_nested(match, response)
|
@@ -0,0 +1,219 @@
|
|
1
|
+
import os
|
2
|
+
import json
|
3
|
+
import logging
|
4
|
+
import traceback
|
5
|
+
from banal import as_bool
|
6
|
+
from typing import Union, Any, Dict, Optional, Generator, Generic
|
7
|
+
from abc import ABC, abstractmethod
|
8
|
+
from requests import Session
|
9
|
+
from requests.exceptions import RequestException, ChunkedEncodingError
|
10
|
+
from followthemoney import DS, registry
|
11
|
+
from followthemoney import StatementEntity, SE
|
12
|
+
from followthemoney.types.topic import TopicType
|
13
|
+
from rigour.urls import build_url, ParamsType
|
14
|
+
|
15
|
+
from nomenklatura import __version__
|
16
|
+
from nomenklatura.cache import Cache
|
17
|
+
from nomenklatura.util import HeadersType
|
18
|
+
|
19
|
+
EnricherConfig = Dict[str, Any]
|
20
|
+
log = logging.getLogger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class EnrichmentException(Exception):
|
24
|
+
pass
|
25
|
+
|
26
|
+
|
27
|
+
class EnrichmentAbort(Exception):
|
28
|
+
pass
|
29
|
+
|
30
|
+
|
31
|
+
class BaseEnricher(Generic[DS]):
|
32
|
+
def __init__(self, dataset: DS, cache: Cache, config: EnricherConfig):
|
33
|
+
self.dataset = dataset
|
34
|
+
self.cache = cache
|
35
|
+
self.config = config
|
36
|
+
self.cache_days = int(config.pop("cache_days", 90))
|
37
|
+
self._filter_schemata = config.pop("schemata", [])
|
38
|
+
self._filter_topics = config.pop("topics", [])
|
39
|
+
|
40
|
+
def get_config_expand(
|
41
|
+
self, name: str, default: Optional[str] = None
|
42
|
+
) -> Optional[str]:
|
43
|
+
value = self.config.get(name, default)
|
44
|
+
if value is None:
|
45
|
+
return None
|
46
|
+
return str(os.path.expandvars(value))
|
47
|
+
|
48
|
+
def get_config_int(self, name: str, default: Union[int, str]) -> int:
|
49
|
+
return int(self.config.get(name, default))
|
50
|
+
|
51
|
+
def get_config_bool(self, name: str, default: Union[bool, str] = False) -> int:
|
52
|
+
return as_bool(self.config.get(name, default))
|
53
|
+
|
54
|
+
def _filter_entity(self, entity: StatementEntity) -> bool:
|
55
|
+
"""Check if the given entity should be filtered out. Filters
|
56
|
+
can be applied by schema or by topic."""
|
57
|
+
if len(self._filter_schemata):
|
58
|
+
if entity.schema.name not in self._filter_schemata:
|
59
|
+
return False
|
60
|
+
_filter_topics = set(self._filter_topics)
|
61
|
+
if "all" in _filter_topics:
|
62
|
+
assert isinstance(registry.topic, TopicType)
|
63
|
+
_filter_topics.update(registry.topic.names.keys())
|
64
|
+
if len(_filter_topics):
|
65
|
+
topics = set(entity.get_type_values(registry.topic))
|
66
|
+
if not len(topics.intersection(_filter_topics)):
|
67
|
+
return False
|
68
|
+
return True
|
69
|
+
|
70
|
+
|
71
|
+
class Enricher(BaseEnricher[DS], ABC):
|
72
|
+
def __init__(
|
73
|
+
self,
|
74
|
+
dataset: DS,
|
75
|
+
cache: Cache,
|
76
|
+
config: EnricherConfig,
|
77
|
+
session: Optional[Session] = None,
|
78
|
+
):
|
79
|
+
super().__init__(dataset, cache, config)
|
80
|
+
self._session: Optional[Session] = session
|
81
|
+
|
82
|
+
@property
|
83
|
+
def session(self) -> Session:
|
84
|
+
if self._session is None:
|
85
|
+
self._session = Session()
|
86
|
+
self._session.headers["User-Agent"] = f"nomenklatura/{__version__}"
|
87
|
+
return self._session
|
88
|
+
|
89
|
+
def http_get_cached(
|
90
|
+
self,
|
91
|
+
url: str,
|
92
|
+
params: ParamsType = None,
|
93
|
+
hidden: ParamsType = None,
|
94
|
+
cache_days: Optional[int] = None,
|
95
|
+
) -> str:
|
96
|
+
url = build_url(url, params=params)
|
97
|
+
cache_days_ = self.cache_days if cache_days is None else cache_days
|
98
|
+
response = self.cache.get(url, max_age=cache_days_)
|
99
|
+
if response is None:
|
100
|
+
log.debug("HTTP GET: %s", url)
|
101
|
+
hidden_url = build_url(url, params=hidden)
|
102
|
+
try:
|
103
|
+
resp = self.session.get(hidden_url)
|
104
|
+
resp.raise_for_status()
|
105
|
+
except RequestException as rex:
|
106
|
+
if rex.response is not None and rex.response.status_code in (401, 403):
|
107
|
+
raise EnrichmentAbort("Authorization failure: %s" % url) from rex
|
108
|
+
msg = "HTTP fetch failed [%s]: %s" % (url, rex)
|
109
|
+
log.info(f"{msg}\n{traceback.format_exc()}")
|
110
|
+
raise EnrichmentException(msg) from rex
|
111
|
+
response = resp.text
|
112
|
+
if cache_days_ > 0:
|
113
|
+
self.cache.set(url, response)
|
114
|
+
return response
|
115
|
+
|
116
|
+
def http_remove_cache(self, url: str, params: ParamsType = None) -> None:
|
117
|
+
url = build_url(url, params=params)
|
118
|
+
self.cache.delete(url)
|
119
|
+
|
120
|
+
def http_get_json_cached(
|
121
|
+
self,
|
122
|
+
url: str,
|
123
|
+
params: ParamsType = None,
|
124
|
+
hidden: ParamsType = None,
|
125
|
+
cache_days: Optional[int] = None,
|
126
|
+
) -> Any:
|
127
|
+
res = self.http_get_cached(url, params, hidden=hidden, cache_days=cache_days)
|
128
|
+
return json.loads(res)
|
129
|
+
|
130
|
+
def http_post_json_cached(
|
131
|
+
self,
|
132
|
+
url: str,
|
133
|
+
cache_key: str,
|
134
|
+
json: Any = None,
|
135
|
+
data: Any = None,
|
136
|
+
headers: HeadersType = None,
|
137
|
+
cache_days: Optional[int] = None,
|
138
|
+
retry_chunked_encoding_error: int = 1,
|
139
|
+
) -> Any:
|
140
|
+
cache_days_ = self.cache_days if cache_days is None else cache_days
|
141
|
+
resp_data = self.cache.get_json(cache_key, max_age=cache_days_)
|
142
|
+
if resp_data is None:
|
143
|
+
try:
|
144
|
+
resp = self.session.post(url, json=json, data=data, headers=headers)
|
145
|
+
resp.raise_for_status()
|
146
|
+
except ChunkedEncodingError as rex:
|
147
|
+
# Due to https://github.com/urllib3/urllib3/issues/2751#issuecomment-2567630065,
|
148
|
+
# urllib3's Retry strategy will not retry on chunked encoding errors.
|
149
|
+
# Since urllib won't retry it, retry it here.
|
150
|
+
# urllib does close the connection.
|
151
|
+
if (
|
152
|
+
"Response ended prematurely" in str(rex)
|
153
|
+
and retry_chunked_encoding_error > 0
|
154
|
+
):
|
155
|
+
log.info("Retrying due to chunked encoding error: %s", rex)
|
156
|
+
return self.http_post_json_cached(
|
157
|
+
url,
|
158
|
+
cache_key,
|
159
|
+
json=json,
|
160
|
+
data=data,
|
161
|
+
headers=headers,
|
162
|
+
cache_days=cache_days,
|
163
|
+
retry_chunked_encoding_error=retry_chunked_encoding_error - 1,
|
164
|
+
)
|
165
|
+
|
166
|
+
msg = "HTTP POST failed [%s]: %s" % (url, rex)
|
167
|
+
raise EnrichmentException(msg) from rex
|
168
|
+
except RequestException as rex:
|
169
|
+
if rex.response is not None and rex.response.status_code in (401, 403):
|
170
|
+
raise EnrichmentAbort("Authorization failure: %s" % url) from rex
|
171
|
+
|
172
|
+
msg = "HTTP POST failed [%s]: %s" % (url, rex)
|
173
|
+
log.info(f"{msg}\n{traceback.format_exc()}")
|
174
|
+
raise EnrichmentException(msg) from rex
|
175
|
+
resp_data = resp.json()
|
176
|
+
if cache_days_ > 0:
|
177
|
+
self.cache.set_json(cache_key, resp_data)
|
178
|
+
return resp_data
|
179
|
+
|
180
|
+
def _make_data_entity(
|
181
|
+
self, entity: SE, data: Dict[str, Any], cleaned: bool = True
|
182
|
+
) -> SE:
|
183
|
+
"""Create an entity which is of the same sub-type of SE as the given
|
184
|
+
query entity."""
|
185
|
+
return type(entity).from_data(self.dataset, data, cleaned=cleaned)
|
186
|
+
|
187
|
+
def load_entity(self, entity: SE, data: Dict[str, Any]) -> SE:
|
188
|
+
proxy = self._make_data_entity(entity, data, cleaned=False)
|
189
|
+
for prop in proxy.iterprops():
|
190
|
+
if prop.stub:
|
191
|
+
proxy.pop(prop)
|
192
|
+
return proxy
|
193
|
+
|
194
|
+
def make_entity(self, entity: SE, schema: str) -> SE:
|
195
|
+
"""Create a new entity of the given schema."""
|
196
|
+
return self._make_data_entity(entity, {"schema": schema})
|
197
|
+
|
198
|
+
def match_wrapped(self, entity: SE) -> Generator[SE, None, None]:
|
199
|
+
if not self._filter_entity(entity):
|
200
|
+
return
|
201
|
+
yield from self.match(entity)
|
202
|
+
|
203
|
+
def expand_wrapped(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
204
|
+
if not self._filter_entity(entity):
|
205
|
+
return
|
206
|
+
yield from self.expand(entity, match)
|
207
|
+
|
208
|
+
@abstractmethod
|
209
|
+
def match(self, entity: SE) -> Generator[SE, None, None]:
|
210
|
+
raise NotImplementedError()
|
211
|
+
|
212
|
+
@abstractmethod
|
213
|
+
def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
214
|
+
raise NotImplementedError()
|
215
|
+
|
216
|
+
def close(self) -> None:
|
217
|
+
self.cache.close()
|
218
|
+
if self._session is not None:
|
219
|
+
self._session.close()
|
@@ -0,0 +1,72 @@
|
|
1
|
+
import logging
|
2
|
+
from normality import squash_spaces
|
3
|
+
from typing import Any, Dict, Iterable, Generator, Optional
|
4
|
+
|
5
|
+
from requests import Session
|
6
|
+
from followthemoney import DS, SE
|
7
|
+
from followthemoney import StatementEntity
|
8
|
+
|
9
|
+
from nomenklatura.cache import Cache
|
10
|
+
from nomenklatura.enrich.common import Enricher, EnricherConfig
|
11
|
+
|
12
|
+
|
13
|
+
log = logging.getLogger(__name__)
|
14
|
+
NOMINATIM = "https://nominatim.openstreetmap.org/search.php"
|
15
|
+
|
16
|
+
|
17
|
+
class NominatimEnricher(Enricher[DS]):
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
dataset: DS,
|
21
|
+
cache: Cache,
|
22
|
+
config: EnricherConfig,
|
23
|
+
session: Optional[Session] = None,
|
24
|
+
):
|
25
|
+
super().__init__(dataset, cache, config, session)
|
26
|
+
self.cache.preload(f"{NOMINATIM}%")
|
27
|
+
|
28
|
+
def search_nominatim(self, address: StatementEntity) -> Iterable[Dict[str, Any]]:
|
29
|
+
for full in address.get("full"):
|
30
|
+
full_norm = squash_spaces(full)
|
31
|
+
if len(full_norm) < 5:
|
32
|
+
log.warning("Skipping tiny address: %s", full)
|
33
|
+
continue
|
34
|
+
params = {
|
35
|
+
"q": full_norm,
|
36
|
+
"countrycodes": address.get("country"),
|
37
|
+
"format": "jsonv2",
|
38
|
+
"accept-language": "en",
|
39
|
+
"addressdetails": 1,
|
40
|
+
}
|
41
|
+
results = self.http_get_json_cached(NOMINATIM, params)
|
42
|
+
log.info("OpenStreetMap geocoded [%s]: %d results", full, len(results))
|
43
|
+
for result in results:
|
44
|
+
yield result
|
45
|
+
# FIXME: only best result for now.
|
46
|
+
return
|
47
|
+
|
48
|
+
def match(self, entity: SE) -> Generator[SE, None, None]:
|
49
|
+
if not entity.schema.is_a("Address"):
|
50
|
+
return
|
51
|
+
|
52
|
+
for result in self.search_nominatim(entity):
|
53
|
+
# pprint(result)
|
54
|
+
addr = self.make_entity(entity, "Address")
|
55
|
+
osm_type = result.get("osm_type")
|
56
|
+
osm_id = result.get("osm_id")
|
57
|
+
if osm_id is None or osm_type is None:
|
58
|
+
continue
|
59
|
+
addr.id = f"osm-{osm_type}-{osm_id}"
|
60
|
+
addr.add("full", result["display_name"])
|
61
|
+
# addr.add("latitude", result.get("lat"))
|
62
|
+
# addr.add("longitude", result.get("lon"))
|
63
|
+
addr_data: Dict[str, str] = result.get("address", {})
|
64
|
+
addr.add("country", addr_data.get("country"))
|
65
|
+
addr.add("country", addr_data.get("country_code"))
|
66
|
+
addr.add("city", addr_data.get("city"))
|
67
|
+
addr.add("state", addr_data.get("state"))
|
68
|
+
addr.add("postalCode", addr_data.get("postcode"))
|
69
|
+
yield addr
|
70
|
+
|
71
|
+
def expand(self, entity: SE, match: SE) -> Generator[SE, None, None]:
|
72
|
+
yield match
|