nomenklatura-mpt 4.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nomenklatura/__init__.py +11 -0
- nomenklatura/cache.py +194 -0
- nomenklatura/cli.py +260 -0
- nomenklatura/conflicting_match.py +80 -0
- nomenklatura/data/er-unstable.pkl +0 -0
- nomenklatura/data/regression-v1.pkl +0 -0
- nomenklatura/db.py +139 -0
- nomenklatura/delta.py +4 -0
- nomenklatura/enrich/__init__.py +94 -0
- nomenklatura/enrich/aleph.py +141 -0
- nomenklatura/enrich/common.py +219 -0
- nomenklatura/enrich/nominatim.py +72 -0
- nomenklatura/enrich/opencorporates.py +233 -0
- nomenklatura/enrich/openfigi.py +124 -0
- nomenklatura/enrich/permid.py +201 -0
- nomenklatura/enrich/wikidata.py +268 -0
- nomenklatura/enrich/yente.py +116 -0
- nomenklatura/exceptions.py +9 -0
- nomenklatura/index/__init__.py +5 -0
- nomenklatura/index/common.py +24 -0
- nomenklatura/index/entry.py +89 -0
- nomenklatura/index/index.py +170 -0
- nomenklatura/index/tokenizer.py +92 -0
- nomenklatura/judgement.py +21 -0
- nomenklatura/kv.py +40 -0
- nomenklatura/matching/__init__.py +47 -0
- nomenklatura/matching/bench.py +32 -0
- nomenklatura/matching/compare/__init__.py +0 -0
- nomenklatura/matching/compare/addresses.py +71 -0
- nomenklatura/matching/compare/countries.py +15 -0
- nomenklatura/matching/compare/dates.py +83 -0
- nomenklatura/matching/compare/gender.py +15 -0
- nomenklatura/matching/compare/identifiers.py +30 -0
- nomenklatura/matching/compare/names.py +157 -0
- nomenklatura/matching/compare/util.py +51 -0
- nomenklatura/matching/compat.py +66 -0
- nomenklatura/matching/erun/__init__.py +0 -0
- nomenklatura/matching/erun/countries.py +42 -0
- nomenklatura/matching/erun/identifiers.py +64 -0
- nomenklatura/matching/erun/misc.py +71 -0
- nomenklatura/matching/erun/model.py +110 -0
- nomenklatura/matching/erun/names.py +126 -0
- nomenklatura/matching/erun/train.py +135 -0
- nomenklatura/matching/erun/util.py +28 -0
- nomenklatura/matching/logic_v1/__init__.py +0 -0
- nomenklatura/matching/logic_v1/identifiers.py +104 -0
- nomenklatura/matching/logic_v1/model.py +76 -0
- nomenklatura/matching/logic_v1/multi.py +21 -0
- nomenklatura/matching/logic_v1/phonetic.py +142 -0
- nomenklatura/matching/logic_v2/__init__.py +0 -0
- nomenklatura/matching/logic_v2/identifiers.py +124 -0
- nomenklatura/matching/logic_v2/model.py +98 -0
- nomenklatura/matching/logic_v2/names/__init__.py +3 -0
- nomenklatura/matching/logic_v2/names/analysis.py +51 -0
- nomenklatura/matching/logic_v2/names/distance.py +181 -0
- nomenklatura/matching/logic_v2/names/magic.py +60 -0
- nomenklatura/matching/logic_v2/names/match.py +195 -0
- nomenklatura/matching/logic_v2/names/pairing.py +81 -0
- nomenklatura/matching/logic_v2/names/util.py +89 -0
- nomenklatura/matching/name_based/__init__.py +4 -0
- nomenklatura/matching/name_based/misc.py +86 -0
- nomenklatura/matching/name_based/model.py +59 -0
- nomenklatura/matching/name_based/names.py +59 -0
- nomenklatura/matching/pairs.py +42 -0
- nomenklatura/matching/regression_v1/__init__.py +0 -0
- nomenklatura/matching/regression_v1/misc.py +75 -0
- nomenklatura/matching/regression_v1/model.py +110 -0
- nomenklatura/matching/regression_v1/names.py +63 -0
- nomenklatura/matching/regression_v1/train.py +87 -0
- nomenklatura/matching/regression_v1/util.py +31 -0
- nomenklatura/matching/svm_v1/__init__.py +5 -0
- nomenklatura/matching/svm_v1/misc.py +94 -0
- nomenklatura/matching/svm_v1/model.py +168 -0
- nomenklatura/matching/svm_v1/names.py +81 -0
- nomenklatura/matching/svm_v1/train.py +186 -0
- nomenklatura/matching/svm_v1/util.py +30 -0
- nomenklatura/matching/types.py +227 -0
- nomenklatura/matching/util.py +62 -0
- nomenklatura/publish/__init__.py +0 -0
- nomenklatura/publish/dates.py +49 -0
- nomenklatura/publish/edges.py +32 -0
- nomenklatura/py.typed +0 -0
- nomenklatura/resolver/__init__.py +6 -0
- nomenklatura/resolver/common.py +2 -0
- nomenklatura/resolver/edge.py +107 -0
- nomenklatura/resolver/identifier.py +60 -0
- nomenklatura/resolver/linker.py +101 -0
- nomenklatura/resolver/resolver.py +565 -0
- nomenklatura/settings.py +17 -0
- nomenklatura/store/__init__.py +41 -0
- nomenklatura/store/base.py +130 -0
- nomenklatura/store/level.py +272 -0
- nomenklatura/store/memory.py +102 -0
- nomenklatura/store/redis_.py +131 -0
- nomenklatura/store/sql.py +219 -0
- nomenklatura/store/util.py +48 -0
- nomenklatura/store/versioned.py +371 -0
- nomenklatura/tui/__init__.py +17 -0
- nomenklatura/tui/app.py +294 -0
- nomenklatura/tui/app.tcss +52 -0
- nomenklatura/tui/comparison.py +81 -0
- nomenklatura/tui/util.py +35 -0
- nomenklatura/util.py +26 -0
- nomenklatura/versions.py +119 -0
- nomenklatura/wikidata/__init__.py +14 -0
- nomenklatura/wikidata/client.py +122 -0
- nomenklatura/wikidata/lang.py +94 -0
- nomenklatura/wikidata/model.py +139 -0
- nomenklatura/wikidata/props.py +70 -0
- nomenklatura/wikidata/qualified.py +49 -0
- nomenklatura/wikidata/query.py +66 -0
- nomenklatura/wikidata/value.py +87 -0
- nomenklatura/xref.py +125 -0
- nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
- nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
- nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
- nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
- nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
import json
|
2
|
+
import logging
|
3
|
+
from functools import lru_cache
|
4
|
+
from typing import Any, List, Optional, Dict
|
5
|
+
from requests import Session
|
6
|
+
from normality import squash_spaces
|
7
|
+
from rigour.urls import build_url
|
8
|
+
from nomenklatura.cache import Cache
|
9
|
+
from nomenklatura.wikidata.lang import LangText
|
10
|
+
from nomenklatura.wikidata.model import Item
|
11
|
+
from nomenklatura.wikidata.query import SparqlResponse
|
12
|
+
|
13
|
+
log = logging.getLogger(__name__)
|
14
|
+
|
15
|
+
|
16
|
+
class WikidataClient(object):
|
17
|
+
WD_API = "https://www.wikidata.org/w/api.php"
|
18
|
+
QUERY_API = "https://query.wikidata.org/sparql"
|
19
|
+
QUERY_HEADERS = {
|
20
|
+
"Accept": "application/sparql-results+json",
|
21
|
+
}
|
22
|
+
CACHE_SHORT = 1
|
23
|
+
CACHE_MEDIUM = CACHE_SHORT * 7
|
24
|
+
CACHE_LONG = CACHE_SHORT * 30
|
25
|
+
|
26
|
+
LABEL_PREFIX = "wd:lb:"
|
27
|
+
LABEL_CACHE_DAYS = 100
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self, cache: Cache, session: Optional[Session] = None, cache_days: int = 14
|
31
|
+
) -> None:
|
32
|
+
self.cache = cache
|
33
|
+
self.session = session or Session()
|
34
|
+
self.cache_days = cache_days
|
35
|
+
# self.cache.preload(f"{self.LABEL_PREFIX}%")
|
36
|
+
|
37
|
+
@lru_cache(maxsize=1000)
|
38
|
+
def fetch_item(self, qid: str) -> Optional[Item]:
|
39
|
+
# https://www.mediawiki.org/wiki/Wikibase/API
|
40
|
+
# https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
|
41
|
+
params = {"format": "json", "ids": qid, "action": "wbgetentities"}
|
42
|
+
url = build_url(self.WD_API, params=params)
|
43
|
+
raw = self.cache.get(url, max_age=self.cache_days)
|
44
|
+
if raw is None:
|
45
|
+
res = self.session.get(url)
|
46
|
+
res.raise_for_status()
|
47
|
+
raw = res.text
|
48
|
+
self.cache.set(url, raw)
|
49
|
+
data = json.loads(raw)
|
50
|
+
entity = data.get("entities", {}).get(qid)
|
51
|
+
if entity is None:
|
52
|
+
return None
|
53
|
+
return Item(self, entity)
|
54
|
+
|
55
|
+
@lru_cache(maxsize=100000)
|
56
|
+
def get_label(self, qid: str) -> LangText:
|
57
|
+
cache_key = f"{self.LABEL_PREFIX}{qid}"
|
58
|
+
cached = self.cache.get_json(cache_key, max_age=self.LABEL_CACHE_DAYS)
|
59
|
+
if cached is not None:
|
60
|
+
return LangText.parse(cached)
|
61
|
+
params = {
|
62
|
+
"format": "json",
|
63
|
+
"ids": qid,
|
64
|
+
"action": "wbgetentities",
|
65
|
+
"props": "labels",
|
66
|
+
}
|
67
|
+
url = build_url(self.WD_API, params=params)
|
68
|
+
res = self.session.get(url)
|
69
|
+
res.raise_for_status()
|
70
|
+
data: Dict[str, Any] = res.json()
|
71
|
+
entity = data.get("entities", {}).get(qid)
|
72
|
+
if entity is None:
|
73
|
+
return LangText(None)
|
74
|
+
labels = LangText.from_dict(entity.get("labels", {}))
|
75
|
+
label = LangText.pick(labels)
|
76
|
+
if label is None:
|
77
|
+
label = LangText(qid)
|
78
|
+
label.original = qid
|
79
|
+
self.cache.set_json(cache_key, label.pack())
|
80
|
+
return label
|
81
|
+
|
82
|
+
def query(self, query_text: str) -> SparqlResponse:
|
83
|
+
"""Query the Wikidata SPARQL endpoint."""
|
84
|
+
clean_text = squash_spaces(query_text)
|
85
|
+
if len(clean_text) == 0:
|
86
|
+
raise RuntimeError("Invalid query: %r" % query_text)
|
87
|
+
params = {"query": clean_text}
|
88
|
+
url = build_url(self.QUERY_API, params=params)
|
89
|
+
raw = self.cache.get(url, max_age=self.cache_days)
|
90
|
+
if raw is None:
|
91
|
+
res = self.session.get(url, headers=self.QUERY_HEADERS)
|
92
|
+
res.raise_for_status()
|
93
|
+
raw = res.text
|
94
|
+
self.cache.set(url, raw)
|
95
|
+
try:
|
96
|
+
data = json.loads(raw)
|
97
|
+
except json.JSONDecodeError as err:
|
98
|
+
self.cache.delete(url)
|
99
|
+
log.exception("Failed to parse JSON: %s", err)
|
100
|
+
return SparqlResponse(clean_text, {})
|
101
|
+
return SparqlResponse(clean_text, data)
|
102
|
+
|
103
|
+
@lru_cache(maxsize=30000)
|
104
|
+
def _type_props(self, qid: str) -> List[str]:
|
105
|
+
item = self.fetch_item(qid)
|
106
|
+
if item is None:
|
107
|
+
return []
|
108
|
+
types: List[str] = []
|
109
|
+
for claim in item.claims:
|
110
|
+
# historical countries are always historical:
|
111
|
+
ended = claim.qualifiers.get("P582") is not None and claim.qid != "Q3024240"
|
112
|
+
if ended or claim.qid is None:
|
113
|
+
continue
|
114
|
+
if claim.property in ("P31", "P279"):
|
115
|
+
types.append(claim.qid)
|
116
|
+
return types
|
117
|
+
|
118
|
+
def __repr__(self) -> str:
|
119
|
+
return "<WikidataClient()>"
|
120
|
+
|
121
|
+
def __hash__(self) -> int:
|
122
|
+
return 42
|
@@ -0,0 +1,94 @@
|
|
1
|
+
import logging
|
2
|
+
from rigour.langs import PREFERRED_LANGS
|
3
|
+
from typing import Callable, Dict, Iterable, List, Optional, Any, Set
|
4
|
+
from followthemoney import registry, StatementEntity
|
5
|
+
from normality.cleaning import remove_unsafe_chars
|
6
|
+
|
7
|
+
log = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class LangText(object):
|
11
|
+
__slots__ = ["text", "lang", "original"]
|
12
|
+
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
text: Optional[str],
|
16
|
+
lang: Optional[str] = None,
|
17
|
+
original: Optional[str] = None,
|
18
|
+
) -> None:
|
19
|
+
if text is None or len(text.strip()) == 0:
|
20
|
+
text = None
|
21
|
+
if text is not None:
|
22
|
+
text = remove_unsafe_chars(text)
|
23
|
+
self.text = text
|
24
|
+
self.lang: Optional[str] = None
|
25
|
+
if lang is not None:
|
26
|
+
self.lang = registry.language.clean_text(lang)
|
27
|
+
if lang is not None and self.lang is None:
|
28
|
+
# Language is given, but it is not one supported by the FtM ecosystem:
|
29
|
+
self.text = None
|
30
|
+
self.original = original or self.text
|
31
|
+
|
32
|
+
def apply(
|
33
|
+
self,
|
34
|
+
entity: StatementEntity,
|
35
|
+
prop: str,
|
36
|
+
clean: Optional[Callable[[str], Optional[str]]] = None,
|
37
|
+
) -> None:
|
38
|
+
if self.text is None:
|
39
|
+
return
|
40
|
+
clean_text = self.text if clean is None else clean(self.text)
|
41
|
+
if clean_text is None or clean_text.strip() == "":
|
42
|
+
return
|
43
|
+
entity.add(prop, clean_text, lang=self.lang, original_value=self.original)
|
44
|
+
|
45
|
+
def pack(self) -> Dict[str, Optional[str]]:
|
46
|
+
data = {"t": self.text, "l": self.lang}
|
47
|
+
if self.original is not None and self.original != self.text:
|
48
|
+
data["o"] = self.original
|
49
|
+
return data
|
50
|
+
|
51
|
+
@classmethod
|
52
|
+
def parse(cls, data: Dict[str, Optional[str]]) -> "LangText":
|
53
|
+
return LangText(data["t"], data["l"], original=data.get("o"))
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
def pick(cls, texts: Iterable["LangText"]) -> Optional["LangText"]:
|
57
|
+
for lang in PREFERRED_LANGS:
|
58
|
+
for lt in texts:
|
59
|
+
if lt.lang == lang:
|
60
|
+
return lt
|
61
|
+
for lt in texts:
|
62
|
+
return lt
|
63
|
+
return None
|
64
|
+
|
65
|
+
@classmethod
|
66
|
+
def from_dict(cls, data: Dict[str, List[Dict[str, str]]]) -> Set["LangText"]:
|
67
|
+
langs: Set[LangText] = set()
|
68
|
+
for objs in data.values():
|
69
|
+
if not isinstance(objs, list):
|
70
|
+
objs = [objs]
|
71
|
+
for obj in objs:
|
72
|
+
value = obj["value"]
|
73
|
+
if value is None:
|
74
|
+
continue
|
75
|
+
lang = obj["language"]
|
76
|
+
lt = LangText(value, lang, original=value)
|
77
|
+
if lt.text is None:
|
78
|
+
continue
|
79
|
+
langs.add(lt)
|
80
|
+
return langs
|
81
|
+
|
82
|
+
def __str__(self) -> str:
|
83
|
+
if self.text is None:
|
84
|
+
return ""
|
85
|
+
return self.text
|
86
|
+
|
87
|
+
def __hash__(self) -> int:
|
88
|
+
return hash((self.text, self.lang, self.original))
|
89
|
+
|
90
|
+
def __eq__(self, other: Any) -> bool:
|
91
|
+
return hash(self) == hash(other)
|
92
|
+
|
93
|
+
def __repr__(self) -> str:
|
94
|
+
return f"<LangText({self.text!r}, {self.lang!r}, {self.original!r})>"
|
@@ -0,0 +1,139 @@
|
|
1
|
+
from normality import stringify
|
2
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
|
3
|
+
|
4
|
+
from nomenklatura.wikidata.value import snak_value_to_string
|
5
|
+
from nomenklatura.wikidata.lang import LangText
|
6
|
+
|
7
|
+
if TYPE_CHECKING:
|
8
|
+
from nomenklatura.wikidata.client import WikidataClient
|
9
|
+
|
10
|
+
|
11
|
+
class Snak(object):
|
12
|
+
"""Some Notation About Knowledge (TM)."""
|
13
|
+
|
14
|
+
def __init__(self, client: "WikidataClient", data: Dict[str, Any]):
|
15
|
+
self.client = client
|
16
|
+
datavalue = data.pop("datavalue", {})
|
17
|
+
self.value_type: str = datavalue.pop("type", None)
|
18
|
+
self._value = datavalue.pop("value", None)
|
19
|
+
data.pop("hash", None)
|
20
|
+
self.type = data.pop("datatype", None)
|
21
|
+
self.property: Optional[str] = data.pop("property", None)
|
22
|
+
self.snaktype = data.pop("snaktype", None)
|
23
|
+
# self._data = data
|
24
|
+
|
25
|
+
@property
|
26
|
+
def property_label(self) -> LangText:
|
27
|
+
return self.client.get_label(self.property)
|
28
|
+
|
29
|
+
@property
|
30
|
+
def qid(self) -> Optional[str]:
|
31
|
+
if self.value_type == "wikibase-entityid":
|
32
|
+
return stringify(self._value.get("id"))
|
33
|
+
return None
|
34
|
+
|
35
|
+
@property
|
36
|
+
def text(self) -> LangText:
|
37
|
+
return snak_value_to_string(self.client, self.value_type, self._value)
|
38
|
+
|
39
|
+
def __repr__(self) -> str:
|
40
|
+
return f"<Snak({self.qid}, {self.property}, {self.value_type})>"
|
41
|
+
|
42
|
+
|
43
|
+
class Reference(object):
|
44
|
+
def __init__(self, client: "WikidataClient", data: Dict[str, Any]) -> None:
|
45
|
+
self.snaks: Dict[str, List[Snak]] = {}
|
46
|
+
for prop, snak_data in data.pop("snaks", {}).items():
|
47
|
+
self.snaks[prop] = [Snak(client, s) for s in snak_data]
|
48
|
+
|
49
|
+
def get(self, prop: str) -> List[Snak]:
|
50
|
+
return self.snaks.get(prop, [])
|
51
|
+
|
52
|
+
|
53
|
+
class Claim(Snak):
|
54
|
+
def __init__(
|
55
|
+
self, client: "WikidataClient", data: Dict[str, Any], prop: str
|
56
|
+
) -> None:
|
57
|
+
self.id = data.pop("id")
|
58
|
+
self.rank = data.pop("rank")
|
59
|
+
super().__init__(client, data.pop("mainsnak"))
|
60
|
+
self.qualifiers: Dict[str, List[Snak]] = {}
|
61
|
+
for prop, snaks in data.pop("qualifiers", {}).items():
|
62
|
+
self.qualifiers[prop] = [Snak(client, s) for s in snaks]
|
63
|
+
|
64
|
+
self.references = [Reference(client, r) for r in data.pop("references", [])]
|
65
|
+
self.property = self.property or prop
|
66
|
+
|
67
|
+
def get_qualifier(self, prop: str) -> List[Snak]:
|
68
|
+
return self.qualifiers.get(prop, [])
|
69
|
+
|
70
|
+
@property
|
71
|
+
def is_ended(self) -> bool:
|
72
|
+
snak = self.qualifiers.get("P582")
|
73
|
+
if snak is not None and len(snak) > 0:
|
74
|
+
return True
|
75
|
+
return False
|
76
|
+
|
77
|
+
def __repr__(self) -> str:
|
78
|
+
return f"<Claim({self.qid}, {self.property}, {self.value_type})>"
|
79
|
+
|
80
|
+
def __hash__(self) -> int:
|
81
|
+
return hash((self.qid, self.property, self.id))
|
82
|
+
|
83
|
+
|
84
|
+
class Item(object):
|
85
|
+
"""A wikidata item (or entity)."""
|
86
|
+
|
87
|
+
def __init__(self, client: "WikidataClient", data: Dict[str, Any]) -> None:
|
88
|
+
self.client = client
|
89
|
+
self.id: str = data.pop("id")
|
90
|
+
self.modified: Optional[str] = data.pop("modified", None)
|
91
|
+
|
92
|
+
self.labels: Set[LangText] = LangText.from_dict(data.pop("labels", {}))
|
93
|
+
self.aliases: Set[LangText] = LangText.from_dict(data.pop("aliases", {}))
|
94
|
+
|
95
|
+
descriptions = LangText.from_dict(data.pop("descriptions", {}))
|
96
|
+
self.description = LangText.pick(descriptions)
|
97
|
+
|
98
|
+
self.claims: List[Claim] = []
|
99
|
+
claims: Dict[str, List[Dict[str, Any]]] = data.pop("claims", {})
|
100
|
+
for prop, values in claims.items():
|
101
|
+
for value in values:
|
102
|
+
self.claims.append(Claim(client, value, prop))
|
103
|
+
|
104
|
+
# TODO: get back to this later:
|
105
|
+
data.pop("sitelinks", None)
|
106
|
+
|
107
|
+
@property
|
108
|
+
def label(self) -> Optional[LangText]:
|
109
|
+
label = LangText.pick(self.labels)
|
110
|
+
if label is not None:
|
111
|
+
return label
|
112
|
+
return LangText.pick(self.aliases)
|
113
|
+
|
114
|
+
def is_instance(self, qid: str) -> bool:
|
115
|
+
for claim in self.claims:
|
116
|
+
if claim.property == "P31" and claim.qid == qid:
|
117
|
+
return True
|
118
|
+
return False
|
119
|
+
|
120
|
+
def _types(self, path: List[str]) -> Set[str]:
|
121
|
+
qid = path[-1]
|
122
|
+
types = set([qid])
|
123
|
+
if len(path) > 6:
|
124
|
+
return types
|
125
|
+
for type_ in self.client._type_props(qid):
|
126
|
+
if type_ not in path:
|
127
|
+
types.update(self._types(path + [type_]))
|
128
|
+
return types
|
129
|
+
|
130
|
+
@property
|
131
|
+
def types(self) -> Set[str]:
|
132
|
+
"""Get all the `instance of` and `subclass of` types for an item."""
|
133
|
+
return self._types([self.id])
|
134
|
+
|
135
|
+
def __repr__(self) -> str:
|
136
|
+
return f"<Item({self.id})>"
|
137
|
+
|
138
|
+
def __hash__(self) -> int:
|
139
|
+
return hash(self.id)
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# Query: https://w.wiki/4Z73
|
2
|
+
PROPS_FAMILY = {
|
3
|
+
"P7": "sibling",
|
4
|
+
"P9": "sibling",
|
5
|
+
"P22": "parent",
|
6
|
+
"P26": "spouse",
|
7
|
+
"P25": "parent",
|
8
|
+
"P40": "child",
|
9
|
+
"P43": "stepparent",
|
10
|
+
"P44": "stepparent",
|
11
|
+
"P451": "unmarried partner",
|
12
|
+
"P1038": "relative",
|
13
|
+
"P1290": "godparent",
|
14
|
+
"P3373": "sibling",
|
15
|
+
"P3448": "stepparent",
|
16
|
+
"P8810": "unspecified parent",
|
17
|
+
}
|
18
|
+
|
19
|
+
PROPS_ASSOCIATION = {
|
20
|
+
"P1327": "partner in business or sport",
|
21
|
+
"P3342": "significant person",
|
22
|
+
}
|
23
|
+
|
24
|
+
# https://www.wikidata.org/wiki/Wikidata:List_of_properties/human
|
25
|
+
PROPS_DIRECT = {
|
26
|
+
"P1477": "alias", # birth name
|
27
|
+
"P1813": "alias", # short name
|
28
|
+
"P2561": "alias", # name
|
29
|
+
"P1559": "alias", # name in native language
|
30
|
+
"P2562": "alias", # married name
|
31
|
+
"P511": "title",
|
32
|
+
"P735": "firstName",
|
33
|
+
"P734": "lastName",
|
34
|
+
"P1950": "lastName",
|
35
|
+
"P21": "gender",
|
36
|
+
"P39": "position",
|
37
|
+
"P140": "religion",
|
38
|
+
"P106": "topics",
|
39
|
+
"P569": "birthDate",
|
40
|
+
"P5056": "fatherName",
|
41
|
+
"P570": "deathDate",
|
42
|
+
"P19": "birthPlace",
|
43
|
+
"P856": "website",
|
44
|
+
"P512": "education",
|
45
|
+
"P69": "education",
|
46
|
+
"P27": "citizenship",
|
47
|
+
"P742": "weakAlias",
|
48
|
+
"P172": "ethnicity",
|
49
|
+
"P973": "sourceUrl",
|
50
|
+
"P1278": "leiCode",
|
51
|
+
"P17": "country",
|
52
|
+
"P571": "incorporationDate",
|
53
|
+
"P1454": "legalForm",
|
54
|
+
}
|
55
|
+
|
56
|
+
PROPS_QUALIFIED = (
|
57
|
+
"position",
|
58
|
+
"education",
|
59
|
+
)
|
60
|
+
|
61
|
+
PROPS_TOPICS = {
|
62
|
+
"Q82955": "role.pep",
|
63
|
+
"Q193391": "role.diplo",
|
64
|
+
# "Q392651": "role.spy",
|
65
|
+
"Q14886050": "crime.terror",
|
66
|
+
"Q16533": "role.judge",
|
67
|
+
"Q17276321": "role.pep", # member of the state duma
|
68
|
+
"Q189290": "mil", # military officer
|
69
|
+
"Q47064": "mil", # military personnel
|
70
|
+
}
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from typing import Set
|
2
|
+
from followthemoney.helpers import dates_years
|
3
|
+
|
4
|
+
from nomenklatura.wikidata.model import Claim
|
5
|
+
from nomenklatura.wikidata.lang import LangText
|
6
|
+
|
7
|
+
|
8
|
+
def post_summary(
|
9
|
+
position: LangText,
|
10
|
+
start_dates: Set[str],
|
11
|
+
end_dates: Set[str],
|
12
|
+
dates: Set[str],
|
13
|
+
) -> LangText:
|
14
|
+
"""Make a string summary for a Post object."""
|
15
|
+
start = min(dates_years(start_dates), default="")
|
16
|
+
end = min(dates_years(end_dates), default="")
|
17
|
+
date_range = None
|
18
|
+
if len(start) or len(end):
|
19
|
+
date_range = f"{start}-{end}"
|
20
|
+
dates_ = dates_years(dates)
|
21
|
+
if date_range is None and len(dates_):
|
22
|
+
date_range = ", ".join(sorted(dates_))
|
23
|
+
|
24
|
+
label = position.text
|
25
|
+
if date_range:
|
26
|
+
label = f"{label} ({date_range})"
|
27
|
+
original = position.text or position.original
|
28
|
+
return LangText(label, position.lang, original=original)
|
29
|
+
|
30
|
+
|
31
|
+
def qualify_value(value: LangText, claim: Claim) -> LangText:
|
32
|
+
if value.text is None:
|
33
|
+
return value
|
34
|
+
starts: Set[str] = set()
|
35
|
+
for qual in claim.get_qualifier("P580"):
|
36
|
+
if qual.text.text is not None:
|
37
|
+
starts.add(qual.text.text)
|
38
|
+
|
39
|
+
ends: Set[str] = set()
|
40
|
+
for qual in claim.get_qualifier("P582"):
|
41
|
+
if qual.text.text is not None:
|
42
|
+
ends.add(qual.text.text)
|
43
|
+
|
44
|
+
dates: Set[str] = set()
|
45
|
+
for qual in claim.get_qualifier("P585"):
|
46
|
+
if qual.text.text is not None:
|
47
|
+
dates.add(qual.text.text)
|
48
|
+
|
49
|
+
return post_summary(value, starts, ends, dates)
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from typing import Dict, Any, List, Optional
|
2
|
+
|
3
|
+
|
4
|
+
class SparqlValue(object):
|
5
|
+
WD_PREFIX = "http://www.wikidata.org/entity/"
|
6
|
+
|
7
|
+
__slots__ = ["type", "value", "lang"]
|
8
|
+
|
9
|
+
def __init__(self, data: Dict[str, Any]) -> None:
|
10
|
+
self.type: str = data["type"]
|
11
|
+
self.value: str = data["value"]
|
12
|
+
if self.type == "uri" and self.value.startswith(self.WD_PREFIX):
|
13
|
+
self.value = self.value[len(self.WD_PREFIX) :]
|
14
|
+
self.lang: Optional[str] = data.get("xml:lang")
|
15
|
+
|
16
|
+
def __str__(self) -> str:
|
17
|
+
return self.value
|
18
|
+
|
19
|
+
def __repr__(self) -> str:
|
20
|
+
return f"<SV({self.type!r}, {self.value!r})>"
|
21
|
+
|
22
|
+
def __hash__(self) -> int:
|
23
|
+
return hash(repr(self))
|
24
|
+
|
25
|
+
|
26
|
+
class SparqlBinding(object):
|
27
|
+
def __init__(self, response: "SparqlResponse", data: Dict[str, Any]) -> None:
|
28
|
+
self.response = response
|
29
|
+
self.values: Dict[str, SparqlValue] = {}
|
30
|
+
for var, value in data.items():
|
31
|
+
self.values[var] = SparqlValue(value)
|
32
|
+
|
33
|
+
def wrapped(self, var: str) -> Optional[SparqlValue]:
|
34
|
+
if var not in self.response.vars:
|
35
|
+
raise KeyError("No such var: %s (in: %r)" % (var, self.response.vars))
|
36
|
+
value = self.values.get(var)
|
37
|
+
if value is None:
|
38
|
+
return None
|
39
|
+
return value
|
40
|
+
|
41
|
+
def plain(self, var: str) -> Optional[str]:
|
42
|
+
if var not in self.response.vars:
|
43
|
+
raise KeyError("No such var: %s (in: %r)" % (var, self.response.vars))
|
44
|
+
if var in self.values:
|
45
|
+
value = self.wrapped(var)
|
46
|
+
if value is not None:
|
47
|
+
return str(value)
|
48
|
+
return None
|
49
|
+
|
50
|
+
def __repr__(self) -> str:
|
51
|
+
return f"<SparqlBinding({self.values!r})>"
|
52
|
+
|
53
|
+
|
54
|
+
class SparqlResponse(object):
|
55
|
+
def __init__(self, query: str, response: Dict[str, Any]) -> None:
|
56
|
+
self.query = query
|
57
|
+
self.vars: List[str] = response["head"]["vars"]
|
58
|
+
self.results: List[SparqlBinding] = []
|
59
|
+
for bind in response["results"]["bindings"]:
|
60
|
+
self.results.append(SparqlBinding(self, bind))
|
61
|
+
|
62
|
+
def __len__(self) -> int:
|
63
|
+
return len(self.results)
|
64
|
+
|
65
|
+
def __repr__(self) -> str:
|
66
|
+
return f"<SparqlResponse({self.vars!r}, {len(self)})>"
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import logging
|
2
|
+
from prefixdate import Precision
|
3
|
+
from typing import TYPE_CHECKING, Set, cast, Any, Dict, Optional
|
4
|
+
from rigour.ids.wikidata import is_qid
|
5
|
+
from rigour.text.cleaning import remove_emoji, remove_bracketed_text
|
6
|
+
from rigour.names import is_name
|
7
|
+
# from rigour.text.distance import is_levenshtein_plausible
|
8
|
+
|
9
|
+
from nomenklatura.wikidata.lang import LangText
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from nomenklatura.wikidata.client import WikidataClient
|
13
|
+
|
14
|
+
|
15
|
+
log = logging.getLogger(__name__)
|
16
|
+
PRECISION = {
|
17
|
+
11: Precision.DAY,
|
18
|
+
10: Precision.MONTH,
|
19
|
+
9: Precision.YEAR,
|
20
|
+
}
|
21
|
+
|
22
|
+
|
23
|
+
def snak_value_to_string(
|
24
|
+
client: "WikidataClient", value_type: Optional[str], value: Dict[str, Any]
|
25
|
+
) -> LangText:
|
26
|
+
if value_type is None:
|
27
|
+
return LangText(None)
|
28
|
+
elif value_type == "time":
|
29
|
+
raw_time = cast(Optional[str], value.get("time"))
|
30
|
+
if raw_time is None:
|
31
|
+
return LangText(None)
|
32
|
+
time = raw_time.strip("+")
|
33
|
+
prec_id = cast(int, value.get("precision"))
|
34
|
+
prec = PRECISION.get(prec_id, Precision.DAY)
|
35
|
+
time = time[: prec.value]
|
36
|
+
|
37
|
+
# Remove Jan 01, because it seems to be in input failure pattern
|
38
|
+
# with Wikidata (probably from bots that don't get "precision").
|
39
|
+
if time.endswith("-01-01"):
|
40
|
+
time = time[:4]
|
41
|
+
|
42
|
+
# Date limit in FtM. These will be removed by the death filter:
|
43
|
+
time = max("1001", time)
|
44
|
+
return LangText(time, original=raw_time)
|
45
|
+
elif value_type == "wikibase-entityid":
|
46
|
+
qid = value.get("id")
|
47
|
+
return client.get_label(qid)
|
48
|
+
elif value_type == "monolingualtext":
|
49
|
+
text = value.get("text")
|
50
|
+
if isinstance(text, str):
|
51
|
+
return LangText(text, lang=value.get("language"))
|
52
|
+
elif value_type == "quantity":
|
53
|
+
# Resolve unit name and make into string:
|
54
|
+
raw_amount = cast(str, value.get("amount", ""))
|
55
|
+
amount = raw_amount.lstrip("+")
|
56
|
+
unit = value.get("unit", "")
|
57
|
+
unit = unit.split("/")[-1]
|
58
|
+
if is_qid(unit):
|
59
|
+
unit = client.get_label(unit)
|
60
|
+
amount = f"{amount} {unit}"
|
61
|
+
return LangText(amount, original=raw_amount)
|
62
|
+
elif isinstance(value, str):
|
63
|
+
return LangText(value)
|
64
|
+
else:
|
65
|
+
log.warning("Unhandled value [%s]: %s", value_type, value)
|
66
|
+
return LangText(None)
|
67
|
+
|
68
|
+
|
69
|
+
def clean_name(name: str) -> Optional[str]:
|
70
|
+
"""Clean a name for storage, try to throw out dangerous user inputs."""
|
71
|
+
if not is_name(name):
|
72
|
+
return None
|
73
|
+
clean_name = remove_bracketed_text(name)
|
74
|
+
if not is_name(clean_name):
|
75
|
+
clean_name = name
|
76
|
+
return remove_emoji(clean_name)
|
77
|
+
|
78
|
+
|
79
|
+
def is_alias_strong(alias: str, names: Set[str]) -> bool:
|
80
|
+
"""Check if an alias is a plausible nickname for a person, ie. shows some
|
81
|
+
similarity to the actual name."""
|
82
|
+
if " " not in alias:
|
83
|
+
return False
|
84
|
+
# for name in names:
|
85
|
+
# if is_levenshtein_plausible(alias, name, max_edits=None, max_percent=0.7):
|
86
|
+
# return True
|
87
|
+
return True
|