nomenklatura-mpt 4.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (118) hide show
  1. nomenklatura/__init__.py +11 -0
  2. nomenklatura/cache.py +194 -0
  3. nomenklatura/cli.py +260 -0
  4. nomenklatura/conflicting_match.py +80 -0
  5. nomenklatura/data/er-unstable.pkl +0 -0
  6. nomenklatura/data/regression-v1.pkl +0 -0
  7. nomenklatura/db.py +139 -0
  8. nomenklatura/delta.py +4 -0
  9. nomenklatura/enrich/__init__.py +94 -0
  10. nomenklatura/enrich/aleph.py +141 -0
  11. nomenklatura/enrich/common.py +219 -0
  12. nomenklatura/enrich/nominatim.py +72 -0
  13. nomenklatura/enrich/opencorporates.py +233 -0
  14. nomenklatura/enrich/openfigi.py +124 -0
  15. nomenklatura/enrich/permid.py +201 -0
  16. nomenklatura/enrich/wikidata.py +268 -0
  17. nomenklatura/enrich/yente.py +116 -0
  18. nomenklatura/exceptions.py +9 -0
  19. nomenklatura/index/__init__.py +5 -0
  20. nomenklatura/index/common.py +24 -0
  21. nomenklatura/index/entry.py +89 -0
  22. nomenklatura/index/index.py +170 -0
  23. nomenklatura/index/tokenizer.py +92 -0
  24. nomenklatura/judgement.py +21 -0
  25. nomenklatura/kv.py +40 -0
  26. nomenklatura/matching/__init__.py +47 -0
  27. nomenklatura/matching/bench.py +32 -0
  28. nomenklatura/matching/compare/__init__.py +0 -0
  29. nomenklatura/matching/compare/addresses.py +71 -0
  30. nomenklatura/matching/compare/countries.py +15 -0
  31. nomenklatura/matching/compare/dates.py +83 -0
  32. nomenklatura/matching/compare/gender.py +15 -0
  33. nomenklatura/matching/compare/identifiers.py +30 -0
  34. nomenklatura/matching/compare/names.py +157 -0
  35. nomenklatura/matching/compare/util.py +51 -0
  36. nomenklatura/matching/compat.py +66 -0
  37. nomenklatura/matching/erun/__init__.py +0 -0
  38. nomenklatura/matching/erun/countries.py +42 -0
  39. nomenklatura/matching/erun/identifiers.py +64 -0
  40. nomenklatura/matching/erun/misc.py +71 -0
  41. nomenklatura/matching/erun/model.py +110 -0
  42. nomenklatura/matching/erun/names.py +126 -0
  43. nomenklatura/matching/erun/train.py +135 -0
  44. nomenklatura/matching/erun/util.py +28 -0
  45. nomenklatura/matching/logic_v1/__init__.py +0 -0
  46. nomenklatura/matching/logic_v1/identifiers.py +104 -0
  47. nomenklatura/matching/logic_v1/model.py +76 -0
  48. nomenklatura/matching/logic_v1/multi.py +21 -0
  49. nomenklatura/matching/logic_v1/phonetic.py +142 -0
  50. nomenklatura/matching/logic_v2/__init__.py +0 -0
  51. nomenklatura/matching/logic_v2/identifiers.py +124 -0
  52. nomenklatura/matching/logic_v2/model.py +98 -0
  53. nomenklatura/matching/logic_v2/names/__init__.py +3 -0
  54. nomenklatura/matching/logic_v2/names/analysis.py +51 -0
  55. nomenklatura/matching/logic_v2/names/distance.py +181 -0
  56. nomenklatura/matching/logic_v2/names/magic.py +60 -0
  57. nomenklatura/matching/logic_v2/names/match.py +195 -0
  58. nomenklatura/matching/logic_v2/names/pairing.py +81 -0
  59. nomenklatura/matching/logic_v2/names/util.py +89 -0
  60. nomenklatura/matching/name_based/__init__.py +4 -0
  61. nomenklatura/matching/name_based/misc.py +86 -0
  62. nomenklatura/matching/name_based/model.py +59 -0
  63. nomenklatura/matching/name_based/names.py +59 -0
  64. nomenklatura/matching/pairs.py +42 -0
  65. nomenklatura/matching/regression_v1/__init__.py +0 -0
  66. nomenklatura/matching/regression_v1/misc.py +75 -0
  67. nomenklatura/matching/regression_v1/model.py +110 -0
  68. nomenklatura/matching/regression_v1/names.py +63 -0
  69. nomenklatura/matching/regression_v1/train.py +87 -0
  70. nomenklatura/matching/regression_v1/util.py +31 -0
  71. nomenklatura/matching/svm_v1/__init__.py +5 -0
  72. nomenklatura/matching/svm_v1/misc.py +94 -0
  73. nomenklatura/matching/svm_v1/model.py +168 -0
  74. nomenklatura/matching/svm_v1/names.py +81 -0
  75. nomenklatura/matching/svm_v1/train.py +186 -0
  76. nomenklatura/matching/svm_v1/util.py +30 -0
  77. nomenklatura/matching/types.py +227 -0
  78. nomenklatura/matching/util.py +62 -0
  79. nomenklatura/publish/__init__.py +0 -0
  80. nomenklatura/publish/dates.py +49 -0
  81. nomenklatura/publish/edges.py +32 -0
  82. nomenklatura/py.typed +0 -0
  83. nomenklatura/resolver/__init__.py +6 -0
  84. nomenklatura/resolver/common.py +2 -0
  85. nomenklatura/resolver/edge.py +107 -0
  86. nomenklatura/resolver/identifier.py +60 -0
  87. nomenklatura/resolver/linker.py +101 -0
  88. nomenklatura/resolver/resolver.py +565 -0
  89. nomenklatura/settings.py +17 -0
  90. nomenklatura/store/__init__.py +41 -0
  91. nomenklatura/store/base.py +130 -0
  92. nomenklatura/store/level.py +272 -0
  93. nomenklatura/store/memory.py +102 -0
  94. nomenklatura/store/redis_.py +131 -0
  95. nomenklatura/store/sql.py +219 -0
  96. nomenklatura/store/util.py +48 -0
  97. nomenklatura/store/versioned.py +371 -0
  98. nomenklatura/tui/__init__.py +17 -0
  99. nomenklatura/tui/app.py +294 -0
  100. nomenklatura/tui/app.tcss +52 -0
  101. nomenklatura/tui/comparison.py +81 -0
  102. nomenklatura/tui/util.py +35 -0
  103. nomenklatura/util.py +26 -0
  104. nomenklatura/versions.py +119 -0
  105. nomenklatura/wikidata/__init__.py +14 -0
  106. nomenklatura/wikidata/client.py +122 -0
  107. nomenklatura/wikidata/lang.py +94 -0
  108. nomenklatura/wikidata/model.py +139 -0
  109. nomenklatura/wikidata/props.py +70 -0
  110. nomenklatura/wikidata/qualified.py +49 -0
  111. nomenklatura/wikidata/query.py +66 -0
  112. nomenklatura/wikidata/value.py +87 -0
  113. nomenklatura/xref.py +125 -0
  114. nomenklatura_mpt-4.1.9.dist-info/METADATA +159 -0
  115. nomenklatura_mpt-4.1.9.dist-info/RECORD +118 -0
  116. nomenklatura_mpt-4.1.9.dist-info/WHEEL +4 -0
  117. nomenklatura_mpt-4.1.9.dist-info/entry_points.txt +3 -0
  118. nomenklatura_mpt-4.1.9.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,122 @@
1
+ import json
2
+ import logging
3
+ from functools import lru_cache
4
+ from typing import Any, List, Optional, Dict
5
+ from requests import Session
6
+ from normality import squash_spaces
7
+ from rigour.urls import build_url
8
+ from nomenklatura.cache import Cache
9
+ from nomenklatura.wikidata.lang import LangText
10
+ from nomenklatura.wikidata.model import Item
11
+ from nomenklatura.wikidata.query import SparqlResponse
12
+
13
+ log = logging.getLogger(__name__)
14
+
15
+
16
+ class WikidataClient(object):
17
+ WD_API = "https://www.wikidata.org/w/api.php"
18
+ QUERY_API = "https://query.wikidata.org/sparql"
19
+ QUERY_HEADERS = {
20
+ "Accept": "application/sparql-results+json",
21
+ }
22
+ CACHE_SHORT = 1
23
+ CACHE_MEDIUM = CACHE_SHORT * 7
24
+ CACHE_LONG = CACHE_SHORT * 30
25
+
26
+ LABEL_PREFIX = "wd:lb:"
27
+ LABEL_CACHE_DAYS = 100
28
+
29
+ def __init__(
30
+ self, cache: Cache, session: Optional[Session] = None, cache_days: int = 14
31
+ ) -> None:
32
+ self.cache = cache
33
+ self.session = session or Session()
34
+ self.cache_days = cache_days
35
+ # self.cache.preload(f"{self.LABEL_PREFIX}%")
36
+
37
+ @lru_cache(maxsize=1000)
38
+ def fetch_item(self, qid: str) -> Optional[Item]:
39
+ # https://www.mediawiki.org/wiki/Wikibase/API
40
+ # https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
41
+ params = {"format": "json", "ids": qid, "action": "wbgetentities"}
42
+ url = build_url(self.WD_API, params=params)
43
+ raw = self.cache.get(url, max_age=self.cache_days)
44
+ if raw is None:
45
+ res = self.session.get(url)
46
+ res.raise_for_status()
47
+ raw = res.text
48
+ self.cache.set(url, raw)
49
+ data = json.loads(raw)
50
+ entity = data.get("entities", {}).get(qid)
51
+ if entity is None:
52
+ return None
53
+ return Item(self, entity)
54
+
55
+ @lru_cache(maxsize=100000)
56
+ def get_label(self, qid: str) -> LangText:
57
+ cache_key = f"{self.LABEL_PREFIX}{qid}"
58
+ cached = self.cache.get_json(cache_key, max_age=self.LABEL_CACHE_DAYS)
59
+ if cached is not None:
60
+ return LangText.parse(cached)
61
+ params = {
62
+ "format": "json",
63
+ "ids": qid,
64
+ "action": "wbgetentities",
65
+ "props": "labels",
66
+ }
67
+ url = build_url(self.WD_API, params=params)
68
+ res = self.session.get(url)
69
+ res.raise_for_status()
70
+ data: Dict[str, Any] = res.json()
71
+ entity = data.get("entities", {}).get(qid)
72
+ if entity is None:
73
+ return LangText(None)
74
+ labels = LangText.from_dict(entity.get("labels", {}))
75
+ label = LangText.pick(labels)
76
+ if label is None:
77
+ label = LangText(qid)
78
+ label.original = qid
79
+ self.cache.set_json(cache_key, label.pack())
80
+ return label
81
+
82
+ def query(self, query_text: str) -> SparqlResponse:
83
+ """Query the Wikidata SPARQL endpoint."""
84
+ clean_text = squash_spaces(query_text)
85
+ if len(clean_text) == 0:
86
+ raise RuntimeError("Invalid query: %r" % query_text)
87
+ params = {"query": clean_text}
88
+ url = build_url(self.QUERY_API, params=params)
89
+ raw = self.cache.get(url, max_age=self.cache_days)
90
+ if raw is None:
91
+ res = self.session.get(url, headers=self.QUERY_HEADERS)
92
+ res.raise_for_status()
93
+ raw = res.text
94
+ self.cache.set(url, raw)
95
+ try:
96
+ data = json.loads(raw)
97
+ except json.JSONDecodeError as err:
98
+ self.cache.delete(url)
99
+ log.exception("Failed to parse JSON: %s", err)
100
+ return SparqlResponse(clean_text, {})
101
+ return SparqlResponse(clean_text, data)
102
+
103
+ @lru_cache(maxsize=30000)
104
+ def _type_props(self, qid: str) -> List[str]:
105
+ item = self.fetch_item(qid)
106
+ if item is None:
107
+ return []
108
+ types: List[str] = []
109
+ for claim in item.claims:
110
+ # historical countries are always historical:
111
+ ended = claim.qualifiers.get("P582") is not None and claim.qid != "Q3024240"
112
+ if ended or claim.qid is None:
113
+ continue
114
+ if claim.property in ("P31", "P279"):
115
+ types.append(claim.qid)
116
+ return types
117
+
118
+ def __repr__(self) -> str:
119
+ return "<WikidataClient()>"
120
+
121
+ def __hash__(self) -> int:
122
+ return 42
@@ -0,0 +1,94 @@
1
+ import logging
2
+ from rigour.langs import PREFERRED_LANGS
3
+ from typing import Callable, Dict, Iterable, List, Optional, Any, Set
4
+ from followthemoney import registry, StatementEntity
5
+ from normality.cleaning import remove_unsafe_chars
6
+
7
+ log = logging.getLogger(__name__)
8
+
9
+
10
+ class LangText(object):
11
+ __slots__ = ["text", "lang", "original"]
12
+
13
+ def __init__(
14
+ self,
15
+ text: Optional[str],
16
+ lang: Optional[str] = None,
17
+ original: Optional[str] = None,
18
+ ) -> None:
19
+ if text is None or len(text.strip()) == 0:
20
+ text = None
21
+ if text is not None:
22
+ text = remove_unsafe_chars(text)
23
+ self.text = text
24
+ self.lang: Optional[str] = None
25
+ if lang is not None:
26
+ self.lang = registry.language.clean_text(lang)
27
+ if lang is not None and self.lang is None:
28
+ # Language is given, but it is not one supported by the FtM ecosystem:
29
+ self.text = None
30
+ self.original = original or self.text
31
+
32
+ def apply(
33
+ self,
34
+ entity: StatementEntity,
35
+ prop: str,
36
+ clean: Optional[Callable[[str], Optional[str]]] = None,
37
+ ) -> None:
38
+ if self.text is None:
39
+ return
40
+ clean_text = self.text if clean is None else clean(self.text)
41
+ if clean_text is None or clean_text.strip() == "":
42
+ return
43
+ entity.add(prop, clean_text, lang=self.lang, original_value=self.original)
44
+
45
+ def pack(self) -> Dict[str, Optional[str]]:
46
+ data = {"t": self.text, "l": self.lang}
47
+ if self.original is not None and self.original != self.text:
48
+ data["o"] = self.original
49
+ return data
50
+
51
+ @classmethod
52
+ def parse(cls, data: Dict[str, Optional[str]]) -> "LangText":
53
+ return LangText(data["t"], data["l"], original=data.get("o"))
54
+
55
+ @classmethod
56
+ def pick(cls, texts: Iterable["LangText"]) -> Optional["LangText"]:
57
+ for lang in PREFERRED_LANGS:
58
+ for lt in texts:
59
+ if lt.lang == lang:
60
+ return lt
61
+ for lt in texts:
62
+ return lt
63
+ return None
64
+
65
+ @classmethod
66
+ def from_dict(cls, data: Dict[str, List[Dict[str, str]]]) -> Set["LangText"]:
67
+ langs: Set[LangText] = set()
68
+ for objs in data.values():
69
+ if not isinstance(objs, list):
70
+ objs = [objs]
71
+ for obj in objs:
72
+ value = obj["value"]
73
+ if value is None:
74
+ continue
75
+ lang = obj["language"]
76
+ lt = LangText(value, lang, original=value)
77
+ if lt.text is None:
78
+ continue
79
+ langs.add(lt)
80
+ return langs
81
+
82
+ def __str__(self) -> str:
83
+ if self.text is None:
84
+ return ""
85
+ return self.text
86
+
87
+ def __hash__(self) -> int:
88
+ return hash((self.text, self.lang, self.original))
89
+
90
+ def __eq__(self, other: Any) -> bool:
91
+ return hash(self) == hash(other)
92
+
93
+ def __repr__(self) -> str:
94
+ return f"<LangText({self.text!r}, {self.lang!r}, {self.original!r})>"
@@ -0,0 +1,139 @@
1
+ from normality import stringify
2
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
3
+
4
+ from nomenklatura.wikidata.value import snak_value_to_string
5
+ from nomenklatura.wikidata.lang import LangText
6
+
7
+ if TYPE_CHECKING:
8
+ from nomenklatura.wikidata.client import WikidataClient
9
+
10
+
11
+ class Snak(object):
12
+ """Some Notation About Knowledge (TM)."""
13
+
14
+ def __init__(self, client: "WikidataClient", data: Dict[str, Any]):
15
+ self.client = client
16
+ datavalue = data.pop("datavalue", {})
17
+ self.value_type: str = datavalue.pop("type", None)
18
+ self._value = datavalue.pop("value", None)
19
+ data.pop("hash", None)
20
+ self.type = data.pop("datatype", None)
21
+ self.property: Optional[str] = data.pop("property", None)
22
+ self.snaktype = data.pop("snaktype", None)
23
+ # self._data = data
24
+
25
+ @property
26
+ def property_label(self) -> LangText:
27
+ return self.client.get_label(self.property)
28
+
29
+ @property
30
+ def qid(self) -> Optional[str]:
31
+ if self.value_type == "wikibase-entityid":
32
+ return stringify(self._value.get("id"))
33
+ return None
34
+
35
+ @property
36
+ def text(self) -> LangText:
37
+ return snak_value_to_string(self.client, self.value_type, self._value)
38
+
39
+ def __repr__(self) -> str:
40
+ return f"<Snak({self.qid}, {self.property}, {self.value_type})>"
41
+
42
+
43
+ class Reference(object):
44
+ def __init__(self, client: "WikidataClient", data: Dict[str, Any]) -> None:
45
+ self.snaks: Dict[str, List[Snak]] = {}
46
+ for prop, snak_data in data.pop("snaks", {}).items():
47
+ self.snaks[prop] = [Snak(client, s) for s in snak_data]
48
+
49
+ def get(self, prop: str) -> List[Snak]:
50
+ return self.snaks.get(prop, [])
51
+
52
+
53
+ class Claim(Snak):
54
+ def __init__(
55
+ self, client: "WikidataClient", data: Dict[str, Any], prop: str
56
+ ) -> None:
57
+ self.id = data.pop("id")
58
+ self.rank = data.pop("rank")
59
+ super().__init__(client, data.pop("mainsnak"))
60
+ self.qualifiers: Dict[str, List[Snak]] = {}
61
+ for prop, snaks in data.pop("qualifiers", {}).items():
62
+ self.qualifiers[prop] = [Snak(client, s) for s in snaks]
63
+
64
+ self.references = [Reference(client, r) for r in data.pop("references", [])]
65
+ self.property = self.property or prop
66
+
67
+ def get_qualifier(self, prop: str) -> List[Snak]:
68
+ return self.qualifiers.get(prop, [])
69
+
70
+ @property
71
+ def is_ended(self) -> bool:
72
+ snak = self.qualifiers.get("P582")
73
+ if snak is not None and len(snak) > 0:
74
+ return True
75
+ return False
76
+
77
+ def __repr__(self) -> str:
78
+ return f"<Claim({self.qid}, {self.property}, {self.value_type})>"
79
+
80
+ def __hash__(self) -> int:
81
+ return hash((self.qid, self.property, self.id))
82
+
83
+
84
+ class Item(object):
85
+ """A wikidata item (or entity)."""
86
+
87
+ def __init__(self, client: "WikidataClient", data: Dict[str, Any]) -> None:
88
+ self.client = client
89
+ self.id: str = data.pop("id")
90
+ self.modified: Optional[str] = data.pop("modified", None)
91
+
92
+ self.labels: Set[LangText] = LangText.from_dict(data.pop("labels", {}))
93
+ self.aliases: Set[LangText] = LangText.from_dict(data.pop("aliases", {}))
94
+
95
+ descriptions = LangText.from_dict(data.pop("descriptions", {}))
96
+ self.description = LangText.pick(descriptions)
97
+
98
+ self.claims: List[Claim] = []
99
+ claims: Dict[str, List[Dict[str, Any]]] = data.pop("claims", {})
100
+ for prop, values in claims.items():
101
+ for value in values:
102
+ self.claims.append(Claim(client, value, prop))
103
+
104
+ # TODO: get back to this later:
105
+ data.pop("sitelinks", None)
106
+
107
+ @property
108
+ def label(self) -> Optional[LangText]:
109
+ label = LangText.pick(self.labels)
110
+ if label is not None:
111
+ return label
112
+ return LangText.pick(self.aliases)
113
+
114
+ def is_instance(self, qid: str) -> bool:
115
+ for claim in self.claims:
116
+ if claim.property == "P31" and claim.qid == qid:
117
+ return True
118
+ return False
119
+
120
+ def _types(self, path: List[str]) -> Set[str]:
121
+ qid = path[-1]
122
+ types = set([qid])
123
+ if len(path) > 6:
124
+ return types
125
+ for type_ in self.client._type_props(qid):
126
+ if type_ not in path:
127
+ types.update(self._types(path + [type_]))
128
+ return types
129
+
130
+ @property
131
+ def types(self) -> Set[str]:
132
+ """Get all the `instance of` and `subclass of` types for an item."""
133
+ return self._types([self.id])
134
+
135
+ def __repr__(self) -> str:
136
+ return f"<Item({self.id})>"
137
+
138
+ def __hash__(self) -> int:
139
+ return hash(self.id)
@@ -0,0 +1,70 @@
1
+ # Query: https://w.wiki/4Z73
2
+ PROPS_FAMILY = {
3
+ "P7": "sibling",
4
+ "P9": "sibling",
5
+ "P22": "parent",
6
+ "P26": "spouse",
7
+ "P25": "parent",
8
+ "P40": "child",
9
+ "P43": "stepparent",
10
+ "P44": "stepparent",
11
+ "P451": "unmarried partner",
12
+ "P1038": "relative",
13
+ "P1290": "godparent",
14
+ "P3373": "sibling",
15
+ "P3448": "stepparent",
16
+ "P8810": "unspecified parent",
17
+ }
18
+
19
+ PROPS_ASSOCIATION = {
20
+ "P1327": "partner in business or sport",
21
+ "P3342": "significant person",
22
+ }
23
+
24
+ # https://www.wikidata.org/wiki/Wikidata:List_of_properties/human
25
+ PROPS_DIRECT = {
26
+ "P1477": "alias", # birth name
27
+ "P1813": "alias", # short name
28
+ "P2561": "alias", # name
29
+ "P1559": "alias", # name in native language
30
+ "P2562": "alias", # married name
31
+ "P511": "title",
32
+ "P735": "firstName",
33
+ "P734": "lastName",
34
+ "P1950": "lastName",
35
+ "P21": "gender",
36
+ "P39": "position",
37
+ "P140": "religion",
38
+ "P106": "topics",
39
+ "P569": "birthDate",
40
+ "P5056": "fatherName",
41
+ "P570": "deathDate",
42
+ "P19": "birthPlace",
43
+ "P856": "website",
44
+ "P512": "education",
45
+ "P69": "education",
46
+ "P27": "citizenship",
47
+ "P742": "weakAlias",
48
+ "P172": "ethnicity",
49
+ "P973": "sourceUrl",
50
+ "P1278": "leiCode",
51
+ "P17": "country",
52
+ "P571": "incorporationDate",
53
+ "P1454": "legalForm",
54
+ }
55
+
56
+ PROPS_QUALIFIED = (
57
+ "position",
58
+ "education",
59
+ )
60
+
61
+ PROPS_TOPICS = {
62
+ "Q82955": "role.pep",
63
+ "Q193391": "role.diplo",
64
+ # "Q392651": "role.spy",
65
+ "Q14886050": "crime.terror",
66
+ "Q16533": "role.judge",
67
+ "Q17276321": "role.pep", # member of the state duma
68
+ "Q189290": "mil", # military officer
69
+ "Q47064": "mil", # military personnel
70
+ }
@@ -0,0 +1,49 @@
1
+ from typing import Set
2
+ from followthemoney.helpers import dates_years
3
+
4
+ from nomenklatura.wikidata.model import Claim
5
+ from nomenklatura.wikidata.lang import LangText
6
+
7
+
8
+ def post_summary(
9
+ position: LangText,
10
+ start_dates: Set[str],
11
+ end_dates: Set[str],
12
+ dates: Set[str],
13
+ ) -> LangText:
14
+ """Make a string summary for a Post object."""
15
+ start = min(dates_years(start_dates), default="")
16
+ end = min(dates_years(end_dates), default="")
17
+ date_range = None
18
+ if len(start) or len(end):
19
+ date_range = f"{start}-{end}"
20
+ dates_ = dates_years(dates)
21
+ if date_range is None and len(dates_):
22
+ date_range = ", ".join(sorted(dates_))
23
+
24
+ label = position.text
25
+ if date_range:
26
+ label = f"{label} ({date_range})"
27
+ original = position.text or position.original
28
+ return LangText(label, position.lang, original=original)
29
+
30
+
31
+ def qualify_value(value: LangText, claim: Claim) -> LangText:
32
+ if value.text is None:
33
+ return value
34
+ starts: Set[str] = set()
35
+ for qual in claim.get_qualifier("P580"):
36
+ if qual.text.text is not None:
37
+ starts.add(qual.text.text)
38
+
39
+ ends: Set[str] = set()
40
+ for qual in claim.get_qualifier("P582"):
41
+ if qual.text.text is not None:
42
+ ends.add(qual.text.text)
43
+
44
+ dates: Set[str] = set()
45
+ for qual in claim.get_qualifier("P585"):
46
+ if qual.text.text is not None:
47
+ dates.add(qual.text.text)
48
+
49
+ return post_summary(value, starts, ends, dates)
@@ -0,0 +1,66 @@
1
+ from typing import Dict, Any, List, Optional
2
+
3
+
4
+ class SparqlValue(object):
5
+ WD_PREFIX = "http://www.wikidata.org/entity/"
6
+
7
+ __slots__ = ["type", "value", "lang"]
8
+
9
+ def __init__(self, data: Dict[str, Any]) -> None:
10
+ self.type: str = data["type"]
11
+ self.value: str = data["value"]
12
+ if self.type == "uri" and self.value.startswith(self.WD_PREFIX):
13
+ self.value = self.value[len(self.WD_PREFIX) :]
14
+ self.lang: Optional[str] = data.get("xml:lang")
15
+
16
+ def __str__(self) -> str:
17
+ return self.value
18
+
19
+ def __repr__(self) -> str:
20
+ return f"<SV({self.type!r}, {self.value!r})>"
21
+
22
+ def __hash__(self) -> int:
23
+ return hash(repr(self))
24
+
25
+
26
+ class SparqlBinding(object):
27
+ def __init__(self, response: "SparqlResponse", data: Dict[str, Any]) -> None:
28
+ self.response = response
29
+ self.values: Dict[str, SparqlValue] = {}
30
+ for var, value in data.items():
31
+ self.values[var] = SparqlValue(value)
32
+
33
+ def wrapped(self, var: str) -> Optional[SparqlValue]:
34
+ if var not in self.response.vars:
35
+ raise KeyError("No such var: %s (in: %r)" % (var, self.response.vars))
36
+ value = self.values.get(var)
37
+ if value is None:
38
+ return None
39
+ return value
40
+
41
+ def plain(self, var: str) -> Optional[str]:
42
+ if var not in self.response.vars:
43
+ raise KeyError("No such var: %s (in: %r)" % (var, self.response.vars))
44
+ if var in self.values:
45
+ value = self.wrapped(var)
46
+ if value is not None:
47
+ return str(value)
48
+ return None
49
+
50
+ def __repr__(self) -> str:
51
+ return f"<SparqlBinding({self.values!r})>"
52
+
53
+
54
+ class SparqlResponse(object):
55
+ def __init__(self, query: str, response: Dict[str, Any]) -> None:
56
+ self.query = query
57
+ self.vars: List[str] = response["head"]["vars"]
58
+ self.results: List[SparqlBinding] = []
59
+ for bind in response["results"]["bindings"]:
60
+ self.results.append(SparqlBinding(self, bind))
61
+
62
+ def __len__(self) -> int:
63
+ return len(self.results)
64
+
65
+ def __repr__(self) -> str:
66
+ return f"<SparqlResponse({self.vars!r}, {len(self)})>"
@@ -0,0 +1,87 @@
1
+ import logging
2
+ from prefixdate import Precision
3
+ from typing import TYPE_CHECKING, Set, cast, Any, Dict, Optional
4
+ from rigour.ids.wikidata import is_qid
5
+ from rigour.text.cleaning import remove_emoji, remove_bracketed_text
6
+ from rigour.names import is_name
7
+ # from rigour.text.distance import is_levenshtein_plausible
8
+
9
+ from nomenklatura.wikidata.lang import LangText
10
+
11
+ if TYPE_CHECKING:
12
+ from nomenklatura.wikidata.client import WikidataClient
13
+
14
+
15
+ log = logging.getLogger(__name__)
16
+ PRECISION = {
17
+ 11: Precision.DAY,
18
+ 10: Precision.MONTH,
19
+ 9: Precision.YEAR,
20
+ }
21
+
22
+
23
+ def snak_value_to_string(
24
+ client: "WikidataClient", value_type: Optional[str], value: Dict[str, Any]
25
+ ) -> LangText:
26
+ if value_type is None:
27
+ return LangText(None)
28
+ elif value_type == "time":
29
+ raw_time = cast(Optional[str], value.get("time"))
30
+ if raw_time is None:
31
+ return LangText(None)
32
+ time = raw_time.strip("+")
33
+ prec_id = cast(int, value.get("precision"))
34
+ prec = PRECISION.get(prec_id, Precision.DAY)
35
+ time = time[: prec.value]
36
+
37
+ # Remove Jan 01, because it seems to be in input failure pattern
38
+ # with Wikidata (probably from bots that don't get "precision").
39
+ if time.endswith("-01-01"):
40
+ time = time[:4]
41
+
42
+ # Date limit in FtM. These will be removed by the death filter:
43
+ time = max("1001", time)
44
+ return LangText(time, original=raw_time)
45
+ elif value_type == "wikibase-entityid":
46
+ qid = value.get("id")
47
+ return client.get_label(qid)
48
+ elif value_type == "monolingualtext":
49
+ text = value.get("text")
50
+ if isinstance(text, str):
51
+ return LangText(text, lang=value.get("language"))
52
+ elif value_type == "quantity":
53
+ # Resolve unit name and make into string:
54
+ raw_amount = cast(str, value.get("amount", ""))
55
+ amount = raw_amount.lstrip("+")
56
+ unit = value.get("unit", "")
57
+ unit = unit.split("/")[-1]
58
+ if is_qid(unit):
59
+ unit = client.get_label(unit)
60
+ amount = f"{amount} {unit}"
61
+ return LangText(amount, original=raw_amount)
62
+ elif isinstance(value, str):
63
+ return LangText(value)
64
+ else:
65
+ log.warning("Unhandled value [%s]: %s", value_type, value)
66
+ return LangText(None)
67
+
68
+
69
+ def clean_name(name: str) -> Optional[str]:
70
+ """Clean a name for storage, try to throw out dangerous user inputs."""
71
+ if not is_name(name):
72
+ return None
73
+ clean_name = remove_bracketed_text(name)
74
+ if not is_name(clean_name):
75
+ clean_name = name
76
+ return remove_emoji(clean_name)
77
+
78
+
79
+ def is_alias_strong(alias: str, names: Set[str]) -> bool:
80
+ """Check if an alias is a plausible nickname for a person, ie. shows some
81
+ similarity to the actual name."""
82
+ if " " not in alias:
83
+ return False
84
+ # for name in names:
85
+ # if is_levenshtein_plausible(alias, name, max_edits=None, max_percent=0.7):
86
+ # return True
87
+ return True