personal_knowledge_library 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of personal_knowledge_library might be problematic. Click here for more details.
- knowledge/__init__.py +91 -0
- knowledge/base/__init__.py +22 -0
- knowledge/base/access.py +167 -0
- knowledge/base/entity.py +267 -0
- knowledge/base/language.py +27 -0
- knowledge/base/ontology.py +2734 -0
- knowledge/base/search.py +473 -0
- knowledge/base/tenant.py +192 -0
- knowledge/nel/__init__.py +11 -0
- knowledge/nel/base.py +495 -0
- knowledge/nel/engine.py +123 -0
- knowledge/ontomapping/__init__.py +667 -0
- knowledge/ontomapping/manager.py +320 -0
- knowledge/public/__init__.py +27 -0
- knowledge/public/cache.py +115 -0
- knowledge/public/helper.py +373 -0
- knowledge/public/relations.py +128 -0
- knowledge/public/wikidata.py +1324 -0
- knowledge/services/__init__.py +128 -0
- knowledge/services/asyncio/__init__.py +7 -0
- knowledge/services/asyncio/base.py +458 -0
- knowledge/services/asyncio/graph.py +1420 -0
- knowledge/services/asyncio/group.py +450 -0
- knowledge/services/asyncio/search.py +439 -0
- knowledge/services/asyncio/users.py +270 -0
- knowledge/services/base.py +533 -0
- knowledge/services/graph.py +1897 -0
- knowledge/services/group.py +819 -0
- knowledge/services/helper.py +142 -0
- knowledge/services/ontology.py +1234 -0
- knowledge/services/search.py +488 -0
- knowledge/services/session.py +444 -0
- knowledge/services/tenant.py +281 -0
- knowledge/services/users.py +445 -0
- knowledge/utils/__init__.py +10 -0
- knowledge/utils/graph.py +417 -0
- knowledge/utils/wikidata.py +197 -0
- knowledge/utils/wikipedia.py +175 -0
- personal_knowledge_library-3.0.0.dist-info/LICENSE +201 -0
- personal_knowledge_library-3.0.0.dist-info/METADATA +1163 -0
- personal_knowledge_library-3.0.0.dist-info/RECORD +42 -0
- personal_knowledge_library-3.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Copyright © 2023-present Wacom. All rights reserved.
|
|
3
|
+
import hashlib
|
|
4
|
+
import math
|
|
5
|
+
import urllib
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from enum import Enum
|
|
8
|
+
from typing import Any, Optional, Dict, List
|
|
9
|
+
|
|
10
|
+
import requests
|
|
11
|
+
from requests import Response
|
|
12
|
+
from requests.adapters import HTTPAdapter
|
|
13
|
+
from urllib3 import Retry
|
|
14
|
+
|
|
15
|
+
from knowledge import logger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# --------------------------------------- Structures -------------------------------------------------------------------
|
|
19
|
+
class Precision(Enum):
|
|
20
|
+
"""
|
|
21
|
+
Precision enum for date.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
BILLION_YEARS = 0
|
|
25
|
+
MILLION_YEARS = 3
|
|
26
|
+
HUNDREDS_THOUSAND_YEARS = 4
|
|
27
|
+
TEN_THOUSAND_YEARS = 5
|
|
28
|
+
MILLENIUM = 6
|
|
29
|
+
CENTURY = 7
|
|
30
|
+
DECADE = 8
|
|
31
|
+
YEAR = 9
|
|
32
|
+
MONTH = 10
|
|
33
|
+
DAY = 11
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class WikiDataAPIException(Exception):
|
|
37
|
+
"""
|
|
38
|
+
WikiDataAPIException
|
|
39
|
+
--------------------
|
|
40
|
+
Exception thrown when accessing WikiData fails.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# --------------------------------------- Tags -------------------------------------------------------------------------
|
|
45
|
+
CLASS_TAG: str = "class"
|
|
46
|
+
ALIASES_TAG: str = "aliases"
|
|
47
|
+
ID_TAG: str = "id"
|
|
48
|
+
QID_TAG: str = "qid"
|
|
49
|
+
PID_TAG: str = "pid"
|
|
50
|
+
LAST_REVID_TAG: str = "lastrevid"
|
|
51
|
+
MODIFIED_TAG: str = "modified"
|
|
52
|
+
WIKIDATA_LANGUAGE_TAG: str = "language"
|
|
53
|
+
LABEL_VALUE_TAG: str = "value"
|
|
54
|
+
LABEL_TAG: str = "label"
|
|
55
|
+
SUPERCLASSES_TAG: str = "superclasses"
|
|
56
|
+
CLAIMS_TAG: str = "claims"
|
|
57
|
+
ONTOLOGY_TYPES_TAG: str = "ontology_types"
|
|
58
|
+
REVISION_TAG: str = "revision"
|
|
59
|
+
SITELINKS_TAG: str = "sitelinks"
|
|
60
|
+
TITLES_TAG: str = "titles"
|
|
61
|
+
URLS_TAG: str = "urls"
|
|
62
|
+
SOURCE_TAG: str = "source"
|
|
63
|
+
API_LIMIT: int = 50
|
|
64
|
+
# --------------------------------------- API URLs ---------------------------------------------------------------------
|
|
65
|
+
THUMB_IMAGE_URL: str = "https://upload.wikimedia.org/wikipedia/commons/thumb/{}/{}/{}/200px-{}"
|
|
66
|
+
MULTIPLE_ENTITIES_API: str = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids="
|
|
67
|
+
WIKIDATA_LDI_URL: str = "https://www.wikidata.org/wiki/Special:EntityData"
|
|
68
|
+
# --------------------------------------- Wikidata Properties ----------------------------------------------------------
|
|
69
|
+
STUDENT_OF: str = "P1066"
|
|
70
|
+
STUDENT: str = "P802"
|
|
71
|
+
INCEPTION: str = "P571"
|
|
72
|
+
MOVEMENT: str = "P135"
|
|
73
|
+
SUBCLASS_OF: str = "P279"
|
|
74
|
+
TITLE: str = "P1476"
|
|
75
|
+
COLLECTION: str = "P195"
|
|
76
|
+
GENRE: str = "P136"
|
|
77
|
+
CREATOR: str = "P170"
|
|
78
|
+
LOGO_IMAGE: str = "P154"
|
|
79
|
+
FLAG_IMAGE: str = "P41"
|
|
80
|
+
GREGORIAN_CALENDAR: str = "Q1985727"
|
|
81
|
+
START_TIME: str = "P580"
|
|
82
|
+
END_TIME: str = "P582"
|
|
83
|
+
FOLLOWS: str = "P155"
|
|
84
|
+
FOLLOWED_BY: str = "P156"
|
|
85
|
+
COUNTRY_OF_ORIGIN: str = "P495"
|
|
86
|
+
COUNTRY: str = "P17"
|
|
87
|
+
INSTANCE_OF: str = "P31"
|
|
88
|
+
IMAGE: str = "P18"
|
|
89
|
+
# URL - Wikidata
|
|
90
|
+
GREGORIAN_CALENDAR_URL: str = "http://www.wikidata.org/entity/Q1985786"
|
|
91
|
+
# URL - Wikidata service
|
|
92
|
+
WIKIDATA_SPARQL_URL: str = "https://query.wikidata.org/sparql"
|
|
93
|
+
WIKIDATA_SEARCH_URL: str = "https://www.wikidata.org/w/api.php"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# --------------------------------------- Helper functions -------------------------------------------------------------
|
|
97
|
+
def image_url(img: str, dpi: int = 500):
|
|
98
|
+
"""
|
|
99
|
+
Helper to generate image URL for Wikipedia.
|
|
100
|
+
|
|
101
|
+
Parameters
|
|
102
|
+
----------
|
|
103
|
+
img: str
|
|
104
|
+
Name of image
|
|
105
|
+
dpi: int
|
|
106
|
+
DPI of the generated URL
|
|
107
|
+
Returns
|
|
108
|
+
-------
|
|
109
|
+
wikimedia_url: str
|
|
110
|
+
URL of wikimedia
|
|
111
|
+
"""
|
|
112
|
+
if not (50 <= dpi <= 1000):
|
|
113
|
+
raise ValueError(f"DPI should bei with range of [50-1000]. Value:={dpi}")
|
|
114
|
+
extension: str = ""
|
|
115
|
+
conversion: str = ""
|
|
116
|
+
fixed_img: str = img.replace(" ", "_")
|
|
117
|
+
if fixed_img.lower().endswith("svg"):
|
|
118
|
+
extension: str = ".png"
|
|
119
|
+
if fixed_img.lower().endswith("tif") or fixed_img.lower().endswith("tiff"):
|
|
120
|
+
extension: str = ".jpg"
|
|
121
|
+
conversion: str = "lossy-page1-"
|
|
122
|
+
hash_img: str = hashlib.md5(fixed_img.encode("utf-8")).hexdigest()
|
|
123
|
+
url_img_part: str = urllib.parse.quote_plus(fixed_img)
|
|
124
|
+
return (
|
|
125
|
+
f"https://upload.wikimedia.org/wikipedia/commons/thumb/"
|
|
126
|
+
f"{hash_img[0]}/{hash_img[:2]}/{url_img_part}/{dpi}px-{conversion + url_img_part + extension}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def parse_date(date_string: str) -> Optional[datetime]:
|
|
131
|
+
"""
|
|
132
|
+
Parse date string to datetime object.
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
date_string: str
|
|
136
|
+
Date string
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
parsed_date: datetime
|
|
141
|
+
Parsed date
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
parsed_date = datetime.fromisoformat(date_string)
|
|
145
|
+
return parsed_date
|
|
146
|
+
except (TypeError, ValueError):
|
|
147
|
+
date_part, _ = date_string.split("T")
|
|
148
|
+
year, month, day = date_part.split("-")
|
|
149
|
+
if month == "00":
|
|
150
|
+
month = "01"
|
|
151
|
+
if day == "00":
|
|
152
|
+
day = "01"
|
|
153
|
+
iso_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
|
|
154
|
+
try:
|
|
155
|
+
parsed_date = datetime.fromisoformat(iso_date)
|
|
156
|
+
return parsed_date
|
|
157
|
+
except (TypeError, ValueError):
|
|
158
|
+
return None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def wikidate(param: Dict[str, Any]) -> Dict[str, Any]:
|
|
162
|
+
"""
|
|
163
|
+
Parse and extract wikidata structure.
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
param: Dict[str, Any]
|
|
167
|
+
Entities wikidata
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
result: Dict[str, Any]
|
|
172
|
+
Dict with pretty print of date
|
|
173
|
+
"""
|
|
174
|
+
time: str = param["time"]
|
|
175
|
+
timezone: int = param["timezone"]
|
|
176
|
+
before: int = param["before"]
|
|
177
|
+
after: int = param["after"]
|
|
178
|
+
precision: int = param["precision"]
|
|
179
|
+
calendar_model: str = param["calendarmodel"]
|
|
180
|
+
iso_encoded: Optional[str] = None
|
|
181
|
+
after_christ: bool = True
|
|
182
|
+
pretty: str = ""
|
|
183
|
+
if calendar_model != "https://www.wikidata.org/wiki/Q1985727":
|
|
184
|
+
if time.startswith("+"):
|
|
185
|
+
time = time[1:]
|
|
186
|
+
elif time.startswith("-"):
|
|
187
|
+
time = time[1:]
|
|
188
|
+
after_christ = False
|
|
189
|
+
date_obj: Optional[datetime] = parse_date(date_string=time)
|
|
190
|
+
if date_obj:
|
|
191
|
+
if date_obj.day == 0:
|
|
192
|
+
# Set the day component to 1
|
|
193
|
+
date_obj = date_obj.replace(day=1)
|
|
194
|
+
iso_encoded = date_obj.isoformat()
|
|
195
|
+
pretty = date_obj.strftime("%Y-%m-%d")
|
|
196
|
+
return {
|
|
197
|
+
"time": time,
|
|
198
|
+
"timezone": timezone,
|
|
199
|
+
"before": before,
|
|
200
|
+
"after": after,
|
|
201
|
+
"precision": precision,
|
|
202
|
+
"calendar-model": calendar_model,
|
|
203
|
+
"pretty": pretty,
|
|
204
|
+
"after-christ": after_christ,
|
|
205
|
+
"iso": iso_encoded,
|
|
206
|
+
}
|
|
207
|
+
if time.startswith("+"):
|
|
208
|
+
time = time[1:]
|
|
209
|
+
elif time.startswith("-"):
|
|
210
|
+
time = time[1:]
|
|
211
|
+
after_christ = False
|
|
212
|
+
# Probably not necessary
|
|
213
|
+
date_str = time.strip()
|
|
214
|
+
# Remove + sign
|
|
215
|
+
if date_str[0] == "+":
|
|
216
|
+
date_str = date_str[1:]
|
|
217
|
+
# Remove missing month/day
|
|
218
|
+
date_str = date_str.split("-00", maxsplit=1)[0]
|
|
219
|
+
# Parse date
|
|
220
|
+
try:
|
|
221
|
+
if Precision.BILLION_YEARS.value == precision:
|
|
222
|
+
pretty = date_str
|
|
223
|
+
elif Precision.MILLION_YEARS.value == precision:
|
|
224
|
+
pretty = date_str
|
|
225
|
+
elif Precision.HUNDREDS_THOUSAND_YEARS.value == precision:
|
|
226
|
+
pretty = date_str
|
|
227
|
+
elif Precision.MILLENIUM.value == precision:
|
|
228
|
+
pretty = date_str
|
|
229
|
+
elif Precision.TEN_THOUSAND_YEARS.value == precision:
|
|
230
|
+
pretty = date_str
|
|
231
|
+
else:
|
|
232
|
+
dt_obj: Optional[datetime] = parse_date(date_str)
|
|
233
|
+
if dt_obj:
|
|
234
|
+
if Precision.CENTURY.value == precision:
|
|
235
|
+
century: int = int(math.ceil(dt_obj.year / 100))
|
|
236
|
+
pretty = f"{century}th century"
|
|
237
|
+
elif Precision.DECADE.value == precision:
|
|
238
|
+
pretty = f"{dt_obj.year}s{'' if after_christ else ' BC'}"
|
|
239
|
+
elif Precision.YEAR.value == precision:
|
|
240
|
+
pretty = f"{dt_obj.year}{'' if after_christ else ' BC'}"
|
|
241
|
+
elif Precision.MONTH.value == precision:
|
|
242
|
+
pretty = dt_obj.strftime("%B %Y")
|
|
243
|
+
elif Precision.DAY.value == precision:
|
|
244
|
+
pretty = dt_obj.strftime("%-d %B %Y")
|
|
245
|
+
iso_encoded = dt_obj.isoformat()
|
|
246
|
+
else:
|
|
247
|
+
iso_encoded = None
|
|
248
|
+
except Exception as pe:
|
|
249
|
+
logger.error(param)
|
|
250
|
+
logger.exception(pe)
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
"time": time,
|
|
254
|
+
"timezone": timezone,
|
|
255
|
+
"before": before,
|
|
256
|
+
"after": after,
|
|
257
|
+
"precision": precision,
|
|
258
|
+
"calendar-model": calendar_model,
|
|
259
|
+
"pretty": pretty,
|
|
260
|
+
"after_christ": after_christ,
|
|
261
|
+
"iso": iso_encoded,
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dict[str, Any]:
|
|
266
|
+
"""
|
|
267
|
+
Sena a request with retry policy.
|
|
268
|
+
|
|
269
|
+
Parameters
|
|
270
|
+
----------
|
|
271
|
+
entity_id: str
|
|
272
|
+
Entity QID
|
|
273
|
+
base_url: Base URL
|
|
274
|
+
The base URL
|
|
275
|
+
Returns
|
|
276
|
+
-------
|
|
277
|
+
result_dict: Dict[str, Any]
|
|
278
|
+
Result dict
|
|
279
|
+
"""
|
|
280
|
+
url: str = f"{base_url}/{entity_id}.json"
|
|
281
|
+
|
|
282
|
+
# Define the retry policy
|
|
283
|
+
retry_policy: Retry = Retry(
|
|
284
|
+
total=3, # maximum number of retries
|
|
285
|
+
backoff_factor=1, # factor by which to multiply the delay between retries
|
|
286
|
+
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
|
|
287
|
+
respect_retry_after_header=True, # respect the Retry-After header
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Create a session and mount the retry adapter
|
|
291
|
+
with requests.Session() as session:
|
|
292
|
+
retry_adapter = HTTPAdapter(max_retries=retry_policy)
|
|
293
|
+
session.mount("https://", retry_adapter)
|
|
294
|
+
|
|
295
|
+
# Make a request using the session
|
|
296
|
+
response: Response = session.get(url)
|
|
297
|
+
|
|
298
|
+
# Check the response status code
|
|
299
|
+
if not response.ok:
|
|
300
|
+
raise WikiDataAPIException(f"Request failed with status code : {response.status_code}. URL:= {url}")
|
|
301
|
+
entity_dict_full: Dict[str, Any] = response.json()
|
|
302
|
+
# remove redundant top level keys
|
|
303
|
+
returned_entity_id: str = next(iter(entity_dict_full["entities"]))
|
|
304
|
+
entity_dict = entity_dict_full["entities"][returned_entity_id]
|
|
305
|
+
|
|
306
|
+
if entity_id != returned_entity_id:
|
|
307
|
+
logger.warning(
|
|
308
|
+
f"Wikidata redirect detected. Input entity id={entity_id}. Returned entity id={returned_entity_id}."
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
return entity_dict
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_ENTITIES_API) -> List[Dict[str, Any]]:
|
|
315
|
+
"""
|
|
316
|
+
Sena a request to retrieve multiple entities with retry policy.
|
|
317
|
+
|
|
318
|
+
Parameters
|
|
319
|
+
----------
|
|
320
|
+
entity_ids: List[str]
|
|
321
|
+
Entity QIDs
|
|
322
|
+
base_url: Base URL
|
|
323
|
+
The base URL
|
|
324
|
+
Returns
|
|
325
|
+
-------
|
|
326
|
+
result_dict: Dict[str, Any]
|
|
327
|
+
Result dict
|
|
328
|
+
Raises
|
|
329
|
+
------
|
|
330
|
+
ValueError - Empty list or to many entities
|
|
331
|
+
"""
|
|
332
|
+
checked_entity_ids: List[str] = [e for e in entity_ids if e.startswith("Q")]
|
|
333
|
+
if not (0 < len(checked_entity_ids) <= API_LIMIT):
|
|
334
|
+
raise ValueError(
|
|
335
|
+
f"Number of entities must be within [1, {API_LIMIT}]. " f"Number of QIDs: {len(checked_entity_ids)}"
|
|
336
|
+
)
|
|
337
|
+
query: str = "|".join(checked_entity_ids)
|
|
338
|
+
url: str = f"{base_url}{query}&format=json"
|
|
339
|
+
|
|
340
|
+
# Define the retry policy
|
|
341
|
+
retry_policy: Retry = Retry(
|
|
342
|
+
total=3, # maximum number of retries
|
|
343
|
+
backoff_factor=1, # factor by which to multiply the delay between retries
|
|
344
|
+
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
|
|
345
|
+
respect_retry_after_header=True, # respect the Retry-After header
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
# Create a session and mount the retry adapter
|
|
349
|
+
with requests.Session() as session:
|
|
350
|
+
retry_adapter = HTTPAdapter(max_retries=retry_policy)
|
|
351
|
+
session.mount("https://", retry_adapter)
|
|
352
|
+
|
|
353
|
+
# Make a request using the session
|
|
354
|
+
response: Response = session.get(url)
|
|
355
|
+
|
|
356
|
+
# Check the response status code
|
|
357
|
+
if not response.ok:
|
|
358
|
+
raise WikiDataAPIException(f"Request failed with status code : {response.status_code}. URL:= {url}")
|
|
359
|
+
entity_dict_full: Dict[str, Any] = response.json()
|
|
360
|
+
results: List[Dict[str, Any]] = []
|
|
361
|
+
# If no entities found
|
|
362
|
+
if "entities" not in entity_dict_full:
|
|
363
|
+
return results
|
|
364
|
+
for qid, e in entity_dict_full["entities"].items():
|
|
365
|
+
if qid not in entity_ids:
|
|
366
|
+
logger.warning(
|
|
367
|
+
f"Wikidata redirect detected. " f"Returned entity id={qid} is not in list of entity ids."
|
|
368
|
+
)
|
|
369
|
+
if "missing" in e:
|
|
370
|
+
logger.warning(f"Missing entity detected. Returned entity id={qid} is not in Wikidata found.")
|
|
371
|
+
continue
|
|
372
|
+
results.append(e)
|
|
373
|
+
return results
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Copyright © 2023-present Wacom. All rights reserved.
|
|
3
|
+
import functools
|
|
4
|
+
import multiprocessing
|
|
5
|
+
from typing import Any, Dict, Set, Tuple, List, Callable, Optional
|
|
6
|
+
|
|
7
|
+
from tqdm import tqdm
|
|
8
|
+
|
|
9
|
+
from knowledge.public.helper import CLAIMS_TAG, PID_TAG, LABEL_TAG, QID_TAG
|
|
10
|
+
from knowledge.public.wikidata import LITERALS_TAG, WikidataThing, WikiDataAPIClient
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def __relations__(thing: Dict[str, Any], wikidata: Set[str]) -> Tuple[str, List[Dict[str, Any]]]:
|
|
14
|
+
"""
|
|
15
|
+
Extracts relations from Wikidata.
|
|
16
|
+
Parameters
|
|
17
|
+
----------
|
|
18
|
+
thing: Dict[str, Any]
|
|
19
|
+
Wikidata thing
|
|
20
|
+
wikidata: Set[str]
|
|
21
|
+
Set of unique QIDs
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
qid: str
|
|
26
|
+
QID of the Wikidata thing
|
|
27
|
+
relations: List[Dict[str, Any]]
|
|
28
|
+
Relations of the Wikidata thing
|
|
29
|
+
"""
|
|
30
|
+
relations: List[Dict[str, Any]] = []
|
|
31
|
+
for _, p_value in thing[CLAIMS_TAG].items():
|
|
32
|
+
for v in p_value[LITERALS_TAG]:
|
|
33
|
+
if isinstance(v, dict) and v.get("type") in {"wikibase-entityid", "wikibase-item"}:
|
|
34
|
+
ref_qid = v["value"]["id"]
|
|
35
|
+
prop = p_value[PID_TAG][LABEL_TAG]
|
|
36
|
+
if ref_qid in wikidata:
|
|
37
|
+
relations.append(
|
|
38
|
+
{
|
|
39
|
+
"subject": {
|
|
40
|
+
"qid": thing[QID_TAG],
|
|
41
|
+
},
|
|
42
|
+
"predicate": {"pid": p_value[PID_TAG][PID_TAG], "label": prop},
|
|
43
|
+
"target": {"qid": ref_qid},
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
return thing[QID_TAG], relations
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def wikidata_extractor_entities(qids: Set[str]) -> Dict[str, WikidataThing]:
|
|
50
|
+
"""
|
|
51
|
+
Extracts an entity from Wikidata.
|
|
52
|
+
|
|
53
|
+
Parameters
|
|
54
|
+
----------
|
|
55
|
+
qids: Set[str]
|
|
56
|
+
Set of unique QIDs
|
|
57
|
+
|
|
58
|
+
Returns
|
|
59
|
+
-------
|
|
60
|
+
wikidata_extractor: Dict[str, WikidataThing]
|
|
61
|
+
Wikidata map
|
|
62
|
+
"""
|
|
63
|
+
return {e.qid: e for e in WikiDataAPIClient.retrieve_entities(qids)}
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def wikidata_relations_extractor(
|
|
67
|
+
wikidata: Dict[str, WikidataThing],
|
|
68
|
+
progress_relations: Optional[Callable[[int, int], None]] = None,
|
|
69
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
70
|
+
"""Extracts relations from Wikidata.
|
|
71
|
+
|
|
72
|
+
Parameters
|
|
73
|
+
----------
|
|
74
|
+
wikidata: Dict[str, WikidataThing]
|
|
75
|
+
Wikidata map
|
|
76
|
+
progress_relations: Optional[Callable[[int, int], None]] = None
|
|
77
|
+
Progress callback function.
|
|
78
|
+
|
|
79
|
+
Returns
|
|
80
|
+
-------
|
|
81
|
+
relations: Dict[str, List[Dict[str, Any]]]
|
|
82
|
+
Relations map.
|
|
83
|
+
"""
|
|
84
|
+
relations: Dict[str, List[Dict[str, Any]]] = {}
|
|
85
|
+
qids: Set[str] = set(wikidata.keys())
|
|
86
|
+
num_processes: int = min(len(wikidata), multiprocessing.cpu_count())
|
|
87
|
+
ctr: int = 0
|
|
88
|
+
tasks: int = len(qids)
|
|
89
|
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
|
90
|
+
# Wikidata thing is not support in multiprocessing
|
|
91
|
+
for qid, rels in pool.map(
|
|
92
|
+
functools.partial(__relations__, wikidata=qids), [e.__dict__() for e in wikidata.values()]
|
|
93
|
+
):
|
|
94
|
+
relations[qid] = rels
|
|
95
|
+
ctr += 1
|
|
96
|
+
if progress_relations:
|
|
97
|
+
progress_relations(ctr, tasks)
|
|
98
|
+
return relations
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def wikidata_relations_extractor_qids(
|
|
102
|
+
wikidata: Dict[str, WikidataThing], qids: Set[str]
|
|
103
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
104
|
+
"""Extracts relations from Wikidata.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
wikidata: Dict[str, WikidataThing]
|
|
109
|
+
Wikidata map
|
|
110
|
+
qids: Set[str]
|
|
111
|
+
Set of unique QIDs
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
relations: Dict[str, List[Dict[str, Any]]]
|
|
116
|
+
Relations map.
|
|
117
|
+
"""
|
|
118
|
+
relations: Dict[str, List[Dict[str, Any]]] = {}
|
|
119
|
+
num_processes: int = min(len(wikidata), multiprocessing.cpu_count())
|
|
120
|
+
with multiprocessing.Pool(processes=num_processes) as pool:
|
|
121
|
+
# Wikidata thing is not support in multiprocessing
|
|
122
|
+
with tqdm(total=round(len(wikidata) / num_processes), desc="Check Wikidata relations.") as pbar:
|
|
123
|
+
for qid, rels in pool.map(
|
|
124
|
+
functools.partial(__relations__, wikidata=qids), [e.__dict__() for e in wikidata.values()]
|
|
125
|
+
):
|
|
126
|
+
relations[qid] = rels
|
|
127
|
+
pbar.update(1)
|
|
128
|
+
return relations
|