personal_knowledge_library 3.0.0__py3-none-any.whl → 3.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of personal_knowledge_library might be problematic. Click here for more details.
- knowledge/__init__.py +1 -1
- knowledge/ontomapping/__init__.py +33 -115
- knowledge/ontomapping/manager.py +24 -25
- knowledge/public/__init__.py +8 -7
- knowledge/public/cache.py +413 -86
- knowledge/public/client.py +428 -0
- knowledge/public/helper.py +39 -11
- knowledge/public/relations.py +2 -1
- knowledge/public/wikidata.py +47 -381
- knowledge/utils/graph.py +6 -6
- {personal_knowledge_library-3.0.0.dist-info → personal_knowledge_library-3.1.1.dist-info}/METADATA +7 -1
- {personal_knowledge_library-3.0.0.dist-info → personal_knowledge_library-3.1.1.dist-info}/RECORD +14 -13
- {personal_knowledge_library-3.0.0.dist-info → personal_knowledge_library-3.1.1.dist-info}/LICENSE +0 -0
- {personal_knowledge_library-3.0.0.dist-info → personal_knowledge_library-3.1.1.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,428 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# Copyright © 2023-present Wacom. All rights reserved.
|
|
3
|
+
import multiprocessing
|
|
4
|
+
from collections import deque
|
|
5
|
+
from multiprocessing import Pool
|
|
6
|
+
from typing import Union, Any, Dict, List, Tuple, Set, Optional, Callable
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
from requests import Response
|
|
10
|
+
from requests.adapters import HTTPAdapter
|
|
11
|
+
from urllib3 import Retry
|
|
12
|
+
|
|
13
|
+
from knowledge import logger
|
|
14
|
+
from knowledge.base.entity import (
|
|
15
|
+
LanguageCode,
|
|
16
|
+
)
|
|
17
|
+
from knowledge.public.cache import WikidataCache
|
|
18
|
+
from knowledge.public.helper import (
|
|
19
|
+
__waiting_request__,
|
|
20
|
+
__waiting_multi_request__,
|
|
21
|
+
WikiDataAPIException,
|
|
22
|
+
WIKIDATA_SPARQL_URL,
|
|
23
|
+
WIKIDATA_SEARCH_URL,
|
|
24
|
+
API_LIMIT,
|
|
25
|
+
)
|
|
26
|
+
from knowledge.public.wikidata import WikidataClass, WikidataThing, WikidataSearchResult, WikidataProperty
|
|
27
|
+
|
|
28
|
+
# Constants
|
|
29
|
+
QUALIFIERS_TAG: str = "QUALIFIERS"
|
|
30
|
+
LITERALS_TAG: str = "LITERALS"
|
|
31
|
+
# Cache for wikidata objects
|
|
32
|
+
wikidata_cache: WikidataCache = WikidataCache()
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def chunks(lst: List[str], chunk_size: int):
|
|
36
|
+
"""
|
|
37
|
+
Yield successive n-sized chunks from lst.Yield successive n-sized chunks from lst.
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
lst: List[str]
|
|
41
|
+
Full length.
|
|
42
|
+
chunk_size: int
|
|
43
|
+
Chunk size.
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
for i in range(0, len(lst), chunk_size):
|
|
47
|
+
yield lst[i : i + chunk_size]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class WikiDataAPIClient:
|
|
51
|
+
"""
|
|
52
|
+
WikiDataAPIClient
|
|
53
|
+
-----------------
|
|
54
|
+
Utility class for the WikiData.
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self):
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def sparql_query(query_string: str, wikidata_sparql_url: str = WIKIDATA_SPARQL_URL, max_retries: int = 3) -> dict:
|
|
63
|
+
"""Send a SPARQL query and return the JSON formatted result.
|
|
64
|
+
|
|
65
|
+
Parameters
|
|
66
|
+
-----------
|
|
67
|
+
query_string: str
|
|
68
|
+
SPARQL query string
|
|
69
|
+
wikidata_sparql_url: str
|
|
70
|
+
Wikidata SPARQL endpoint to use
|
|
71
|
+
max_retries: int
|
|
72
|
+
Maximum number of retries
|
|
73
|
+
"""
|
|
74
|
+
# Define the retry policy
|
|
75
|
+
retry_policy: Retry = Retry(
|
|
76
|
+
total=max_retries, # maximum number of retries
|
|
77
|
+
backoff_factor=1, # factor by which to multiply the delay between retries
|
|
78
|
+
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
|
|
79
|
+
respect_retry_after_header=True, # respect the Retry-After header
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Create a session and mount the retry adapter
|
|
83
|
+
with requests.Session() as session:
|
|
84
|
+
retry_adapter = HTTPAdapter(max_retries=retry_policy)
|
|
85
|
+
session.mount("https://", retry_adapter)
|
|
86
|
+
|
|
87
|
+
# Make a request using the session
|
|
88
|
+
response: Response = session.get(
|
|
89
|
+
wikidata_sparql_url, params={"query": query_string, "format": "json"}, timeout=10000
|
|
90
|
+
)
|
|
91
|
+
if response.ok:
|
|
92
|
+
return response.json()
|
|
93
|
+
|
|
94
|
+
raise WikiDataAPIException(
|
|
95
|
+
f"Failed to query entities. " f"Response code:={response.status_code}, Exception:= {response.content}."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
@staticmethod
|
|
99
|
+
def superclasses(qid: str) -> Dict[str, WikidataClass]:
|
|
100
|
+
"""
|
|
101
|
+
Returns the Wikidata class with all its superclasses for the given QID.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
qid: str
|
|
106
|
+
Wikidata QID (e.g., 'Q146' for house cat).
|
|
107
|
+
|
|
108
|
+
Returns
|
|
109
|
+
-------
|
|
110
|
+
classes: Dict[str, WikidataClass]
|
|
111
|
+
A dictionary of WikidataClass objects, where the keys are QIDs and the values are the corresponding
|
|
112
|
+
"""
|
|
113
|
+
# Fetch superclasses
|
|
114
|
+
query = f"""
|
|
115
|
+
SELECT DISTINCT ?class ?classLabel ?superclass ?superclassLabel
|
|
116
|
+
WHERE
|
|
117
|
+
{{
|
|
118
|
+
wd:{qid} wdt:P279* ?class.
|
|
119
|
+
?class wdt:P279 ?superclass.
|
|
120
|
+
SERVICE wikibase:label {{bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
|
|
121
|
+
}}
|
|
122
|
+
"""
|
|
123
|
+
try:
|
|
124
|
+
reply: Dict[str, Any] = WikiDataAPIClient.sparql_query(query)
|
|
125
|
+
wikidata_classes: Dict[str, WikidataClass] = {}
|
|
126
|
+
cycle_detector: Set[Tuple[str, str]] = set()
|
|
127
|
+
adjacency_list: Dict[str, Set[str]] = {}
|
|
128
|
+
|
|
129
|
+
if "results" in reply:
|
|
130
|
+
for b in reply["results"]["bindings"]:
|
|
131
|
+
superclass_qid = b["superclass"]["value"].rsplit("/", 1)[-1]
|
|
132
|
+
class_qid = b["class"]["value"].rsplit("/", 1)[-1]
|
|
133
|
+
superclass_label = b["superclassLabel"]["value"]
|
|
134
|
+
class_label = b["classLabel"]["value"]
|
|
135
|
+
wikidata_classes.setdefault(class_qid, WikidataClass(class_qid, class_label))
|
|
136
|
+
wikidata_classes.setdefault(superclass_qid, WikidataClass(superclass_qid, superclass_label))
|
|
137
|
+
adjacency_list.setdefault(class_qid, set()).add(superclass_qid)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.exception(e)
|
|
140
|
+
return {qid: WikidataClass(qid, f"Class {qid}")}
|
|
141
|
+
queue = deque([qid])
|
|
142
|
+
visited = set()
|
|
143
|
+
|
|
144
|
+
while queue:
|
|
145
|
+
current_qid = queue.popleft()
|
|
146
|
+
if current_qid in visited:
|
|
147
|
+
continue
|
|
148
|
+
visited.add(current_qid)
|
|
149
|
+
|
|
150
|
+
if current_qid in adjacency_list:
|
|
151
|
+
for superclass_qid in adjacency_list[current_qid]:
|
|
152
|
+
if (current_qid, superclass_qid) not in cycle_detector:
|
|
153
|
+
wikidata_classes[current_qid].superclasses.append(wikidata_classes[superclass_qid])
|
|
154
|
+
queue.append(superclass_qid)
|
|
155
|
+
cycle_detector.add((current_qid, superclass_qid))
|
|
156
|
+
|
|
157
|
+
return wikidata_classes
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def subclasses(qid: str) -> Dict[str, WikidataClass]:
|
|
161
|
+
"""
|
|
162
|
+
Returns the Wikidata class with all its subclasses for the given QID.
|
|
163
|
+
|
|
164
|
+
Parameters
|
|
165
|
+
----------
|
|
166
|
+
qid: str
|
|
167
|
+
Wikidata QID (e.g., 'Q146' for house cat).
|
|
168
|
+
|
|
169
|
+
Returns
|
|
170
|
+
-------
|
|
171
|
+
classes: Dict[str, WikidataClass]
|
|
172
|
+
A dictionary of WikidataClass objects, where the keys are QIDs and the values are the corresponding
|
|
173
|
+
classes with their subclasses populated.
|
|
174
|
+
"""
|
|
175
|
+
# Fetch subclasses
|
|
176
|
+
query = f"""
|
|
177
|
+
SELECT DISTINCT ?class ?classLabel ?subclass ?subclassLabel
|
|
178
|
+
WHERE
|
|
179
|
+
{{
|
|
180
|
+
?subclass wdt:P279 wd:{qid}.
|
|
181
|
+
?subclass wdt:P279 ?class.
|
|
182
|
+
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
|
|
183
|
+
}}
|
|
184
|
+
LIMIT 1000
|
|
185
|
+
"""
|
|
186
|
+
try:
|
|
187
|
+
reply: Dict[str, Any] = WikiDataAPIClient.sparql_query(query)
|
|
188
|
+
wikidata_classes: Dict[str, WikidataClass] = {}
|
|
189
|
+
cycle_detector: Set[Tuple[str, str]] = set()
|
|
190
|
+
adjacency_list: Dict[str, Set[str]] = {}
|
|
191
|
+
|
|
192
|
+
if "results" in reply:
|
|
193
|
+
for b in reply["results"]["bindings"]:
|
|
194
|
+
subclass_qid = b["subclass"]["value"].rsplit("/", 1)[-1]
|
|
195
|
+
class_qid = b["class"]["value"].rsplit("/", 1)[-1]
|
|
196
|
+
subclass_label = b["subclassLabel"]["value"]
|
|
197
|
+
class_label = b["classLabel"]["value"]
|
|
198
|
+
|
|
199
|
+
wikidata_classes.setdefault(class_qid, WikidataClass(class_qid, class_label))
|
|
200
|
+
wikidata_classes.setdefault(subclass_qid, WikidataClass(subclass_qid, subclass_label))
|
|
201
|
+
|
|
202
|
+
# subclass -> class relationship (reverse of superclass logic)
|
|
203
|
+
adjacency_list.setdefault(class_qid, set()).add(subclass_qid)
|
|
204
|
+
except Exception as e:
|
|
205
|
+
logger.exception(e)
|
|
206
|
+
return {qid: WikidataClass(qid, f"Class {qid}")}
|
|
207
|
+
|
|
208
|
+
queue = deque([qid])
|
|
209
|
+
visited = set()
|
|
210
|
+
|
|
211
|
+
while queue:
|
|
212
|
+
current_qid = queue.popleft()
|
|
213
|
+
if current_qid in visited:
|
|
214
|
+
continue
|
|
215
|
+
visited.add(current_qid)
|
|
216
|
+
|
|
217
|
+
# Ensure the starting QID is in the dictionary
|
|
218
|
+
if current_qid not in wikidata_classes:
|
|
219
|
+
# If not present, we might need to fetch its label separately
|
|
220
|
+
wikidata_classes[current_qid] = WikidataClass(current_qid, f"Class {current_qid}")
|
|
221
|
+
|
|
222
|
+
if current_qid in adjacency_list:
|
|
223
|
+
for subclass_qid in adjacency_list[current_qid]:
|
|
224
|
+
if (current_qid, subclass_qid) not in cycle_detector:
|
|
225
|
+
wikidata_classes[current_qid].subclasses.append(wikidata_classes[subclass_qid])
|
|
226
|
+
queue.append(subclass_qid)
|
|
227
|
+
cycle_detector.add((current_qid, subclass_qid))
|
|
228
|
+
|
|
229
|
+
return wikidata_classes
|
|
230
|
+
|
|
231
|
+
@staticmethod
|
|
232
|
+
def search_term(
|
|
233
|
+
search_term: str, language: LanguageCode, url: str = WIKIDATA_SEARCH_URL
|
|
234
|
+
) -> List[WikidataSearchResult]:
|
|
235
|
+
"""
|
|
236
|
+
Search for a term in the WikiData.
|
|
237
|
+
Parameters
|
|
238
|
+
----------
|
|
239
|
+
search_term: str
|
|
240
|
+
The term to search for.
|
|
241
|
+
language: str
|
|
242
|
+
The language to search in.
|
|
243
|
+
url: str
|
|
244
|
+
The URL of the WikiData search API.
|
|
245
|
+
|
|
246
|
+
Returns
|
|
247
|
+
-------
|
|
248
|
+
search_results_dict: List[WikidataSearchResult]
|
|
249
|
+
The search results.
|
|
250
|
+
"""
|
|
251
|
+
search_results_dict: List[WikidataSearchResult] = []
|
|
252
|
+
# Define the retry policy
|
|
253
|
+
retry_policy: Retry = Retry(
|
|
254
|
+
total=3, # maximum number of retries
|
|
255
|
+
backoff_factor=1, # factor by which to multiply the delay between retries
|
|
256
|
+
status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
|
|
257
|
+
respect_retry_after_header=True, # respect the Retry-After header
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Create a session and mount the retry adapter
|
|
261
|
+
with requests.Session() as session:
|
|
262
|
+
retry_adapter = HTTPAdapter(max_retries=retry_policy)
|
|
263
|
+
session.mount("https://", retry_adapter)
|
|
264
|
+
params: Dict[str, str] = {
|
|
265
|
+
"action": "wbsearchentities",
|
|
266
|
+
"format": "json",
|
|
267
|
+
"language": language,
|
|
268
|
+
"search": search_term,
|
|
269
|
+
}
|
|
270
|
+
# Make a request using the session
|
|
271
|
+
response: Response = session.get(url, params=params, timeout=200000)
|
|
272
|
+
|
|
273
|
+
# Check the response status code
|
|
274
|
+
if not response.ok:
|
|
275
|
+
raise WikiDataAPIException(
|
|
276
|
+
f"Search request failed with status code : {response.status_code}. " f"URL:= {url}"
|
|
277
|
+
)
|
|
278
|
+
search_result_dict_full: Dict[str, Any] = response.json()
|
|
279
|
+
for search_result_dict in search_result_dict_full["search"]:
|
|
280
|
+
search_results_dict.append(WikidataSearchResult.from_dict(search_result_dict))
|
|
281
|
+
return search_results_dict
|
|
282
|
+
|
|
283
|
+
@staticmethod
|
|
284
|
+
def __wikidata_task__(qid: str) -> WikidataThing:
|
|
285
|
+
"""Retrieve a single Wikidata thing.
|
|
286
|
+
|
|
287
|
+
Parameters
|
|
288
|
+
----------
|
|
289
|
+
qid: str
|
|
290
|
+
QID of the entity.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
instance: WikidataThing
|
|
295
|
+
Single wikidata thing
|
|
296
|
+
"""
|
|
297
|
+
try:
|
|
298
|
+
if wikidata_cache.qid_in_cache(qid):
|
|
299
|
+
return wikidata_cache.get_wikidata_object(qid)
|
|
300
|
+
w_thing = WikidataThing.from_wikidata(__waiting_request__(qid))
|
|
301
|
+
# Add the thing to the cache
|
|
302
|
+
wikidata_cache.cache_wikidata_object(w_thing)
|
|
303
|
+
return w_thing
|
|
304
|
+
except Exception as e:
|
|
305
|
+
logger.exception(e)
|
|
306
|
+
raise WikiDataAPIException(e) from e
|
|
307
|
+
|
|
308
|
+
@staticmethod
|
|
309
|
+
def __wikidata_multiple_task__(qids: List[str]) -> List[WikidataThing]:
|
|
310
|
+
"""Retrieve multiple Wikidata things.
|
|
311
|
+
|
|
312
|
+
Parameters
|
|
313
|
+
----------
|
|
314
|
+
qids: List[str]
|
|
315
|
+
QIDs of the entities.
|
|
316
|
+
|
|
317
|
+
Returns
|
|
318
|
+
-------
|
|
319
|
+
instances: List[WikidataThing]
|
|
320
|
+
List of wikidata things
|
|
321
|
+
"""
|
|
322
|
+
try:
|
|
323
|
+
results: List[WikidataThing] = []
|
|
324
|
+
if len(qids) > 0:
|
|
325
|
+
for e in __waiting_multi_request__(qids):
|
|
326
|
+
w_thing = WikidataThing.from_wikidata(e)
|
|
327
|
+
results.append(w_thing)
|
|
328
|
+
return results
|
|
329
|
+
except Exception as e:
|
|
330
|
+
logger.exception(e)
|
|
331
|
+
raise WikiDataAPIException(e) from e
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def retrieve_entity(qid: str) -> WikidataThing:
|
|
335
|
+
"""
|
|
336
|
+
Retrieve a single Wikidata thing.
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
qid: str
|
|
341
|
+
QID of the entity.
|
|
342
|
+
|
|
343
|
+
Returns
|
|
344
|
+
-------
|
|
345
|
+
instance: WikidataThing
|
|
346
|
+
Single wikidata thing
|
|
347
|
+
"""
|
|
348
|
+
return WikiDataAPIClient.__wikidata_task__(qid)
|
|
349
|
+
|
|
350
|
+
@staticmethod
|
|
351
|
+
def retrieve_entities(qids: Union[List[str], Set[str]], progress: Optional[Callable[[int, int], None]] = None) \
|
|
352
|
+
-> List[WikidataThing]:
|
|
353
|
+
"""
|
|
354
|
+
Retrieve multiple Wikidata things.
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
qids: List[str]
|
|
358
|
+
QIDs of the entities.
|
|
359
|
+
progress: Optional[Callable[[int, int], None]]
|
|
360
|
+
Optional callback function to report progress.
|
|
361
|
+
|
|
362
|
+
Returns
|
|
363
|
+
-------
|
|
364
|
+
instances: List[WikidataThing]
|
|
365
|
+
List of wikidata things.
|
|
366
|
+
"""
|
|
367
|
+
pulled: List[WikidataThing] = []
|
|
368
|
+
task_size: int = len(qids)
|
|
369
|
+
if len(qids) == 0:
|
|
370
|
+
return []
|
|
371
|
+
missing_qids: List[str] = []
|
|
372
|
+
for qid in qids:
|
|
373
|
+
if not wikidata_cache.qid_in_cache(qid):
|
|
374
|
+
if qid and qid.startswith("Q") and len(qid) > 1:
|
|
375
|
+
missing_qids.append(qid)
|
|
376
|
+
else:
|
|
377
|
+
pulled.append(wikidata_cache.get_wikidata_object(qid))
|
|
378
|
+
ctr: int = len(pulled)
|
|
379
|
+
if progress:
|
|
380
|
+
progress(len(pulled), task_size)
|
|
381
|
+
jobs: List[List[str]] = list(chunks(list(missing_qids), API_LIMIT))
|
|
382
|
+
num_processes: int = min(len(jobs), multiprocessing.cpu_count())
|
|
383
|
+
if num_processes > 1:
|
|
384
|
+
with Pool(processes=num_processes) as pool:
|
|
385
|
+
# Wikidata thing is not support in multiprocessing
|
|
386
|
+
for lst in pool.imap_unordered(__waiting_multi_request__, jobs):
|
|
387
|
+
for w_dict in lst:
|
|
388
|
+
w_thing = WikidataThing.from_wikidata(w_dict)
|
|
389
|
+
wikidata_cache.cache_wikidata_object(w_thing)
|
|
390
|
+
pulled.append(w_thing)
|
|
391
|
+
ctr += 1
|
|
392
|
+
if progress:
|
|
393
|
+
progress(ctr, task_size)
|
|
394
|
+
else:
|
|
395
|
+
results = WikiDataAPIClient.__wikidata_multiple_task__(jobs[0])
|
|
396
|
+
for w_thing in results:
|
|
397
|
+
wikidata_cache.cache_wikidata_object(w_thing)
|
|
398
|
+
ctr += 1
|
|
399
|
+
if progress:
|
|
400
|
+
progress(ctr, task_size)
|
|
401
|
+
pulled.extend(results)
|
|
402
|
+
return pulled
|
|
403
|
+
|
|
404
|
+
@staticmethod
|
|
405
|
+
def wikiproperty(pid: str) -> WikidataProperty:
|
|
406
|
+
"""
|
|
407
|
+
Retrieve a single Wikidata property.
|
|
408
|
+
|
|
409
|
+
Parameters
|
|
410
|
+
----------
|
|
411
|
+
pid: str
|
|
412
|
+
PID of the property.
|
|
413
|
+
|
|
414
|
+
Returns
|
|
415
|
+
-------
|
|
416
|
+
instance: WikidataProperty
|
|
417
|
+
Single wikidata property
|
|
418
|
+
"""
|
|
419
|
+
try:
|
|
420
|
+
# if wikidata_cache.get_property(pid):
|
|
421
|
+
# return wikidata_cache.get_property(pid)
|
|
422
|
+
w_property = WikidataProperty.from_wikidata(__waiting_request__(pid))
|
|
423
|
+
# Add the property to the cache
|
|
424
|
+
wikidata_cache.cache_property(w_property)
|
|
425
|
+
return w_property
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.exception(e)
|
|
428
|
+
raise WikiDataAPIException(e) from e
|
knowledge/public/helper.py
CHANGED
|
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter
|
|
|
13
13
|
from urllib3 import Retry
|
|
14
14
|
|
|
15
15
|
from knowledge import logger
|
|
16
|
+
from knowledge.public import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT, DEFAULT_BACKOFF_FACTOR, STATUS_FORCE_LIST
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
# --------------------------------------- Structures -------------------------------------------------------------------
|
|
@@ -49,10 +50,12 @@ QID_TAG: str = "qid"
|
|
|
49
50
|
PID_TAG: str = "pid"
|
|
50
51
|
LAST_REVID_TAG: str = "lastrevid"
|
|
51
52
|
MODIFIED_TAG: str = "modified"
|
|
53
|
+
SYNC_TIME_TAG: str = "sync"
|
|
52
54
|
WIKIDATA_LANGUAGE_TAG: str = "language"
|
|
53
55
|
LABEL_VALUE_TAG: str = "value"
|
|
54
56
|
LABEL_TAG: str = "label"
|
|
55
57
|
SUPERCLASSES_TAG: str = "superclasses"
|
|
58
|
+
SUBCLASSES_TAG: str = "subclasses"
|
|
56
59
|
CLAIMS_TAG: str = "claims"
|
|
57
60
|
ONTOLOGY_TYPES_TAG: str = "ontology_types"
|
|
58
61
|
REVISION_TAG: str = "revision"
|
|
@@ -262,7 +265,13 @@ def wikidate(param: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
262
265
|
}
|
|
263
266
|
|
|
264
267
|
|
|
265
|
-
def __waiting_request__(
|
|
268
|
+
def __waiting_request__(
|
|
269
|
+
entity_id: str,
|
|
270
|
+
base_url: str = WIKIDATA_LDI_URL,
|
|
271
|
+
timeout: int = DEFAULT_TIMEOUT,
|
|
272
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
273
|
+
backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
|
|
274
|
+
) -> Dict[str, Any]:
|
|
266
275
|
"""
|
|
267
276
|
Sena a request with retry policy.
|
|
268
277
|
|
|
@@ -272,18 +281,24 @@ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dic
|
|
|
272
281
|
Entity QID
|
|
273
282
|
base_url: Base URL
|
|
274
283
|
The base URL
|
|
284
|
+
timeout: int
|
|
285
|
+
Timeout in seconds
|
|
286
|
+
max_retries: int
|
|
287
|
+
Maximum number of retries
|
|
288
|
+
backoff_factor: float
|
|
289
|
+
Backoff factor for retries.
|
|
290
|
+
|
|
275
291
|
Returns
|
|
276
292
|
-------
|
|
277
293
|
result_dict: Dict[str, Any]
|
|
278
294
|
Result dict
|
|
279
295
|
"""
|
|
280
296
|
url: str = f"{base_url}/{entity_id}.json"
|
|
281
|
-
|
|
282
297
|
# Define the retry policy
|
|
283
298
|
retry_policy: Retry = Retry(
|
|
284
|
-
total=
|
|
285
|
-
backoff_factor=
|
|
286
|
-
status_forcelist=
|
|
299
|
+
total=max_retries, # maximum number of retries
|
|
300
|
+
backoff_factor=backoff_factor, # factor by which to multiply the delay between retries
|
|
301
|
+
status_forcelist=STATUS_FORCE_LIST, # HTTP status codes to retry on
|
|
287
302
|
respect_retry_after_header=True, # respect the Retry-After header
|
|
288
303
|
)
|
|
289
304
|
|
|
@@ -293,7 +308,7 @@ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dic
|
|
|
293
308
|
session.mount("https://", retry_adapter)
|
|
294
309
|
|
|
295
310
|
# Make a request using the session
|
|
296
|
-
response: Response = session.get(url)
|
|
311
|
+
response: Response = session.get(url, timeout=timeout)
|
|
297
312
|
|
|
298
313
|
# Check the response status code
|
|
299
314
|
if not response.ok:
|
|
@@ -311,7 +326,13 @@ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dic
|
|
|
311
326
|
return entity_dict
|
|
312
327
|
|
|
313
328
|
|
|
314
|
-
def __waiting_multi_request__(
|
|
329
|
+
def __waiting_multi_request__(
|
|
330
|
+
entity_ids: List[str],
|
|
331
|
+
base_url: str = MULTIPLE_ENTITIES_API,
|
|
332
|
+
timeout: int = DEFAULT_TIMEOUT,
|
|
333
|
+
max_retries: int = DEFAULT_MAX_RETRIES,
|
|
334
|
+
backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
|
|
335
|
+
) -> List[Dict[str, Any]]:
|
|
315
336
|
"""
|
|
316
337
|
Sena a request to retrieve multiple entities with retry policy.
|
|
317
338
|
|
|
@@ -321,6 +342,12 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
|
|
|
321
342
|
Entity QIDs
|
|
322
343
|
base_url: Base URL
|
|
323
344
|
The base URL
|
|
345
|
+
timeout: int
|
|
346
|
+
Timeout in seconds
|
|
347
|
+
max_retries: int
|
|
348
|
+
Maximum number of retries
|
|
349
|
+
backoff_factor: float
|
|
350
|
+
Backoff factor for retries.
|
|
324
351
|
Returns
|
|
325
352
|
-------
|
|
326
353
|
result_dict: Dict[str, Any]
|
|
@@ -330,6 +357,7 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
|
|
|
330
357
|
ValueError - Empty list or to many entities
|
|
331
358
|
"""
|
|
332
359
|
checked_entity_ids: List[str] = [e for e in entity_ids if e.startswith("Q")]
|
|
360
|
+
|
|
333
361
|
if not (0 < len(checked_entity_ids) <= API_LIMIT):
|
|
334
362
|
raise ValueError(
|
|
335
363
|
f"Number of entities must be within [1, {API_LIMIT}]. " f"Number of QIDs: {len(checked_entity_ids)}"
|
|
@@ -339,9 +367,9 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
|
|
|
339
367
|
|
|
340
368
|
# Define the retry policy
|
|
341
369
|
retry_policy: Retry = Retry(
|
|
342
|
-
total=
|
|
343
|
-
backoff_factor=
|
|
344
|
-
status_forcelist=
|
|
370
|
+
total=max_retries, # maximum number of retries
|
|
371
|
+
backoff_factor=backoff_factor, # factor by which to multiply the delay between retries
|
|
372
|
+
status_forcelist=STATUS_FORCE_LIST, # HTTP status codes to retry on
|
|
345
373
|
respect_retry_after_header=True, # respect the Retry-After header
|
|
346
374
|
)
|
|
347
375
|
|
|
@@ -351,7 +379,7 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
|
|
|
351
379
|
session.mount("https://", retry_adapter)
|
|
352
380
|
|
|
353
381
|
# Make a request using the session
|
|
354
|
-
response: Response = session.get(url)
|
|
382
|
+
response: Response = session.get(url, timeout=timeout)
|
|
355
383
|
|
|
356
384
|
# Check the response status code
|
|
357
385
|
if not response.ok:
|
knowledge/public/relations.py
CHANGED
|
@@ -7,7 +7,8 @@ from typing import Any, Dict, Set, Tuple, List, Callable, Optional
|
|
|
7
7
|
from tqdm import tqdm
|
|
8
8
|
|
|
9
9
|
from knowledge.public.helper import CLAIMS_TAG, PID_TAG, LABEL_TAG, QID_TAG
|
|
10
|
-
from knowledge.public.wikidata import LITERALS_TAG, WikidataThing
|
|
10
|
+
from knowledge.public.wikidata import LITERALS_TAG, WikidataThing
|
|
11
|
+
from knowledge.public.client import WikiDataAPIClient
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def __relations__(thing: Dict[str, Any], wikidata: Set[str]) -> Tuple[str, List[Dict[str, Any]]]:
|