personal_knowledge_library 3.0.0__py3-none-any.whl → 3.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of personal_knowledge_library might be problematic. Click here for more details.

@@ -0,0 +1,428 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright © 2023-present Wacom. All rights reserved.
3
+ import multiprocessing
4
+ from collections import deque
5
+ from multiprocessing import Pool
6
+ from typing import Union, Any, Dict, List, Tuple, Set, Optional, Callable
7
+
8
+ import requests
9
+ from requests import Response
10
+ from requests.adapters import HTTPAdapter
11
+ from urllib3 import Retry
12
+
13
+ from knowledge import logger
14
+ from knowledge.base.entity import (
15
+ LanguageCode,
16
+ )
17
+ from knowledge.public.cache import WikidataCache
18
+ from knowledge.public.helper import (
19
+ __waiting_request__,
20
+ __waiting_multi_request__,
21
+ WikiDataAPIException,
22
+ WIKIDATA_SPARQL_URL,
23
+ WIKIDATA_SEARCH_URL,
24
+ API_LIMIT,
25
+ )
26
+ from knowledge.public.wikidata import WikidataClass, WikidataThing, WikidataSearchResult, WikidataProperty
27
+
28
+ # Constants
29
+ QUALIFIERS_TAG: str = "QUALIFIERS"
30
+ LITERALS_TAG: str = "LITERALS"
31
+ # Cache for wikidata objects
32
+ wikidata_cache: WikidataCache = WikidataCache()
33
+
34
+
35
+ def chunks(lst: List[str], chunk_size: int):
36
+ """
37
+ Yield successive n-sized chunks from lst.Yield successive n-sized chunks from lst.
38
+ Parameters
39
+ ----------
40
+ lst: List[str]
41
+ Full length.
42
+ chunk_size: int
43
+ Chunk size.
44
+
45
+ """
46
+ for i in range(0, len(lst), chunk_size):
47
+ yield lst[i : i + chunk_size]
48
+
49
+
50
+ class WikiDataAPIClient:
51
+ """
52
+ WikiDataAPIClient
53
+ -----------------
54
+ Utility class for the WikiData.
55
+
56
+ """
57
+
58
+ def __init__(self):
59
+ pass
60
+
61
+ @staticmethod
62
+ def sparql_query(query_string: str, wikidata_sparql_url: str = WIKIDATA_SPARQL_URL, max_retries: int = 3) -> dict:
63
+ """Send a SPARQL query and return the JSON formatted result.
64
+
65
+ Parameters
66
+ -----------
67
+ query_string: str
68
+ SPARQL query string
69
+ wikidata_sparql_url: str
70
+ Wikidata SPARQL endpoint to use
71
+ max_retries: int
72
+ Maximum number of retries
73
+ """
74
+ # Define the retry policy
75
+ retry_policy: Retry = Retry(
76
+ total=max_retries, # maximum number of retries
77
+ backoff_factor=1, # factor by which to multiply the delay between retries
78
+ status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
79
+ respect_retry_after_header=True, # respect the Retry-After header
80
+ )
81
+
82
+ # Create a session and mount the retry adapter
83
+ with requests.Session() as session:
84
+ retry_adapter = HTTPAdapter(max_retries=retry_policy)
85
+ session.mount("https://", retry_adapter)
86
+
87
+ # Make a request using the session
88
+ response: Response = session.get(
89
+ wikidata_sparql_url, params={"query": query_string, "format": "json"}, timeout=10000
90
+ )
91
+ if response.ok:
92
+ return response.json()
93
+
94
+ raise WikiDataAPIException(
95
+ f"Failed to query entities. " f"Response code:={response.status_code}, Exception:= {response.content}."
96
+ )
97
+
98
+ @staticmethod
99
+ def superclasses(qid: str) -> Dict[str, WikidataClass]:
100
+ """
101
+ Returns the Wikidata class with all its superclasses for the given QID.
102
+
103
+ Parameters
104
+ ----------
105
+ qid: str
106
+ Wikidata QID (e.g., 'Q146' for house cat).
107
+
108
+ Returns
109
+ -------
110
+ classes: Dict[str, WikidataClass]
111
+ A dictionary of WikidataClass objects, where the keys are QIDs and the values are the corresponding
112
+ """
113
+ # Fetch superclasses
114
+ query = f"""
115
+ SELECT DISTINCT ?class ?classLabel ?superclass ?superclassLabel
116
+ WHERE
117
+ {{
118
+ wd:{qid} wdt:P279* ?class.
119
+ ?class wdt:P279 ?superclass.
120
+ SERVICE wikibase:label {{bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
121
+ }}
122
+ """
123
+ try:
124
+ reply: Dict[str, Any] = WikiDataAPIClient.sparql_query(query)
125
+ wikidata_classes: Dict[str, WikidataClass] = {}
126
+ cycle_detector: Set[Tuple[str, str]] = set()
127
+ adjacency_list: Dict[str, Set[str]] = {}
128
+
129
+ if "results" in reply:
130
+ for b in reply["results"]["bindings"]:
131
+ superclass_qid = b["superclass"]["value"].rsplit("/", 1)[-1]
132
+ class_qid = b["class"]["value"].rsplit("/", 1)[-1]
133
+ superclass_label = b["superclassLabel"]["value"]
134
+ class_label = b["classLabel"]["value"]
135
+ wikidata_classes.setdefault(class_qid, WikidataClass(class_qid, class_label))
136
+ wikidata_classes.setdefault(superclass_qid, WikidataClass(superclass_qid, superclass_label))
137
+ adjacency_list.setdefault(class_qid, set()).add(superclass_qid)
138
+ except Exception as e:
139
+ logger.exception(e)
140
+ return {qid: WikidataClass(qid, f"Class {qid}")}
141
+ queue = deque([qid])
142
+ visited = set()
143
+
144
+ while queue:
145
+ current_qid = queue.popleft()
146
+ if current_qid in visited:
147
+ continue
148
+ visited.add(current_qid)
149
+
150
+ if current_qid in adjacency_list:
151
+ for superclass_qid in adjacency_list[current_qid]:
152
+ if (current_qid, superclass_qid) not in cycle_detector:
153
+ wikidata_classes[current_qid].superclasses.append(wikidata_classes[superclass_qid])
154
+ queue.append(superclass_qid)
155
+ cycle_detector.add((current_qid, superclass_qid))
156
+
157
+ return wikidata_classes
158
+
159
+ @staticmethod
160
+ def subclasses(qid: str) -> Dict[str, WikidataClass]:
161
+ """
162
+ Returns the Wikidata class with all its subclasses for the given QID.
163
+
164
+ Parameters
165
+ ----------
166
+ qid: str
167
+ Wikidata QID (e.g., 'Q146' for house cat).
168
+
169
+ Returns
170
+ -------
171
+ classes: Dict[str, WikidataClass]
172
+ A dictionary of WikidataClass objects, where the keys are QIDs and the values are the corresponding
173
+ classes with their subclasses populated.
174
+ """
175
+ # Fetch subclasses
176
+ query = f"""
177
+ SELECT DISTINCT ?class ?classLabel ?subclass ?subclassLabel
178
+ WHERE
179
+ {{
180
+ ?subclass wdt:P279 wd:{qid}.
181
+ ?subclass wdt:P279 ?class.
182
+ SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
183
+ }}
184
+ LIMIT 1000
185
+ """
186
+ try:
187
+ reply: Dict[str, Any] = WikiDataAPIClient.sparql_query(query)
188
+ wikidata_classes: Dict[str, WikidataClass] = {}
189
+ cycle_detector: Set[Tuple[str, str]] = set()
190
+ adjacency_list: Dict[str, Set[str]] = {}
191
+
192
+ if "results" in reply:
193
+ for b in reply["results"]["bindings"]:
194
+ subclass_qid = b["subclass"]["value"].rsplit("/", 1)[-1]
195
+ class_qid = b["class"]["value"].rsplit("/", 1)[-1]
196
+ subclass_label = b["subclassLabel"]["value"]
197
+ class_label = b["classLabel"]["value"]
198
+
199
+ wikidata_classes.setdefault(class_qid, WikidataClass(class_qid, class_label))
200
+ wikidata_classes.setdefault(subclass_qid, WikidataClass(subclass_qid, subclass_label))
201
+
202
+ # subclass -> class relationship (reverse of superclass logic)
203
+ adjacency_list.setdefault(class_qid, set()).add(subclass_qid)
204
+ except Exception as e:
205
+ logger.exception(e)
206
+ return {qid: WikidataClass(qid, f"Class {qid}")}
207
+
208
+ queue = deque([qid])
209
+ visited = set()
210
+
211
+ while queue:
212
+ current_qid = queue.popleft()
213
+ if current_qid in visited:
214
+ continue
215
+ visited.add(current_qid)
216
+
217
+ # Ensure the starting QID is in the dictionary
218
+ if current_qid not in wikidata_classes:
219
+ # If not present, we might need to fetch its label separately
220
+ wikidata_classes[current_qid] = WikidataClass(current_qid, f"Class {current_qid}")
221
+
222
+ if current_qid in adjacency_list:
223
+ for subclass_qid in adjacency_list[current_qid]:
224
+ if (current_qid, subclass_qid) not in cycle_detector:
225
+ wikidata_classes[current_qid].subclasses.append(wikidata_classes[subclass_qid])
226
+ queue.append(subclass_qid)
227
+ cycle_detector.add((current_qid, subclass_qid))
228
+
229
+ return wikidata_classes
230
+
231
+ @staticmethod
232
+ def search_term(
233
+ search_term: str, language: LanguageCode, url: str = WIKIDATA_SEARCH_URL
234
+ ) -> List[WikidataSearchResult]:
235
+ """
236
+ Search for a term in the WikiData.
237
+ Parameters
238
+ ----------
239
+ search_term: str
240
+ The term to search for.
241
+ language: str
242
+ The language to search in.
243
+ url: str
244
+ The URL of the WikiData search API.
245
+
246
+ Returns
247
+ -------
248
+ search_results_dict: List[WikidataSearchResult]
249
+ The search results.
250
+ """
251
+ search_results_dict: List[WikidataSearchResult] = []
252
+ # Define the retry policy
253
+ retry_policy: Retry = Retry(
254
+ total=3, # maximum number of retries
255
+ backoff_factor=1, # factor by which to multiply the delay between retries
256
+ status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
257
+ respect_retry_after_header=True, # respect the Retry-After header
258
+ )
259
+
260
+ # Create a session and mount the retry adapter
261
+ with requests.Session() as session:
262
+ retry_adapter = HTTPAdapter(max_retries=retry_policy)
263
+ session.mount("https://", retry_adapter)
264
+ params: Dict[str, str] = {
265
+ "action": "wbsearchentities",
266
+ "format": "json",
267
+ "language": language,
268
+ "search": search_term,
269
+ }
270
+ # Make a request using the session
271
+ response: Response = session.get(url, params=params, timeout=200000)
272
+
273
+ # Check the response status code
274
+ if not response.ok:
275
+ raise WikiDataAPIException(
276
+ f"Search request failed with status code : {response.status_code}. " f"URL:= {url}"
277
+ )
278
+ search_result_dict_full: Dict[str, Any] = response.json()
279
+ for search_result_dict in search_result_dict_full["search"]:
280
+ search_results_dict.append(WikidataSearchResult.from_dict(search_result_dict))
281
+ return search_results_dict
282
+
283
+ @staticmethod
284
+ def __wikidata_task__(qid: str) -> WikidataThing:
285
+ """Retrieve a single Wikidata thing.
286
+
287
+ Parameters
288
+ ----------
289
+ qid: str
290
+ QID of the entity.
291
+
292
+ Returns
293
+ -------
294
+ instance: WikidataThing
295
+ Single wikidata thing
296
+ """
297
+ try:
298
+ if wikidata_cache.qid_in_cache(qid):
299
+ return wikidata_cache.get_wikidata_object(qid)
300
+ w_thing = WikidataThing.from_wikidata(__waiting_request__(qid))
301
+ # Add the thing to the cache
302
+ wikidata_cache.cache_wikidata_object(w_thing)
303
+ return w_thing
304
+ except Exception as e:
305
+ logger.exception(e)
306
+ raise WikiDataAPIException(e) from e
307
+
308
+ @staticmethod
309
+ def __wikidata_multiple_task__(qids: List[str]) -> List[WikidataThing]:
310
+ """Retrieve multiple Wikidata things.
311
+
312
+ Parameters
313
+ ----------
314
+ qids: List[str]
315
+ QIDs of the entities.
316
+
317
+ Returns
318
+ -------
319
+ instances: List[WikidataThing]
320
+ List of wikidata things
321
+ """
322
+ try:
323
+ results: List[WikidataThing] = []
324
+ if len(qids) > 0:
325
+ for e in __waiting_multi_request__(qids):
326
+ w_thing = WikidataThing.from_wikidata(e)
327
+ results.append(w_thing)
328
+ return results
329
+ except Exception as e:
330
+ logger.exception(e)
331
+ raise WikiDataAPIException(e) from e
332
+
333
+ @staticmethod
334
+ def retrieve_entity(qid: str) -> WikidataThing:
335
+ """
336
+ Retrieve a single Wikidata thing.
337
+
338
+ Parameters
339
+ ----------
340
+ qid: str
341
+ QID of the entity.
342
+
343
+ Returns
344
+ -------
345
+ instance: WikidataThing
346
+ Single wikidata thing
347
+ """
348
+ return WikiDataAPIClient.__wikidata_task__(qid)
349
+
350
+ @staticmethod
351
+ def retrieve_entities(qids: Union[List[str], Set[str]], progress: Optional[Callable[[int, int], None]] = None) \
352
+ -> List[WikidataThing]:
353
+ """
354
+ Retrieve multiple Wikidata things.
355
+ Parameters
356
+ ----------
357
+ qids: List[str]
358
+ QIDs of the entities.
359
+ progress: Optional[Callable[[int, int], None]]
360
+ Optional callback function to report progress.
361
+
362
+ Returns
363
+ -------
364
+ instances: List[WikidataThing]
365
+ List of wikidata things.
366
+ """
367
+ pulled: List[WikidataThing] = []
368
+ task_size: int = len(qids)
369
+ if len(qids) == 0:
370
+ return []
371
+ missing_qids: List[str] = []
372
+ for qid in qids:
373
+ if not wikidata_cache.qid_in_cache(qid):
374
+ if qid and qid.startswith("Q") and len(qid) > 1:
375
+ missing_qids.append(qid)
376
+ else:
377
+ pulled.append(wikidata_cache.get_wikidata_object(qid))
378
+ ctr: int = len(pulled)
379
+ if progress:
380
+ progress(len(pulled), task_size)
381
+ jobs: List[List[str]] = list(chunks(list(missing_qids), API_LIMIT))
382
+ num_processes: int = min(len(jobs), multiprocessing.cpu_count())
383
+ if num_processes > 1:
384
+ with Pool(processes=num_processes) as pool:
385
+ # Wikidata thing is not support in multiprocessing
386
+ for lst in pool.imap_unordered(__waiting_multi_request__, jobs):
387
+ for w_dict in lst:
388
+ w_thing = WikidataThing.from_wikidata(w_dict)
389
+ wikidata_cache.cache_wikidata_object(w_thing)
390
+ pulled.append(w_thing)
391
+ ctr += 1
392
+ if progress:
393
+ progress(ctr, task_size)
394
+ else:
395
+ results = WikiDataAPIClient.__wikidata_multiple_task__(jobs[0])
396
+ for w_thing in results:
397
+ wikidata_cache.cache_wikidata_object(w_thing)
398
+ ctr += 1
399
+ if progress:
400
+ progress(ctr, task_size)
401
+ pulled.extend(results)
402
+ return pulled
403
+
404
+ @staticmethod
405
+ def wikiproperty(pid: str) -> WikidataProperty:
406
+ """
407
+ Retrieve a single Wikidata property.
408
+
409
+ Parameters
410
+ ----------
411
+ pid: str
412
+ PID of the property.
413
+
414
+ Returns
415
+ -------
416
+ instance: WikidataProperty
417
+ Single wikidata property
418
+ """
419
+ try:
420
+ # if wikidata_cache.get_property(pid):
421
+ # return wikidata_cache.get_property(pid)
422
+ w_property = WikidataProperty.from_wikidata(__waiting_request__(pid))
423
+ # Add the property to the cache
424
+ wikidata_cache.cache_property(w_property)
425
+ return w_property
426
+ except Exception as e:
427
+ logger.exception(e)
428
+ raise WikiDataAPIException(e) from e
@@ -13,6 +13,7 @@ from requests.adapters import HTTPAdapter
13
13
  from urllib3 import Retry
14
14
 
15
15
  from knowledge import logger
16
+ from knowledge.public import DEFAULT_MAX_RETRIES, DEFAULT_TIMEOUT, DEFAULT_BACKOFF_FACTOR, STATUS_FORCE_LIST
16
17
 
17
18
 
18
19
  # --------------------------------------- Structures -------------------------------------------------------------------
@@ -49,10 +50,12 @@ QID_TAG: str = "qid"
49
50
  PID_TAG: str = "pid"
50
51
  LAST_REVID_TAG: str = "lastrevid"
51
52
  MODIFIED_TAG: str = "modified"
53
+ SYNC_TIME_TAG: str = "sync"
52
54
  WIKIDATA_LANGUAGE_TAG: str = "language"
53
55
  LABEL_VALUE_TAG: str = "value"
54
56
  LABEL_TAG: str = "label"
55
57
  SUPERCLASSES_TAG: str = "superclasses"
58
+ SUBCLASSES_TAG: str = "subclasses"
56
59
  CLAIMS_TAG: str = "claims"
57
60
  ONTOLOGY_TYPES_TAG: str = "ontology_types"
58
61
  REVISION_TAG: str = "revision"
@@ -262,7 +265,13 @@ def wikidate(param: Dict[str, Any]) -> Dict[str, Any]:
262
265
  }
263
266
 
264
267
 
265
- def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dict[str, Any]:
268
+ def __waiting_request__(
269
+ entity_id: str,
270
+ base_url: str = WIKIDATA_LDI_URL,
271
+ timeout: int = DEFAULT_TIMEOUT,
272
+ max_retries: int = DEFAULT_MAX_RETRIES,
273
+ backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
274
+ ) -> Dict[str, Any]:
266
275
  """
267
276
  Sena a request with retry policy.
268
277
 
@@ -272,18 +281,24 @@ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dic
272
281
  Entity QID
273
282
  base_url: Base URL
274
283
  The base URL
284
+ timeout: int
285
+ Timeout in seconds
286
+ max_retries: int
287
+ Maximum number of retries
288
+ backoff_factor: float
289
+ Backoff factor for retries.
290
+
275
291
  Returns
276
292
  -------
277
293
  result_dict: Dict[str, Any]
278
294
  Result dict
279
295
  """
280
296
  url: str = f"{base_url}/{entity_id}.json"
281
-
282
297
  # Define the retry policy
283
298
  retry_policy: Retry = Retry(
284
- total=3, # maximum number of retries
285
- backoff_factor=1, # factor by which to multiply the delay between retries
286
- status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
299
+ total=max_retries, # maximum number of retries
300
+ backoff_factor=backoff_factor, # factor by which to multiply the delay between retries
301
+ status_forcelist=STATUS_FORCE_LIST, # HTTP status codes to retry on
287
302
  respect_retry_after_header=True, # respect the Retry-After header
288
303
  )
289
304
 
@@ -293,7 +308,7 @@ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dic
293
308
  session.mount("https://", retry_adapter)
294
309
 
295
310
  # Make a request using the session
296
- response: Response = session.get(url)
311
+ response: Response = session.get(url, timeout=timeout)
297
312
 
298
313
  # Check the response status code
299
314
  if not response.ok:
@@ -311,7 +326,13 @@ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dic
311
326
  return entity_dict
312
327
 
313
328
 
314
- def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_ENTITIES_API) -> List[Dict[str, Any]]:
329
+ def __waiting_multi_request__(
330
+ entity_ids: List[str],
331
+ base_url: str = MULTIPLE_ENTITIES_API,
332
+ timeout: int = DEFAULT_TIMEOUT,
333
+ max_retries: int = DEFAULT_MAX_RETRIES,
334
+ backoff_factor: float = DEFAULT_BACKOFF_FACTOR,
335
+ ) -> List[Dict[str, Any]]:
315
336
  """
316
337
  Sena a request to retrieve multiple entities with retry policy.
317
338
 
@@ -321,6 +342,12 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
321
342
  Entity QIDs
322
343
  base_url: Base URL
323
344
  The base URL
345
+ timeout: int
346
+ Timeout in seconds
347
+ max_retries: int
348
+ Maximum number of retries
349
+ backoff_factor: float
350
+ Backoff factor for retries.
324
351
  Returns
325
352
  -------
326
353
  result_dict: Dict[str, Any]
@@ -330,6 +357,7 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
330
357
  ValueError - Empty list or to many entities
331
358
  """
332
359
  checked_entity_ids: List[str] = [e for e in entity_ids if e.startswith("Q")]
360
+
333
361
  if not (0 < len(checked_entity_ids) <= API_LIMIT):
334
362
  raise ValueError(
335
363
  f"Number of entities must be within [1, {API_LIMIT}]. " f"Number of QIDs: {len(checked_entity_ids)}"
@@ -339,9 +367,9 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
339
367
 
340
368
  # Define the retry policy
341
369
  retry_policy: Retry = Retry(
342
- total=3, # maximum number of retries
343
- backoff_factor=1, # factor by which to multiply the delay between retries
344
- status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
370
+ total=max_retries, # maximum number of retries
371
+ backoff_factor=backoff_factor, # factor by which to multiply the delay between retries
372
+ status_forcelist=STATUS_FORCE_LIST, # HTTP status codes to retry on
345
373
  respect_retry_after_header=True, # respect the Retry-After header
346
374
  )
347
375
 
@@ -351,7 +379,7 @@ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_EN
351
379
  session.mount("https://", retry_adapter)
352
380
 
353
381
  # Make a request using the session
354
- response: Response = session.get(url)
382
+ response: Response = session.get(url, timeout=timeout)
355
383
 
356
384
  # Check the response status code
357
385
  if not response.ok:
@@ -7,7 +7,8 @@ from typing import Any, Dict, Set, Tuple, List, Callable, Optional
7
7
  from tqdm import tqdm
8
8
 
9
9
  from knowledge.public.helper import CLAIMS_TAG, PID_TAG, LABEL_TAG, QID_TAG
10
- from knowledge.public.wikidata import LITERALS_TAG, WikidataThing, WikiDataAPIClient
10
+ from knowledge.public.wikidata import LITERALS_TAG, WikidataThing
11
+ from knowledge.public.client import WikiDataAPIClient
11
12
 
12
13
 
13
14
  def __relations__(thing: Dict[str, Any], wikidata: Set[str]) -> Tuple[str, List[Dict[str, Any]]]: