personal_knowledge_library 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of personal_knowledge_library might be problematic. Click here for more details.

Files changed (42) hide show
  1. knowledge/__init__.py +91 -0
  2. knowledge/base/__init__.py +22 -0
  3. knowledge/base/access.py +167 -0
  4. knowledge/base/entity.py +267 -0
  5. knowledge/base/language.py +27 -0
  6. knowledge/base/ontology.py +2734 -0
  7. knowledge/base/search.py +473 -0
  8. knowledge/base/tenant.py +192 -0
  9. knowledge/nel/__init__.py +11 -0
  10. knowledge/nel/base.py +495 -0
  11. knowledge/nel/engine.py +123 -0
  12. knowledge/ontomapping/__init__.py +667 -0
  13. knowledge/ontomapping/manager.py +320 -0
  14. knowledge/public/__init__.py +27 -0
  15. knowledge/public/cache.py +115 -0
  16. knowledge/public/helper.py +373 -0
  17. knowledge/public/relations.py +128 -0
  18. knowledge/public/wikidata.py +1324 -0
  19. knowledge/services/__init__.py +128 -0
  20. knowledge/services/asyncio/__init__.py +7 -0
  21. knowledge/services/asyncio/base.py +458 -0
  22. knowledge/services/asyncio/graph.py +1420 -0
  23. knowledge/services/asyncio/group.py +450 -0
  24. knowledge/services/asyncio/search.py +439 -0
  25. knowledge/services/asyncio/users.py +270 -0
  26. knowledge/services/base.py +533 -0
  27. knowledge/services/graph.py +1897 -0
  28. knowledge/services/group.py +819 -0
  29. knowledge/services/helper.py +142 -0
  30. knowledge/services/ontology.py +1234 -0
  31. knowledge/services/search.py +488 -0
  32. knowledge/services/session.py +444 -0
  33. knowledge/services/tenant.py +281 -0
  34. knowledge/services/users.py +445 -0
  35. knowledge/utils/__init__.py +10 -0
  36. knowledge/utils/graph.py +417 -0
  37. knowledge/utils/wikidata.py +197 -0
  38. knowledge/utils/wikipedia.py +175 -0
  39. personal_knowledge_library-3.0.0.dist-info/LICENSE +201 -0
  40. personal_knowledge_library-3.0.0.dist-info/METADATA +1163 -0
  41. personal_knowledge_library-3.0.0.dist-info/RECORD +42 -0
  42. personal_knowledge_library-3.0.0.dist-info/WHEEL +4 -0
@@ -0,0 +1,373 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright © 2023-present Wacom. All rights reserved.
3
+ import hashlib
4
+ import math
5
+ import urllib
6
+ from datetime import datetime
7
+ from enum import Enum
8
+ from typing import Any, Optional, Dict, List
9
+
10
+ import requests
11
+ from requests import Response
12
+ from requests.adapters import HTTPAdapter
13
+ from urllib3 import Retry
14
+
15
+ from knowledge import logger
16
+
17
+
18
+ # --------------------------------------- Structures -------------------------------------------------------------------
19
+ class Precision(Enum):
20
+ """
21
+ Precision enum for date.
22
+ """
23
+
24
+ BILLION_YEARS = 0
25
+ MILLION_YEARS = 3
26
+ HUNDREDS_THOUSAND_YEARS = 4
27
+ TEN_THOUSAND_YEARS = 5
28
+ MILLENIUM = 6
29
+ CENTURY = 7
30
+ DECADE = 8
31
+ YEAR = 9
32
+ MONTH = 10
33
+ DAY = 11
34
+
35
+
36
+ class WikiDataAPIException(Exception):
37
+ """
38
+ WikiDataAPIException
39
+ --------------------
40
+ Exception thrown when accessing WikiData fails.
41
+ """
42
+
43
+
44
+ # --------------------------------------- Tags -------------------------------------------------------------------------
45
+ CLASS_TAG: str = "class"
46
+ ALIASES_TAG: str = "aliases"
47
+ ID_TAG: str = "id"
48
+ QID_TAG: str = "qid"
49
+ PID_TAG: str = "pid"
50
+ LAST_REVID_TAG: str = "lastrevid"
51
+ MODIFIED_TAG: str = "modified"
52
+ WIKIDATA_LANGUAGE_TAG: str = "language"
53
+ LABEL_VALUE_TAG: str = "value"
54
+ LABEL_TAG: str = "label"
55
+ SUPERCLASSES_TAG: str = "superclasses"
56
+ CLAIMS_TAG: str = "claims"
57
+ ONTOLOGY_TYPES_TAG: str = "ontology_types"
58
+ REVISION_TAG: str = "revision"
59
+ SITELINKS_TAG: str = "sitelinks"
60
+ TITLES_TAG: str = "titles"
61
+ URLS_TAG: str = "urls"
62
+ SOURCE_TAG: str = "source"
63
+ API_LIMIT: int = 50
64
+ # --------------------------------------- API URLs ---------------------------------------------------------------------
65
+ THUMB_IMAGE_URL: str = "https://upload.wikimedia.org/wikipedia/commons/thumb/{}/{}/{}/200px-{}"
66
+ MULTIPLE_ENTITIES_API: str = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids="
67
+ WIKIDATA_LDI_URL: str = "https://www.wikidata.org/wiki/Special:EntityData"
68
+ # --------------------------------------- Wikidata Properties ----------------------------------------------------------
69
+ STUDENT_OF: str = "P1066"
70
+ STUDENT: str = "P802"
71
+ INCEPTION: str = "P571"
72
+ MOVEMENT: str = "P135"
73
+ SUBCLASS_OF: str = "P279"
74
+ TITLE: str = "P1476"
75
+ COLLECTION: str = "P195"
76
+ GENRE: str = "P136"
77
+ CREATOR: str = "P170"
78
+ LOGO_IMAGE: str = "P154"
79
+ FLAG_IMAGE: str = "P41"
80
+ GREGORIAN_CALENDAR: str = "Q1985727"
81
+ START_TIME: str = "P580"
82
+ END_TIME: str = "P582"
83
+ FOLLOWS: str = "P155"
84
+ FOLLOWED_BY: str = "P156"
85
+ COUNTRY_OF_ORIGIN: str = "P495"
86
+ COUNTRY: str = "P17"
87
+ INSTANCE_OF: str = "P31"
88
+ IMAGE: str = "P18"
89
+ # URL - Wikidata
90
+ GREGORIAN_CALENDAR_URL: str = "http://www.wikidata.org/entity/Q1985786"
91
+ # URL - Wikidata service
92
+ WIKIDATA_SPARQL_URL: str = "https://query.wikidata.org/sparql"
93
+ WIKIDATA_SEARCH_URL: str = "https://www.wikidata.org/w/api.php"
94
+
95
+
96
+ # --------------------------------------- Helper functions -------------------------------------------------------------
97
+ def image_url(img: str, dpi: int = 500):
98
+ """
99
+ Helper to generate image URL for Wikipedia.
100
+
101
+ Parameters
102
+ ----------
103
+ img: str
104
+ Name of image
105
+ dpi: int
106
+ DPI of the generated URL
107
+ Returns
108
+ -------
109
+ wikimedia_url: str
110
+ URL of wikimedia
111
+ """
112
+ if not (50 <= dpi <= 1000):
113
+ raise ValueError(f"DPI should bei with range of [50-1000]. Value:={dpi}")
114
+ extension: str = ""
115
+ conversion: str = ""
116
+ fixed_img: str = img.replace(" ", "_")
117
+ if fixed_img.lower().endswith("svg"):
118
+ extension: str = ".png"
119
+ if fixed_img.lower().endswith("tif") or fixed_img.lower().endswith("tiff"):
120
+ extension: str = ".jpg"
121
+ conversion: str = "lossy-page1-"
122
+ hash_img: str = hashlib.md5(fixed_img.encode("utf-8")).hexdigest()
123
+ url_img_part: str = urllib.parse.quote_plus(fixed_img)
124
+ return (
125
+ f"https://upload.wikimedia.org/wikipedia/commons/thumb/"
126
+ f"{hash_img[0]}/{hash_img[:2]}/{url_img_part}/{dpi}px-{conversion + url_img_part + extension}"
127
+ )
128
+
129
+
130
+ def parse_date(date_string: str) -> Optional[datetime]:
131
+ """
132
+ Parse date string to datetime object.
133
+ Parameters
134
+ ----------
135
+ date_string: str
136
+ Date string
137
+
138
+ Returns
139
+ -------
140
+ parsed_date: datetime
141
+ Parsed date
142
+ """
143
+ try:
144
+ parsed_date = datetime.fromisoformat(date_string)
145
+ return parsed_date
146
+ except (TypeError, ValueError):
147
+ date_part, _ = date_string.split("T")
148
+ year, month, day = date_part.split("-")
149
+ if month == "00":
150
+ month = "01"
151
+ if day == "00":
152
+ day = "01"
153
+ iso_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
154
+ try:
155
+ parsed_date = datetime.fromisoformat(iso_date)
156
+ return parsed_date
157
+ except (TypeError, ValueError):
158
+ return None
159
+
160
+
161
+ def wikidate(param: Dict[str, Any]) -> Dict[str, Any]:
162
+ """
163
+ Parse and extract wikidata structure.
164
+ Parameters
165
+ ----------
166
+ param: Dict[str, Any]
167
+ Entities wikidata
168
+
169
+ Returns
170
+ -------
171
+ result: Dict[str, Any]
172
+ Dict with pretty print of date
173
+ """
174
+ time: str = param["time"]
175
+ timezone: int = param["timezone"]
176
+ before: int = param["before"]
177
+ after: int = param["after"]
178
+ precision: int = param["precision"]
179
+ calendar_model: str = param["calendarmodel"]
180
+ iso_encoded: Optional[str] = None
181
+ after_christ: bool = True
182
+ pretty: str = ""
183
+ if calendar_model != "https://www.wikidata.org/wiki/Q1985727":
184
+ if time.startswith("+"):
185
+ time = time[1:]
186
+ elif time.startswith("-"):
187
+ time = time[1:]
188
+ after_christ = False
189
+ date_obj: Optional[datetime] = parse_date(date_string=time)
190
+ if date_obj:
191
+ if date_obj.day == 0:
192
+ # Set the day component to 1
193
+ date_obj = date_obj.replace(day=1)
194
+ iso_encoded = date_obj.isoformat()
195
+ pretty = date_obj.strftime("%Y-%m-%d")
196
+ return {
197
+ "time": time,
198
+ "timezone": timezone,
199
+ "before": before,
200
+ "after": after,
201
+ "precision": precision,
202
+ "calendar-model": calendar_model,
203
+ "pretty": pretty,
204
+ "after-christ": after_christ,
205
+ "iso": iso_encoded,
206
+ }
207
+ if time.startswith("+"):
208
+ time = time[1:]
209
+ elif time.startswith("-"):
210
+ time = time[1:]
211
+ after_christ = False
212
+ # Probably not necessary
213
+ date_str = time.strip()
214
+ # Remove + sign
215
+ if date_str[0] == "+":
216
+ date_str = date_str[1:]
217
+ # Remove missing month/day
218
+ date_str = date_str.split("-00", maxsplit=1)[0]
219
+ # Parse date
220
+ try:
221
+ if Precision.BILLION_YEARS.value == precision:
222
+ pretty = date_str
223
+ elif Precision.MILLION_YEARS.value == precision:
224
+ pretty = date_str
225
+ elif Precision.HUNDREDS_THOUSAND_YEARS.value == precision:
226
+ pretty = date_str
227
+ elif Precision.MILLENIUM.value == precision:
228
+ pretty = date_str
229
+ elif Precision.TEN_THOUSAND_YEARS.value == precision:
230
+ pretty = date_str
231
+ else:
232
+ dt_obj: Optional[datetime] = parse_date(date_str)
233
+ if dt_obj:
234
+ if Precision.CENTURY.value == precision:
235
+ century: int = int(math.ceil(dt_obj.year / 100))
236
+ pretty = f"{century}th century"
237
+ elif Precision.DECADE.value == precision:
238
+ pretty = f"{dt_obj.year}s{'' if after_christ else ' BC'}"
239
+ elif Precision.YEAR.value == precision:
240
+ pretty = f"{dt_obj.year}{'' if after_christ else ' BC'}"
241
+ elif Precision.MONTH.value == precision:
242
+ pretty = dt_obj.strftime("%B %Y")
243
+ elif Precision.DAY.value == precision:
244
+ pretty = dt_obj.strftime("%-d %B %Y")
245
+ iso_encoded = dt_obj.isoformat()
246
+ else:
247
+ iso_encoded = None
248
+ except Exception as pe:
249
+ logger.error(param)
250
+ logger.exception(pe)
251
+
252
+ return {
253
+ "time": time,
254
+ "timezone": timezone,
255
+ "before": before,
256
+ "after": after,
257
+ "precision": precision,
258
+ "calendar-model": calendar_model,
259
+ "pretty": pretty,
260
+ "after_christ": after_christ,
261
+ "iso": iso_encoded,
262
+ }
263
+
264
+
265
+ def __waiting_request__(entity_id: str, base_url: str = WIKIDATA_LDI_URL) -> Dict[str, Any]:
266
+ """
267
+ Sena a request with retry policy.
268
+
269
+ Parameters
270
+ ----------
271
+ entity_id: str
272
+ Entity QID
273
+ base_url: Base URL
274
+ The base URL
275
+ Returns
276
+ -------
277
+ result_dict: Dict[str, Any]
278
+ Result dict
279
+ """
280
+ url: str = f"{base_url}/{entity_id}.json"
281
+
282
+ # Define the retry policy
283
+ retry_policy: Retry = Retry(
284
+ total=3, # maximum number of retries
285
+ backoff_factor=1, # factor by which to multiply the delay between retries
286
+ status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
287
+ respect_retry_after_header=True, # respect the Retry-After header
288
+ )
289
+
290
+ # Create a session and mount the retry adapter
291
+ with requests.Session() as session:
292
+ retry_adapter = HTTPAdapter(max_retries=retry_policy)
293
+ session.mount("https://", retry_adapter)
294
+
295
+ # Make a request using the session
296
+ response: Response = session.get(url)
297
+
298
+ # Check the response status code
299
+ if not response.ok:
300
+ raise WikiDataAPIException(f"Request failed with status code : {response.status_code}. URL:= {url}")
301
+ entity_dict_full: Dict[str, Any] = response.json()
302
+ # remove redundant top level keys
303
+ returned_entity_id: str = next(iter(entity_dict_full["entities"]))
304
+ entity_dict = entity_dict_full["entities"][returned_entity_id]
305
+
306
+ if entity_id != returned_entity_id:
307
+ logger.warning(
308
+ f"Wikidata redirect detected. Input entity id={entity_id}. Returned entity id={returned_entity_id}."
309
+ )
310
+
311
+ return entity_dict
312
+
313
+
314
+ def __waiting_multi_request__(entity_ids: List[str], base_url: str = MULTIPLE_ENTITIES_API) -> List[Dict[str, Any]]:
315
+ """
316
+ Sena a request to retrieve multiple entities with retry policy.
317
+
318
+ Parameters
319
+ ----------
320
+ entity_ids: List[str]
321
+ Entity QIDs
322
+ base_url: Base URL
323
+ The base URL
324
+ Returns
325
+ -------
326
+ result_dict: Dict[str, Any]
327
+ Result dict
328
+ Raises
329
+ ------
330
+ ValueError - Empty list or to many entities
331
+ """
332
+ checked_entity_ids: List[str] = [e for e in entity_ids if e.startswith("Q")]
333
+ if not (0 < len(checked_entity_ids) <= API_LIMIT):
334
+ raise ValueError(
335
+ f"Number of entities must be within [1, {API_LIMIT}]. " f"Number of QIDs: {len(checked_entity_ids)}"
336
+ )
337
+ query: str = "|".join(checked_entity_ids)
338
+ url: str = f"{base_url}{query}&format=json"
339
+
340
+ # Define the retry policy
341
+ retry_policy: Retry = Retry(
342
+ total=3, # maximum number of retries
343
+ backoff_factor=1, # factor by which to multiply the delay between retries
344
+ status_forcelist=[429, 500, 502, 503, 504], # HTTP status codes to retry on
345
+ respect_retry_after_header=True, # respect the Retry-After header
346
+ )
347
+
348
+ # Create a session and mount the retry adapter
349
+ with requests.Session() as session:
350
+ retry_adapter = HTTPAdapter(max_retries=retry_policy)
351
+ session.mount("https://", retry_adapter)
352
+
353
+ # Make a request using the session
354
+ response: Response = session.get(url)
355
+
356
+ # Check the response status code
357
+ if not response.ok:
358
+ raise WikiDataAPIException(f"Request failed with status code : {response.status_code}. URL:= {url}")
359
+ entity_dict_full: Dict[str, Any] = response.json()
360
+ results: List[Dict[str, Any]] = []
361
+ # If no entities found
362
+ if "entities" not in entity_dict_full:
363
+ return results
364
+ for qid, e in entity_dict_full["entities"].items():
365
+ if qid not in entity_ids:
366
+ logger.warning(
367
+ f"Wikidata redirect detected. " f"Returned entity id={qid} is not in list of entity ids."
368
+ )
369
+ if "missing" in e:
370
+ logger.warning(f"Missing entity detected. Returned entity id={qid} is not in Wikidata found.")
371
+ continue
372
+ results.append(e)
373
+ return results
@@ -0,0 +1,128 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright © 2023-present Wacom. All rights reserved.
3
+ import functools
4
+ import multiprocessing
5
+ from typing import Any, Dict, Set, Tuple, List, Callable, Optional
6
+
7
+ from tqdm import tqdm
8
+
9
+ from knowledge.public.helper import CLAIMS_TAG, PID_TAG, LABEL_TAG, QID_TAG
10
+ from knowledge.public.wikidata import LITERALS_TAG, WikidataThing, WikiDataAPIClient
11
+
12
+
13
+ def __relations__(thing: Dict[str, Any], wikidata: Set[str]) -> Tuple[str, List[Dict[str, Any]]]:
14
+ """
15
+ Extracts relations from Wikidata.
16
+ Parameters
17
+ ----------
18
+ thing: Dict[str, Any]
19
+ Wikidata thing
20
+ wikidata: Set[str]
21
+ Set of unique QIDs
22
+
23
+ Returns
24
+ -------
25
+ qid: str
26
+ QID of the Wikidata thing
27
+ relations: List[Dict[str, Any]]
28
+ Relations of the Wikidata thing
29
+ """
30
+ relations: List[Dict[str, Any]] = []
31
+ for _, p_value in thing[CLAIMS_TAG].items():
32
+ for v in p_value[LITERALS_TAG]:
33
+ if isinstance(v, dict) and v.get("type") in {"wikibase-entityid", "wikibase-item"}:
34
+ ref_qid = v["value"]["id"]
35
+ prop = p_value[PID_TAG][LABEL_TAG]
36
+ if ref_qid in wikidata:
37
+ relations.append(
38
+ {
39
+ "subject": {
40
+ "qid": thing[QID_TAG],
41
+ },
42
+ "predicate": {"pid": p_value[PID_TAG][PID_TAG], "label": prop},
43
+ "target": {"qid": ref_qid},
44
+ }
45
+ )
46
+ return thing[QID_TAG], relations
47
+
48
+
49
+ def wikidata_extractor_entities(qids: Set[str]) -> Dict[str, WikidataThing]:
50
+ """
51
+ Extracts an entity from Wikidata.
52
+
53
+ Parameters
54
+ ----------
55
+ qids: Set[str]
56
+ Set of unique QIDs
57
+
58
+ Returns
59
+ -------
60
+ wikidata_extractor: Dict[str, WikidataThing]
61
+ Wikidata map
62
+ """
63
+ return {e.qid: e for e in WikiDataAPIClient.retrieve_entities(qids)}
64
+
65
+
66
+ def wikidata_relations_extractor(
67
+ wikidata: Dict[str, WikidataThing],
68
+ progress_relations: Optional[Callable[[int, int], None]] = None,
69
+ ) -> Dict[str, List[Dict[str, Any]]]:
70
+ """Extracts relations from Wikidata.
71
+
72
+ Parameters
73
+ ----------
74
+ wikidata: Dict[str, WikidataThing]
75
+ Wikidata map
76
+ progress_relations: Optional[Callable[[int, int], None]] = None
77
+ Progress callback function.
78
+
79
+ Returns
80
+ -------
81
+ relations: Dict[str, List[Dict[str, Any]]]
82
+ Relations map.
83
+ """
84
+ relations: Dict[str, List[Dict[str, Any]]] = {}
85
+ qids: Set[str] = set(wikidata.keys())
86
+ num_processes: int = min(len(wikidata), multiprocessing.cpu_count())
87
+ ctr: int = 0
88
+ tasks: int = len(qids)
89
+ with multiprocessing.Pool(processes=num_processes) as pool:
90
+ # Wikidata thing is not support in multiprocessing
91
+ for qid, rels in pool.map(
92
+ functools.partial(__relations__, wikidata=qids), [e.__dict__() for e in wikidata.values()]
93
+ ):
94
+ relations[qid] = rels
95
+ ctr += 1
96
+ if progress_relations:
97
+ progress_relations(ctr, tasks)
98
+ return relations
99
+
100
+
101
+ def wikidata_relations_extractor_qids(
102
+ wikidata: Dict[str, WikidataThing], qids: Set[str]
103
+ ) -> Dict[str, List[Dict[str, Any]]]:
104
+ """Extracts relations from Wikidata.
105
+
106
+ Parameters
107
+ ----------
108
+ wikidata: Dict[str, WikidataThing]
109
+ Wikidata map
110
+ qids: Set[str]
111
+ Set of unique QIDs
112
+
113
+ Returns
114
+ -------
115
+ relations: Dict[str, List[Dict[str, Any]]]
116
+ Relations map.
117
+ """
118
+ relations: Dict[str, List[Dict[str, Any]]] = {}
119
+ num_processes: int = min(len(wikidata), multiprocessing.cpu_count())
120
+ with multiprocessing.Pool(processes=num_processes) as pool:
121
+ # Wikidata thing is not support in multiprocessing
122
+ with tqdm(total=round(len(wikidata) / num_processes), desc="Check Wikidata relations.") as pbar:
123
+ for qid, rels in pool.map(
124
+ functools.partial(__relations__, wikidata=qids), [e.__dict__() for e in wikidata.values()]
125
+ ):
126
+ relations[qid] = rels
127
+ pbar.update(1)
128
+ return relations