tablassert 7.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
tablassert/lib.py ADDED
@@ -0,0 +1,602 @@
1
+ from tablassert.enums import EncodingMethods
2
+ from rich.progress import TaskProgressColumn
3
+ from rich.progress import TimeElapsedColumn
4
+ from tablassert.utils import namespace_uuid
5
+ from tablassert.models import NodeEncoding
6
+ from tablassert.downloader import from_url
7
+ from tablassert.ingests import to_sections
8
+ from tablassert.ingests import from_yaml
9
+ from tablassert.qc import fullmap_audit
10
+ from tablassert.fullmap import version4
11
+ from rich.progress import SpinnerColumn
12
+ from tablassert.models import Encoding
13
+ from tablassert.models import Section
14
+ from rich.progress import TextColumn
15
+ from tablassert.utils import mkhash
16
+ from tablassert.enums import Tokens
17
+ from pydantic import NonNegativeInt
18
+ from tablassert.models import Graph
19
+ from rich.progress import BarColumn
20
+ from rich.progress import Progress
21
+ from tablassert.enums import Files
22
+ from tablassert.utils import STORE
23
+ from tablassert.log import logger
24
+ from sqlite_utils import Database
25
+ from pydantic import PositiveInt
26
+ from multiprocessing import Pool
27
+ from functools import reduce
28
+ from os.path import basename
29
+ from itertools import chain
30
+ from typing import Callable
31
+ from typing import Optional
32
+ from typing import Literal
33
+ from pydantic import Field
34
+ from pathlib import Path
35
+ from typing import Union
36
+ from operator import add
37
+ from typing import Self
38
+ from operator import eq
39
+ from operator import le
40
+ from typing import Any
41
+ import polars as pl
42
+ import operator
43
+ import xxhash
44
+ import duckdb
45
+ import orjson
46
+ import typer
47
+ import math
48
+
49
+ # ? Newline To Make Progress Bar More Readable
50
+ print("\n")
51
+
52
+
53
+ def value(lf: pl.LazyFrame, col: str, x: str) -> pl.LazyFrame:
54
+ # ? Creates A New Column With A Literal Value
55
+ return lf.with_columns(pl.lit(x).alias(col))
56
+
57
+
58
+ def contributor_values(lf: pl.LazyFrame, col: str, contributors: list[dict[str, Any]]) -> pl.LazyFrame:
59
+ # ? Adds Nested Contributors Fields To Column
60
+ return lf.with_columns(pl.lit([x.model_dump() for x in contributors]).alias(col)) # pyright: ignore
61
+
62
+
63
+ def column(lf: pl.LazyFrame, col: str, x: str) -> pl.LazyFrame:
64
+ # ? Creates A New Column With From An Old Column
65
+ return lf.with_columns(pl.col(x).alias(col))
66
+
67
+
68
+ def math_op(
69
+ lf: pl.LazyFrame, col: str, func: str, args: list[Union[Literal[Tokens.VALUES], float, int]]
70
+ ) -> pl.LazyFrame:
71
+ # ? Transform Values In A Column With The Math Module
72
+ # ! Collection Point: Required For map_elements
73
+ df: pl.DataFrame = lf.collect()
74
+ expr: pl.Expr = pl.col(col).cast(pl.Float64)
75
+ attr: Callable[[Any], Any] = getattr(math, func)
76
+ df = df.with_columns(
77
+ expr.map_elements(lambda x: attr(*(x if eq(a, Tokens.VALUES) else a for a in args)), return_dtype=pl.Float64).alias(
78
+ col
79
+ )
80
+ )
81
+ return df.lazy()
82
+
83
+
84
+ def zero(lf: pl.LazyFrame, col: str) -> pl.LazyFrame:
85
+ # ? Level Zero Text Processing
86
+ expr: pl.Expr = pl.col(col).cast(pl.String).str.strip_chars().str.to_lowercase()
87
+ return lf.with_columns(expr.alias(col))
88
+
89
+
90
+ def one(
91
+ lf: pl.LazyFrame,
92
+ col: str, # pyright: ignore
93
+ regex: str = r"\W+",
94
+ tag: str = " one",
95
+ ) -> pl.LazyFrame:
96
+ # ? Level One Text Processing
97
+ expr: pl.Expr = pl.col(col).str.replace_all(regex, "")
98
+ col: str = add(col, tag)
99
+ return lf.with_columns(expr.alias(col))
100
+
101
+
102
+ def prefix(lf: pl.LazyFrame, col: str, prefix: str) -> pl.LazyFrame:
103
+ expr: pl.Expr = add(pl.lit(prefix), pl.col(col).cast(pl.String))
104
+ return lf.with_columns(expr.alias(col))
105
+
106
+
107
+ def suffix(lf: pl.LazyFrame, col: str, suffix: str) -> pl.LazyFrame:
108
+ expr: pl.Expr = add(pl.col(col).cast(pl.String), pl.lit(suffix))
109
+ return lf.with_columns(expr.alias(col))
110
+
111
+
112
+ def regex(lf: pl.LazyFrame, col: str, pattern: str, replacement: str = "") -> pl.LazyFrame:
113
+ expr: pl.Expr = pl.col(col).cast(pl.String).str.replace_all(pattern, replacement)
114
+ return lf.with_columns(expr.alias(col))
115
+
116
+
117
+ def fill(lf: pl.LazyFrame, col: str, method: str) -> pl.LazyFrame:
118
+ expr: pl.Expr = pl.col(col).fill_null(strategy=method) # pyright: ignore
119
+ return lf.with_columns(expr.alias(col))
120
+
121
+
122
+ def explode(lf: pl.LazyFrame, col: str, delimiter: str) -> pl.LazyFrame:
123
+ # ? Explodes A Row With Items Into Many Unique Rows By A Delimiter
124
+ expr: pl.Expr = pl.col(col).cast(pl.String).str.split(delimiter)
125
+ lf = lf.with_columns(expr.alias(col))
126
+ return lf.explode(col)
127
+
128
+
129
+ def sig(
130
+ lf: pl.LazyFrame,
131
+ cutoff: float = 0.05, # pyright: ignore
132
+ col: str = "p value",
133
+ out: str = "significant",
134
+ ) -> pl.LazyFrame:
135
+ # ? Creates The "significant" Column
136
+ if col in lf.collect_schema().names():
137
+ expr: pl.Expr = pl.col(col).cast(pl.Float64, strict=False)
138
+ cond: pl.Expr = le(expr, cutoff)
139
+ cutoff: pl.Expr = (
140
+ pl.when(expr.is_null()).then(pl.lit("UNSURE")).when(cond).then(pl.lit("YES")).otherwise(pl.lit("NO"))
141
+ )
142
+ return lf.with_columns(cutoff.alias(out))
143
+
144
+ else:
145
+ return lf.with_columns(pl.lit("UNSURE").alias(out))
146
+
147
+
148
+ def idx(lf: pl.LazyFrame, col: str = "row number") -> pl.LazyFrame:
149
+ # ? Creates An Index Column Of Row Numbers
150
+ return lf.with_row_index(col)
151
+
152
+
153
+ def csv(p: Path, sep: str) -> pl.LazyFrame:
154
+ # ? Reads Source From CSV And TSV As LazyFrame
155
+ return pl.scan_csv(source=p, separator=sep, has_header=False, infer_schema_length=None, truncate_ragged_lines=True)
156
+
157
+
158
+ def excel(p: Path, sheet: str, engine: str = "calamine") -> pl.LazyFrame:
159
+ # ? Reads Source From Excel As LazyFrame
160
+ df: pl.DataFrame = pl.read_excel(
161
+ source=p,
162
+ sheet_name=sheet,
163
+ engine=engine, # pyright: ignore
164
+ has_header=False,
165
+ infer_schema_length=None,
166
+ )
167
+ return df.lazy()
168
+
169
+
170
+ def crop(lf: pl.LazyFrame, row_slice: list[Union[NonNegativeInt, Literal[Tokens.AUTO]]]) -> pl.LazyFrame:
171
+ # ? Takes A Slice From A LazyFrame
172
+ # ! Collection Point: Requires Height Calculation
173
+ df: pl.DataFrame = lf.collect()
174
+ n: int = df.select(pl.len()).item()
175
+ start: Union[int, Literal[Tokens.AUTO]] = row_slice[0]
176
+ stop: Union[int, Literal[Tokens.AUTO]] = row_slice[1]
177
+ offset: int = 0 if eq(start, Tokens.AUTO) else start # pyright: ignore
178
+ length: int = n if eq(stop, Tokens.AUTO) else (stop - offset) # pyright: ignore
179
+ df = df.slice(offset=offset, length=length)
180
+ return df.lazy()
181
+
182
+
183
+ def pick(lf: pl.LazyFrame, rows: list[int]) -> pl.LazyFrame:
184
+ # ? Picks A List Of Rows From A LazyFrame
185
+ # ! Collection Point: take() Requires Eager, Relazy After
186
+ df: pl.DataFrame = lf.collect()
187
+ df = df.select(pl.all().take(indices=rows)) # pyright: ignore
188
+ return df.lazy()
189
+
190
+
191
+ def reindex(df: pl.LazyFrame, col: str, op: Callable, comp: Union[str, int, float], cast: bool = True) -> pl.LazyFrame:
192
+ # ? Reindex A LazyFrame Based On A Condition
193
+ expr: pl.Expr = pl.col(col).cast(pl.Float64) if cast else pl.col(col)
194
+ return df.filter(op(expr, comp))
195
+
196
+
197
+ def idxname(col: Any) -> str:
198
+ # ? Converts Excel Style Column Names To Polars Column Names
199
+ scol: str = str(col)
200
+ idx: int = 0
201
+ for char in scol:
202
+ idx = idx * 26 + (ord(char) - 65 + 1)
203
+
204
+ return f"column_{idx}"
205
+
206
+
207
+ def trim(lf: pl.LazyFrame, regex: str = r"^column_\d+$") -> pl.LazyFrame:
208
+ # ? Removes Columns With The Excel Naming Conventions From LazyFrame
209
+ return lf.select(pl.exclude(regex))
210
+
211
+
212
+ def to_store(lf: pl.LazyFrame, p: Path, config_name: str) -> Path:
213
+ # ? collect and write section parquet; warn if result is empty
214
+ df: pl.DataFrame = lf.collect()
215
+
216
+ if df.height == 0:
217
+ logger.warning(f"EMPTY SUBGRAPH | STORE: {p.stem} | CONFIG: {config_name}")
218
+ df.write_parquet(p)
219
+
220
+ return p
221
+
222
+
223
+ def with_mesh(lf: pl.LazyFrame, pubmed_db: Path, curie: str) -> pl.LazyFrame:
224
+ # ? Adds PubMedDB Related MeSH Annotations To LazyFrame
225
+ # ! Collection Point: SQLite Query Then Per-Row Literal Assignment
226
+ df: pl.DataFrame = lf.collect()
227
+ db: object = Database(pubmed_db)
228
+ query: str = """
229
+ SELECT
230
+ mesh.mesh_major,
231
+ mesh.mesh,
232
+ info.firstauthor,
233
+ info.journal,
234
+ info.title,
235
+ info.year
236
+ FROM ids
237
+ INNER JOIN mesh ON ids.pmid = mesh.pmid
238
+ INNER JOIN info ON ids.pmid = info.pmid
239
+ WHERE ids.alt = :curie OR ids.pmid = :curie
240
+ LIMIT 1
241
+ """
242
+ rows: list[dict[str, str]] = list(db.query(query, {"curie": curie})) or []
243
+ all_ids: list[str] = [add("MESH:", x["mesh"]) for x in rows if x]
244
+ is_major: list[bool] = [eq(x["mesh_major"], "Y") for x in rows]
245
+ domain: list[str] = [x for x, y in zip(all_ids, is_major) if y]
246
+ mesh: list[str] = [x for x in all_ids if x not in domain]
247
+
248
+ row: dict[str, str] = rows[0] if rows else {}
249
+ first_author: Optional[str] = row.get("firstauthor")
250
+ journal: Optional[str] = row.get("journal")
251
+ title: Optional[str] = row.get("title")
252
+ year: Optional[str] = row.get("year")
253
+
254
+ if domain:
255
+ df = df.with_columns(pl.lit(domain).alias("domain"))
256
+ if mesh:
257
+ df = df.with_columns(pl.lit(mesh).alias("mesh"))
258
+ if first_author:
259
+ df = df.with_columns(pl.lit(first_author).alias("first author"))
260
+ if journal:
261
+ df = df.with_columns(pl.lit(journal).alias("journal"))
262
+ if title:
263
+ df = df.with_columns(pl.lit(title).alias("title"))
264
+ if year:
265
+ df = df.with_columns(pl.lit(year).alias("year published"))
266
+
267
+ return df.lazy()
268
+
269
+
270
+ def with_captions(lf: pl.LazyFrame, pmc_db: Path, curie: str, url: str) -> pl.LazyFrame:
271
+ # ? Adds PMC Caption Annotations To LazyFrame With Filename Heuristic
272
+ # ! Collection Point: SQLite Query Then Literal Assignment
273
+ df: pl.DataFrame = lf.collect()
274
+ db: object = Database(pmc_db)
275
+ filename: str = basename(url)
276
+ query: str = """
277
+ SELECT caption
278
+ FROM captions
279
+ WHERE pmc = :curie AND file = :filename
280
+ LIMIT 1
281
+ """
282
+ rows: list[dict[str, str]] = list(db.query(query, {"curie": curie, "filename": filename})) or []
283
+ row: dict[str, str] = rows[0] if rows else {}
284
+
285
+ caption: Optional[str] = row.get("caption")
286
+ if caption:
287
+ df = df.with_columns(pl.lit(caption).alias("file caption"))
288
+
289
+ return df.lazy()
290
+
291
+
292
+ class Tcode(Section):
293
+ # ? Extends Section To Compile A KG
294
+ number: PositiveInt = Field(...)
295
+ config: Path = Field(...)
296
+ store: Path = Field(...)
297
+
298
+ def encoding(self: Self, x: Encoding, col: str) -> list[Any]:
299
+ # ? Collect Helper For Encoding Classes
300
+ return [
301
+ (value, (col, x.encoding)) if eq(x.method, EncodingMethods.VALUE) else None,
302
+ (column, (col, idxname(x.encoding))) if eq(x.method, EncodingMethods.COLUMN) else None,
303
+ (fill, (col, x.fill)) if x.fill else None,
304
+ (explode, (col, x.explode_by)) if x.explode_by else None,
305
+ [(regex, (col, r.pattern, r.replacement)) for r in x.regex] if x.regex else None,
306
+ [(regex, (col, r)) for r in x.remove] if x.remove else None,
307
+ (prefix, (col, x.prefix)) if x.prefix else None,
308
+ (suffix, (col, x.suffix)) if x.suffix else None,
309
+ [(math_op, (col, t.function, t.arguments)) for t in x.transformations] if x.transformations else None,
310
+ ]
311
+
312
+ def node(self: Self, x: NodeEncoding, col: str, conn: object) -> list[Any]:
313
+ # ? Collect Helper For NodeEncoding Classes
314
+ encoding: list[Any] = self.encoding(x, col)
315
+ node: list[Any] = [
316
+ (column, (add("original ", col), col)),
317
+ (zero, (col,)),
318
+ (one, (col,)),
319
+ (version4, (col, conn, x.taxon, x.prioritize, x.avoid, self.store.stem, self.config.name)),
320
+ (fullmap_audit, (col, self.store.stem, self.config.name)),
321
+ ]
322
+ return add(encoding, node)
323
+
324
+ def clean(self: Self, tcode: list[tuple[Callable, Any]]) -> list[tuple[Callable, tuple[Any]]]:
325
+ # ? Cleans Tcode So It Can Be Used With reduce From functools
326
+ result: list[tuple[Callable, tuple[Any]]] = []
327
+ for x in tcode:
328
+ if not x:
329
+ continue
330
+ elif isinstance(x, list):
331
+ result.extend(self.clean(x))
332
+ else:
333
+ result.append(x)
334
+ return result
335
+
336
+ def collect(
337
+ self: Self, conn: object, pubmed_db: Optional[Path], pmc_db: Optional[Path]
338
+ ) -> Union[list[tuple[Callable, tuple[Any]]], Path]:
339
+ # ? Code That Tells Tablassert What Actions To While Transforming Data
340
+
341
+ if self.store.is_file():
342
+ # * Quick Exit If Subgraph Already Exists
343
+ return self.store
344
+
345
+ else:
346
+ # * Returns A List Of: (Function, (Arguments))
347
+ tcode: Optional[list[Any]] = [
348
+ (from_url, (str(self.source.url), self.source.local)),
349
+ (csv, (self.source.delimiter,)) if eq(self.source.kind, Files.TEXT) else None, # pyright: ignore
350
+ (excel, (self.source.sheet,)) if eq(self.source.kind, Files.EXCEL) else None, # pyright: ignore
351
+ (idx, ()),
352
+ (crop, (self.source.row_slice,)) if self.source.row_slice else None,
353
+ (pick, (self.source.rows,)) if self.source.rows else None,
354
+ [
355
+ (reindex, (idxname(x.column), getattr(operator, x.comparison), x.comparator))
356
+ if x.comparison not in ["ne", "eq"]
357
+ else (reindex, (idxname(x.column), getattr(operator, x.comparison), x.comparator, False))
358
+ for x in self.source.reindex
359
+ ]
360
+ if self.source.reindex
361
+ else None,
362
+ [op for x in self.annotations for op in self.encoding(x, x.annotation)] if self.annotations else None,
363
+ self.node(self.statement.subject, "subject", conn),
364
+ self.node(self.statement.object, "object", conn),
365
+ (value, ("predicate", self.statement.predicate)),
366
+ [op for x in self.statement.qualifiers for op in self.node(x, x.qualifier, conn)]
367
+ if self.statement.qualifiers
368
+ else None,
369
+ (value, ("syntax", self.syntax)),
370
+ (value, ("configuration file", self.config.name)),
371
+ (value, ("section number", self.number)),
372
+ (value, ("status", self.status)),
373
+ (value, ("repository", self.provenance.repo)),
374
+ (value, ("publication", (self.provenance.repo + ":" + self.provenance.publication))),
375
+ (contributor_values, ("contributors", self.provenance.contributors)),
376
+ (value, ("url", str(self.source.url))),
377
+ (value, ("section hash", self.store.stem)),
378
+ (with_mesh, (pubmed_db, self.provenance.publication)) if pubmed_db else None,
379
+ (with_captions, (pmc_db, self.provenance.publication, str(self.source.url))) if pmc_db else None,
380
+ (sig, ()),
381
+ (trim, ()),
382
+ (to_store, (self.store, self.config.name)),
383
+ ]
384
+ return self.clean(tcode)
385
+
386
+
387
+ def compile_subgraph(tcode: list[tuple[Callable, tuple[Any]]]) -> Path:
388
+ # ? Executes Tcode To Build Subgraphs As Parquets
389
+ return reduce(lambda acc, op: op[0](acc, *op[1]) if acc is not None else op[0](*op[1]), tcode, None) # pyright: ignore
390
+
391
+
392
+ def normalize(
393
+ edges: pl.LazyFrame, col: str, names: list[str] = ["id", "name", "category", "taxon", "source", "source version"]
394
+ ) -> tuple[pl.LazyFrame, pl.LazyFrame]:
395
+ # ? Normalized Disparate Node Columns To A Unified Format And Removes Them From Edges
396
+ # * Returns Partial Nodes And Modified Edges As LazyFrames
397
+ cols: list[str] = [
398
+ col,
399
+ add(col, " name"),
400
+ add(col, " category"),
401
+ add(col, " taxon"),
402
+ add(col, " source"),
403
+ add(col, " source version"),
404
+ ]
405
+ nodes: pl.LazyFrame = edges.select(cols).unique().rename({k: v for k, v in zip(cols, names)})
406
+ edges_out: pl.LazyFrame = edges.drop(cols[1:])
407
+ return nodes, edges_out
408
+
409
+
410
+ def publications(
411
+ edges: pl.LazyFrame, names: list[str] = ["id", "name", "first author", "journal", "year published"]
412
+ ) -> tuple[pl.LazyFrame, pl.LazyFrame]:
413
+ cols: list[str] = ["publication", "title", "first author", "journal", "year published"]
414
+ cols = [x for x in cols if x in edges.collect_schema().names()]
415
+ nodes: pl.LazyFrame = edges.select(cols).unique().rename({k: v for k, v in zip(cols, names)})
416
+ nodes = nodes.with_columns(pl.lit("biolink:Publication").alias("category"))
417
+ edges_out: pl.LazyFrame = edges.drop(cols[1:])
418
+ return nodes, edges_out
419
+
420
+
421
+ def label_edge(r: object, domain: str = "TABLASSERT", out: str = "uuid") -> object:
422
+ # ? Gives Edges A Unique UUID In The Tablassert Namespace
423
+ r[out] = namespace_uuid(domain, *r.values()) # pyright: ignore
424
+ return r
425
+
426
+
427
+ def strip_nulls(r: object, bad: set[str] = {"na", "nan", "null", "none", ""}) -> dict:
428
+ # ? Removes Null Keys From NDJSON
429
+ return {
430
+ k: [strip_nulls(i) if isinstance(i, dict) else i for i in v]
431
+ if isinstance(v, list)
432
+ else strip_nulls(v)
433
+ if isinstance(v, dict)
434
+ else v
435
+ for k, v in r.items() # pyright: ignore
436
+ if v and str(v).strip().lower() not in bad
437
+ }
438
+
439
+
440
+ def dedup_stream(p_in: Path, is_edges: bool) -> None:
441
+ # ? Removes Null Values From And Deduplicates NDJSON
442
+ # * Also Adds UUIDs To Edges
443
+ p_out: Path = p_in.with_suffix("")
444
+
445
+ if p_out.is_file():
446
+ p_out.unlink()
447
+
448
+ seen: set[bytes] = set()
449
+ with p_in.open("rb") as f_in, p_out.open("wb") as f_out:
450
+ for line in f_in:
451
+ r: object = orjson.loads(line) # pyright: ignore
452
+ r = strip_nulls(r)
453
+
454
+ if r:
455
+ b: bytes = orjson.dumps(r)
456
+ h: bytes = xxhash.xxh64(b).digest()
457
+ if h not in seen:
458
+ seen |= {h}
459
+
460
+ if is_edges:
461
+ r = label_edge(r)
462
+ b = orjson.dumps(r)
463
+
464
+ b = b + ("\n").encode("utf-8")
465
+ f_out.write(b)
466
+
467
+ p_in.unlink()
468
+
469
+
470
+ def compile_graph(subgraphs: list[Path], name: str, version: str, fmt: str = "mixed", precision: int = 4) -> None:
471
+ # ? Aggregates Parquets For NDJSON KGX Export Using Lazy Scan
472
+ p: Path = Path(f"./{name}_{version}")
473
+
474
+ e: Path = p.with_suffix(".edges.ndjson.temp") # ! For Labeling
475
+ if e.exists():
476
+ e.unlink()
477
+
478
+ n: Path = p.with_suffix(".nodes.ndjson.temp")
479
+ if n.exists():
480
+ n.unlink()
481
+
482
+ subnodes: list[pl.LazyFrame] = []
483
+ subedges: list[pl.LazyFrame] = []
484
+ for s in subgraphs:
485
+ lf: pl.LazyFrame = pl.scan_parquet(s)
486
+
487
+ node_cols: list[str] = [col.replace("original ", "") for col in lf.collect_schema().names() if "original " in col]
488
+ for col in node_cols:
489
+ partial, lf = normalize(lf, col)
490
+ subnodes.append(partial)
491
+
492
+ partial, lf = publications(lf)
493
+ subnodes.append(partial)
494
+ subedges.append(lf)
495
+
496
+ # ! Collection Point: Appending To Output Files
497
+ with n.open("a") as f:
498
+ for subnode in subnodes:
499
+ eagernode: pl.DataFrame = subnode.collect().unique()
500
+ eagernode.write_ndjson(f)
501
+
502
+ with e.open("a") as f:
503
+ with pl.Config(set_fmt_float=fmt): # pyright: ignore
504
+ with pl.Config(float_precision=precision):
505
+ for subedge in subedges:
506
+ eageredge: pl.DataFrame = subedge.collect().unique()
507
+ eageredge.write_ndjson(f)
508
+
509
+ dedup_stream(e, is_edges=True)
510
+ dedup_stream(n, is_edges=False)
511
+
512
+
513
+ CLI: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
514
+ PROGRESS: Progress = Progress(
515
+ SpinnerColumn(),
516
+ TextColumn("[progress.description]{task.description}"),
517
+ BarColumn(),
518
+ TaskProgressColumn(),
519
+ TimeElapsedColumn(),
520
+ )
521
+
522
+
523
+ def track(task_id: Any, iterable: Any) -> Any:
524
+ for item in iterable:
525
+ yield item
526
+ PROGRESS.advance(task_id)
527
+
528
+
529
+ @CLI.command()
530
+ def build_knowledge_graph(
531
+ graph_configuration_file: Path = typer.Argument(..., help="Knowledge Graph Configuration -- See Docs"),
532
+ ) -> None:
533
+ """Build A KGX Compliant Knowledge Graph From A Graph Configuration File"""
534
+ # TODO: Make MeSH A Node (Micro Version)
535
+ # TODO: Add FullMap Column Context Flag"
536
+ r: object = from_yaml(graph_configuration_file)
537
+ g: Graph = Graph.model_validate(r)
538
+
539
+ with PROGRESS:
540
+ # ? Load Tables
541
+ t1: Any = PROGRESS.add_task("Loading Tables...", total=None)
542
+ with Pool() as pool:
543
+ raw: list[object] = pool.map(from_yaml, g.tables)
544
+ PROGRESS.update(t1, total=1, completed=1)
545
+
546
+ # ? Extract Sections
547
+ t2: Any = PROGRESS.add_task("Extracting Sections...", total=None)
548
+ with Pool() as pool:
549
+ temp: list[list[dict[str, Any]]] = pool.starmap(to_sections, zip(raw, g.tables)) # pyright: ignore
550
+ PROGRESS.update(t2, total=1, completed=1)
551
+ sections: list[dict[str, Any]] = list(chain.from_iterable(temp))
552
+ n: int = len(sections)
553
+
554
+ # ? Build Tcodes
555
+ t3: Any = PROGRESS.add_task("Building TCode...", total=n)
556
+ tcode: list[Tcode] = [
557
+ Tcode.model_validate({**s, "number": idx, "store": (STORE / f"{mkhash(s)}.parquet")})
558
+ for idx, s in track(t3, enumerate(sections, start=1))
559
+ ]
560
+ with duckdb.connect(g.dbssert, read_only=True) as conn:
561
+ # ? Collect Instructions
562
+ t4: Any = PROGRESS.add_task("Collecting Instructions...", total=n)
563
+ instructions: list[Union[list[tuple[Callable, tuple[Any, ...]]], Path]] = [
564
+ x.collect(conn, g.pubmed_db, g.pmc_db) for x in track(t4, tcode)
565
+ ] # pyright: ignore
566
+
567
+ # ? Build Subgraphs
568
+ t5: Any = PROGRESS.add_task("Building Subgraphs...", total=n)
569
+ subgraphs: list[Path] = [op if isinstance(op, Path) else compile_subgraph(op) for op in track(t5, instructions)] # pyright: ignore
570
+
571
+ # ? Compile Graph
572
+ t6: Any = PROGRESS.add_task("Compiling Graph...", total=None)
573
+ compile_graph(subgraphs, g.name, g.version)
574
+ PROGRESS.update(t6, total=1, completed=1)
575
+
576
+ PROGRESS.add_task("[bold green]Finished!", total=1, completed=1)
577
+
578
+
579
+ @CLI.command()
580
+ def verify_table_configuration_syntax(
581
+ table_configuration_file: Path = typer.Argument(..., help="Table Configuration -- See Docs"),
582
+ ) -> None:
583
+ """Verify The Syntax Of A Declarative Table Configuration File"""
584
+ with PROGRESS:
585
+ # ? Load Tables
586
+ t1: Any = PROGRESS.add_task("Loading Tables...", total=None)
587
+ r: object = from_yaml(table_configuration_file)
588
+ PROGRESS.update(t1, total=1, completed=1)
589
+
590
+ # ? Extract Sections
591
+ t2: Any = PROGRESS.add_task("Extracting Sections...", total=None)
592
+ sections: list[dict[str, Any]] = to_sections(r) # pyright: ignore
593
+ n: int = len(sections)
594
+ PROGRESS.update(t2, total=1, completed=1)
595
+
596
+ # ? Validating Section Syntax
597
+ t3: Any = PROGRESS.add_task("Validating Section Syntax...", total=n)
598
+ for s in track(t3, sections):
599
+ Section.model_validate(s)
600
+ PROGRESS.update(t3, total=1, completed=1)
601
+
602
+ PROGRESS.add_task("[bold green]Finished!", total=1, completed=1)
tablassert/log.py ADDED
@@ -0,0 +1,15 @@
1
+ from loguru import logger
2
+ from pathlib import Path
3
+
4
+ LOGASSERT: Path = Path("./.logassert")
5
+ LOGASSERT.mkdir(parents=True, exist_ok=True)
6
+
7
+ logger.remove()
8
+ logger.add(
9
+ (LOGASSERT / "logassert.log"),
10
+ level="INFO",
11
+ format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
12
+ rotation="100 MB",
13
+ encoding="utf-8",
14
+ mode="w",
15
+ )