tablassert 7.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tablassert/__init__.py +0 -0
- tablassert/downloader.py +35 -0
- tablassert/enums.py +521 -0
- tablassert/fullmap.py +167 -0
- tablassert/ingests.py +43 -0
- tablassert/lib.py +602 -0
- tablassert/log.py +15 -0
- tablassert/models.py +131 -0
- tablassert/qc.py +124 -0
- tablassert/utils.py +43 -0
- tablassert-7.0.0.dist-info/METADATA +141 -0
- tablassert-7.0.0.dist-info/RECORD +15 -0
- tablassert-7.0.0.dist-info/WHEEL +4 -0
- tablassert-7.0.0.dist-info/entry_points.txt +2 -0
- tablassert-7.0.0.dist-info/licenses/LICENSE +201 -0
tablassert/lib.py
ADDED
|
@@ -0,0 +1,602 @@
|
|
|
1
|
+
from tablassert.enums import EncodingMethods
|
|
2
|
+
from rich.progress import TaskProgressColumn
|
|
3
|
+
from rich.progress import TimeElapsedColumn
|
|
4
|
+
from tablassert.utils import namespace_uuid
|
|
5
|
+
from tablassert.models import NodeEncoding
|
|
6
|
+
from tablassert.downloader import from_url
|
|
7
|
+
from tablassert.ingests import to_sections
|
|
8
|
+
from tablassert.ingests import from_yaml
|
|
9
|
+
from tablassert.qc import fullmap_audit
|
|
10
|
+
from tablassert.fullmap import version4
|
|
11
|
+
from rich.progress import SpinnerColumn
|
|
12
|
+
from tablassert.models import Encoding
|
|
13
|
+
from tablassert.models import Section
|
|
14
|
+
from rich.progress import TextColumn
|
|
15
|
+
from tablassert.utils import mkhash
|
|
16
|
+
from tablassert.enums import Tokens
|
|
17
|
+
from pydantic import NonNegativeInt
|
|
18
|
+
from tablassert.models import Graph
|
|
19
|
+
from rich.progress import BarColumn
|
|
20
|
+
from rich.progress import Progress
|
|
21
|
+
from tablassert.enums import Files
|
|
22
|
+
from tablassert.utils import STORE
|
|
23
|
+
from tablassert.log import logger
|
|
24
|
+
from sqlite_utils import Database
|
|
25
|
+
from pydantic import PositiveInt
|
|
26
|
+
from multiprocessing import Pool
|
|
27
|
+
from functools import reduce
|
|
28
|
+
from os.path import basename
|
|
29
|
+
from itertools import chain
|
|
30
|
+
from typing import Callable
|
|
31
|
+
from typing import Optional
|
|
32
|
+
from typing import Literal
|
|
33
|
+
from pydantic import Field
|
|
34
|
+
from pathlib import Path
|
|
35
|
+
from typing import Union
|
|
36
|
+
from operator import add
|
|
37
|
+
from typing import Self
|
|
38
|
+
from operator import eq
|
|
39
|
+
from operator import le
|
|
40
|
+
from typing import Any
|
|
41
|
+
import polars as pl
|
|
42
|
+
import operator
|
|
43
|
+
import xxhash
|
|
44
|
+
import duckdb
|
|
45
|
+
import orjson
|
|
46
|
+
import typer
|
|
47
|
+
import math
|
|
48
|
+
|
|
49
|
+
# ? Newline To Make Progress Bar More Readable
|
|
50
|
+
print("\n")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def value(lf: pl.LazyFrame, col: str, x: str) -> pl.LazyFrame:
|
|
54
|
+
# ? Creates A New Column With A Literal Value
|
|
55
|
+
return lf.with_columns(pl.lit(x).alias(col))
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def contributor_values(lf: pl.LazyFrame, col: str, contributors: list[dict[str, Any]]) -> pl.LazyFrame:
|
|
59
|
+
# ? Adds Nested Contributors Fields To Column
|
|
60
|
+
return lf.with_columns(pl.lit([x.model_dump() for x in contributors]).alias(col)) # pyright: ignore
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def column(lf: pl.LazyFrame, col: str, x: str) -> pl.LazyFrame:
|
|
64
|
+
# ? Creates A New Column With From An Old Column
|
|
65
|
+
return lf.with_columns(pl.col(x).alias(col))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def math_op(
|
|
69
|
+
lf: pl.LazyFrame, col: str, func: str, args: list[Union[Literal[Tokens.VALUES], float, int]]
|
|
70
|
+
) -> pl.LazyFrame:
|
|
71
|
+
# ? Transform Values In A Column With The Math Module
|
|
72
|
+
# ! Collection Point: Required For map_elements
|
|
73
|
+
df: pl.DataFrame = lf.collect()
|
|
74
|
+
expr: pl.Expr = pl.col(col).cast(pl.Float64)
|
|
75
|
+
attr: Callable[[Any], Any] = getattr(math, func)
|
|
76
|
+
df = df.with_columns(
|
|
77
|
+
expr.map_elements(lambda x: attr(*(x if eq(a, Tokens.VALUES) else a for a in args)), return_dtype=pl.Float64).alias(
|
|
78
|
+
col
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
return df.lazy()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def zero(lf: pl.LazyFrame, col: str) -> pl.LazyFrame:
|
|
85
|
+
# ? Level Zero Text Processing
|
|
86
|
+
expr: pl.Expr = pl.col(col).cast(pl.String).str.strip_chars().str.to_lowercase()
|
|
87
|
+
return lf.with_columns(expr.alias(col))
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def one(
|
|
91
|
+
lf: pl.LazyFrame,
|
|
92
|
+
col: str, # pyright: ignore
|
|
93
|
+
regex: str = r"\W+",
|
|
94
|
+
tag: str = " one",
|
|
95
|
+
) -> pl.LazyFrame:
|
|
96
|
+
# ? Level One Text Processing
|
|
97
|
+
expr: pl.Expr = pl.col(col).str.replace_all(regex, "")
|
|
98
|
+
col: str = add(col, tag)
|
|
99
|
+
return lf.with_columns(expr.alias(col))
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def prefix(lf: pl.LazyFrame, col: str, prefix: str) -> pl.LazyFrame:
|
|
103
|
+
expr: pl.Expr = add(pl.lit(prefix), pl.col(col).cast(pl.String))
|
|
104
|
+
return lf.with_columns(expr.alias(col))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def suffix(lf: pl.LazyFrame, col: str, suffix: str) -> pl.LazyFrame:
|
|
108
|
+
expr: pl.Expr = add(pl.col(col).cast(pl.String), pl.lit(suffix))
|
|
109
|
+
return lf.with_columns(expr.alias(col))
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def regex(lf: pl.LazyFrame, col: str, pattern: str, replacement: str = "") -> pl.LazyFrame:
|
|
113
|
+
expr: pl.Expr = pl.col(col).cast(pl.String).str.replace_all(pattern, replacement)
|
|
114
|
+
return lf.with_columns(expr.alias(col))
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def fill(lf: pl.LazyFrame, col: str, method: str) -> pl.LazyFrame:
|
|
118
|
+
expr: pl.Expr = pl.col(col).fill_null(strategy=method) # pyright: ignore
|
|
119
|
+
return lf.with_columns(expr.alias(col))
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def explode(lf: pl.LazyFrame, col: str, delimiter: str) -> pl.LazyFrame:
|
|
123
|
+
# ? Explodes A Row With Items Into Many Unique Rows By A Delimiter
|
|
124
|
+
expr: pl.Expr = pl.col(col).cast(pl.String).str.split(delimiter)
|
|
125
|
+
lf = lf.with_columns(expr.alias(col))
|
|
126
|
+
return lf.explode(col)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def sig(
|
|
130
|
+
lf: pl.LazyFrame,
|
|
131
|
+
cutoff: float = 0.05, # pyright: ignore
|
|
132
|
+
col: str = "p value",
|
|
133
|
+
out: str = "significant",
|
|
134
|
+
) -> pl.LazyFrame:
|
|
135
|
+
# ? Creates The "significant" Column
|
|
136
|
+
if col in lf.collect_schema().names():
|
|
137
|
+
expr: pl.Expr = pl.col(col).cast(pl.Float64, strict=False)
|
|
138
|
+
cond: pl.Expr = le(expr, cutoff)
|
|
139
|
+
cutoff: pl.Expr = (
|
|
140
|
+
pl.when(expr.is_null()).then(pl.lit("UNSURE")).when(cond).then(pl.lit("YES")).otherwise(pl.lit("NO"))
|
|
141
|
+
)
|
|
142
|
+
return lf.with_columns(cutoff.alias(out))
|
|
143
|
+
|
|
144
|
+
else:
|
|
145
|
+
return lf.with_columns(pl.lit("UNSURE").alias(out))
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def idx(lf: pl.LazyFrame, col: str = "row number") -> pl.LazyFrame:
|
|
149
|
+
# ? Creates An Index Column Of Row Numbers
|
|
150
|
+
return lf.with_row_index(col)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def csv(p: Path, sep: str) -> pl.LazyFrame:
|
|
154
|
+
# ? Reads Source From CSV And TSV As LazyFrame
|
|
155
|
+
return pl.scan_csv(source=p, separator=sep, has_header=False, infer_schema_length=None, truncate_ragged_lines=True)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def excel(p: Path, sheet: str, engine: str = "calamine") -> pl.LazyFrame:
|
|
159
|
+
# ? Reads Source From Excel As LazyFrame
|
|
160
|
+
df: pl.DataFrame = pl.read_excel(
|
|
161
|
+
source=p,
|
|
162
|
+
sheet_name=sheet,
|
|
163
|
+
engine=engine, # pyright: ignore
|
|
164
|
+
has_header=False,
|
|
165
|
+
infer_schema_length=None,
|
|
166
|
+
)
|
|
167
|
+
return df.lazy()
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def crop(lf: pl.LazyFrame, row_slice: list[Union[NonNegativeInt, Literal[Tokens.AUTO]]]) -> pl.LazyFrame:
|
|
171
|
+
# ? Takes A Slice From A LazyFrame
|
|
172
|
+
# ! Collection Point: Requires Height Calculation
|
|
173
|
+
df: pl.DataFrame = lf.collect()
|
|
174
|
+
n: int = df.select(pl.len()).item()
|
|
175
|
+
start: Union[int, Literal[Tokens.AUTO]] = row_slice[0]
|
|
176
|
+
stop: Union[int, Literal[Tokens.AUTO]] = row_slice[1]
|
|
177
|
+
offset: int = 0 if eq(start, Tokens.AUTO) else start # pyright: ignore
|
|
178
|
+
length: int = n if eq(stop, Tokens.AUTO) else (stop - offset) # pyright: ignore
|
|
179
|
+
df = df.slice(offset=offset, length=length)
|
|
180
|
+
return df.lazy()
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def pick(lf: pl.LazyFrame, rows: list[int]) -> pl.LazyFrame:
|
|
184
|
+
# ? Picks A List Of Rows From A LazyFrame
|
|
185
|
+
# ! Collection Point: take() Requires Eager, Relazy After
|
|
186
|
+
df: pl.DataFrame = lf.collect()
|
|
187
|
+
df = df.select(pl.all().take(indices=rows)) # pyright: ignore
|
|
188
|
+
return df.lazy()
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def reindex(df: pl.LazyFrame, col: str, op: Callable, comp: Union[str, int, float], cast: bool = True) -> pl.LazyFrame:
|
|
192
|
+
# ? Reindex A LazyFrame Based On A Condition
|
|
193
|
+
expr: pl.Expr = pl.col(col).cast(pl.Float64) if cast else pl.col(col)
|
|
194
|
+
return df.filter(op(expr, comp))
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def idxname(col: Any) -> str:
|
|
198
|
+
# ? Converts Excel Style Column Names To Polars Column Names
|
|
199
|
+
scol: str = str(col)
|
|
200
|
+
idx: int = 0
|
|
201
|
+
for char in scol:
|
|
202
|
+
idx = idx * 26 + (ord(char) - 65 + 1)
|
|
203
|
+
|
|
204
|
+
return f"column_{idx}"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def trim(lf: pl.LazyFrame, regex: str = r"^column_\d+$") -> pl.LazyFrame:
|
|
208
|
+
# ? Removes Columns With The Excel Naming Conventions From LazyFrame
|
|
209
|
+
return lf.select(pl.exclude(regex))
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def to_store(lf: pl.LazyFrame, p: Path, config_name: str) -> Path:
|
|
213
|
+
# ? collect and write section parquet; warn if result is empty
|
|
214
|
+
df: pl.DataFrame = lf.collect()
|
|
215
|
+
|
|
216
|
+
if df.height == 0:
|
|
217
|
+
logger.warning(f"EMPTY SUBGRAPH | STORE: {p.stem} | CONFIG: {config_name}")
|
|
218
|
+
df.write_parquet(p)
|
|
219
|
+
|
|
220
|
+
return p
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def with_mesh(lf: pl.LazyFrame, pubmed_db: Path, curie: str) -> pl.LazyFrame:
|
|
224
|
+
# ? Adds PubMedDB Related MeSH Annotations To LazyFrame
|
|
225
|
+
# ! Collection Point: SQLite Query Then Per-Row Literal Assignment
|
|
226
|
+
df: pl.DataFrame = lf.collect()
|
|
227
|
+
db: object = Database(pubmed_db)
|
|
228
|
+
query: str = """
|
|
229
|
+
SELECT
|
|
230
|
+
mesh.mesh_major,
|
|
231
|
+
mesh.mesh,
|
|
232
|
+
info.firstauthor,
|
|
233
|
+
info.journal,
|
|
234
|
+
info.title,
|
|
235
|
+
info.year
|
|
236
|
+
FROM ids
|
|
237
|
+
INNER JOIN mesh ON ids.pmid = mesh.pmid
|
|
238
|
+
INNER JOIN info ON ids.pmid = info.pmid
|
|
239
|
+
WHERE ids.alt = :curie OR ids.pmid = :curie
|
|
240
|
+
LIMIT 1
|
|
241
|
+
"""
|
|
242
|
+
rows: list[dict[str, str]] = list(db.query(query, {"curie": curie})) or []
|
|
243
|
+
all_ids: list[str] = [add("MESH:", x["mesh"]) for x in rows if x]
|
|
244
|
+
is_major: list[bool] = [eq(x["mesh_major"], "Y") for x in rows]
|
|
245
|
+
domain: list[str] = [x for x, y in zip(all_ids, is_major) if y]
|
|
246
|
+
mesh: list[str] = [x for x in all_ids if x not in domain]
|
|
247
|
+
|
|
248
|
+
row: dict[str, str] = rows[0] if rows else {}
|
|
249
|
+
first_author: Optional[str] = row.get("firstauthor")
|
|
250
|
+
journal: Optional[str] = row.get("journal")
|
|
251
|
+
title: Optional[str] = row.get("title")
|
|
252
|
+
year: Optional[str] = row.get("year")
|
|
253
|
+
|
|
254
|
+
if domain:
|
|
255
|
+
df = df.with_columns(pl.lit(domain).alias("domain"))
|
|
256
|
+
if mesh:
|
|
257
|
+
df = df.with_columns(pl.lit(mesh).alias("mesh"))
|
|
258
|
+
if first_author:
|
|
259
|
+
df = df.with_columns(pl.lit(first_author).alias("first author"))
|
|
260
|
+
if journal:
|
|
261
|
+
df = df.with_columns(pl.lit(journal).alias("journal"))
|
|
262
|
+
if title:
|
|
263
|
+
df = df.with_columns(pl.lit(title).alias("title"))
|
|
264
|
+
if year:
|
|
265
|
+
df = df.with_columns(pl.lit(year).alias("year published"))
|
|
266
|
+
|
|
267
|
+
return df.lazy()
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def with_captions(lf: pl.LazyFrame, pmc_db: Path, curie: str, url: str) -> pl.LazyFrame:
|
|
271
|
+
# ? Adds PMC Caption Annotations To LazyFrame With Filename Heuristic
|
|
272
|
+
# ! Collection Point: SQLite Query Then Literal Assignment
|
|
273
|
+
df: pl.DataFrame = lf.collect()
|
|
274
|
+
db: object = Database(pmc_db)
|
|
275
|
+
filename: str = basename(url)
|
|
276
|
+
query: str = """
|
|
277
|
+
SELECT caption
|
|
278
|
+
FROM captions
|
|
279
|
+
WHERE pmc = :curie AND file = :filename
|
|
280
|
+
LIMIT 1
|
|
281
|
+
"""
|
|
282
|
+
rows: list[dict[str, str]] = list(db.query(query, {"curie": curie, "filename": filename})) or []
|
|
283
|
+
row: dict[str, str] = rows[0] if rows else {}
|
|
284
|
+
|
|
285
|
+
caption: Optional[str] = row.get("caption")
|
|
286
|
+
if caption:
|
|
287
|
+
df = df.with_columns(pl.lit(caption).alias("file caption"))
|
|
288
|
+
|
|
289
|
+
return df.lazy()
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class Tcode(Section):
|
|
293
|
+
# ? Extends Section To Compile A KG
|
|
294
|
+
number: PositiveInt = Field(...)
|
|
295
|
+
config: Path = Field(...)
|
|
296
|
+
store: Path = Field(...)
|
|
297
|
+
|
|
298
|
+
def encoding(self: Self, x: Encoding, col: str) -> list[Any]:
|
|
299
|
+
# ? Collect Helper For Encoding Classes
|
|
300
|
+
return [
|
|
301
|
+
(value, (col, x.encoding)) if eq(x.method, EncodingMethods.VALUE) else None,
|
|
302
|
+
(column, (col, idxname(x.encoding))) if eq(x.method, EncodingMethods.COLUMN) else None,
|
|
303
|
+
(fill, (col, x.fill)) if x.fill else None,
|
|
304
|
+
(explode, (col, x.explode_by)) if x.explode_by else None,
|
|
305
|
+
[(regex, (col, r.pattern, r.replacement)) for r in x.regex] if x.regex else None,
|
|
306
|
+
[(regex, (col, r)) for r in x.remove] if x.remove else None,
|
|
307
|
+
(prefix, (col, x.prefix)) if x.prefix else None,
|
|
308
|
+
(suffix, (col, x.suffix)) if x.suffix else None,
|
|
309
|
+
[(math_op, (col, t.function, t.arguments)) for t in x.transformations] if x.transformations else None,
|
|
310
|
+
]
|
|
311
|
+
|
|
312
|
+
def node(self: Self, x: NodeEncoding, col: str, conn: object) -> list[Any]:
|
|
313
|
+
# ? Collect Helper For NodeEncoding Classes
|
|
314
|
+
encoding: list[Any] = self.encoding(x, col)
|
|
315
|
+
node: list[Any] = [
|
|
316
|
+
(column, (add("original ", col), col)),
|
|
317
|
+
(zero, (col,)),
|
|
318
|
+
(one, (col,)),
|
|
319
|
+
(version4, (col, conn, x.taxon, x.prioritize, x.avoid, self.store.stem, self.config.name)),
|
|
320
|
+
(fullmap_audit, (col, self.store.stem, self.config.name)),
|
|
321
|
+
]
|
|
322
|
+
return add(encoding, node)
|
|
323
|
+
|
|
324
|
+
def clean(self: Self, tcode: list[tuple[Callable, Any]]) -> list[tuple[Callable, tuple[Any]]]:
|
|
325
|
+
# ? Cleans Tcode So It Can Be Used With reduce From functools
|
|
326
|
+
result: list[tuple[Callable, tuple[Any]]] = []
|
|
327
|
+
for x in tcode:
|
|
328
|
+
if not x:
|
|
329
|
+
continue
|
|
330
|
+
elif isinstance(x, list):
|
|
331
|
+
result.extend(self.clean(x))
|
|
332
|
+
else:
|
|
333
|
+
result.append(x)
|
|
334
|
+
return result
|
|
335
|
+
|
|
336
|
+
def collect(
|
|
337
|
+
self: Self, conn: object, pubmed_db: Optional[Path], pmc_db: Optional[Path]
|
|
338
|
+
) -> Union[list[tuple[Callable, tuple[Any]]], Path]:
|
|
339
|
+
# ? Code That Tells Tablassert What Actions To While Transforming Data
|
|
340
|
+
|
|
341
|
+
if self.store.is_file():
|
|
342
|
+
# * Quick Exit If Subgraph Already Exists
|
|
343
|
+
return self.store
|
|
344
|
+
|
|
345
|
+
else:
|
|
346
|
+
# * Returns A List Of: (Function, (Arguments))
|
|
347
|
+
tcode: Optional[list[Any]] = [
|
|
348
|
+
(from_url, (str(self.source.url), self.source.local)),
|
|
349
|
+
(csv, (self.source.delimiter,)) if eq(self.source.kind, Files.TEXT) else None, # pyright: ignore
|
|
350
|
+
(excel, (self.source.sheet,)) if eq(self.source.kind, Files.EXCEL) else None, # pyright: ignore
|
|
351
|
+
(idx, ()),
|
|
352
|
+
(crop, (self.source.row_slice,)) if self.source.row_slice else None,
|
|
353
|
+
(pick, (self.source.rows,)) if self.source.rows else None,
|
|
354
|
+
[
|
|
355
|
+
(reindex, (idxname(x.column), getattr(operator, x.comparison), x.comparator))
|
|
356
|
+
if x.comparison not in ["ne", "eq"]
|
|
357
|
+
else (reindex, (idxname(x.column), getattr(operator, x.comparison), x.comparator, False))
|
|
358
|
+
for x in self.source.reindex
|
|
359
|
+
]
|
|
360
|
+
if self.source.reindex
|
|
361
|
+
else None,
|
|
362
|
+
[op for x in self.annotations for op in self.encoding(x, x.annotation)] if self.annotations else None,
|
|
363
|
+
self.node(self.statement.subject, "subject", conn),
|
|
364
|
+
self.node(self.statement.object, "object", conn),
|
|
365
|
+
(value, ("predicate", self.statement.predicate)),
|
|
366
|
+
[op for x in self.statement.qualifiers for op in self.node(x, x.qualifier, conn)]
|
|
367
|
+
if self.statement.qualifiers
|
|
368
|
+
else None,
|
|
369
|
+
(value, ("syntax", self.syntax)),
|
|
370
|
+
(value, ("configuration file", self.config.name)),
|
|
371
|
+
(value, ("section number", self.number)),
|
|
372
|
+
(value, ("status", self.status)),
|
|
373
|
+
(value, ("repository", self.provenance.repo)),
|
|
374
|
+
(value, ("publication", (self.provenance.repo + ":" + self.provenance.publication))),
|
|
375
|
+
(contributor_values, ("contributors", self.provenance.contributors)),
|
|
376
|
+
(value, ("url", str(self.source.url))),
|
|
377
|
+
(value, ("section hash", self.store.stem)),
|
|
378
|
+
(with_mesh, (pubmed_db, self.provenance.publication)) if pubmed_db else None,
|
|
379
|
+
(with_captions, (pmc_db, self.provenance.publication, str(self.source.url))) if pmc_db else None,
|
|
380
|
+
(sig, ()),
|
|
381
|
+
(trim, ()),
|
|
382
|
+
(to_store, (self.store, self.config.name)),
|
|
383
|
+
]
|
|
384
|
+
return self.clean(tcode)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def compile_subgraph(tcode: list[tuple[Callable, tuple[Any]]]) -> Path:
|
|
388
|
+
# ? Executes Tcode To Build Subgraphs As Parquets
|
|
389
|
+
return reduce(lambda acc, op: op[0](acc, *op[1]) if acc is not None else op[0](*op[1]), tcode, None) # pyright: ignore
|
|
390
|
+
|
|
391
|
+
|
|
392
|
+
def normalize(
|
|
393
|
+
edges: pl.LazyFrame, col: str, names: list[str] = ["id", "name", "category", "taxon", "source", "source version"]
|
|
394
|
+
) -> tuple[pl.LazyFrame, pl.LazyFrame]:
|
|
395
|
+
# ? Normalized Disparate Node Columns To A Unified Format And Removes Them From Edges
|
|
396
|
+
# * Returns Partial Nodes And Modified Edges As LazyFrames
|
|
397
|
+
cols: list[str] = [
|
|
398
|
+
col,
|
|
399
|
+
add(col, " name"),
|
|
400
|
+
add(col, " category"),
|
|
401
|
+
add(col, " taxon"),
|
|
402
|
+
add(col, " source"),
|
|
403
|
+
add(col, " source version"),
|
|
404
|
+
]
|
|
405
|
+
nodes: pl.LazyFrame = edges.select(cols).unique().rename({k: v for k, v in zip(cols, names)})
|
|
406
|
+
edges_out: pl.LazyFrame = edges.drop(cols[1:])
|
|
407
|
+
return nodes, edges_out
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def publications(
|
|
411
|
+
edges: pl.LazyFrame, names: list[str] = ["id", "name", "first author", "journal", "year published"]
|
|
412
|
+
) -> tuple[pl.LazyFrame, pl.LazyFrame]:
|
|
413
|
+
cols: list[str] = ["publication", "title", "first author", "journal", "year published"]
|
|
414
|
+
cols = [x for x in cols if x in edges.collect_schema().names()]
|
|
415
|
+
nodes: pl.LazyFrame = edges.select(cols).unique().rename({k: v for k, v in zip(cols, names)})
|
|
416
|
+
nodes = nodes.with_columns(pl.lit("biolink:Publication").alias("category"))
|
|
417
|
+
edges_out: pl.LazyFrame = edges.drop(cols[1:])
|
|
418
|
+
return nodes, edges_out
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
def label_edge(r: object, domain: str = "TABLASSERT", out: str = "uuid") -> object:
|
|
422
|
+
# ? Gives Edges A Unique UUID In The Tablassert Namespace
|
|
423
|
+
r[out] = namespace_uuid(domain, *r.values()) # pyright: ignore
|
|
424
|
+
return r
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def strip_nulls(r: object, bad: set[str] = {"na", "nan", "null", "none", ""}) -> dict:
|
|
428
|
+
# ? Removes Null Keys From NDJSON
|
|
429
|
+
return {
|
|
430
|
+
k: [strip_nulls(i) if isinstance(i, dict) else i for i in v]
|
|
431
|
+
if isinstance(v, list)
|
|
432
|
+
else strip_nulls(v)
|
|
433
|
+
if isinstance(v, dict)
|
|
434
|
+
else v
|
|
435
|
+
for k, v in r.items() # pyright: ignore
|
|
436
|
+
if v and str(v).strip().lower() not in bad
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def dedup_stream(p_in: Path, is_edges: bool) -> None:
|
|
441
|
+
# ? Removes Null Values From And Deduplicates NDJSON
|
|
442
|
+
# * Also Adds UUIDs To Edges
|
|
443
|
+
p_out: Path = p_in.with_suffix("")
|
|
444
|
+
|
|
445
|
+
if p_out.is_file():
|
|
446
|
+
p_out.unlink()
|
|
447
|
+
|
|
448
|
+
seen: set[bytes] = set()
|
|
449
|
+
with p_in.open("rb") as f_in, p_out.open("wb") as f_out:
|
|
450
|
+
for line in f_in:
|
|
451
|
+
r: object = orjson.loads(line) # pyright: ignore
|
|
452
|
+
r = strip_nulls(r)
|
|
453
|
+
|
|
454
|
+
if r:
|
|
455
|
+
b: bytes = orjson.dumps(r)
|
|
456
|
+
h: bytes = xxhash.xxh64(b).digest()
|
|
457
|
+
if h not in seen:
|
|
458
|
+
seen |= {h}
|
|
459
|
+
|
|
460
|
+
if is_edges:
|
|
461
|
+
r = label_edge(r)
|
|
462
|
+
b = orjson.dumps(r)
|
|
463
|
+
|
|
464
|
+
b = b + ("\n").encode("utf-8")
|
|
465
|
+
f_out.write(b)
|
|
466
|
+
|
|
467
|
+
p_in.unlink()
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def compile_graph(subgraphs: list[Path], name: str, version: str, fmt: str = "mixed", precision: int = 4) -> None:
|
|
471
|
+
# ? Aggregates Parquets For NDJSON KGX Export Using Lazy Scan
|
|
472
|
+
p: Path = Path(f"./{name}_{version}")
|
|
473
|
+
|
|
474
|
+
e: Path = p.with_suffix(".edges.ndjson.temp") # ! For Labeling
|
|
475
|
+
if e.exists():
|
|
476
|
+
e.unlink()
|
|
477
|
+
|
|
478
|
+
n: Path = p.with_suffix(".nodes.ndjson.temp")
|
|
479
|
+
if n.exists():
|
|
480
|
+
n.unlink()
|
|
481
|
+
|
|
482
|
+
subnodes: list[pl.LazyFrame] = []
|
|
483
|
+
subedges: list[pl.LazyFrame] = []
|
|
484
|
+
for s in subgraphs:
|
|
485
|
+
lf: pl.LazyFrame = pl.scan_parquet(s)
|
|
486
|
+
|
|
487
|
+
node_cols: list[str] = [col.replace("original ", "") for col in lf.collect_schema().names() if "original " in col]
|
|
488
|
+
for col in node_cols:
|
|
489
|
+
partial, lf = normalize(lf, col)
|
|
490
|
+
subnodes.append(partial)
|
|
491
|
+
|
|
492
|
+
partial, lf = publications(lf)
|
|
493
|
+
subnodes.append(partial)
|
|
494
|
+
subedges.append(lf)
|
|
495
|
+
|
|
496
|
+
# ! Collection Point: Appending To Output Files
|
|
497
|
+
with n.open("a") as f:
|
|
498
|
+
for subnode in subnodes:
|
|
499
|
+
eagernode: pl.DataFrame = subnode.collect().unique()
|
|
500
|
+
eagernode.write_ndjson(f)
|
|
501
|
+
|
|
502
|
+
with e.open("a") as f:
|
|
503
|
+
with pl.Config(set_fmt_float=fmt): # pyright: ignore
|
|
504
|
+
with pl.Config(float_precision=precision):
|
|
505
|
+
for subedge in subedges:
|
|
506
|
+
eageredge: pl.DataFrame = subedge.collect().unique()
|
|
507
|
+
eageredge.write_ndjson(f)
|
|
508
|
+
|
|
509
|
+
dedup_stream(e, is_edges=True)
|
|
510
|
+
dedup_stream(n, is_edges=False)
|
|
511
|
+
|
|
512
|
+
|
|
513
|
+
CLI: typer.Typer = typer.Typer(pretty_exceptions_show_locals=False)
|
|
514
|
+
PROGRESS: Progress = Progress(
|
|
515
|
+
SpinnerColumn(),
|
|
516
|
+
TextColumn("[progress.description]{task.description}"),
|
|
517
|
+
BarColumn(),
|
|
518
|
+
TaskProgressColumn(),
|
|
519
|
+
TimeElapsedColumn(),
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def track(task_id: Any, iterable: Any) -> Any:
|
|
524
|
+
for item in iterable:
|
|
525
|
+
yield item
|
|
526
|
+
PROGRESS.advance(task_id)
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@CLI.command()
|
|
530
|
+
def build_knowledge_graph(
|
|
531
|
+
graph_configuration_file: Path = typer.Argument(..., help="Knowledge Graph Configuration -- See Docs"),
|
|
532
|
+
) -> None:
|
|
533
|
+
"""Build A KGX Compliant Knowledge Graph From A Graph Configuration File"""
|
|
534
|
+
# TODO: Make MeSH A Node (Micro Version)
|
|
535
|
+
# TODO: Add FullMap Column Context Flag"
|
|
536
|
+
r: object = from_yaml(graph_configuration_file)
|
|
537
|
+
g: Graph = Graph.model_validate(r)
|
|
538
|
+
|
|
539
|
+
with PROGRESS:
|
|
540
|
+
# ? Load Tables
|
|
541
|
+
t1: Any = PROGRESS.add_task("Loading Tables...", total=None)
|
|
542
|
+
with Pool() as pool:
|
|
543
|
+
raw: list[object] = pool.map(from_yaml, g.tables)
|
|
544
|
+
PROGRESS.update(t1, total=1, completed=1)
|
|
545
|
+
|
|
546
|
+
# ? Extract Sections
|
|
547
|
+
t2: Any = PROGRESS.add_task("Extracting Sections...", total=None)
|
|
548
|
+
with Pool() as pool:
|
|
549
|
+
temp: list[list[dict[str, Any]]] = pool.starmap(to_sections, zip(raw, g.tables)) # pyright: ignore
|
|
550
|
+
PROGRESS.update(t2, total=1, completed=1)
|
|
551
|
+
sections: list[dict[str, Any]] = list(chain.from_iterable(temp))
|
|
552
|
+
n: int = len(sections)
|
|
553
|
+
|
|
554
|
+
# ? Build Tcodes
|
|
555
|
+
t3: Any = PROGRESS.add_task("Building TCode...", total=n)
|
|
556
|
+
tcode: list[Tcode] = [
|
|
557
|
+
Tcode.model_validate({**s, "number": idx, "store": (STORE / f"{mkhash(s)}.parquet")})
|
|
558
|
+
for idx, s in track(t3, enumerate(sections, start=1))
|
|
559
|
+
]
|
|
560
|
+
with duckdb.connect(g.dbssert, read_only=True) as conn:
|
|
561
|
+
# ? Collect Instructions
|
|
562
|
+
t4: Any = PROGRESS.add_task("Collecting Instructions...", total=n)
|
|
563
|
+
instructions: list[Union[list[tuple[Callable, tuple[Any, ...]]], Path]] = [
|
|
564
|
+
x.collect(conn, g.pubmed_db, g.pmc_db) for x in track(t4, tcode)
|
|
565
|
+
] # pyright: ignore
|
|
566
|
+
|
|
567
|
+
# ? Build Subgraphs
|
|
568
|
+
t5: Any = PROGRESS.add_task("Building Subgraphs...", total=n)
|
|
569
|
+
subgraphs: list[Path] = [op if isinstance(op, Path) else compile_subgraph(op) for op in track(t5, instructions)] # pyright: ignore
|
|
570
|
+
|
|
571
|
+
# ? Compile Graph
|
|
572
|
+
t6: Any = PROGRESS.add_task("Compiling Graph...", total=None)
|
|
573
|
+
compile_graph(subgraphs, g.name, g.version)
|
|
574
|
+
PROGRESS.update(t6, total=1, completed=1)
|
|
575
|
+
|
|
576
|
+
PROGRESS.add_task("[bold green]Finished!", total=1, completed=1)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
@CLI.command()
|
|
580
|
+
def verify_table_configuration_syntax(
|
|
581
|
+
table_configuration_file: Path = typer.Argument(..., help="Table Configuration -- See Docs"),
|
|
582
|
+
) -> None:
|
|
583
|
+
"""Verify The Syntax Of A Declarative Table Configuration File"""
|
|
584
|
+
with PROGRESS:
|
|
585
|
+
# ? Load Tables
|
|
586
|
+
t1: Any = PROGRESS.add_task("Loading Tables...", total=None)
|
|
587
|
+
r: object = from_yaml(table_configuration_file)
|
|
588
|
+
PROGRESS.update(t1, total=1, completed=1)
|
|
589
|
+
|
|
590
|
+
# ? Extract Sections
|
|
591
|
+
t2: Any = PROGRESS.add_task("Extracting Sections...", total=None)
|
|
592
|
+
sections: list[dict[str, Any]] = to_sections(r) # pyright: ignore
|
|
593
|
+
n: int = len(sections)
|
|
594
|
+
PROGRESS.update(t2, total=1, completed=1)
|
|
595
|
+
|
|
596
|
+
# ? Validating Section Syntax
|
|
597
|
+
t3: Any = PROGRESS.add_task("Validating Section Syntax...", total=n)
|
|
598
|
+
for s in track(t3, sections):
|
|
599
|
+
Section.model_validate(s)
|
|
600
|
+
PROGRESS.update(t3, total=1, completed=1)
|
|
601
|
+
|
|
602
|
+
PROGRESS.add_task("[bold green]Finished!", total=1, completed=1)
|
tablassert/log.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from loguru import logger
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
LOGASSERT: Path = Path("./.logassert")
|
|
5
|
+
LOGASSERT.mkdir(parents=True, exist_ok=True)
|
|
6
|
+
|
|
7
|
+
logger.remove()
|
|
8
|
+
logger.add(
|
|
9
|
+
(LOGASSERT / "logassert.log"),
|
|
10
|
+
level="INFO",
|
|
11
|
+
format="{time:YYYY-MM-DD HH:mm:ss} | {level} | {message}",
|
|
12
|
+
rotation="100 MB",
|
|
13
|
+
encoding="utf-8",
|
|
14
|
+
mode="w",
|
|
15
|
+
)
|