closurizer 0.1.3__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {closurizer-0.1.3 → closurizer-0.5.1}/PKG-INFO +5 -2
- closurizer-0.5.1/closurizer/cli.py +37 -0
- closurizer-0.5.1/closurizer/closurizer.py +176 -0
- {closurizer-0.1.3 → closurizer-0.5.1}/pyproject.toml +2 -2
- closurizer-0.1.3/closurizer/cli.py +0 -21
- closurizer-0.1.3/closurizer/closurizer.db +0 -0
- closurizer-0.1.3/closurizer/closurizer.py +0 -124
- closurizer-0.1.3/setup.py +0 -34
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: closurizer
|
3
|
-
Version: 0.1
|
3
|
+
Version: 0.5.1
|
4
4
|
Summary: Add closure expansion fields to kgx files following the Golr pattern
|
5
5
|
Author: Kevin Schaper
|
6
6
|
Author-email: kevin@tislab.org
|
@@ -8,6 +8,9 @@ Requires-Python: >=3.8,<4.0
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
9
9
|
Classifier: Programming Language :: Python :: 3.8
|
10
10
|
Classifier: Programming Language :: Python :: 3.9
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
11
14
|
Requires-Dist: SQLAlchemy (>=1.4.37,<2.0.0)
|
12
15
|
Requires-Dist: click (>=8,<9)
|
13
|
-
Requires-Dist:
|
16
|
+
Requires-Dist: duckdb (>=0.9.1,<0.10.0)
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import click
|
2
|
+
from typing import List
|
3
|
+
from closurizer.closurizer import add_closure
|
4
|
+
|
5
|
+
|
6
|
+
@click.command()
|
7
|
+
@click.option('--kg', required=True, help='KGX tar.gz archive')
|
8
|
+
@click.option('--closure', required=True, help='TSV file of closure triples')
|
9
|
+
@click.option('--nodes-output', required=True, help='file write nodes kgx file with closure fields added')
|
10
|
+
@click.option('--edges-output', required=True, help='file write edges kgx file with closure fields added')
|
11
|
+
@click.option('--additional-node-constraints', required=False,
|
12
|
+
help='additional where clause constraints to apply to the generation of the denormalized nodes output')
|
13
|
+
@click.option('--edge-fields', multiple=True, help='edge fields to expand with closure IDs, labels, etc')
|
14
|
+
@click.option('--node-fields', multiple=True, help='node fields to expand with closure IDs, labels, etc')
|
15
|
+
@click.option('--grouping-fields', multiple=True, help='fields to populate a single value grouping_key field')
|
16
|
+
@click.option('--dry-run', is_flag=True, help='A dry run will not write the output file, but will print the SQL query')
|
17
|
+
def main(kg: str,
|
18
|
+
closure: str,
|
19
|
+
nodes_output: str,
|
20
|
+
edges_output: str,
|
21
|
+
additional_node_constraints: str = None,
|
22
|
+
dry_run: bool = False,
|
23
|
+
edge_fields: List[str] = None,
|
24
|
+
node_fields: List[str] = None,
|
25
|
+
grouping_fields: List[str] = None):
|
26
|
+
add_closure(kg_archive=kg,
|
27
|
+
closure_file=closure,
|
28
|
+
edge_fields=edge_fields,
|
29
|
+
node_fields=node_fields,
|
30
|
+
edges_output_file=edges_output,
|
31
|
+
nodes_output_file=nodes_output,
|
32
|
+
additional_node_constraints=additional_node_constraints,
|
33
|
+
dry_run=dry_run,
|
34
|
+
grouping_fields=grouping_fields)
|
35
|
+
|
36
|
+
if __name__ == "__main__":
|
37
|
+
main()
|
@@ -0,0 +1,176 @@
|
|
1
|
+
from typing import List
|
2
|
+
|
3
|
+
import os
|
4
|
+
import tarfile
|
5
|
+
import duckdb
|
6
|
+
|
7
|
+
def edge_columns(field):
|
8
|
+
column_text = f"""
|
9
|
+
{field}.name as {field}_label,
|
10
|
+
{field}.category as {field}_category,
|
11
|
+
{field}.namespace as {field}_namespace,
|
12
|
+
list_aggregate({field}_closure.closure, 'string_agg', '|') as {field}_closure,
|
13
|
+
list_aggregate({field}_closure_label.closure_label,'string_agg', '|') as {field}_closure_label,
|
14
|
+
"""
|
15
|
+
if field in ['subject', 'object']:
|
16
|
+
column_text += f"""
|
17
|
+
{field}.in_taxon as {field}_taxon,
|
18
|
+
{field}.in_taxon_label as {field}_taxon_label,
|
19
|
+
"""
|
20
|
+
return column_text
|
21
|
+
|
22
|
+
def edge_joins(field):
|
23
|
+
return f"""
|
24
|
+
left outer join nodes as {field} on edges.{field} = {field}.id
|
25
|
+
left outer join closure_id as {field}_closure on {field}.id = {field}_closure.id
|
26
|
+
left outer join closure_label as {field}_closure_label on {field}.id = {field}_closure_label.id
|
27
|
+
"""
|
28
|
+
|
29
|
+
def evidence_sum(evidence_fields):
|
30
|
+
""" Sum together the length of each field after splitting on | """
|
31
|
+
evidence_count_sum = "+".join([f"ifnull(len(split({field}, '|')),0)" for field in evidence_fields])
|
32
|
+
return f"{evidence_count_sum} as evidence_count,"
|
33
|
+
|
34
|
+
|
35
|
+
def node_columns(predicate):
|
36
|
+
# strip the biolink predicate, if necessary to get the field name
|
37
|
+
field = predicate.replace('biolink:','')
|
38
|
+
|
39
|
+
return f"""
|
40
|
+
string_agg({field}_edges.object, '|') as {field},
|
41
|
+
string_agg({field}_edges.object_label, '|') as {field}_label,
|
42
|
+
count (distinct {field}_edges.object) as {field}_count,
|
43
|
+
list_aggregate(list_distinct(flatten(array_agg({field}_closure.closure))), 'string_agg', '|') as {field}_closure,
|
44
|
+
list_aggregate(list_distinct(flatten(array_agg({field}_closure_label.closure_label))), 'string_agg', '|') as {field}_closure_label,
|
45
|
+
"""
|
46
|
+
|
47
|
+
def node_joins(predicate):
|
48
|
+
# strip the biolink predicate, if necessary to get the field name
|
49
|
+
field = predicate.replace('biolink:','')
|
50
|
+
return f"""
|
51
|
+
left outer join denormalized_edges as {field}_edges
|
52
|
+
on nodes.id = {field}_edges.subject
|
53
|
+
and {field}_edges.predicate = 'biolink:{field}'
|
54
|
+
left outer join closure_id as {field}_closure
|
55
|
+
on {field}_edges.object = {field}_closure.id
|
56
|
+
left outer join closure_label as {field}_closure_label
|
57
|
+
on {field}_edges.object = {field}_closure_label.id
|
58
|
+
"""
|
59
|
+
|
60
|
+
def grouping_key(grouping_fields):
|
61
|
+
fragments = []
|
62
|
+
for field in grouping_fields:
|
63
|
+
if field == 'negated':
|
64
|
+
fragments.append(f"coalesce({field}.replace('True','NOT'), '')")
|
65
|
+
else:
|
66
|
+
fragments.append(field)
|
67
|
+
grouping_key_fragments = ", ".join(fragments)
|
68
|
+
return f"concat_ws('|', {grouping_key_fragments}) as grouping_key"
|
69
|
+
|
70
|
+
def add_closure(kg_archive: str,
|
71
|
+
closure_file: str,
|
72
|
+
nodes_output_file: str,
|
73
|
+
edges_output_file: str,
|
74
|
+
node_fields: List[str] = None,
|
75
|
+
edge_fields: List[str] = ['subject', 'object'],
|
76
|
+
additional_node_constraints: str = None,
|
77
|
+
dry_run: bool = False,
|
78
|
+
evidence_fields: List[str] = None,
|
79
|
+
grouping_fields: List[str] = None
|
80
|
+
):
|
81
|
+
print("Generating closure KG...")
|
82
|
+
print(f"kg_archive: {kg_archive}")
|
83
|
+
print(f"closure_file: {closure_file}")
|
84
|
+
|
85
|
+
db = duckdb.connect(database='monarch-kg.duckdb')
|
86
|
+
|
87
|
+
if edge_fields is None or len(edge_fields) == 0:
|
88
|
+
edge_fields = ['subject', 'object']
|
89
|
+
|
90
|
+
if evidence_fields is None or len(evidence_fields) == 0:
|
91
|
+
evidence_fields = ['has_evidence', 'publications']
|
92
|
+
|
93
|
+
if grouping_fields is None or len(grouping_fields) == 0:
|
94
|
+
grouping_fields = ['subject', 'negated', 'predicate', 'object']
|
95
|
+
|
96
|
+
|
97
|
+
if not dry_run:
|
98
|
+
print(f"fields: {','.join(edge_fields)}")
|
99
|
+
print(f"output_file: {edges_output_file}")
|
100
|
+
|
101
|
+
tar = tarfile.open(f"{kg_archive}")
|
102
|
+
|
103
|
+
print("Loading node table...")
|
104
|
+
node_file_name = [member.name for member in tar.getmembers() if member.name.endswith('_nodes.tsv') ][0]
|
105
|
+
tar.extract(node_file_name,)
|
106
|
+
node_file = f"{node_file_name}"
|
107
|
+
print(f"node_file: {node_file}")
|
108
|
+
|
109
|
+
db.sql(f"""
|
110
|
+
create or replace table nodes as select *, substr(id, 1, instr(id,':') -1) as namespace from read_csv('{node_file_name}', header=True, sep='\t', AUTO_DETECT=TRUE)
|
111
|
+
""")
|
112
|
+
|
113
|
+
edge_file_name = [member.name for member in tar.getmembers() if member.name.endswith('_edges.tsv') ][0]
|
114
|
+
tar.extract(edge_file_name)
|
115
|
+
edge_file = f"{edge_file_name}"
|
116
|
+
print(f"edge_file: {edge_file}")
|
117
|
+
|
118
|
+
db.sql(f"""
|
119
|
+
create or replace table edges as select * from read_csv('{edge_file_name}', header=True, sep='\t', AUTO_DETECT=TRUE)
|
120
|
+
""")
|
121
|
+
|
122
|
+
# Load the relation graph tsv in long format mapping a node to each of it's ancestors
|
123
|
+
db.sql(f"""
|
124
|
+
create or replace table closure as select * from read_csv('{closure_file}', sep='\t', names=['subject_id', 'predicate_id', 'object_id'], AUTO_DETECT=TRUE)
|
125
|
+
""")
|
126
|
+
|
127
|
+
db.sql("""
|
128
|
+
create or replace table closure_id as select subject_id as id, array_agg(object_id) as closure from closure group by subject_id
|
129
|
+
""")
|
130
|
+
|
131
|
+
db.sql("""
|
132
|
+
create or replace table closure_label as select subject_id as id, array_agg(name) as closure_label from closure join nodes on object_id = id
|
133
|
+
group by subject_id
|
134
|
+
""")
|
135
|
+
|
136
|
+
edges_query = f"""
|
137
|
+
create or replace table denormalized_edges as
|
138
|
+
select edges.*,
|
139
|
+
{"".join([edge_columns(field) for field in edge_fields])}
|
140
|
+
{evidence_sum(evidence_fields)}
|
141
|
+
{grouping_key(grouping_fields)}
|
142
|
+
from edges
|
143
|
+
{"".join([edge_joins(field) for field in edge_fields])}
|
144
|
+
"""
|
145
|
+
|
146
|
+
print(edges_query)
|
147
|
+
|
148
|
+
nodes_query = f"""
|
149
|
+
create or replace table denormalized_nodes as
|
150
|
+
select nodes.*,
|
151
|
+
{"".join([node_columns(node_field) for node_field in node_fields])}
|
152
|
+
from nodes
|
153
|
+
{node_joins('has_phenotype')}
|
154
|
+
where {additional_node_constraints}
|
155
|
+
group by nodes.*
|
156
|
+
"""
|
157
|
+
print(nodes_query)
|
158
|
+
|
159
|
+
if not dry_run:
|
160
|
+
db.query(edges_query)
|
161
|
+
db.query(f"""
|
162
|
+
-- write denormalized_edges as tsv
|
163
|
+
copy (select * from denormalized_edges) to '{edges_output_file}' (header, delimiter '\t')
|
164
|
+
""")
|
165
|
+
db.query(nodes_query)
|
166
|
+
db.query(f"""
|
167
|
+
-- write denormalized_nodes as tsv
|
168
|
+
copy (select * from denormalized_nodes) to '{nodes_output_file}' (header, delimiter '\t')
|
169
|
+
""")
|
170
|
+
|
171
|
+
|
172
|
+
# Clean up extracted node & edge files
|
173
|
+
if os.path.exists(f"{node_file}"):
|
174
|
+
os.remove(f"{node_file}")
|
175
|
+
if os.path.exists(f"{edge_file}"):
|
176
|
+
os.remove(f"{edge_file}")
|
@@ -1,14 +1,14 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "closurizer"
|
3
|
-
version = "0.1
|
3
|
+
version = "0.5.1"
|
4
4
|
description = "Add closure expansion fields to kgx files following the Golr pattern"
|
5
5
|
authors = ["Kevin Schaper <kevin@tislab.org>"]
|
6
6
|
|
7
7
|
[tool.poetry.dependencies]
|
8
8
|
python = "^3.8"
|
9
9
|
click = "^8"
|
10
|
-
petl = "^1.7.10"
|
11
10
|
SQLAlchemy = "^1.4.37"
|
11
|
+
duckdb = "^0.9.1"
|
12
12
|
|
13
13
|
[tool.poetry.dev-dependencies]
|
14
14
|
|
@@ -1,21 +0,0 @@
|
|
1
|
-
import click
|
2
|
-
from typing import List
|
3
|
-
from closurizer import add_closure
|
4
|
-
|
5
|
-
|
6
|
-
@click.command()
|
7
|
-
@click.option('--nodes', help='KGX node file to provide closure labels')
|
8
|
-
@click.option('--edges', help='KGX edge file to add closure fields to')
|
9
|
-
@click.option('--closure', help='TSV file of closure triples')
|
10
|
-
@click.option('--output', '-o', help='file write kgx file with closure fields added')
|
11
|
-
@click.option('--fields', multiple=True, help='fields to closurize')
|
12
|
-
def main(nodes: str,
|
13
|
-
edges: str,
|
14
|
-
closure: str,
|
15
|
-
output: str,
|
16
|
-
fields: List[str]):
|
17
|
-
add_closure(node_file=nodes, edge_file=edges, closure_file=closure, fields=fields, output_file=output)
|
18
|
-
|
19
|
-
|
20
|
-
if __name__ == "__main__":
|
21
|
-
main()
|
File without changes
|
@@ -1,124 +0,0 @@
|
|
1
|
-
from typing import List
|
2
|
-
|
3
|
-
import petl as etl
|
4
|
-
import sqlite3
|
5
|
-
import os
|
6
|
-
import tarfile
|
7
|
-
|
8
|
-
|
9
|
-
def _string_agg(key, rows):
|
10
|
-
return [key, "|".join(row[1] for row in rows)]
|
11
|
-
|
12
|
-
|
13
|
-
def add_closure(node_file: str,
|
14
|
-
edge_file: str,
|
15
|
-
kg_archive: str,
|
16
|
-
closure_file: str,
|
17
|
-
path: str,
|
18
|
-
fields: List[str],
|
19
|
-
output_file: str):
|
20
|
-
|
21
|
-
print("Generating closure KG...")
|
22
|
-
print(f"node_file: {node_file}")
|
23
|
-
print(f"edge_file: {edge_file}")
|
24
|
-
print(f"kg_archive: {kg_archive}")
|
25
|
-
print(f"closure_file: {closure_file}")
|
26
|
-
print(f"fields: {','.join(fields)}")
|
27
|
-
print(f"output_file: {output_file}")
|
28
|
-
|
29
|
-
tar = tarfile.open(f"{path}/{kg_archive}")
|
30
|
-
tar.extract(node_file, path=path)
|
31
|
-
tar.extract(edge_file, path=path)
|
32
|
-
|
33
|
-
db = "closurizer.db"
|
34
|
-
|
35
|
-
if os.path.exists(db):
|
36
|
-
os.remove(db)
|
37
|
-
sqlite = sqlite3.connect(db)
|
38
|
-
|
39
|
-
nodes = etl.fromtsv(node_file)
|
40
|
-
etl.todb(nodes, sqlite, "nodes", create=True)
|
41
|
-
|
42
|
-
edges = etl.fromtsv(edge_file)
|
43
|
-
|
44
|
-
for field in fields:
|
45
|
-
edges = etl.addfield(edges, f"{field}_namespace")
|
46
|
-
edges = etl.addfield(edges, f"{field}_category")
|
47
|
-
edges = etl.addfield(edges, f"{field}_closure")
|
48
|
-
edges = etl.addfield(edges, f"{field}_label")
|
49
|
-
edges = etl.addfield(edges, f"{field}_closure_label")
|
50
|
-
|
51
|
-
etl.todb(edges, sqlite, "edges", create=True)
|
52
|
-
|
53
|
-
closure_table = (etl
|
54
|
-
.fromtsv(closure_file)
|
55
|
-
.setheader(['id', 'predicate', 'ancestor'])
|
56
|
-
.cutout('predicate') # assume all predicates for now
|
57
|
-
)
|
58
|
-
|
59
|
-
closure_id_table = etl.rowreduce(closure_table, key='id', reducer=_string_agg, header=['id', 'ancestors'])
|
60
|
-
etl.todb(closure_id_table, sqlite, "closure", create=True)
|
61
|
-
|
62
|
-
closure_label_table = (etl.leftjoin(closure_table,
|
63
|
-
etl.cut(nodes, ["id", "name"]),
|
64
|
-
lkey="ancestor",
|
65
|
-
rkey="id")
|
66
|
-
.cutout("ancestor")
|
67
|
-
.rename("name", "closure_label")
|
68
|
-
.selectnotnone("closure_label")
|
69
|
-
.rowreduce(key='id', reducer=_string_agg, header=['id', 'ancestor_labels']))
|
70
|
-
etl.todb(closure_label_table, sqlite, "closure_label", create=True)
|
71
|
-
|
72
|
-
cur = sqlite.cursor()
|
73
|
-
|
74
|
-
for field in fields:
|
75
|
-
etl.leftjoin(edges, closure_id_table, lkey=f"{field}", rkey="id")
|
76
|
-
|
77
|
-
for field in fields:
|
78
|
-
|
79
|
-
cur.execute(f"""
|
80
|
-
update edges
|
81
|
-
set {field}_namespace = SUBSTR(nodes.id,1,INSTR(nodes.id,':') -1)
|
82
|
-
from nodes
|
83
|
-
where edges.{field} = nodes.id;
|
84
|
-
""")
|
85
|
-
|
86
|
-
cur.execute(f"""
|
87
|
-
update edges
|
88
|
-
set {field}_category = nodes.category
|
89
|
-
from nodes
|
90
|
-
where edges.{field} = nodes.id;
|
91
|
-
""")
|
92
|
-
|
93
|
-
cur.execute(f"""
|
94
|
-
update edges
|
95
|
-
set {field}_closure = ancestors
|
96
|
-
from closure
|
97
|
-
where edges.{field} = closure.id;
|
98
|
-
""")
|
99
|
-
|
100
|
-
cur.execute(f"""
|
101
|
-
update edges
|
102
|
-
set {field}_label = nodes.name
|
103
|
-
from nodes
|
104
|
-
where edges.{field} = nodes.id;
|
105
|
-
""")
|
106
|
-
|
107
|
-
cur.execute(f"""
|
108
|
-
update edges
|
109
|
-
set {field}_closure_label = closure_label.ancestor_labels
|
110
|
-
from closure_label
|
111
|
-
where edges.{field} = closure_label.id;
|
112
|
-
""")
|
113
|
-
|
114
|
-
etl.fromdb(sqlite, 'select * from edges').totsv(f"{path}/{output_file}")
|
115
|
-
|
116
|
-
# Clean up the database
|
117
|
-
if os.path.exists(db):
|
118
|
-
os.remove(db)
|
119
|
-
|
120
|
-
# Clean up extracted node & edge files
|
121
|
-
if os.path.exists(f"{path}/{node_file}"):
|
122
|
-
os.remove(f"{path}/{node_file}")
|
123
|
-
if os.path.exists(f"{path}/{edge_file}"):
|
124
|
-
os.remove(f"{path}/{edge_file}")
|
closurizer-0.1.3/setup.py
DELETED
@@ -1,34 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
from setuptools import setup
|
3
|
-
|
4
|
-
packages = \
|
5
|
-
['closurizer']
|
6
|
-
|
7
|
-
package_data = \
|
8
|
-
{'': ['*']}
|
9
|
-
|
10
|
-
install_requires = \
|
11
|
-
['SQLAlchemy>=1.4.37,<2.0.0', 'click>=8,<9', 'petl>=1.7.10,<2.0.0']
|
12
|
-
|
13
|
-
entry_points = \
|
14
|
-
{'console_scripts': ['closurizer = closurizer.cli:main']}
|
15
|
-
|
16
|
-
setup_kwargs = {
|
17
|
-
'name': 'closurizer',
|
18
|
-
'version': '0.1.3',
|
19
|
-
'description': 'Add closure expansion fields to kgx files following the Golr pattern',
|
20
|
-
'long_description': None,
|
21
|
-
'author': 'Kevin Schaper',
|
22
|
-
'author_email': 'kevin@tislab.org',
|
23
|
-
'maintainer': None,
|
24
|
-
'maintainer_email': None,
|
25
|
-
'url': None,
|
26
|
-
'packages': packages,
|
27
|
-
'package_data': package_data,
|
28
|
-
'install_requires': install_requires,
|
29
|
-
'entry_points': entry_points,
|
30
|
-
'python_requires': '>=3.8,<4.0',
|
31
|
-
}
|
32
|
-
|
33
|
-
|
34
|
-
setup(**setup_kwargs)
|