closurizer 0.6.0__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- closurizer/cli.py +3 -0
- closurizer/closurizer.py +18 -21
- {closurizer-0.6.0.dist-info → closurizer-0.7.1.dist-info}/METADATA +4 -3
- closurizer-0.7.1.dist-info/RECORD +6 -0
- {closurizer-0.6.0.dist-info → closurizer-0.7.1.dist-info}/WHEEL +1 -1
- closurizer-0.6.0.dist-info/RECORD +0 -6
- {closurizer-0.6.0.dist-info → closurizer-0.7.1.dist-info}/entry_points.txt +0 -0
closurizer/cli.py
CHANGED
@@ -11,6 +11,7 @@ from closurizer.closurizer import add_closure
|
|
11
11
|
@click.option('--additional-node-constraints', required=False,
|
12
12
|
help='additional where clause constraints to apply to the generation of the denormalized nodes output')
|
13
13
|
@click.option('--edge-fields', multiple=True, help='edge fields to expand with closure IDs, labels, etc')
|
14
|
+
@click.option('--edge-fields-to-label', multiple=True, help='edge fields to with category, label, etc but not full closure exansion')
|
14
15
|
@click.option('--node-fields', multiple=True, help='node fields to expand with closure IDs, labels, etc')
|
15
16
|
@click.option('--grouping-fields', multiple=True, help='fields to populate a single value grouping_key field')
|
16
17
|
@click.option('--dry-run', is_flag=True, help='A dry run will not write the output file, but will print the SQL query')
|
@@ -21,11 +22,13 @@ def main(kg: str,
|
|
21
22
|
additional_node_constraints: str = None,
|
22
23
|
dry_run: bool = False,
|
23
24
|
edge_fields: List[str] = None,
|
25
|
+
edge_fields_to_label: List[str] = None,
|
24
26
|
node_fields: List[str] = None,
|
25
27
|
grouping_fields: List[str] = None):
|
26
28
|
add_closure(kg_archive=kg,
|
27
29
|
closure_file=closure,
|
28
30
|
edge_fields=edge_fields,
|
31
|
+
edge_fields_to_label=edge_fields_to_label,
|
29
32
|
node_fields=node_fields,
|
30
33
|
edges_output_file=edges_output,
|
31
34
|
nodes_output_file=nodes_output,
|
closurizer/closurizer.py
CHANGED
@@ -1,17 +1,21 @@
|
|
1
|
-
from typing import List
|
1
|
+
from typing import List, Optional
|
2
2
|
|
3
3
|
import os
|
4
4
|
import tarfile
|
5
5
|
import duckdb
|
6
6
|
|
7
|
-
def edge_columns(field):
|
7
|
+
def edge_columns(field: str, include_closure_fields: bool =True):
|
8
8
|
column_text = f"""
|
9
9
|
{field}.name as {field}_label,
|
10
10
|
{field}.category as {field}_category,
|
11
|
-
{field}.namespace as {field}_namespace,
|
12
|
-
list_aggregate({field}_closure.closure, 'string_agg', '|') as {field}_closure,
|
13
|
-
list_aggregate({field}_closure_label.closure_label,'string_agg', '|') as {field}_closure_label,
|
11
|
+
{field}.namespace as {field}_namespace,
|
14
12
|
"""
|
13
|
+
if include_closure_fields:
|
14
|
+
column_text += f"""
|
15
|
+
{field}_closure.closure as {field}_closure,
|
16
|
+
{field}_closure_label.closure_label as {field}_closure_label,
|
17
|
+
"""
|
18
|
+
|
15
19
|
if field in ['subject', 'object']:
|
16
20
|
column_text += f"""
|
17
21
|
{field}.in_taxon as {field}_taxon,
|
@@ -19,14 +23,14 @@ def edge_columns(field):
|
|
19
23
|
"""
|
20
24
|
return column_text
|
21
25
|
|
22
|
-
def edge_joins(field):
|
26
|
+
def edge_joins(field: str, include_closure_joins: bool =True):
|
23
27
|
return f"""
|
24
28
|
left outer join nodes as {field} on edges.{field} = {field}.id
|
25
29
|
left outer join closure_id as {field}_closure on {field}.id = {field}_closure.id
|
26
30
|
left outer join closure_label as {field}_closure_label on {field}.id = {field}_closure_label.id
|
27
31
|
"""
|
28
32
|
|
29
|
-
def evidence_sum(evidence_fields):
|
33
|
+
def evidence_sum(evidence_fields: List[str]):
|
30
34
|
""" Sum together the length of each field after splitting on | """
|
31
35
|
evidence_count_sum = "+".join([f"ifnull(len(split({field}, '|')),0)" for field in evidence_fields])
|
32
36
|
return f"{evidence_count_sum} as evidence_count,"
|
@@ -73,12 +77,13 @@ def add_closure(kg_archive: str,
|
|
73
77
|
closure_file: str,
|
74
78
|
nodes_output_file: str,
|
75
79
|
edges_output_file: str,
|
76
|
-
node_fields: List[str] =
|
80
|
+
node_fields: List[str] = [],
|
77
81
|
edge_fields: List[str] = ['subject', 'object'],
|
78
|
-
|
82
|
+
edge_fields_to_label: List[str] = [],
|
83
|
+
additional_node_constraints: Optional[str] = None,
|
79
84
|
dry_run: bool = False,
|
80
|
-
evidence_fields: List[str] =
|
81
|
-
grouping_fields: List[str] =
|
85
|
+
evidence_fields: List[str] = ['has_evidence', 'publications'],
|
86
|
+
grouping_fields: List[str] = ['subject', 'negated', 'predicate', 'object']
|
82
87
|
):
|
83
88
|
print("Generating closure KG...")
|
84
89
|
print(f"kg_archive: {kg_archive}")
|
@@ -86,16 +91,6 @@ def add_closure(kg_archive: str,
|
|
86
91
|
|
87
92
|
db = duckdb.connect(database='monarch-kg.duckdb')
|
88
93
|
|
89
|
-
if edge_fields is None or len(edge_fields) == 0:
|
90
|
-
edge_fields = ['subject', 'object']
|
91
|
-
|
92
|
-
if evidence_fields is None or len(evidence_fields) == 0:
|
93
|
-
evidence_fields = ['has_evidence', 'publications']
|
94
|
-
|
95
|
-
if grouping_fields is None or len(grouping_fields) == 0:
|
96
|
-
grouping_fields = ['subject', 'negated', 'predicate', 'object']
|
97
|
-
|
98
|
-
|
99
94
|
if not dry_run:
|
100
95
|
print(f"fields: {','.join(edge_fields)}")
|
101
96
|
print(f"output_file: {edges_output_file}")
|
@@ -139,10 +134,12 @@ def add_closure(kg_archive: str,
|
|
139
134
|
create or replace table denormalized_edges as
|
140
135
|
select edges.*,
|
141
136
|
{"".join([edge_columns(field) for field in edge_fields])}
|
137
|
+
{"".join([edge_columns(field, include_closure_fields=False) for field in edge_fields_to_label])}
|
142
138
|
{evidence_sum(evidence_fields)}
|
143
139
|
{grouping_key(grouping_fields)}
|
144
140
|
from edges
|
145
141
|
{"".join([edge_joins(field) for field in edge_fields])}
|
142
|
+
{"".join([edge_joins(field, include_closure_joins=False) for field in edge_fields_to_label])}
|
146
143
|
"""
|
147
144
|
|
148
145
|
print(edges_query)
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.3
|
2
2
|
Name: closurizer
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.1
|
4
4
|
Summary: Add closure expansion fields to kgx files following the Golr pattern
|
5
5
|
Author: Kevin Schaper
|
6
6
|
Author-email: kevin@tislab.org
|
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
11
11
|
Classifier: Programming Language :: Python :: 3.10
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
14
15
|
Requires-Dist: SQLAlchemy (>=1.4.37,<2.0.0)
|
15
16
|
Requires-Dist: click (>=8,<9)
|
16
|
-
Requires-Dist: duckdb
|
17
|
+
Requires-Dist: duckdb
|
@@ -0,0 +1,6 @@
|
|
1
|
+
closurizer/cli.py,sha256=xTFscsxGDnaKoTNhn1FefRPPeldI5tZvvp3DygNai7Y,2069
|
2
|
+
closurizer/closurizer.py,sha256=DY3NnSaCOf3XYF_NFqS_TB3o9WZVnBviXlvI0saXlxg,7000
|
3
|
+
closurizer-0.7.1.dist-info/METADATA,sha256=S5ItMFYNZQZWBfF31M_fP9pS9LgNCFl_1QGbHU5cLKk,661
|
4
|
+
closurizer-0.7.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
5
|
+
closurizer-0.7.1.dist-info/entry_points.txt,sha256=MnAVu1lgP6DqDb3BZGNzVs2AnDMsp4sThi3ccWbONFo,50
|
6
|
+
closurizer-0.7.1.dist-info/RECORD,,
|
@@ -1,6 +0,0 @@
|
|
1
|
-
closurizer/cli.py,sha256=AfK0Dy0lSmngUfzxKsT6VuH_YqjUVA1yU1Ko41Yil1w,1827
|
2
|
-
closurizer/closurizer.py,sha256=MMFeYqmgKB6Pr9Eh2jWeZt4mnDS635oGg8rl0ywJRvE,6902
|
3
|
-
closurizer-0.6.0.dist-info/METADATA,sha256=IALfF2731qv6hyIPRPiwYESfFNPshLmMnQ3PPEVo9eQ,629
|
4
|
-
closurizer-0.6.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
5
|
-
closurizer-0.6.0.dist-info/entry_points.txt,sha256=MnAVu1lgP6DqDb3BZGNzVs2AnDMsp4sThi3ccWbONFo,50
|
6
|
-
closurizer-0.6.0.dist-info/RECORD,,
|
File without changes
|