closurizer 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
closurizer/cli.py CHANGED
@@ -6,17 +6,32 @@ from closurizer.closurizer import add_closure
6
6
  @click.command()
7
7
  @click.option('--kg', required=True, help='KGX tar.gz archive')
8
8
  @click.option('--closure', required=True, help='TSV file of closure triples')
9
- @click.option('--output', '-o', required=True, help='file write kgx file with closure fields added')
10
- @click.option('--fields', multiple=True, help='fields to closurize')
9
+ @click.option('--nodes-output', required=True, help='file write nodes kgx file with closure fields added')
10
+ @click.option('--edges-output', required=True, help='file write edges kgx file with closure fields added')
11
+ @click.option('--additional-node-constraints', required=False,
12
+ help='additional where clause constraints to apply to the generation of the denormalized nodes output')
13
+ @click.option('--edge-fields', multiple=True, help='edge fields to expand with closure IDs, labels, etc')
14
+ @click.option('--node-fields', multiple=True, help='node fields to expand with closure IDs, labels, etc')
11
15
  @click.option('--grouping-fields', multiple=True, help='fields to populate a single value grouping_key field')
12
16
  @click.option('--dry-run', is_flag=True, help='A dry run will not write the output file, but will print the SQL query')
13
17
  def main(kg: str,
14
18
  closure: str,
15
- output: str,
19
+ nodes_output: str,
20
+ edges_output: str,
21
+ additional_node_constraints: str = None,
16
22
  dry_run: bool = False,
17
- fields: List[str] = None,
23
+ edge_fields: List[str] = None,
24
+ node_fields: List[str] = None,
18
25
  grouping_fields: List[str] = None):
19
- add_closure(kg_archive=kg, closure_file=closure, fields=fields, output_file=output, dry_run=dry_run, grouping_fields=grouping_fields)
26
+ add_closure(kg_archive=kg,
27
+ closure_file=closure,
28
+ edge_fields=edge_fields,
29
+ node_fields=node_fields,
30
+ edges_output_file=edges_output,
31
+ nodes_output_file=nodes_output,
32
+ additional_node_constraints=additional_node_constraints,
33
+ dry_run=dry_run,
34
+ grouping_fields=grouping_fields)
20
35
 
21
36
  if __name__ == "__main__":
22
37
  main()
closurizer/closurizer.py CHANGED
@@ -4,13 +4,13 @@ import os
4
4
  import tarfile
5
5
  import duckdb
6
6
 
7
- def columns(field):
7
+ def edge_columns(field):
8
8
  column_text = f"""
9
9
  {field}.name as {field}_label,
10
10
  {field}.category as {field}_category,
11
11
  {field}.namespace as {field}_namespace,
12
- {field}_closure.closure as {field}_closure,
13
- {field}_closure_label.closure_label as {field}_closure_label,
12
+ list_aggregate({field}_closure.closure, 'string_agg', '|') as {field}_closure,
13
+ list_aggregate({field}_closure_label.closure_label,'string_agg', '|') as {field}_closure_label,
14
14
  """
15
15
  if field in ['subject', 'object']:
16
16
  column_text += f"""
@@ -19,7 +19,7 @@ def columns(field):
19
19
  """
20
20
  return column_text
21
21
 
22
- def joins(field):
22
+ def edge_joins(field):
23
23
  return f"""
24
24
  left outer join nodes as {field} on edges.{field} = {field}.id
25
25
  left outer join closure_id as {field}_closure on {field}.id = {field}_closure.id
@@ -31,6 +31,32 @@ def evidence_sum(evidence_fields):
31
31
  evidence_count_sum = "+".join([f"len(split({field}, '|'))" for field in evidence_fields])
32
32
  return f"{evidence_count_sum} as evidence_count,"
33
33
 
34
+
35
+ def node_columns(predicate):
36
+ # strip the biolink predicate, if necessary to get the field name
37
+ field = predicate.replace('biolink:','')
38
+
39
+ return f"""
40
+ string_agg({field}_edges.object, '|') as {field},
41
+ string_agg({field}_edges.object_label, '|') as {field}_label,
42
+ count (distinct {field}_edges.object) as {field}_count,
43
+ list_aggregate(list_distinct(flatten(array_agg({field}_closure.closure))), 'string_agg', '|') as {field}_closure,
44
+ list_aggregate(list_distinct(flatten(array_agg({field}_closure_label.closure_label))), 'string_agg', '|') as {field}_closure_label,
45
+ """
46
+
47
+ def node_joins(predicate):
48
+ # strip the biolink predicate, if necessary to get the field name
49
+ field = predicate.replace('biolink:','')
50
+ return f"""
51
+ left outer join denormalized_edges as {field}_edges
52
+ on nodes.id = {field}_edges.subject
53
+ and {field}_edges.predicate = 'biolink:{field}'
54
+ left outer join closure_id as {field}_closure
55
+ on {field}_edges.object = {field}_closure.id
56
+ left outer join closure_label as {field}_closure_label
57
+ on {field}_edges.object = {field}_closure_label.id
58
+ """
59
+
34
60
  def grouping_key(grouping_fields):
35
61
  fragments = []
36
62
  for field in grouping_fields:
@@ -39,12 +65,15 @@ def grouping_key(grouping_fields):
39
65
  else:
40
66
  fragments.append(field)
41
67
  grouping_key_fragments = ", ".join(fragments)
42
- return f"concat_ws('🍪', {grouping_key_fragments}) as grouping_key"
68
+ return f"concat_ws('|', {grouping_key_fragments}) as grouping_key"
43
69
 
44
70
  def add_closure(kg_archive: str,
45
71
  closure_file: str,
46
- output_file: str,
47
- fields: List[str] = ['subject', 'object'],
72
+ nodes_output_file: str,
73
+ edges_output_file: str,
74
+ node_fields: List[str] = None,
75
+ edge_fields: List[str] = ['subject', 'object'],
76
+ additional_node_constraints: str = None,
48
77
  dry_run: bool = False,
49
78
  evidence_fields: List[str] = None,
50
79
  grouping_fields: List[str] = None
@@ -55,8 +84,8 @@ def add_closure(kg_archive: str,
55
84
 
56
85
  db = duckdb.connect(database='monarch-kg.duckdb')
57
86
 
58
- if fields is None or len(fields) == 0:
59
- fields = ['subject', 'object']
87
+ if edge_fields is None or len(edge_fields) == 0:
88
+ edge_fields = ['subject', 'object']
60
89
 
61
90
  if evidence_fields is None or len(evidence_fields) == 0:
62
91
  evidence_fields = ['has_evidence', 'publications']
@@ -66,8 +95,8 @@ def add_closure(kg_archive: str,
66
95
 
67
96
 
68
97
  if not dry_run:
69
- print(f"fields: {','.join(fields)}")
70
- print(f"output_file: {output_file}")
98
+ print(f"fields: {','.join(edge_fields)}")
99
+ print(f"output_file: {edges_output_file}")
71
100
 
72
101
  tar = tarfile.open(f"{kg_archive}")
73
102
 
@@ -96,32 +125,49 @@ def add_closure(kg_archive: str,
96
125
  """)
97
126
 
98
127
  db.sql("""
99
- create or replace table closure_id as select subject_id as id, string_agg(object_id, '|') as closure from closure group by subject_id
128
+ create or replace table closure_id as select subject_id as id, array_agg(object_id) as closure from closure group by subject_id
100
129
  """)
101
130
 
102
131
  db.sql("""
103
- create or replace table closure_label as select subject_id as id, string_agg(name, '|') as closure_label from closure join nodes on object_id = id
132
+ create or replace table closure_label as select subject_id as id, array_agg(name) as closure_label from closure join nodes on object_id = id
104
133
  group by subject_id
105
134
  """)
106
135
 
107
- query = f"""
136
+ edges_query = f"""
108
137
  create or replace table denormalized_edges as
109
138
  select edges.*,
110
- {"".join([columns(field) for field in fields])}
139
+ {"".join([edge_columns(field) for field in edge_fields])}
111
140
  {evidence_sum(evidence_fields)}
112
141
  {grouping_key(grouping_fields)}
113
142
  from edges
114
- {"".join([joins(field) for field in fields])}
143
+ {"".join([edge_joins(field) for field in edge_fields])}
115
144
  """
116
145
 
117
- print(query)
146
+ print(edges_query)
147
+
148
+ nodes_query = f"""
149
+ create or replace table denormalized_nodes as
150
+ select nodes.*,
151
+ {"".join([node_columns(node_field) for node_field in node_fields])}
152
+ from nodes
153
+ {node_joins('has_phenotype')}
154
+ where {additional_node_constraints}
155
+ group by nodes.*
156
+ """
157
+ print(nodes_query)
118
158
 
119
159
  if not dry_run:
120
- db.query(query)
160
+ db.query(edges_query)
121
161
  db.query(f"""
122
162
  -- write denormalized_edges as tsv
123
- copy (select * from denormalized_edges) to '{output_file}' (header, delimiter '\t')
163
+ copy (select * from denormalized_edges) to '{edges_output_file}' (header, delimiter '\t')
124
164
  """)
165
+ db.query(nodes_query)
166
+ db.query(f"""
167
+ -- write denormalized_nodes as tsv
168
+ copy (select * from denormalized_nodes) to '{nodes_output_file}' (header, delimiter '\t')
169
+ """)
170
+
125
171
 
126
172
  # Clean up extracted node & edge files
127
173
  if os.path.exists(f"{node_file}"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: closurizer
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Add closure expansion fields to kgx files following the Golr pattern
5
5
  Author: Kevin Schaper
6
6
  Author-email: kevin@tislab.org
@@ -0,0 +1,6 @@
1
+ closurizer/cli.py,sha256=AfK0Dy0lSmngUfzxKsT6VuH_YqjUVA1yU1Ko41Yil1w,1827
2
+ closurizer/closurizer.py,sha256=u8Weefop4b6DPZsMc7kVxoASjCP-1pQlZBQWGVhet_8,6767
3
+ closurizer-0.5.0.dist-info/METADATA,sha256=h8kByCLqn7B7KW311FfUAXHBAemimNC-kIKUfSOpCbQ,577
4
+ closurizer-0.5.0.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
5
+ closurizer-0.5.0.dist-info/entry_points.txt,sha256=MnAVu1lgP6DqDb3BZGNzVs2AnDMsp4sThi3ccWbONFo,50
6
+ closurizer-0.5.0.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- closurizer/cli.py,sha256=SdTbdYrl_vNnciYxKmT5__gK1EB_4vPSbEo2tcsKT4E,997
2
- closurizer/closurizer.py,sha256=5S4oOCx5mTLJTyRWyzhyu1t-LhmcsxMb6-KQNTQ2Itc,4787
3
- closurizer-0.4.1.dist-info/METADATA,sha256=UlSwTVtcY5MZT2PvfhkSGh5nvz4WDb4nVD9jNOU_F2U,577
4
- closurizer-0.4.1.dist-info/WHEEL,sha256=d2fvjOD7sXsVzChCqf0Ty0JbHKBaLYwDbGQDwQTnJ50,88
5
- closurizer-0.4.1.dist-info/entry_points.txt,sha256=MnAVu1lgP6DqDb3BZGNzVs2AnDMsp4sThi3ccWbONFo,50
6
- closurizer-0.4.1.dist-info/RECORD,,