nmdc-runtime 1.3.1__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -0
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +30 -4
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +1331 -0
- nmdc_runtime/site/export/ncbi_xml_utils.py +405 -0
- nmdc_runtime/site/export/study_metadata.py +27 -4
- nmdc_runtime/site/graphs.py +294 -45
- nmdc_runtime/site/ops.py +1008 -230
- nmdc_runtime/site/repair/database_updater.py +451 -0
- nmdc_runtime/site/repository.py +368 -133
- nmdc_runtime/site/resources.py +154 -80
- nmdc_runtime/site/translation/gold_translator.py +235 -83
- nmdc_runtime/site/translation/neon_benthic_translator.py +212 -188
- nmdc_runtime/site/translation/neon_soil_translator.py +82 -58
- nmdc_runtime/site/translation/neon_surface_water_translator.py +698 -0
- nmdc_runtime/site/translation/neon_utils.py +24 -7
- nmdc_runtime/site/translation/submission_portal_translator.py +616 -162
- nmdc_runtime/site/translation/translator.py +73 -3
- nmdc_runtime/site/util.py +26 -7
- nmdc_runtime/site/validation/emsl.py +1 -0
- nmdc_runtime/site/validation/gold.py +1 -0
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +236 -192
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -1
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/terminusdb/generate.py +0 -198
- nmdc_runtime/site/terminusdb/ingest.py +0 -44
- nmdc_runtime/site/terminusdb/schema.py +0 -1671
- nmdc_runtime/site/translation/emsl.py +0 -42
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -31
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -42
- nmdc_runtime-1.3.1.dist-info/METADATA +0 -181
- nmdc_runtime-1.3.1.dist-info/RECORD +0 -81
- nmdc_runtime-1.3.1.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- /nmdc_runtime/site/{terminusdb → repair}/__init__.py +0 -0
- {nmdc_runtime-1.3.1.dist-info → nmdc_runtime-2.12.0.dist-info/licenses}/LICENSE +0 -0
|
@@ -0,0 +1,1331 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import datetime
|
|
4
|
+
import xml.etree.ElementTree as ET
|
|
5
|
+
import xml.dom.minidom
|
|
6
|
+
|
|
7
|
+
from typing import Any, List
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
from unidecode import unidecode
|
|
10
|
+
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
11
|
+
handle_controlled_identified_term_value,
|
|
12
|
+
handle_controlled_term_value,
|
|
13
|
+
handle_geolocation_value,
|
|
14
|
+
handle_quantity_value,
|
|
15
|
+
handle_text_value,
|
|
16
|
+
handle_timestamp_value,
|
|
17
|
+
handle_float_value,
|
|
18
|
+
handle_string_value,
|
|
19
|
+
load_mappings,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NCBISubmissionXML:
|
|
24
|
+
def __init__(self, nmdc_study: Any, ncbi_submission_metadata: dict):
|
|
25
|
+
self.root = ET.Element("Submission")
|
|
26
|
+
|
|
27
|
+
self.nmdc_study_id = nmdc_study.get("id")
|
|
28
|
+
self.nmdc_study_title = nmdc_study.get("title")
|
|
29
|
+
self.nmdc_study_description = nmdc_study.get("description")
|
|
30
|
+
# get the first INSDC BioProject ID from the NMDC study
|
|
31
|
+
self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
|
|
32
|
+
# the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
|
|
33
|
+
# everything after the prefix and delimiter (":")
|
|
34
|
+
self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
|
|
35
|
+
self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
|
|
36
|
+
nmdc_study_pi_name = (
|
|
37
|
+
nmdc_study.get("principal_investigator", {}).get("name").split()
|
|
38
|
+
)
|
|
39
|
+
self.first_name = nmdc_study_pi_name[0]
|
|
40
|
+
self.last_name = nmdc_study_pi_name[1] if len(nmdc_study_pi_name) > 1 else None
|
|
41
|
+
|
|
42
|
+
self.nmdc_ncbi_attribute_mapping_file_url = ncbi_submission_metadata.get(
|
|
43
|
+
"nmdc_ncbi_attribute_mapping_file_url"
|
|
44
|
+
)
|
|
45
|
+
self.ncbi_submission_metadata = ncbi_submission_metadata.get(
|
|
46
|
+
"ncbi_submission_metadata", {}
|
|
47
|
+
)
|
|
48
|
+
self.ncbi_biosample_metadata = ncbi_submission_metadata.get(
|
|
49
|
+
"ncbi_biosample_metadata", {}
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# dispatcher dictionary capturing handlers for NMDC object to NCBI flat Attribute
|
|
53
|
+
# type handlers
|
|
54
|
+
self.type_handlers = {
|
|
55
|
+
"QuantityValue": handle_quantity_value,
|
|
56
|
+
"TextValue": handle_text_value,
|
|
57
|
+
"TimestampValue": handle_timestamp_value,
|
|
58
|
+
"ControlledTermValue": handle_controlled_term_value,
|
|
59
|
+
"ControlledIdentifiedTermValue": handle_controlled_identified_term_value,
|
|
60
|
+
"GeolocationValue": handle_geolocation_value,
|
|
61
|
+
"float": handle_float_value,
|
|
62
|
+
"string": handle_string_value,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def set_element(self, tag, text="", attrib=None, children=None):
|
|
66
|
+
attrib = attrib or {}
|
|
67
|
+
children = children or []
|
|
68
|
+
element = ET.Element(tag, attrib=attrib)
|
|
69
|
+
element.text = text
|
|
70
|
+
for child in children:
|
|
71
|
+
element.append(child)
|
|
72
|
+
return element
|
|
73
|
+
|
|
74
|
+
def set_description(self, email, first, last, org, date=None):
|
|
75
|
+
date = date or datetime.datetime.now().strftime("%Y-%m-%d")
|
|
76
|
+
description = self.set_element(
|
|
77
|
+
"Description",
|
|
78
|
+
children=[
|
|
79
|
+
self.set_element(
|
|
80
|
+
"Comment", f"NMDC Submission for {self.nmdc_study_id}"
|
|
81
|
+
),
|
|
82
|
+
self.set_element(
|
|
83
|
+
"Organization",
|
|
84
|
+
attrib={"role": "owner", "type": "center"},
|
|
85
|
+
children=[
|
|
86
|
+
self.set_element("Name", org),
|
|
87
|
+
self.set_element(
|
|
88
|
+
"Contact",
|
|
89
|
+
attrib={"email": email},
|
|
90
|
+
children=[
|
|
91
|
+
self.set_element(
|
|
92
|
+
"Name",
|
|
93
|
+
children=[
|
|
94
|
+
self.set_element("First", first),
|
|
95
|
+
self.set_element("Last", last),
|
|
96
|
+
],
|
|
97
|
+
)
|
|
98
|
+
],
|
|
99
|
+
),
|
|
100
|
+
],
|
|
101
|
+
),
|
|
102
|
+
self.set_element("Hold", attrib={"release_date": date}),
|
|
103
|
+
],
|
|
104
|
+
)
|
|
105
|
+
self.root.append(description)
|
|
106
|
+
|
|
107
|
+
def set_descriptor(self, title, description):
|
|
108
|
+
descriptor_elements = []
|
|
109
|
+
descriptor_elements.append(self.set_element("Title", title))
|
|
110
|
+
descriptor_elements.append(
|
|
111
|
+
self.set_element(
|
|
112
|
+
"Description", children=[self.set_element("p", description)]
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
return descriptor_elements
|
|
117
|
+
|
|
118
|
+
def set_bioproject(self, title, project_id, description, data_type, org):
|
|
119
|
+
action = self.set_element("Action")
|
|
120
|
+
add_data = self.set_element("AddData", attrib={"target_db": "BioProject"})
|
|
121
|
+
|
|
122
|
+
data_element = self.set_element("Data", attrib={"content_type": "XML"})
|
|
123
|
+
xml_content = self.set_element("XmlContent")
|
|
124
|
+
project = self.set_element("Project", attrib={"schema_version": "2.0"})
|
|
125
|
+
|
|
126
|
+
project_id_element = self.set_element("ProjectID")
|
|
127
|
+
spuid = self.set_element("SPUID", project_id, {"spuid_namespace": org})
|
|
128
|
+
project_id_element.append(spuid)
|
|
129
|
+
|
|
130
|
+
descriptor = self.set_descriptor(title, description)
|
|
131
|
+
project_type = self.set_element("ProjectType")
|
|
132
|
+
# "sample_scope" is a enumeration feild. Docs: https://www.ncbi.nlm.nih.gov/data_specs/schema/other/bioproject/Core.xsd
|
|
133
|
+
# scope is "eEnvironment" when "Content of species in a sample is not known, i.e. microbiome,metagenome, etc.."
|
|
134
|
+
project_type_submission = self.set_element(
|
|
135
|
+
"ProjectTypeSubmission", attrib={"sample_scope": "eEnvironment"}
|
|
136
|
+
)
|
|
137
|
+
intended_data_type_set = self.set_element("IntendedDataTypeSet")
|
|
138
|
+
data_type_element = self.set_element("DataType", data_type)
|
|
139
|
+
|
|
140
|
+
intended_data_type_set.append(data_type_element)
|
|
141
|
+
project_type_submission.append(intended_data_type_set)
|
|
142
|
+
project_type.append(project_type_submission)
|
|
143
|
+
|
|
144
|
+
project.extend([project_id_element] + descriptor + [project_type])
|
|
145
|
+
|
|
146
|
+
xml_content.append(project)
|
|
147
|
+
data_element.append(xml_content)
|
|
148
|
+
add_data.append(data_element)
|
|
149
|
+
|
|
150
|
+
identifier = self.set_element("Identifier")
|
|
151
|
+
spuid_identifier = self.set_element(
|
|
152
|
+
"SPUID", project_id, {"spuid_namespace": org}
|
|
153
|
+
)
|
|
154
|
+
identifier.append(spuid_identifier)
|
|
155
|
+
add_data.append(identifier)
|
|
156
|
+
|
|
157
|
+
action.append(add_data)
|
|
158
|
+
self.root.append(action)
|
|
159
|
+
|
|
160
|
+
def set_biosample(
|
|
161
|
+
self,
|
|
162
|
+
organism_name,
|
|
163
|
+
org,
|
|
164
|
+
bioproject_id,
|
|
165
|
+
nmdc_biosamples,
|
|
166
|
+
pooled_biosamples_data=None,
|
|
167
|
+
):
|
|
168
|
+
attribute_mappings, slot_range_mappings = load_mappings(
|
|
169
|
+
self.nmdc_ncbi_attribute_mapping_file_url
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Use provided pooling data or empty dict
|
|
173
|
+
pooling_data = pooled_biosamples_data or {}
|
|
174
|
+
|
|
175
|
+
# Group biosamples by pooling process
|
|
176
|
+
pooling_groups = {}
|
|
177
|
+
individual_biosamples = []
|
|
178
|
+
|
|
179
|
+
for biosample in nmdc_biosamples:
|
|
180
|
+
pooling_info = pooling_data.get(biosample["id"], {})
|
|
181
|
+
if pooling_info and pooling_info.get("pooling_process_id"):
|
|
182
|
+
pooling_process_id = pooling_info["pooling_process_id"]
|
|
183
|
+
if pooling_process_id not in pooling_groups:
|
|
184
|
+
pooling_groups[pooling_process_id] = {
|
|
185
|
+
"biosamples": [],
|
|
186
|
+
"pooling_info": pooling_info,
|
|
187
|
+
}
|
|
188
|
+
pooling_groups[pooling_process_id]["biosamples"].append(biosample)
|
|
189
|
+
else:
|
|
190
|
+
individual_biosamples.append(biosample)
|
|
191
|
+
|
|
192
|
+
# Process pooled sample groups - create one <Action> block per pooling process
|
|
193
|
+
for pooling_process_id, group_data in pooling_groups.items():
|
|
194
|
+
self._create_pooled_biosample_action(
|
|
195
|
+
group_data["biosamples"],
|
|
196
|
+
group_data["pooling_info"],
|
|
197
|
+
organism_name,
|
|
198
|
+
org,
|
|
199
|
+
bioproject_id,
|
|
200
|
+
attribute_mappings,
|
|
201
|
+
slot_range_mappings,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Process individual biosamples
|
|
205
|
+
for biosample in individual_biosamples:
|
|
206
|
+
attributes = {}
|
|
207
|
+
sample_id_value = None
|
|
208
|
+
env_package = None
|
|
209
|
+
|
|
210
|
+
# Get pooling info for this specific biosample
|
|
211
|
+
pooling_info = pooling_data.get(biosample["id"], {})
|
|
212
|
+
|
|
213
|
+
for json_key, value in biosample.items():
|
|
214
|
+
if isinstance(value, list):
|
|
215
|
+
for item in value:
|
|
216
|
+
if json_key not in attribute_mappings:
|
|
217
|
+
continue
|
|
218
|
+
|
|
219
|
+
xml_key = attribute_mappings[json_key]
|
|
220
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
221
|
+
handler = self.type_handlers.get(
|
|
222
|
+
value_type, handle_string_value
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
# Special handling for "elev" key
|
|
226
|
+
if json_key == "elev":
|
|
227
|
+
value = f"{float(value)} m" # Convert to float if possible
|
|
228
|
+
attributes[xml_key] = value
|
|
229
|
+
continue # Skip applying the handler to this key
|
|
230
|
+
|
|
231
|
+
formatted_value = handler(item)
|
|
232
|
+
|
|
233
|
+
# Combine multiple values with a separator for list elements
|
|
234
|
+
if xml_key in attributes:
|
|
235
|
+
attributes[xml_key] += f"| {formatted_value}"
|
|
236
|
+
else:
|
|
237
|
+
attributes[xml_key] = formatted_value
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
if json_key == "env_package":
|
|
241
|
+
env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
|
|
242
|
+
|
|
243
|
+
# Special handling for NMDC Biosample "id"
|
|
244
|
+
if json_key == "id":
|
|
245
|
+
# Use ProcessedSample ID if this is a pooled sample, otherwise use biosample ID
|
|
246
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
247
|
+
sample_id_value = pooling_info["processed_sample_id"]
|
|
248
|
+
else:
|
|
249
|
+
sample_id_value = value
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
if json_key not in attribute_mappings:
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
xml_key = attribute_mappings[json_key]
|
|
256
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
257
|
+
handler = self.type_handlers.get(value_type, handle_string_value)
|
|
258
|
+
|
|
259
|
+
# Special handling for "elev" key
|
|
260
|
+
if json_key == "elev":
|
|
261
|
+
value = f"{float(value)} m" # Convert to float if possible
|
|
262
|
+
attributes[xml_key] = value
|
|
263
|
+
continue # Skip applying the handler to this key
|
|
264
|
+
|
|
265
|
+
# Special handling for "host_taxid"
|
|
266
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
267
|
+
if "term" in value and "id" in value["term"]:
|
|
268
|
+
value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
|
|
269
|
+
attributes[xml_key] = value
|
|
270
|
+
continue # Skip applying the handler to this key
|
|
271
|
+
|
|
272
|
+
# Special handling for "geo_loc_name" - convert unicode to closest ASCII characters
|
|
273
|
+
if json_key == "geo_loc_name":
|
|
274
|
+
formatted_value = handler(value)
|
|
275
|
+
formatted_value_ascii = unidecode(formatted_value)
|
|
276
|
+
attributes[xml_key] = formatted_value_ascii
|
|
277
|
+
continue # Skip applying the handler to this key
|
|
278
|
+
|
|
279
|
+
# Default processing for other keys
|
|
280
|
+
formatted_value = handler(value)
|
|
281
|
+
attributes[xml_key] = formatted_value
|
|
282
|
+
|
|
283
|
+
# Override with aggregated values for pooled samples
|
|
284
|
+
if pooling_info:
|
|
285
|
+
if pooling_info.get("aggregated_collection_date"):
|
|
286
|
+
# Find the mapping for collection_date
|
|
287
|
+
collection_date_key = attribute_mappings.get(
|
|
288
|
+
"collection_date", "collection_date"
|
|
289
|
+
)
|
|
290
|
+
attributes[collection_date_key] = pooling_info[
|
|
291
|
+
"aggregated_collection_date"
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
if pooling_info.get("aggregated_depth"):
|
|
295
|
+
# Find the mapping for depth
|
|
296
|
+
depth_key = attribute_mappings.get("depth", "depth")
|
|
297
|
+
attributes[depth_key] = pooling_info["aggregated_depth"]
|
|
298
|
+
|
|
299
|
+
# Add samp_pooling attribute with semicolon-delimited biosample IDs
|
|
300
|
+
if pooling_info.get("pooled_biosample_ids"):
|
|
301
|
+
attributes["samp_pooling"] = ";".join(
|
|
302
|
+
pooling_info["pooled_biosample_ids"]
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
biosample_elements = [
|
|
306
|
+
self.set_element(
|
|
307
|
+
"SampleId",
|
|
308
|
+
children=[
|
|
309
|
+
self.set_element(
|
|
310
|
+
"SPUID", sample_id_value, {"spuid_namespace": org}
|
|
311
|
+
)
|
|
312
|
+
],
|
|
313
|
+
),
|
|
314
|
+
self.set_element(
|
|
315
|
+
"Descriptor",
|
|
316
|
+
children=[
|
|
317
|
+
self.set_element(
|
|
318
|
+
"Title",
|
|
319
|
+
attributes.get(
|
|
320
|
+
"name",
|
|
321
|
+
# fallback title if "name" is not present
|
|
322
|
+
f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
|
|
323
|
+
),
|
|
324
|
+
),
|
|
325
|
+
]
|
|
326
|
+
+ (
|
|
327
|
+
# Add external links for pooled samples
|
|
328
|
+
[
|
|
329
|
+
self.set_element(
|
|
330
|
+
"ExternalLink",
|
|
331
|
+
attrib={"label": "NMDC Processed Sample"},
|
|
332
|
+
children=[
|
|
333
|
+
self.set_element(
|
|
334
|
+
"URL",
|
|
335
|
+
f"https://bioregistry.io/{pooling_info['processed_sample_id']}",
|
|
336
|
+
)
|
|
337
|
+
],
|
|
338
|
+
),
|
|
339
|
+
self.set_element(
|
|
340
|
+
"ExternalLink",
|
|
341
|
+
attrib={"label": "NMDC Pooling Process"},
|
|
342
|
+
children=[
|
|
343
|
+
self.set_element(
|
|
344
|
+
"URL",
|
|
345
|
+
f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
|
|
346
|
+
)
|
|
347
|
+
],
|
|
348
|
+
),
|
|
349
|
+
]
|
|
350
|
+
if pooling_info
|
|
351
|
+
and pooling_info.get("processed_sample_id")
|
|
352
|
+
and pooling_info.get("pooling_process_id")
|
|
353
|
+
else [
|
|
354
|
+
# Add external link for individual biosamples
|
|
355
|
+
self.set_element(
|
|
356
|
+
"ExternalLink",
|
|
357
|
+
attrib={"label": sample_id_value},
|
|
358
|
+
children=[
|
|
359
|
+
self.set_element(
|
|
360
|
+
"URL",
|
|
361
|
+
f"https://bioregistry.io/{sample_id_value}",
|
|
362
|
+
)
|
|
363
|
+
],
|
|
364
|
+
),
|
|
365
|
+
]
|
|
366
|
+
),
|
|
367
|
+
),
|
|
368
|
+
self.set_element(
|
|
369
|
+
"Organism",
|
|
370
|
+
children=[self.set_element("OrganismName", organism_name)],
|
|
371
|
+
),
|
|
372
|
+
self.set_element(
|
|
373
|
+
"BioProject",
|
|
374
|
+
children=[
|
|
375
|
+
self.set_element(
|
|
376
|
+
"PrimaryId", bioproject_id, {"db": "BioProject"}
|
|
377
|
+
)
|
|
378
|
+
],
|
|
379
|
+
),
|
|
380
|
+
self.set_element("Package", env_package),
|
|
381
|
+
self.set_element(
|
|
382
|
+
"Attributes",
|
|
383
|
+
children=[
|
|
384
|
+
self.set_element(
|
|
385
|
+
"Attribute", attributes[key], {"attribute_name": key}
|
|
386
|
+
)
|
|
387
|
+
for key in sorted(attributes)
|
|
388
|
+
]
|
|
389
|
+
+ [
|
|
390
|
+
self.set_element(
|
|
391
|
+
"Attribute",
|
|
392
|
+
"National Microbiome Data Collaborative",
|
|
393
|
+
{"attribute_name": "broker name"},
|
|
394
|
+
)
|
|
395
|
+
],
|
|
396
|
+
),
|
|
397
|
+
]
|
|
398
|
+
|
|
399
|
+
action = self.set_element(
|
|
400
|
+
"Action",
|
|
401
|
+
children=[
|
|
402
|
+
self.set_element(
|
|
403
|
+
"AddData",
|
|
404
|
+
attrib={"target_db": "BioSample"},
|
|
405
|
+
children=[
|
|
406
|
+
self.set_element(
|
|
407
|
+
"Data",
|
|
408
|
+
attrib={"content_type": "XML"},
|
|
409
|
+
children=[
|
|
410
|
+
self.set_element(
|
|
411
|
+
"XmlContent",
|
|
412
|
+
children=[
|
|
413
|
+
self.set_element(
|
|
414
|
+
"BioSample",
|
|
415
|
+
attrib={"schema_version": "2.0"},
|
|
416
|
+
children=biosample_elements,
|
|
417
|
+
),
|
|
418
|
+
],
|
|
419
|
+
),
|
|
420
|
+
],
|
|
421
|
+
),
|
|
422
|
+
self.set_element(
|
|
423
|
+
"Identifier",
|
|
424
|
+
children=[
|
|
425
|
+
self.set_element(
|
|
426
|
+
"SPUID",
|
|
427
|
+
sample_id_value,
|
|
428
|
+
{"spuid_namespace": org},
|
|
429
|
+
),
|
|
430
|
+
],
|
|
431
|
+
),
|
|
432
|
+
],
|
|
433
|
+
),
|
|
434
|
+
],
|
|
435
|
+
)
|
|
436
|
+
self.root.append(action)
|
|
437
|
+
|
|
438
|
+
def _create_pooled_biosample_action(
|
|
439
|
+
self,
|
|
440
|
+
biosamples,
|
|
441
|
+
pooling_info,
|
|
442
|
+
organism_name,
|
|
443
|
+
org,
|
|
444
|
+
bioproject_id,
|
|
445
|
+
attribute_mappings,
|
|
446
|
+
slot_range_mappings,
|
|
447
|
+
):
|
|
448
|
+
# Use the processed sample ID as the primary identifier
|
|
449
|
+
sample_id_value = pooling_info.get("processed_sample_id")
|
|
450
|
+
if not sample_id_value:
|
|
451
|
+
return
|
|
452
|
+
|
|
453
|
+
# Aggregate attributes from all biosamples in the pool
|
|
454
|
+
aggregated_attributes = {}
|
|
455
|
+
env_package = None
|
|
456
|
+
|
|
457
|
+
# Get title from the first biosample or use processed sample name
|
|
458
|
+
title = pooling_info.get(
|
|
459
|
+
"processed_sample_name", f"Pooled sample {sample_id_value}"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Process each biosample to collect and aggregate attributes
|
|
463
|
+
for biosample in biosamples:
|
|
464
|
+
for json_key, value in biosample.items():
|
|
465
|
+
if json_key == "id":
|
|
466
|
+
continue
|
|
467
|
+
|
|
468
|
+
if json_key == "env_package":
|
|
469
|
+
env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
if isinstance(value, list):
|
|
473
|
+
for item in value:
|
|
474
|
+
if json_key not in attribute_mappings:
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
xml_key = attribute_mappings[json_key]
|
|
478
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
479
|
+
handler = self.type_handlers.get(
|
|
480
|
+
value_type, handle_string_value
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Special handling for "elev" key
|
|
484
|
+
if json_key == "elev":
|
|
485
|
+
value = f"{float(value)} m"
|
|
486
|
+
aggregated_attributes[xml_key] = value
|
|
487
|
+
continue
|
|
488
|
+
|
|
489
|
+
# Special handling for "host_taxid"
|
|
490
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
491
|
+
if "term" in value and "id" in value["term"]:
|
|
492
|
+
value = re.findall(
|
|
493
|
+
r"\d+", value["term"]["id"].split(":")[1]
|
|
494
|
+
)[0]
|
|
495
|
+
aggregated_attributes[xml_key] = value
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
formatted_value = handler(item)
|
|
499
|
+
|
|
500
|
+
# For pooled samples, we typically want the first value or aggregate appropriately
|
|
501
|
+
if xml_key not in aggregated_attributes:
|
|
502
|
+
aggregated_attributes[xml_key] = formatted_value
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
if json_key not in attribute_mappings:
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
xml_key = attribute_mappings[json_key]
|
|
509
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
510
|
+
handler = self.type_handlers.get(value_type, handle_string_value)
|
|
511
|
+
|
|
512
|
+
# Special handling for "elev" key
|
|
513
|
+
if json_key == "elev":
|
|
514
|
+
value = f"{float(value)} m"
|
|
515
|
+
aggregated_attributes[xml_key] = value
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
# Special handling for "host_taxid"
|
|
519
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
520
|
+
if "term" in value and "id" in value["term"]:
|
|
521
|
+
value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
|
|
522
|
+
aggregated_attributes[xml_key] = value
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
formatted_value = handler(value)
|
|
526
|
+
|
|
527
|
+
# For pooled samples, we typically want the first value or aggregate appropriately
|
|
528
|
+
if xml_key not in aggregated_attributes:
|
|
529
|
+
aggregated_attributes[xml_key] = formatted_value
|
|
530
|
+
|
|
531
|
+
# Override with aggregated values for pooled samples
|
|
532
|
+
if pooling_info.get("aggregated_collection_date"):
|
|
533
|
+
collection_date_key = attribute_mappings.get(
|
|
534
|
+
"collection_date", "collection_date"
|
|
535
|
+
)
|
|
536
|
+
aggregated_attributes[collection_date_key] = pooling_info[
|
|
537
|
+
"aggregated_collection_date"
|
|
538
|
+
]
|
|
539
|
+
|
|
540
|
+
if pooling_info.get("aggregated_depth"):
|
|
541
|
+
depth_key = attribute_mappings.get("depth", "depth")
|
|
542
|
+
aggregated_attributes[depth_key] = pooling_info["aggregated_depth"]
|
|
543
|
+
|
|
544
|
+
# Add samp_pooling attribute with semicolon-delimited biosample IDs
|
|
545
|
+
if pooling_info.get("pooled_biosample_ids"):
|
|
546
|
+
aggregated_attributes["samp_pooling"] = ";".join(
|
|
547
|
+
pooling_info["pooled_biosample_ids"]
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# Filter attributes to only include the ones from neon_soil_example.xml for pooled samples
|
|
551
|
+
allowed_attributes = {
|
|
552
|
+
"collection_date",
|
|
553
|
+
"depth",
|
|
554
|
+
"elev",
|
|
555
|
+
"geo_loc_name",
|
|
556
|
+
"lat_lon",
|
|
557
|
+
"env_broad_scale",
|
|
558
|
+
"env_local_scale",
|
|
559
|
+
"env_medium",
|
|
560
|
+
"samp_pooling",
|
|
561
|
+
}
|
|
562
|
+
filtered_attributes = {
|
|
563
|
+
k: v for k, v in aggregated_attributes.items() if k in allowed_attributes
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
biosample_elements = [
|
|
567
|
+
self.set_element(
|
|
568
|
+
"SampleId",
|
|
569
|
+
children=[
|
|
570
|
+
self.set_element("SPUID", sample_id_value, {"spuid_namespace": org})
|
|
571
|
+
],
|
|
572
|
+
),
|
|
573
|
+
self.set_element(
|
|
574
|
+
"Descriptor",
|
|
575
|
+
children=[
|
|
576
|
+
self.set_element("Title", title),
|
|
577
|
+
self.set_element(
|
|
578
|
+
"ExternalLink",
|
|
579
|
+
attrib={"label": sample_id_value},
|
|
580
|
+
children=[
|
|
581
|
+
self.set_element(
|
|
582
|
+
"URL",
|
|
583
|
+
f"https://bioregistry.io/{sample_id_value}",
|
|
584
|
+
)
|
|
585
|
+
],
|
|
586
|
+
),
|
|
587
|
+
self.set_element(
|
|
588
|
+
"ExternalLink",
|
|
589
|
+
attrib={"label": pooling_info["pooling_process_id"]},
|
|
590
|
+
children=[
|
|
591
|
+
self.set_element(
|
|
592
|
+
"URL",
|
|
593
|
+
f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
|
|
594
|
+
)
|
|
595
|
+
],
|
|
596
|
+
),
|
|
597
|
+
]
|
|
598
|
+
+ [
|
|
599
|
+
self.set_element(
|
|
600
|
+
"ExternalLink",
|
|
601
|
+
attrib={"label": biosample_id},
|
|
602
|
+
children=[
|
|
603
|
+
self.set_element(
|
|
604
|
+
"URL",
|
|
605
|
+
f"https://bioregistry.io/{biosample_id}",
|
|
606
|
+
)
|
|
607
|
+
],
|
|
608
|
+
)
|
|
609
|
+
for biosample_id in pooling_info.get("pooled_biosample_ids", [])
|
|
610
|
+
],
|
|
611
|
+
),
|
|
612
|
+
self.set_element(
|
|
613
|
+
"Organism",
|
|
614
|
+
children=[self.set_element("OrganismName", organism_name)],
|
|
615
|
+
),
|
|
616
|
+
self.set_element(
|
|
617
|
+
"BioProject",
|
|
618
|
+
children=[
|
|
619
|
+
self.set_element("PrimaryId", bioproject_id, {"db": "BioProject"})
|
|
620
|
+
],
|
|
621
|
+
),
|
|
622
|
+
self.set_element("Package", env_package),
|
|
623
|
+
self.set_element(
|
|
624
|
+
"Attributes",
|
|
625
|
+
children=[
|
|
626
|
+
self.set_element(
|
|
627
|
+
"Attribute", filtered_attributes[key], {"attribute_name": key}
|
|
628
|
+
)
|
|
629
|
+
for key in sorted(filtered_attributes)
|
|
630
|
+
]
|
|
631
|
+
+ [
|
|
632
|
+
self.set_element(
|
|
633
|
+
"Attribute",
|
|
634
|
+
"National Microbiome Data Collaborative",
|
|
635
|
+
{"attribute_name": "broker name"},
|
|
636
|
+
)
|
|
637
|
+
],
|
|
638
|
+
),
|
|
639
|
+
]
|
|
640
|
+
|
|
641
|
+
action = self.set_element(
|
|
642
|
+
"Action",
|
|
643
|
+
children=[
|
|
644
|
+
self.set_element(
|
|
645
|
+
"AddData",
|
|
646
|
+
attrib={"target_db": "BioSample"},
|
|
647
|
+
children=[
|
|
648
|
+
self.set_element(
|
|
649
|
+
"Data",
|
|
650
|
+
attrib={"content_type": "XML"},
|
|
651
|
+
children=[
|
|
652
|
+
self.set_element(
|
|
653
|
+
"XmlContent",
|
|
654
|
+
children=[
|
|
655
|
+
self.set_element(
|
|
656
|
+
"BioSample",
|
|
657
|
+
attrib={"schema_version": "2.0"},
|
|
658
|
+
children=biosample_elements,
|
|
659
|
+
),
|
|
660
|
+
],
|
|
661
|
+
),
|
|
662
|
+
],
|
|
663
|
+
),
|
|
664
|
+
self.set_element(
|
|
665
|
+
"Identifier",
|
|
666
|
+
children=[
|
|
667
|
+
self.set_element(
|
|
668
|
+
"SPUID",
|
|
669
|
+
sample_id_value,
|
|
670
|
+
{"spuid_namespace": org},
|
|
671
|
+
),
|
|
672
|
+
],
|
|
673
|
+
),
|
|
674
|
+
],
|
|
675
|
+
),
|
|
676
|
+
],
|
|
677
|
+
)
|
|
678
|
+
self.root.append(action)
|
|
679
|
+
|
|
680
|
+
def set_fastq(
|
|
681
|
+
self,
|
|
682
|
+
biosample_data_objects: list,
|
|
683
|
+
bioproject_id: str,
|
|
684
|
+
org: str,
|
|
685
|
+
nmdc_nucleotide_sequencing: list,
|
|
686
|
+
nmdc_biosamples: list,
|
|
687
|
+
nmdc_library_preparation: list,
|
|
688
|
+
all_instruments: dict,
|
|
689
|
+
pooled_biosamples_data=None,
|
|
690
|
+
):
|
|
691
|
+
bsm_id_name_dict = {
|
|
692
|
+
biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
# Use provided pooling data or empty dict
|
|
696
|
+
pooling_data = pooled_biosamples_data or {}
|
|
697
|
+
|
|
698
|
+
# Group data objects by pooling process
|
|
699
|
+
pooling_groups = {}
|
|
700
|
+
individual_entries = []
|
|
701
|
+
|
|
702
|
+
for entry in biosample_data_objects:
|
|
703
|
+
pooling_process_id = None
|
|
704
|
+
# Check if any biosample in this entry belongs to a pooling process
|
|
705
|
+
for biosample_id in entry.keys():
|
|
706
|
+
pooling_info = pooling_data.get(biosample_id, {})
|
|
707
|
+
if pooling_info and pooling_info.get("pooling_process_id"):
|
|
708
|
+
pooling_process_id = pooling_info["pooling_process_id"]
|
|
709
|
+
break
|
|
710
|
+
|
|
711
|
+
if pooling_process_id:
|
|
712
|
+
if pooling_process_id not in pooling_groups:
|
|
713
|
+
pooling_groups[pooling_process_id] = {
|
|
714
|
+
"entries": [],
|
|
715
|
+
"processed_sample_id": pooling_info.get("processed_sample_id"),
|
|
716
|
+
"processed_sample_name": pooling_info.get(
|
|
717
|
+
"processed_sample_name", ""
|
|
718
|
+
),
|
|
719
|
+
}
|
|
720
|
+
pooling_groups[pooling_process_id]["entries"].append(entry)
|
|
721
|
+
else:
|
|
722
|
+
individual_entries.append(entry)
|
|
723
|
+
|
|
724
|
+
# Process pooled entries - create one SRA <Action> block per pooling process
|
|
725
|
+
for pooling_process_id, group_data in pooling_groups.items():
|
|
726
|
+
self._create_pooled_sra_action(
|
|
727
|
+
group_data["entries"],
|
|
728
|
+
group_data["processed_sample_id"],
|
|
729
|
+
group_data["processed_sample_name"],
|
|
730
|
+
bioproject_id,
|
|
731
|
+
org,
|
|
732
|
+
nmdc_nucleotide_sequencing,
|
|
733
|
+
nmdc_library_preparation,
|
|
734
|
+
all_instruments,
|
|
735
|
+
bsm_id_name_dict,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Process individual entries
|
|
739
|
+
for entry in individual_entries:
|
|
740
|
+
fastq_files = []
|
|
741
|
+
biosample_ids = []
|
|
742
|
+
nucleotide_sequencing_ids = {}
|
|
743
|
+
lib_prep_protocol_names = {}
|
|
744
|
+
analyte_category = ""
|
|
745
|
+
library_name = ""
|
|
746
|
+
instrument_vendor = ""
|
|
747
|
+
instrument_model = ""
|
|
748
|
+
|
|
749
|
+
for biosample_id, data_objects in entry.items():
|
|
750
|
+
biosample_ids.append(biosample_id)
|
|
751
|
+
for data_object in data_objects:
|
|
752
|
+
if "url" in data_object:
|
|
753
|
+
url = urlparse(data_object["url"])
|
|
754
|
+
file_path = os.path.basename(url.path)
|
|
755
|
+
fastq_files.append(file_path)
|
|
756
|
+
|
|
757
|
+
for ntseq_dict in nmdc_nucleotide_sequencing:
|
|
758
|
+
if biosample_id in ntseq_dict:
|
|
759
|
+
for ntseq in ntseq_dict[biosample_id]:
|
|
760
|
+
nucleotide_sequencing_ids[biosample_id] = ntseq.get(
|
|
761
|
+
"id", ""
|
|
762
|
+
)
|
|
763
|
+
# Currently, we are making the assumption that only one instrument
|
|
764
|
+
# is used to sequence a Biosample
|
|
765
|
+
instrument_used: List[str] = ntseq.get(
|
|
766
|
+
"instrument_used", []
|
|
767
|
+
)
|
|
768
|
+
if not instrument_used:
|
|
769
|
+
instrument_id = None
|
|
770
|
+
else:
|
|
771
|
+
instrument_id = instrument_used[0]
|
|
772
|
+
|
|
773
|
+
instrument = all_instruments.get(instrument_id, {})
|
|
774
|
+
instrument_vendor = instrument.get("vendor", "")
|
|
775
|
+
instrument_model = instrument.get("model", "")
|
|
776
|
+
|
|
777
|
+
analyte_category = ntseq.get("analyte_category", "")
|
|
778
|
+
library_name = bsm_id_name_dict.get(biosample_id, "")
|
|
779
|
+
|
|
780
|
+
for lib_prep_dict in nmdc_library_preparation:
|
|
781
|
+
if biosample_id in lib_prep_dict:
|
|
782
|
+
lib_prep_protocol_names[biosample_id] = (
|
|
783
|
+
lib_prep_dict[biosample_id]
|
|
784
|
+
.get("protocol_link", {})
|
|
785
|
+
.get("name", "")
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
if fastq_files:
|
|
789
|
+
files_elements = [
|
|
790
|
+
self.set_element(
|
|
791
|
+
"File",
|
|
792
|
+
"",
|
|
793
|
+
{"file_path": f},
|
|
794
|
+
[
|
|
795
|
+
self.set_element(
|
|
796
|
+
"DataType",
|
|
797
|
+
"sra-run-fastq" if ".fastq" in f else "generic-data",
|
|
798
|
+
)
|
|
799
|
+
],
|
|
800
|
+
)
|
|
801
|
+
for f in fastq_files
|
|
802
|
+
]
|
|
803
|
+
|
|
804
|
+
attribute_elements = [
|
|
805
|
+
self.set_element(
|
|
806
|
+
"AttributeRefId",
|
|
807
|
+
attrib={"name": "BioProject"},
|
|
808
|
+
children=[
|
|
809
|
+
self.set_element(
|
|
810
|
+
"RefId",
|
|
811
|
+
children=[
|
|
812
|
+
self.set_element(
|
|
813
|
+
"PrimaryId",
|
|
814
|
+
bioproject_id,
|
|
815
|
+
{"db": "BioProject"},
|
|
816
|
+
)
|
|
817
|
+
],
|
|
818
|
+
)
|
|
819
|
+
],
|
|
820
|
+
)
|
|
821
|
+
]
|
|
822
|
+
|
|
823
|
+
for biosample_id in biosample_ids:
|
|
824
|
+
attribute_elements.append(
|
|
825
|
+
self.set_element(
|
|
826
|
+
"AttributeRefId",
|
|
827
|
+
attrib={"name": "BioSample"},
|
|
828
|
+
children=[
|
|
829
|
+
self.set_element(
|
|
830
|
+
"RefId",
|
|
831
|
+
children=[
|
|
832
|
+
self.set_element(
|
|
833
|
+
"SPUID",
|
|
834
|
+
biosample_id,
|
|
835
|
+
{"spuid_namespace": org},
|
|
836
|
+
)
|
|
837
|
+
],
|
|
838
|
+
)
|
|
839
|
+
],
|
|
840
|
+
)
|
|
841
|
+
)
|
|
842
|
+
|
|
843
|
+
sra_attributes = []
|
|
844
|
+
if instrument_vendor == "illumina":
|
|
845
|
+
sra_attributes.append(
|
|
846
|
+
self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
|
|
847
|
+
)
|
|
848
|
+
if instrument_model == "nextseq_550":
|
|
849
|
+
sra_attributes.append(
|
|
850
|
+
self.set_element(
|
|
851
|
+
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
852
|
+
)
|
|
853
|
+
)
|
|
854
|
+
elif instrument_model == "novaseq_6000":
|
|
855
|
+
sra_attributes.append(
|
|
856
|
+
self.set_element(
|
|
857
|
+
"Attribute",
|
|
858
|
+
"NovaSeq 6000",
|
|
859
|
+
{"name": "instrument_model"},
|
|
860
|
+
)
|
|
861
|
+
)
|
|
862
|
+
elif instrument_model == "hiseq":
|
|
863
|
+
sra_attributes.append(
|
|
864
|
+
self.set_element(
|
|
865
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
866
|
+
)
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
if analyte_category == "metagenome":
|
|
870
|
+
sra_attributes.append(
|
|
871
|
+
self.set_element(
|
|
872
|
+
"Attribute", "WGS", {"name": "library_strategy"}
|
|
873
|
+
)
|
|
874
|
+
)
|
|
875
|
+
sra_attributes.append(
|
|
876
|
+
self.set_element(
|
|
877
|
+
"Attribute", "METAGENOMIC", {"name": "library_source"}
|
|
878
|
+
)
|
|
879
|
+
)
|
|
880
|
+
sra_attributes.append(
|
|
881
|
+
self.set_element(
|
|
882
|
+
"Attribute", "RANDOM", {"name": "library_selection"}
|
|
883
|
+
)
|
|
884
|
+
)
|
|
885
|
+
elif analyte_category == "metatranscriptome":
|
|
886
|
+
sra_attributes.append(
|
|
887
|
+
self.set_element(
|
|
888
|
+
"Attribute",
|
|
889
|
+
"METATRANSCRIPTOMIC",
|
|
890
|
+
{"name": "library_source"},
|
|
891
|
+
)
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
has_paired_reads = any(
|
|
895
|
+
data_object.get("data_object_type", "").lower()
|
|
896
|
+
== "metagenome raw reads"
|
|
897
|
+
for data_object in data_objects
|
|
898
|
+
) or (
|
|
899
|
+
any(
|
|
900
|
+
data_object.get("data_object_type", "").lower()
|
|
901
|
+
== "metagenome raw read 1"
|
|
902
|
+
for data_object in data_objects
|
|
903
|
+
)
|
|
904
|
+
and any(
|
|
905
|
+
data_object.get("data_object_type", "").lower()
|
|
906
|
+
== "metagenome raw read 2"
|
|
907
|
+
for data_object in data_objects
|
|
908
|
+
)
|
|
909
|
+
)
|
|
910
|
+
|
|
911
|
+
if has_paired_reads:
|
|
912
|
+
sra_attributes.append(
|
|
913
|
+
self.set_element(
|
|
914
|
+
"Attribute", "paired", {"name": "library_layout"}
|
|
915
|
+
)
|
|
916
|
+
)
|
|
917
|
+
else:
|
|
918
|
+
sra_attributes.append(
|
|
919
|
+
self.set_element(
|
|
920
|
+
"Attribute", "single", {"name": "library_layout"}
|
|
921
|
+
)
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
# Add library_name attribute
|
|
925
|
+
if library_name:
|
|
926
|
+
sra_attributes.append(
|
|
927
|
+
self.set_element(
|
|
928
|
+
"Attribute", library_name, {"name": "library_name"}
|
|
929
|
+
)
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
|
|
933
|
+
sra_attributes.append(
|
|
934
|
+
self.set_element(
|
|
935
|
+
"Attribute",
|
|
936
|
+
lib_prep_name,
|
|
937
|
+
{"name": "library_construction_protocol"},
|
|
938
|
+
)
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
for (
|
|
942
|
+
biosample_id,
|
|
943
|
+
omics_processing_id,
|
|
944
|
+
) in nucleotide_sequencing_ids.items():
|
|
945
|
+
identifier_element = self.set_element(
|
|
946
|
+
"Identifier",
|
|
947
|
+
children=[
|
|
948
|
+
self.set_element(
|
|
949
|
+
"SPUID", omics_processing_id, {"spuid_namespace": org}
|
|
950
|
+
)
|
|
951
|
+
],
|
|
952
|
+
)
|
|
953
|
+
|
|
954
|
+
action = self.set_element(
|
|
955
|
+
"Action",
|
|
956
|
+
children=[
|
|
957
|
+
self.set_element(
|
|
958
|
+
"AddFiles",
|
|
959
|
+
attrib={"target_db": "SRA"},
|
|
960
|
+
children=files_elements
|
|
961
|
+
+ attribute_elements
|
|
962
|
+
+ sra_attributes
|
|
963
|
+
+ [identifier_element],
|
|
964
|
+
),
|
|
965
|
+
],
|
|
966
|
+
)
|
|
967
|
+
|
|
968
|
+
self.root.append(action)
|
|
969
|
+
|
|
970
|
+
def _create_pooled_sra_action(
|
|
971
|
+
self,
|
|
972
|
+
entries,
|
|
973
|
+
processed_sample_id,
|
|
974
|
+
processed_sample_name,
|
|
975
|
+
bioproject_id,
|
|
976
|
+
org,
|
|
977
|
+
nmdc_nucleotide_sequencing,
|
|
978
|
+
nmdc_library_preparation,
|
|
979
|
+
all_instruments,
|
|
980
|
+
bsm_id_name_dict,
|
|
981
|
+
):
|
|
982
|
+
if not processed_sample_id:
|
|
983
|
+
return
|
|
984
|
+
|
|
985
|
+
# Collect all fastq files from all entries
|
|
986
|
+
all_fastq_files = set()
|
|
987
|
+
all_biosample_ids = set()
|
|
988
|
+
nucleotide_sequencing_ids = {}
|
|
989
|
+
lib_prep_protocol_names = {}
|
|
990
|
+
analyte_category = ""
|
|
991
|
+
instrument_vendor = ""
|
|
992
|
+
instrument_model = ""
|
|
993
|
+
|
|
994
|
+
for entry in entries:
|
|
995
|
+
for biosample_id, data_objects in entry.items():
|
|
996
|
+
all_biosample_ids.add(biosample_id)
|
|
997
|
+
for data_object in data_objects:
|
|
998
|
+
if "url" in data_object:
|
|
999
|
+
url = urlparse(data_object["url"])
|
|
1000
|
+
file_path = os.path.basename(url.path)
|
|
1001
|
+
all_fastq_files.add(file_path)
|
|
1002
|
+
|
|
1003
|
+
# Get nucleotide sequencing info
|
|
1004
|
+
for ntseq_dict in nmdc_nucleotide_sequencing:
|
|
1005
|
+
if biosample_id in ntseq_dict:
|
|
1006
|
+
for ntseq in ntseq_dict[biosample_id]:
|
|
1007
|
+
nucleotide_sequencing_ids[biosample_id] = ntseq.get(
|
|
1008
|
+
"id", ""
|
|
1009
|
+
)
|
|
1010
|
+
instrument_used = ntseq.get("instrument_used", [])
|
|
1011
|
+
if instrument_used:
|
|
1012
|
+
instrument_id = instrument_used[0]
|
|
1013
|
+
instrument = all_instruments.get(instrument_id, {})
|
|
1014
|
+
instrument_vendor = instrument.get("vendor", "")
|
|
1015
|
+
instrument_model = instrument.get("model", "")
|
|
1016
|
+
analyte_category = ntseq.get("analyte_category", "")
|
|
1017
|
+
|
|
1018
|
+
# Get library preparation info
|
|
1019
|
+
for lib_prep_dict in nmdc_library_preparation:
|
|
1020
|
+
if biosample_id in lib_prep_dict:
|
|
1021
|
+
lib_prep_protocol_names[biosample_id] = (
|
|
1022
|
+
lib_prep_dict[biosample_id]
|
|
1023
|
+
.get("protocol_link", {})
|
|
1024
|
+
.get("name", "")
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
if all_fastq_files:
|
|
1028
|
+
files_elements = [
|
|
1029
|
+
self.set_element(
|
|
1030
|
+
"File",
|
|
1031
|
+
"",
|
|
1032
|
+
{"file_path": f},
|
|
1033
|
+
[
|
|
1034
|
+
self.set_element(
|
|
1035
|
+
"DataType",
|
|
1036
|
+
"sra-run-fastq" if ".fastq" in f else "generic-data",
|
|
1037
|
+
)
|
|
1038
|
+
],
|
|
1039
|
+
)
|
|
1040
|
+
for f in sorted(all_fastq_files)
|
|
1041
|
+
]
|
|
1042
|
+
|
|
1043
|
+
attribute_elements = [
|
|
1044
|
+
self.set_element(
|
|
1045
|
+
"AttributeRefId",
|
|
1046
|
+
attrib={"name": "BioProject"},
|
|
1047
|
+
children=[
|
|
1048
|
+
self.set_element(
|
|
1049
|
+
"RefId",
|
|
1050
|
+
children=[
|
|
1051
|
+
self.set_element(
|
|
1052
|
+
"PrimaryId",
|
|
1053
|
+
bioproject_id,
|
|
1054
|
+
{"db": "BioProject"},
|
|
1055
|
+
)
|
|
1056
|
+
],
|
|
1057
|
+
)
|
|
1058
|
+
],
|
|
1059
|
+
),
|
|
1060
|
+
# Reference the processed sample, not individual biosamples
|
|
1061
|
+
self.set_element(
|
|
1062
|
+
"AttributeRefId",
|
|
1063
|
+
attrib={"name": "BioSample"},
|
|
1064
|
+
children=[
|
|
1065
|
+
self.set_element(
|
|
1066
|
+
"RefId",
|
|
1067
|
+
children=[
|
|
1068
|
+
self.set_element(
|
|
1069
|
+
"SPUID",
|
|
1070
|
+
processed_sample_id,
|
|
1071
|
+
{"spuid_namespace": org},
|
|
1072
|
+
)
|
|
1073
|
+
],
|
|
1074
|
+
)
|
|
1075
|
+
],
|
|
1076
|
+
),
|
|
1077
|
+
]
|
|
1078
|
+
|
|
1079
|
+
sra_attributes = []
|
|
1080
|
+
if instrument_vendor == "illumina":
|
|
1081
|
+
sra_attributes.append(
|
|
1082
|
+
self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
|
|
1083
|
+
)
|
|
1084
|
+
if instrument_model == "nextseq_550":
|
|
1085
|
+
sra_attributes.append(
|
|
1086
|
+
self.set_element(
|
|
1087
|
+
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
1088
|
+
)
|
|
1089
|
+
)
|
|
1090
|
+
elif instrument_model == "novaseq_6000":
|
|
1091
|
+
sra_attributes.append(
|
|
1092
|
+
self.set_element(
|
|
1093
|
+
"Attribute",
|
|
1094
|
+
"NovaSeq 6000",
|
|
1095
|
+
{"name": "instrument_model"},
|
|
1096
|
+
)
|
|
1097
|
+
)
|
|
1098
|
+
elif instrument_model == "hiseq":
|
|
1099
|
+
sra_attributes.append(
|
|
1100
|
+
self.set_element(
|
|
1101
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
1102
|
+
)
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
if analyte_category == "metagenome":
|
|
1106
|
+
sra_attributes.append(
|
|
1107
|
+
self.set_element("Attribute", "WGS", {"name": "library_strategy"})
|
|
1108
|
+
)
|
|
1109
|
+
sra_attributes.append(
|
|
1110
|
+
self.set_element(
|
|
1111
|
+
"Attribute", "METAGENOMIC", {"name": "library_source"}
|
|
1112
|
+
)
|
|
1113
|
+
)
|
|
1114
|
+
sra_attributes.append(
|
|
1115
|
+
self.set_element(
|
|
1116
|
+
"Attribute", "RANDOM", {"name": "library_selection"}
|
|
1117
|
+
)
|
|
1118
|
+
)
|
|
1119
|
+
elif analyte_category == "metatranscriptome":
|
|
1120
|
+
sra_attributes.append(
|
|
1121
|
+
self.set_element(
|
|
1122
|
+
"Attribute",
|
|
1123
|
+
"METATRANSCRIPTOMIC",
|
|
1124
|
+
{"name": "library_source"},
|
|
1125
|
+
)
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# Determine library layout based on file patterns
|
|
1129
|
+
has_paired_reads = any(
|
|
1130
|
+
"_R1" in f and "_R2" in f.replace("_R1", "_R2") in all_fastq_files
|
|
1131
|
+
for f in all_fastq_files
|
|
1132
|
+
if "_R1" in f
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
if has_paired_reads:
|
|
1136
|
+
sra_attributes.append(
|
|
1137
|
+
self.set_element("Attribute", "paired", {"name": "library_layout"})
|
|
1138
|
+
)
|
|
1139
|
+
else:
|
|
1140
|
+
sra_attributes.append(
|
|
1141
|
+
self.set_element("Attribute", "single", {"name": "library_layout"})
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
# Add library_name attribute using ProcessedSample name
|
|
1145
|
+
if processed_sample_name:
|
|
1146
|
+
sra_attributes.append(
|
|
1147
|
+
self.set_element(
|
|
1148
|
+
"Attribute", processed_sample_name, {"name": "library_name"}
|
|
1149
|
+
)
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Add library construction protocol from any of the biosamples
|
|
1153
|
+
for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
|
|
1154
|
+
if lib_prep_name:
|
|
1155
|
+
sra_attributes.append(
|
|
1156
|
+
self.set_element(
|
|
1157
|
+
"Attribute",
|
|
1158
|
+
lib_prep_name,
|
|
1159
|
+
{"name": "library_construction_protocol"},
|
|
1160
|
+
)
|
|
1161
|
+
)
|
|
1162
|
+
break # Only add one protocol name
|
|
1163
|
+
|
|
1164
|
+
# Use the first nucleotide sequencing ID as the identifier
|
|
1165
|
+
omics_processing_id = None
|
|
1166
|
+
for biosample_id, seq_id in nucleotide_sequencing_ids.items():
|
|
1167
|
+
if seq_id:
|
|
1168
|
+
omics_processing_id = seq_id
|
|
1169
|
+
break
|
|
1170
|
+
|
|
1171
|
+
if omics_processing_id:
|
|
1172
|
+
identifier_element = self.set_element(
|
|
1173
|
+
"Identifier",
|
|
1174
|
+
children=[
|
|
1175
|
+
self.set_element(
|
|
1176
|
+
"SPUID", omics_processing_id, {"spuid_namespace": org}
|
|
1177
|
+
)
|
|
1178
|
+
],
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
action = self.set_element(
|
|
1182
|
+
"Action",
|
|
1183
|
+
children=[
|
|
1184
|
+
self.set_element(
|
|
1185
|
+
"AddFiles",
|
|
1186
|
+
attrib={"target_db": "SRA"},
|
|
1187
|
+
children=files_elements
|
|
1188
|
+
+ attribute_elements
|
|
1189
|
+
+ sra_attributes
|
|
1190
|
+
+ [identifier_element],
|
|
1191
|
+
),
|
|
1192
|
+
],
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
self.root.append(action)
|
|
1196
|
+
|
|
1197
|
+
def get_submission_xml(
|
|
1198
|
+
self,
|
|
1199
|
+
biosamples_list: list,
|
|
1200
|
+
biosample_nucleotide_sequencing_list: list,
|
|
1201
|
+
biosample_data_objects_list: list,
|
|
1202
|
+
biosample_library_preparation_list: list,
|
|
1203
|
+
instruments_dict: dict,
|
|
1204
|
+
pooled_biosamples_data=None,
|
|
1205
|
+
):
|
|
1206
|
+
# data_type = None
|
|
1207
|
+
|
|
1208
|
+
biosamples_to_exclude = set()
|
|
1209
|
+
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
1210
|
+
for bsm_id, ntseq_list in bsm_ntseq.items():
|
|
1211
|
+
# Check if any processing_institution is "JGI"
|
|
1212
|
+
for ntseq in ntseq_list:
|
|
1213
|
+
if (
|
|
1214
|
+
"processing_institution" in ntseq
|
|
1215
|
+
and ntseq["processing_institution"] == "JGI"
|
|
1216
|
+
):
|
|
1217
|
+
biosamples_to_exclude.add(bsm_id)
|
|
1218
|
+
break
|
|
1219
|
+
|
|
1220
|
+
# Filter biosample_nucleotide_sequencing_list to exclude JGI records
|
|
1221
|
+
filtered_nucleotide_sequencing_list = []
|
|
1222
|
+
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
1223
|
+
filtered_dict = {}
|
|
1224
|
+
for bsm_id, ntseq_list in bsm_ntseq.items():
|
|
1225
|
+
if bsm_id not in biosamples_to_exclude:
|
|
1226
|
+
filtered_dict[bsm_id] = ntseq_list
|
|
1227
|
+
if filtered_dict: # Only add non-empty dictionaries
|
|
1228
|
+
filtered_nucleotide_sequencing_list.append(filtered_dict)
|
|
1229
|
+
|
|
1230
|
+
# Filter biosamples_list to exclude JGI-processed biosamples
|
|
1231
|
+
filtered_biosamples_list = [
|
|
1232
|
+
biosample
|
|
1233
|
+
for biosample in biosamples_list
|
|
1234
|
+
if biosample.get("id") not in biosamples_to_exclude
|
|
1235
|
+
]
|
|
1236
|
+
|
|
1237
|
+
# Get data_type from filtered list
|
|
1238
|
+
# for bsm_ntseq in filtered_nucleotide_sequencing_list:
|
|
1239
|
+
# for _, ntseq_list in bsm_ntseq.items():
|
|
1240
|
+
# for ntseq in ntseq_list:
|
|
1241
|
+
# if "analyte_category" in ntseq:
|
|
1242
|
+
# data_type = handle_string_value(
|
|
1243
|
+
# ntseq["analyte_category"]
|
|
1244
|
+
# ).capitalize()
|
|
1245
|
+
|
|
1246
|
+
self.set_description(
|
|
1247
|
+
email=self.nmdc_pi_email,
|
|
1248
|
+
first=self.first_name,
|
|
1249
|
+
last=self.last_name,
|
|
1250
|
+
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
1251
|
+
)
|
|
1252
|
+
|
|
1253
|
+
# if not self.ncbi_bioproject_id:
|
|
1254
|
+
# self.set_bioproject(
|
|
1255
|
+
# title=self.nmdc_study_title,
|
|
1256
|
+
# project_id=self.ncbi_bioproject_id,
|
|
1257
|
+
# description=self.nmdc_study_description,
|
|
1258
|
+
# data_type=data_type,
|
|
1259
|
+
# org=self.ncbi_submission_metadata.get("organization", ""),
|
|
1260
|
+
# )
|
|
1261
|
+
|
|
1262
|
+
self.set_biosample(
|
|
1263
|
+
organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
|
|
1264
|
+
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
1265
|
+
bioproject_id=self.ncbi_bioproject_id,
|
|
1266
|
+
nmdc_biosamples=filtered_biosamples_list,
|
|
1267
|
+
pooled_biosamples_data=pooled_biosamples_data,
|
|
1268
|
+
)
|
|
1269
|
+
|
|
1270
|
+
# Also filter biosample_data_objects_list
|
|
1271
|
+
filtered_data_objects_list = []
|
|
1272
|
+
acceptable_extensions = [".fastq.gz", ".fastq"]
|
|
1273
|
+
|
|
1274
|
+
for entry in biosample_data_objects_list:
|
|
1275
|
+
filtered_entry = {}
|
|
1276
|
+
for biosample_id, data_objects in entry.items():
|
|
1277
|
+
if biosample_id not in biosamples_to_exclude:
|
|
1278
|
+
# filter data_objects based on acceptable/allowed extensions
|
|
1279
|
+
# for "url" key in data_object
|
|
1280
|
+
filtered_objects = []
|
|
1281
|
+
for data_object in data_objects:
|
|
1282
|
+
if "url" in data_object:
|
|
1283
|
+
url = urlparse(data_object["url"])
|
|
1284
|
+
file_path = os.path.basename(url.path)
|
|
1285
|
+
if any(
|
|
1286
|
+
file_path.endswith(ext) for ext in acceptable_extensions
|
|
1287
|
+
):
|
|
1288
|
+
filtered_objects.append(data_object)
|
|
1289
|
+
|
|
1290
|
+
if filtered_objects:
|
|
1291
|
+
filtered_entry[biosample_id] = filtered_objects
|
|
1292
|
+
|
|
1293
|
+
if filtered_entry: # Only add non-empty entries
|
|
1294
|
+
filtered_data_objects_list.append(filtered_entry)
|
|
1295
|
+
|
|
1296
|
+
# Filter library preparation list as well
|
|
1297
|
+
filtered_library_preparation_list = []
|
|
1298
|
+
for lib_prep_dict in biosample_library_preparation_list:
|
|
1299
|
+
filtered_lib_prep = {}
|
|
1300
|
+
for biosample_id, lib_prep in lib_prep_dict.items():
|
|
1301
|
+
if biosample_id not in biosamples_to_exclude:
|
|
1302
|
+
filtered_lib_prep[biosample_id] = lib_prep
|
|
1303
|
+
if filtered_lib_prep: # Only add non-empty entries
|
|
1304
|
+
filtered_library_preparation_list.append(filtered_lib_prep)
|
|
1305
|
+
|
|
1306
|
+
self.set_fastq(
|
|
1307
|
+
biosample_data_objects=filtered_data_objects_list,
|
|
1308
|
+
bioproject_id=self.ncbi_bioproject_id,
|
|
1309
|
+
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
1310
|
+
nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
|
|
1311
|
+
nmdc_biosamples=filtered_biosamples_list,
|
|
1312
|
+
nmdc_library_preparation=filtered_library_preparation_list,
|
|
1313
|
+
all_instruments=instruments_dict,
|
|
1314
|
+
pooled_biosamples_data=pooled_biosamples_data,
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
rough_string = ET.tostring(self.root, "unicode")
|
|
1318
|
+
reparsed = xml.dom.minidom.parseString(rough_string)
|
|
1319
|
+
submission_xml = reparsed.toprettyxml(indent=" ", newl="\n")
|
|
1320
|
+
|
|
1321
|
+
# ============= Uncomment the following code to validate the XML against NCBI XSDs ============ #
|
|
1322
|
+
# submission_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/common/submission.xsd?view=co"
|
|
1323
|
+
# validate_xml(submission_xml, submission_xsd_url)
|
|
1324
|
+
|
|
1325
|
+
# bioproject_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/bioproject/bioproject.xsd?view=co"
|
|
1326
|
+
# validate_xml(submission_xml, bioproject_xsd_url)
|
|
1327
|
+
|
|
1328
|
+
# biosample_xsd_url = "https://www.ncbi.nlm.nih.gov/viewvc/v1/trunk/submit/public-docs/biosample/biosample.xsd?view=co"
|
|
1329
|
+
# validate_xml(submission_xml, biosample_xsd_url)
|
|
1330
|
+
|
|
1331
|
+
return submission_xml
|