nmdc-runtime 2.6.0__py3-none-any.whl → 2.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nmdc_runtime/Dockerfile +177 -0
- nmdc_runtime/api/analytics.py +90 -0
- nmdc_runtime/api/boot/capabilities.py +9 -0
- nmdc_runtime/api/boot/object_types.py +126 -0
- nmdc_runtime/api/boot/triggers.py +84 -0
- nmdc_runtime/api/boot/workflows.py +116 -0
- nmdc_runtime/api/core/auth.py +212 -0
- nmdc_runtime/api/core/idgen.py +200 -0
- nmdc_runtime/api/core/metadata.py +777 -0
- nmdc_runtime/api/core/util.py +114 -0
- nmdc_runtime/api/db/mongo.py +436 -0
- nmdc_runtime/api/db/s3.py +37 -0
- nmdc_runtime/api/endpoints/capabilities.py +25 -0
- nmdc_runtime/api/endpoints/find.py +634 -0
- nmdc_runtime/api/endpoints/jobs.py +206 -0
- nmdc_runtime/api/endpoints/lib/helpers.py +274 -0
- nmdc_runtime/api/endpoints/lib/linked_instances.py +193 -0
- nmdc_runtime/api/endpoints/lib/path_segments.py +165 -0
- nmdc_runtime/api/endpoints/metadata.py +260 -0
- nmdc_runtime/api/endpoints/nmdcschema.py +515 -0
- nmdc_runtime/api/endpoints/object_types.py +38 -0
- nmdc_runtime/api/endpoints/objects.py +277 -0
- nmdc_runtime/api/endpoints/operations.py +78 -0
- nmdc_runtime/api/endpoints/queries.py +701 -0
- nmdc_runtime/api/endpoints/runs.py +98 -0
- nmdc_runtime/api/endpoints/search.py +38 -0
- nmdc_runtime/api/endpoints/sites.py +205 -0
- nmdc_runtime/api/endpoints/triggers.py +25 -0
- nmdc_runtime/api/endpoints/users.py +214 -0
- nmdc_runtime/api/endpoints/util.py +817 -0
- nmdc_runtime/api/endpoints/wf_file_staging.py +307 -0
- nmdc_runtime/api/endpoints/workflows.py +353 -0
- nmdc_runtime/api/entrypoint.sh +7 -0
- nmdc_runtime/api/main.py +495 -0
- nmdc_runtime/api/middleware.py +43 -0
- nmdc_runtime/api/models/capability.py +14 -0
- nmdc_runtime/api/models/id.py +92 -0
- nmdc_runtime/api/models/job.py +57 -0
- nmdc_runtime/api/models/lib/helpers.py +78 -0
- nmdc_runtime/api/models/metadata.py +11 -0
- nmdc_runtime/api/models/nmdc_schema.py +146 -0
- nmdc_runtime/api/models/object.py +180 -0
- nmdc_runtime/api/models/object_type.py +20 -0
- nmdc_runtime/api/models/operation.py +66 -0
- nmdc_runtime/api/models/query.py +246 -0
- nmdc_runtime/api/models/query_continuation.py +111 -0
- nmdc_runtime/api/models/run.py +161 -0
- nmdc_runtime/api/models/site.py +87 -0
- nmdc_runtime/api/models/trigger.py +13 -0
- nmdc_runtime/api/models/user.py +207 -0
- nmdc_runtime/api/models/util.py +260 -0
- nmdc_runtime/api/models/wfe_file_stages.py +122 -0
- nmdc_runtime/api/models/workflow.py +15 -0
- nmdc_runtime/api/openapi.py +178 -0
- nmdc_runtime/api/swagger_ui/assets/EllipsesButton.js +146 -0
- nmdc_runtime/api/swagger_ui/assets/EndpointSearchWidget.js +369 -0
- nmdc_runtime/api/swagger_ui/assets/script.js +252 -0
- nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
- nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
- nmdc_runtime/config.py +56 -1
- nmdc_runtime/minter/adapters/repository.py +22 -2
- nmdc_runtime/minter/config.py +2 -0
- nmdc_runtime/minter/domain/model.py +55 -1
- nmdc_runtime/minter/entrypoints/fastapi_app.py +1 -1
- nmdc_runtime/mongo_util.py +89 -0
- nmdc_runtime/site/backup/nmdcdb_mongodump.py +1 -1
- nmdc_runtime/site/backup/nmdcdb_mongoexport.py +1 -3
- nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
- nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
- nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
- nmdc_runtime/site/dagster.yaml +53 -0
- nmdc_runtime/site/entrypoint-daemon.sh +29 -0
- nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
- nmdc_runtime/site/entrypoint-dagit.sh +29 -0
- nmdc_runtime/site/export/ncbi_xml.py +731 -40
- nmdc_runtime/site/export/ncbi_xml_utils.py +142 -26
- nmdc_runtime/site/graphs.py +80 -29
- nmdc_runtime/site/ops.py +522 -183
- nmdc_runtime/site/repair/database_updater.py +210 -1
- nmdc_runtime/site/repository.py +108 -117
- nmdc_runtime/site/resources.py +72 -36
- nmdc_runtime/site/translation/gold_translator.py +22 -21
- nmdc_runtime/site/translation/neon_benthic_translator.py +1 -1
- nmdc_runtime/site/translation/neon_soil_translator.py +5 -5
- nmdc_runtime/site/translation/neon_surface_water_translator.py +1 -2
- nmdc_runtime/site/translation/submission_portal_translator.py +216 -69
- nmdc_runtime/site/translation/translator.py +64 -1
- nmdc_runtime/site/util.py +8 -3
- nmdc_runtime/site/validation/util.py +16 -12
- nmdc_runtime/site/workspace.yaml +13 -0
- nmdc_runtime/static/NMDC_logo.svg +1073 -0
- nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
- nmdc_runtime/static/README.md +5 -0
- nmdc_runtime/static/favicon.ico +0 -0
- nmdc_runtime/util.py +175 -348
- nmdc_runtime-2.12.0.dist-info/METADATA +45 -0
- nmdc_runtime-2.12.0.dist-info/RECORD +131 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/WHEEL +1 -2
- nmdc_runtime/containers.py +0 -14
- nmdc_runtime/core/db/Database.py +0 -15
- nmdc_runtime/core/exceptions/__init__.py +0 -23
- nmdc_runtime/core/exceptions/base.py +0 -47
- nmdc_runtime/core/exceptions/token.py +0 -13
- nmdc_runtime/domain/users/queriesInterface.py +0 -18
- nmdc_runtime/domain/users/userSchema.py +0 -37
- nmdc_runtime/domain/users/userService.py +0 -14
- nmdc_runtime/infrastructure/database/db.py +0 -3
- nmdc_runtime/infrastructure/database/models/user.py +0 -10
- nmdc_runtime/lib/__init__.py +0 -1
- nmdc_runtime/lib/extract_nmdc_data.py +0 -41
- nmdc_runtime/lib/load_nmdc_data.py +0 -121
- nmdc_runtime/lib/nmdc_dataframes.py +0 -829
- nmdc_runtime/lib/nmdc_etl_class.py +0 -402
- nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
- nmdc_runtime/site/drsobjects/ingest.py +0 -93
- nmdc_runtime/site/drsobjects/registration.py +0 -131
- nmdc_runtime/site/translation/emsl.py +0 -43
- nmdc_runtime/site/translation/gold.py +0 -53
- nmdc_runtime/site/translation/jgi.py +0 -32
- nmdc_runtime/site/translation/util.py +0 -132
- nmdc_runtime/site/validation/jgi.py +0 -43
- nmdc_runtime-2.6.0.dist-info/METADATA +0 -199
- nmdc_runtime-2.6.0.dist-info/RECORD +0 -83
- nmdc_runtime-2.6.0.dist-info/top_level.txt +0 -1
- /nmdc_runtime/{client → api}/__init__.py +0 -0
- /nmdc_runtime/{core → api/boot}/__init__.py +0 -0
- /nmdc_runtime/{core/db → api/core}/__init__.py +0 -0
- /nmdc_runtime/{domain → api/db}/__init__.py +0 -0
- /nmdc_runtime/{domain/users → api/endpoints}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure → api/endpoints/lib}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database → api/models}/__init__.py +0 -0
- /nmdc_runtime/{infrastructure/database/models → api/models/lib}/__init__.py +0 -0
- /nmdc_runtime/{site/drsobjects/__init__.py → api/models/minter.py} +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/entry_points.txt +0 -0
- {nmdc_runtime-2.6.0.dist-info → nmdc_runtime-2.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,8 +4,9 @@ import datetime
|
|
|
4
4
|
import xml.etree.ElementTree as ET
|
|
5
5
|
import xml.dom.minidom
|
|
6
6
|
|
|
7
|
-
from typing import Any, List
|
|
7
|
+
from typing import Any, List
|
|
8
8
|
from urllib.parse import urlparse
|
|
9
|
+
from unidecode import unidecode
|
|
9
10
|
from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
10
11
|
handle_controlled_identified_term_value,
|
|
11
12
|
handle_controlled_term_value,
|
|
@@ -16,7 +17,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
|
|
|
16
17
|
handle_float_value,
|
|
17
18
|
handle_string_value,
|
|
18
19
|
load_mappings,
|
|
19
|
-
validate_xml,
|
|
20
20
|
)
|
|
21
21
|
|
|
22
22
|
|
|
@@ -27,7 +27,11 @@ class NCBISubmissionXML:
|
|
|
27
27
|
self.nmdc_study_id = nmdc_study.get("id")
|
|
28
28
|
self.nmdc_study_title = nmdc_study.get("title")
|
|
29
29
|
self.nmdc_study_description = nmdc_study.get("description")
|
|
30
|
-
|
|
30
|
+
# get the first INSDC BioProject ID from the NMDC study
|
|
31
|
+
self.ncbi_bioproject_id = nmdc_study.get("insdc_bioproject_identifiers")[0]
|
|
32
|
+
# the value asserted in "insdc_bioproject_identifiers" will be a CURIE, so extract
|
|
33
|
+
# everything after the prefix and delimiter (":")
|
|
34
|
+
self.ncbi_bioproject_id = self.ncbi_bioproject_id.split(":")[-1]
|
|
31
35
|
self.nmdc_pi_email = nmdc_study.get("principal_investigator", {}).get("email")
|
|
32
36
|
nmdc_study_pi_name = (
|
|
33
37
|
nmdc_study.get("principal_investigator", {}).get("name").split()
|
|
@@ -159,16 +163,53 @@ class NCBISubmissionXML:
|
|
|
159
163
|
org,
|
|
160
164
|
bioproject_id,
|
|
161
165
|
nmdc_biosamples,
|
|
166
|
+
pooled_biosamples_data=None,
|
|
162
167
|
):
|
|
163
168
|
attribute_mappings, slot_range_mappings = load_mappings(
|
|
164
169
|
self.nmdc_ncbi_attribute_mapping_file_url
|
|
165
170
|
)
|
|
166
171
|
|
|
172
|
+
# Use provided pooling data or empty dict
|
|
173
|
+
pooling_data = pooled_biosamples_data or {}
|
|
174
|
+
|
|
175
|
+
# Group biosamples by pooling process
|
|
176
|
+
pooling_groups = {}
|
|
177
|
+
individual_biosamples = []
|
|
178
|
+
|
|
167
179
|
for biosample in nmdc_biosamples:
|
|
180
|
+
pooling_info = pooling_data.get(biosample["id"], {})
|
|
181
|
+
if pooling_info and pooling_info.get("pooling_process_id"):
|
|
182
|
+
pooling_process_id = pooling_info["pooling_process_id"]
|
|
183
|
+
if pooling_process_id not in pooling_groups:
|
|
184
|
+
pooling_groups[pooling_process_id] = {
|
|
185
|
+
"biosamples": [],
|
|
186
|
+
"pooling_info": pooling_info,
|
|
187
|
+
}
|
|
188
|
+
pooling_groups[pooling_process_id]["biosamples"].append(biosample)
|
|
189
|
+
else:
|
|
190
|
+
individual_biosamples.append(biosample)
|
|
191
|
+
|
|
192
|
+
# Process pooled sample groups - create one <Action> block per pooling process
|
|
193
|
+
for pooling_process_id, group_data in pooling_groups.items():
|
|
194
|
+
self._create_pooled_biosample_action(
|
|
195
|
+
group_data["biosamples"],
|
|
196
|
+
group_data["pooling_info"],
|
|
197
|
+
organism_name,
|
|
198
|
+
org,
|
|
199
|
+
bioproject_id,
|
|
200
|
+
attribute_mappings,
|
|
201
|
+
slot_range_mappings,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Process individual biosamples
|
|
205
|
+
for biosample in individual_biosamples:
|
|
168
206
|
attributes = {}
|
|
169
207
|
sample_id_value = None
|
|
170
208
|
env_package = None
|
|
171
209
|
|
|
210
|
+
# Get pooling info for this specific biosample
|
|
211
|
+
pooling_info = pooling_data.get(biosample["id"], {})
|
|
212
|
+
|
|
172
213
|
for json_key, value in biosample.items():
|
|
173
214
|
if isinstance(value, list):
|
|
174
215
|
for item in value:
|
|
@@ -187,15 +228,6 @@ class NCBISubmissionXML:
|
|
|
187
228
|
attributes[xml_key] = value
|
|
188
229
|
continue # Skip applying the handler to this key
|
|
189
230
|
|
|
190
|
-
# Special handling for "host_taxid"
|
|
191
|
-
if json_key == "host_taxid" and isinstance(value, dict):
|
|
192
|
-
if "term" in value and "id" in value["term"]:
|
|
193
|
-
value = re.findall(
|
|
194
|
-
r"\d+", value["term"]["id"].split(":")[1]
|
|
195
|
-
)[0]
|
|
196
|
-
attributes[xml_key] = value
|
|
197
|
-
continue # Skip applying the handler to this key
|
|
198
|
-
|
|
199
231
|
formatted_value = handler(item)
|
|
200
232
|
|
|
201
233
|
# Combine multiple values with a separator for list elements
|
|
@@ -210,7 +242,11 @@ class NCBISubmissionXML:
|
|
|
210
242
|
|
|
211
243
|
# Special handling for NMDC Biosample "id"
|
|
212
244
|
if json_key == "id":
|
|
213
|
-
|
|
245
|
+
# Use ProcessedSample ID if this is a pooled sample, otherwise use biosample ID
|
|
246
|
+
if pooling_info and pooling_info.get("processed_sample_id"):
|
|
247
|
+
sample_id_value = pooling_info["processed_sample_id"]
|
|
248
|
+
else:
|
|
249
|
+
sample_id_value = value
|
|
214
250
|
continue
|
|
215
251
|
|
|
216
252
|
if json_key not in attribute_mappings:
|
|
@@ -233,10 +269,39 @@ class NCBISubmissionXML:
|
|
|
233
269
|
attributes[xml_key] = value
|
|
234
270
|
continue # Skip applying the handler to this key
|
|
235
271
|
|
|
272
|
+
# Special handling for "geo_loc_name" - convert unicode to closest ASCII characters
|
|
273
|
+
if json_key == "geo_loc_name":
|
|
274
|
+
formatted_value = handler(value)
|
|
275
|
+
formatted_value_ascii = unidecode(formatted_value)
|
|
276
|
+
attributes[xml_key] = formatted_value_ascii
|
|
277
|
+
continue # Skip applying the handler to this key
|
|
278
|
+
|
|
236
279
|
# Default processing for other keys
|
|
237
280
|
formatted_value = handler(value)
|
|
238
281
|
attributes[xml_key] = formatted_value
|
|
239
282
|
|
|
283
|
+
# Override with aggregated values for pooled samples
|
|
284
|
+
if pooling_info:
|
|
285
|
+
if pooling_info.get("aggregated_collection_date"):
|
|
286
|
+
# Find the mapping for collection_date
|
|
287
|
+
collection_date_key = attribute_mappings.get(
|
|
288
|
+
"collection_date", "collection_date"
|
|
289
|
+
)
|
|
290
|
+
attributes[collection_date_key] = pooling_info[
|
|
291
|
+
"aggregated_collection_date"
|
|
292
|
+
]
|
|
293
|
+
|
|
294
|
+
if pooling_info.get("aggregated_depth"):
|
|
295
|
+
# Find the mapping for depth
|
|
296
|
+
depth_key = attribute_mappings.get("depth", "depth")
|
|
297
|
+
attributes[depth_key] = pooling_info["aggregated_depth"]
|
|
298
|
+
|
|
299
|
+
# Add samp_pooling attribute with semicolon-delimited biosample IDs
|
|
300
|
+
if pooling_info.get("pooled_biosample_ids"):
|
|
301
|
+
attributes["samp_pooling"] = ";".join(
|
|
302
|
+
pooling_info["pooled_biosample_ids"]
|
|
303
|
+
)
|
|
304
|
+
|
|
240
305
|
biosample_elements = [
|
|
241
306
|
self.set_element(
|
|
242
307
|
"SampleId",
|
|
@@ -251,9 +316,54 @@ class NCBISubmissionXML:
|
|
|
251
316
|
children=[
|
|
252
317
|
self.set_element(
|
|
253
318
|
"Title",
|
|
254
|
-
|
|
319
|
+
attributes.get(
|
|
320
|
+
"name",
|
|
321
|
+
# fallback title if "name" is not present
|
|
322
|
+
f"NMDC Biosample {sample_id_value} from {organism_name}, part of {self.nmdc_study_id} study",
|
|
323
|
+
),
|
|
255
324
|
),
|
|
256
|
-
]
|
|
325
|
+
]
|
|
326
|
+
+ (
|
|
327
|
+
# Add external links for pooled samples
|
|
328
|
+
[
|
|
329
|
+
self.set_element(
|
|
330
|
+
"ExternalLink",
|
|
331
|
+
attrib={"label": "NMDC Processed Sample"},
|
|
332
|
+
children=[
|
|
333
|
+
self.set_element(
|
|
334
|
+
"URL",
|
|
335
|
+
f"https://bioregistry.io/{pooling_info['processed_sample_id']}",
|
|
336
|
+
)
|
|
337
|
+
],
|
|
338
|
+
),
|
|
339
|
+
self.set_element(
|
|
340
|
+
"ExternalLink",
|
|
341
|
+
attrib={"label": "NMDC Pooling Process"},
|
|
342
|
+
children=[
|
|
343
|
+
self.set_element(
|
|
344
|
+
"URL",
|
|
345
|
+
f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
|
|
346
|
+
)
|
|
347
|
+
],
|
|
348
|
+
),
|
|
349
|
+
]
|
|
350
|
+
if pooling_info
|
|
351
|
+
and pooling_info.get("processed_sample_id")
|
|
352
|
+
and pooling_info.get("pooling_process_id")
|
|
353
|
+
else [
|
|
354
|
+
# Add external link for individual biosamples
|
|
355
|
+
self.set_element(
|
|
356
|
+
"ExternalLink",
|
|
357
|
+
attrib={"label": sample_id_value},
|
|
358
|
+
children=[
|
|
359
|
+
self.set_element(
|
|
360
|
+
"URL",
|
|
361
|
+
f"https://bioregistry.io/{sample_id_value}",
|
|
362
|
+
)
|
|
363
|
+
],
|
|
364
|
+
),
|
|
365
|
+
]
|
|
366
|
+
),
|
|
257
367
|
),
|
|
258
368
|
self.set_element(
|
|
259
369
|
"Organism",
|
|
@@ -325,6 +435,248 @@ class NCBISubmissionXML:
|
|
|
325
435
|
)
|
|
326
436
|
self.root.append(action)
|
|
327
437
|
|
|
438
|
+
def _create_pooled_biosample_action(
|
|
439
|
+
self,
|
|
440
|
+
biosamples,
|
|
441
|
+
pooling_info,
|
|
442
|
+
organism_name,
|
|
443
|
+
org,
|
|
444
|
+
bioproject_id,
|
|
445
|
+
attribute_mappings,
|
|
446
|
+
slot_range_mappings,
|
|
447
|
+
):
|
|
448
|
+
# Use the processed sample ID as the primary identifier
|
|
449
|
+
sample_id_value = pooling_info.get("processed_sample_id")
|
|
450
|
+
if not sample_id_value:
|
|
451
|
+
return
|
|
452
|
+
|
|
453
|
+
# Aggregate attributes from all biosamples in the pool
|
|
454
|
+
aggregated_attributes = {}
|
|
455
|
+
env_package = None
|
|
456
|
+
|
|
457
|
+
# Get title from the first biosample or use processed sample name
|
|
458
|
+
title = pooling_info.get(
|
|
459
|
+
"processed_sample_name", f"Pooled sample {sample_id_value}"
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
# Process each biosample to collect and aggregate attributes
|
|
463
|
+
for biosample in biosamples:
|
|
464
|
+
for json_key, value in biosample.items():
|
|
465
|
+
if json_key == "id":
|
|
466
|
+
continue
|
|
467
|
+
|
|
468
|
+
if json_key == "env_package":
|
|
469
|
+
env_package = f"MIMS.me.{handle_text_value(value)}.6.0"
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
if isinstance(value, list):
|
|
473
|
+
for item in value:
|
|
474
|
+
if json_key not in attribute_mappings:
|
|
475
|
+
continue
|
|
476
|
+
|
|
477
|
+
xml_key = attribute_mappings[json_key]
|
|
478
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
479
|
+
handler = self.type_handlers.get(
|
|
480
|
+
value_type, handle_string_value
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
# Special handling for "elev" key
|
|
484
|
+
if json_key == "elev":
|
|
485
|
+
value = f"{float(value)} m"
|
|
486
|
+
aggregated_attributes[xml_key] = value
|
|
487
|
+
continue
|
|
488
|
+
|
|
489
|
+
# Special handling for "host_taxid"
|
|
490
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
491
|
+
if "term" in value and "id" in value["term"]:
|
|
492
|
+
value = re.findall(
|
|
493
|
+
r"\d+", value["term"]["id"].split(":")[1]
|
|
494
|
+
)[0]
|
|
495
|
+
aggregated_attributes[xml_key] = value
|
|
496
|
+
continue
|
|
497
|
+
|
|
498
|
+
formatted_value = handler(item)
|
|
499
|
+
|
|
500
|
+
# For pooled samples, we typically want the first value or aggregate appropriately
|
|
501
|
+
if xml_key not in aggregated_attributes:
|
|
502
|
+
aggregated_attributes[xml_key] = formatted_value
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
if json_key not in attribute_mappings:
|
|
506
|
+
continue
|
|
507
|
+
|
|
508
|
+
xml_key = attribute_mappings[json_key]
|
|
509
|
+
value_type = slot_range_mappings.get(json_key, "string")
|
|
510
|
+
handler = self.type_handlers.get(value_type, handle_string_value)
|
|
511
|
+
|
|
512
|
+
# Special handling for "elev" key
|
|
513
|
+
if json_key == "elev":
|
|
514
|
+
value = f"{float(value)} m"
|
|
515
|
+
aggregated_attributes[xml_key] = value
|
|
516
|
+
continue
|
|
517
|
+
|
|
518
|
+
# Special handling for "host_taxid"
|
|
519
|
+
if json_key == "host_taxid" and isinstance(value, dict):
|
|
520
|
+
if "term" in value and "id" in value["term"]:
|
|
521
|
+
value = re.findall(r"\d+", value["term"]["id"].split(":")[1])[0]
|
|
522
|
+
aggregated_attributes[xml_key] = value
|
|
523
|
+
continue
|
|
524
|
+
|
|
525
|
+
formatted_value = handler(value)
|
|
526
|
+
|
|
527
|
+
# For pooled samples, we typically want the first value or aggregate appropriately
|
|
528
|
+
if xml_key not in aggregated_attributes:
|
|
529
|
+
aggregated_attributes[xml_key] = formatted_value
|
|
530
|
+
|
|
531
|
+
# Override with aggregated values for pooled samples
|
|
532
|
+
if pooling_info.get("aggregated_collection_date"):
|
|
533
|
+
collection_date_key = attribute_mappings.get(
|
|
534
|
+
"collection_date", "collection_date"
|
|
535
|
+
)
|
|
536
|
+
aggregated_attributes[collection_date_key] = pooling_info[
|
|
537
|
+
"aggregated_collection_date"
|
|
538
|
+
]
|
|
539
|
+
|
|
540
|
+
if pooling_info.get("aggregated_depth"):
|
|
541
|
+
depth_key = attribute_mappings.get("depth", "depth")
|
|
542
|
+
aggregated_attributes[depth_key] = pooling_info["aggregated_depth"]
|
|
543
|
+
|
|
544
|
+
# Add samp_pooling attribute with semicolon-delimited biosample IDs
|
|
545
|
+
if pooling_info.get("pooled_biosample_ids"):
|
|
546
|
+
aggregated_attributes["samp_pooling"] = ";".join(
|
|
547
|
+
pooling_info["pooled_biosample_ids"]
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
# Filter attributes to only include the ones from neon_soil_example.xml for pooled samples
|
|
551
|
+
allowed_attributes = {
|
|
552
|
+
"collection_date",
|
|
553
|
+
"depth",
|
|
554
|
+
"elev",
|
|
555
|
+
"geo_loc_name",
|
|
556
|
+
"lat_lon",
|
|
557
|
+
"env_broad_scale",
|
|
558
|
+
"env_local_scale",
|
|
559
|
+
"env_medium",
|
|
560
|
+
"samp_pooling",
|
|
561
|
+
}
|
|
562
|
+
filtered_attributes = {
|
|
563
|
+
k: v for k, v in aggregated_attributes.items() if k in allowed_attributes
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
biosample_elements = [
|
|
567
|
+
self.set_element(
|
|
568
|
+
"SampleId",
|
|
569
|
+
children=[
|
|
570
|
+
self.set_element("SPUID", sample_id_value, {"spuid_namespace": org})
|
|
571
|
+
],
|
|
572
|
+
),
|
|
573
|
+
self.set_element(
|
|
574
|
+
"Descriptor",
|
|
575
|
+
children=[
|
|
576
|
+
self.set_element("Title", title),
|
|
577
|
+
self.set_element(
|
|
578
|
+
"ExternalLink",
|
|
579
|
+
attrib={"label": sample_id_value},
|
|
580
|
+
children=[
|
|
581
|
+
self.set_element(
|
|
582
|
+
"URL",
|
|
583
|
+
f"https://bioregistry.io/{sample_id_value}",
|
|
584
|
+
)
|
|
585
|
+
],
|
|
586
|
+
),
|
|
587
|
+
self.set_element(
|
|
588
|
+
"ExternalLink",
|
|
589
|
+
attrib={"label": pooling_info["pooling_process_id"]},
|
|
590
|
+
children=[
|
|
591
|
+
self.set_element(
|
|
592
|
+
"URL",
|
|
593
|
+
f"https://bioregistry.io/{pooling_info['pooling_process_id']}",
|
|
594
|
+
)
|
|
595
|
+
],
|
|
596
|
+
),
|
|
597
|
+
]
|
|
598
|
+
+ [
|
|
599
|
+
self.set_element(
|
|
600
|
+
"ExternalLink",
|
|
601
|
+
attrib={"label": biosample_id},
|
|
602
|
+
children=[
|
|
603
|
+
self.set_element(
|
|
604
|
+
"URL",
|
|
605
|
+
f"https://bioregistry.io/{biosample_id}",
|
|
606
|
+
)
|
|
607
|
+
],
|
|
608
|
+
)
|
|
609
|
+
for biosample_id in pooling_info.get("pooled_biosample_ids", [])
|
|
610
|
+
],
|
|
611
|
+
),
|
|
612
|
+
self.set_element(
|
|
613
|
+
"Organism",
|
|
614
|
+
children=[self.set_element("OrganismName", organism_name)],
|
|
615
|
+
),
|
|
616
|
+
self.set_element(
|
|
617
|
+
"BioProject",
|
|
618
|
+
children=[
|
|
619
|
+
self.set_element("PrimaryId", bioproject_id, {"db": "BioProject"})
|
|
620
|
+
],
|
|
621
|
+
),
|
|
622
|
+
self.set_element("Package", env_package),
|
|
623
|
+
self.set_element(
|
|
624
|
+
"Attributes",
|
|
625
|
+
children=[
|
|
626
|
+
self.set_element(
|
|
627
|
+
"Attribute", filtered_attributes[key], {"attribute_name": key}
|
|
628
|
+
)
|
|
629
|
+
for key in sorted(filtered_attributes)
|
|
630
|
+
]
|
|
631
|
+
+ [
|
|
632
|
+
self.set_element(
|
|
633
|
+
"Attribute",
|
|
634
|
+
"National Microbiome Data Collaborative",
|
|
635
|
+
{"attribute_name": "broker name"},
|
|
636
|
+
)
|
|
637
|
+
],
|
|
638
|
+
),
|
|
639
|
+
]
|
|
640
|
+
|
|
641
|
+
action = self.set_element(
|
|
642
|
+
"Action",
|
|
643
|
+
children=[
|
|
644
|
+
self.set_element(
|
|
645
|
+
"AddData",
|
|
646
|
+
attrib={"target_db": "BioSample"},
|
|
647
|
+
children=[
|
|
648
|
+
self.set_element(
|
|
649
|
+
"Data",
|
|
650
|
+
attrib={"content_type": "XML"},
|
|
651
|
+
children=[
|
|
652
|
+
self.set_element(
|
|
653
|
+
"XmlContent",
|
|
654
|
+
children=[
|
|
655
|
+
self.set_element(
|
|
656
|
+
"BioSample",
|
|
657
|
+
attrib={"schema_version": "2.0"},
|
|
658
|
+
children=biosample_elements,
|
|
659
|
+
),
|
|
660
|
+
],
|
|
661
|
+
),
|
|
662
|
+
],
|
|
663
|
+
),
|
|
664
|
+
self.set_element(
|
|
665
|
+
"Identifier",
|
|
666
|
+
children=[
|
|
667
|
+
self.set_element(
|
|
668
|
+
"SPUID",
|
|
669
|
+
sample_id_value,
|
|
670
|
+
{"spuid_namespace": org},
|
|
671
|
+
),
|
|
672
|
+
],
|
|
673
|
+
),
|
|
674
|
+
],
|
|
675
|
+
),
|
|
676
|
+
],
|
|
677
|
+
)
|
|
678
|
+
self.root.append(action)
|
|
679
|
+
|
|
328
680
|
def set_fastq(
|
|
329
681
|
self,
|
|
330
682
|
biosample_data_objects: list,
|
|
@@ -334,12 +686,57 @@ class NCBISubmissionXML:
|
|
|
334
686
|
nmdc_biosamples: list,
|
|
335
687
|
nmdc_library_preparation: list,
|
|
336
688
|
all_instruments: dict,
|
|
689
|
+
pooled_biosamples_data=None,
|
|
337
690
|
):
|
|
338
691
|
bsm_id_name_dict = {
|
|
339
692
|
biosample["id"]: biosample["name"] for biosample in nmdc_biosamples
|
|
340
693
|
}
|
|
341
694
|
|
|
695
|
+
# Use provided pooling data or empty dict
|
|
696
|
+
pooling_data = pooled_biosamples_data or {}
|
|
697
|
+
|
|
698
|
+
# Group data objects by pooling process
|
|
699
|
+
pooling_groups = {}
|
|
700
|
+
individual_entries = []
|
|
701
|
+
|
|
342
702
|
for entry in biosample_data_objects:
|
|
703
|
+
pooling_process_id = None
|
|
704
|
+
# Check if any biosample in this entry belongs to a pooling process
|
|
705
|
+
for biosample_id in entry.keys():
|
|
706
|
+
pooling_info = pooling_data.get(biosample_id, {})
|
|
707
|
+
if pooling_info and pooling_info.get("pooling_process_id"):
|
|
708
|
+
pooling_process_id = pooling_info["pooling_process_id"]
|
|
709
|
+
break
|
|
710
|
+
|
|
711
|
+
if pooling_process_id:
|
|
712
|
+
if pooling_process_id not in pooling_groups:
|
|
713
|
+
pooling_groups[pooling_process_id] = {
|
|
714
|
+
"entries": [],
|
|
715
|
+
"processed_sample_id": pooling_info.get("processed_sample_id"),
|
|
716
|
+
"processed_sample_name": pooling_info.get(
|
|
717
|
+
"processed_sample_name", ""
|
|
718
|
+
),
|
|
719
|
+
}
|
|
720
|
+
pooling_groups[pooling_process_id]["entries"].append(entry)
|
|
721
|
+
else:
|
|
722
|
+
individual_entries.append(entry)
|
|
723
|
+
|
|
724
|
+
# Process pooled entries - create one SRA <Action> block per pooling process
|
|
725
|
+
for pooling_process_id, group_data in pooling_groups.items():
|
|
726
|
+
self._create_pooled_sra_action(
|
|
727
|
+
group_data["entries"],
|
|
728
|
+
group_data["processed_sample_id"],
|
|
729
|
+
group_data["processed_sample_name"],
|
|
730
|
+
bioproject_id,
|
|
731
|
+
org,
|
|
732
|
+
nmdc_nucleotide_sequencing,
|
|
733
|
+
nmdc_library_preparation,
|
|
734
|
+
all_instruments,
|
|
735
|
+
bsm_id_name_dict,
|
|
736
|
+
)
|
|
737
|
+
|
|
738
|
+
# Process individual entries
|
|
739
|
+
for entry in individual_entries:
|
|
343
740
|
fastq_files = []
|
|
344
741
|
biosample_ids = []
|
|
345
742
|
nucleotide_sequencing_ids = {}
|
|
@@ -524,6 +921,7 @@ class NCBISubmissionXML:
|
|
|
524
921
|
)
|
|
525
922
|
)
|
|
526
923
|
|
|
924
|
+
# Add library_name attribute
|
|
527
925
|
if library_name:
|
|
528
926
|
sra_attributes.append(
|
|
529
927
|
self.set_element(
|
|
@@ -569,6 +967,233 @@ class NCBISubmissionXML:
|
|
|
569
967
|
|
|
570
968
|
self.root.append(action)
|
|
571
969
|
|
|
970
|
+
def _create_pooled_sra_action(
|
|
971
|
+
self,
|
|
972
|
+
entries,
|
|
973
|
+
processed_sample_id,
|
|
974
|
+
processed_sample_name,
|
|
975
|
+
bioproject_id,
|
|
976
|
+
org,
|
|
977
|
+
nmdc_nucleotide_sequencing,
|
|
978
|
+
nmdc_library_preparation,
|
|
979
|
+
all_instruments,
|
|
980
|
+
bsm_id_name_dict,
|
|
981
|
+
):
|
|
982
|
+
if not processed_sample_id:
|
|
983
|
+
return
|
|
984
|
+
|
|
985
|
+
# Collect all fastq files from all entries
|
|
986
|
+
all_fastq_files = set()
|
|
987
|
+
all_biosample_ids = set()
|
|
988
|
+
nucleotide_sequencing_ids = {}
|
|
989
|
+
lib_prep_protocol_names = {}
|
|
990
|
+
analyte_category = ""
|
|
991
|
+
instrument_vendor = ""
|
|
992
|
+
instrument_model = ""
|
|
993
|
+
|
|
994
|
+
for entry in entries:
|
|
995
|
+
for biosample_id, data_objects in entry.items():
|
|
996
|
+
all_biosample_ids.add(biosample_id)
|
|
997
|
+
for data_object in data_objects:
|
|
998
|
+
if "url" in data_object:
|
|
999
|
+
url = urlparse(data_object["url"])
|
|
1000
|
+
file_path = os.path.basename(url.path)
|
|
1001
|
+
all_fastq_files.add(file_path)
|
|
1002
|
+
|
|
1003
|
+
# Get nucleotide sequencing info
|
|
1004
|
+
for ntseq_dict in nmdc_nucleotide_sequencing:
|
|
1005
|
+
if biosample_id in ntseq_dict:
|
|
1006
|
+
for ntseq in ntseq_dict[biosample_id]:
|
|
1007
|
+
nucleotide_sequencing_ids[biosample_id] = ntseq.get(
|
|
1008
|
+
"id", ""
|
|
1009
|
+
)
|
|
1010
|
+
instrument_used = ntseq.get("instrument_used", [])
|
|
1011
|
+
if instrument_used:
|
|
1012
|
+
instrument_id = instrument_used[0]
|
|
1013
|
+
instrument = all_instruments.get(instrument_id, {})
|
|
1014
|
+
instrument_vendor = instrument.get("vendor", "")
|
|
1015
|
+
instrument_model = instrument.get("model", "")
|
|
1016
|
+
analyte_category = ntseq.get("analyte_category", "")
|
|
1017
|
+
|
|
1018
|
+
# Get library preparation info
|
|
1019
|
+
for lib_prep_dict in nmdc_library_preparation:
|
|
1020
|
+
if biosample_id in lib_prep_dict:
|
|
1021
|
+
lib_prep_protocol_names[biosample_id] = (
|
|
1022
|
+
lib_prep_dict[biosample_id]
|
|
1023
|
+
.get("protocol_link", {})
|
|
1024
|
+
.get("name", "")
|
|
1025
|
+
)
|
|
1026
|
+
|
|
1027
|
+
if all_fastq_files:
|
|
1028
|
+
files_elements = [
|
|
1029
|
+
self.set_element(
|
|
1030
|
+
"File",
|
|
1031
|
+
"",
|
|
1032
|
+
{"file_path": f},
|
|
1033
|
+
[
|
|
1034
|
+
self.set_element(
|
|
1035
|
+
"DataType",
|
|
1036
|
+
"sra-run-fastq" if ".fastq" in f else "generic-data",
|
|
1037
|
+
)
|
|
1038
|
+
],
|
|
1039
|
+
)
|
|
1040
|
+
for f in sorted(all_fastq_files)
|
|
1041
|
+
]
|
|
1042
|
+
|
|
1043
|
+
attribute_elements = [
|
|
1044
|
+
self.set_element(
|
|
1045
|
+
"AttributeRefId",
|
|
1046
|
+
attrib={"name": "BioProject"},
|
|
1047
|
+
children=[
|
|
1048
|
+
self.set_element(
|
|
1049
|
+
"RefId",
|
|
1050
|
+
children=[
|
|
1051
|
+
self.set_element(
|
|
1052
|
+
"PrimaryId",
|
|
1053
|
+
bioproject_id,
|
|
1054
|
+
{"db": "BioProject"},
|
|
1055
|
+
)
|
|
1056
|
+
],
|
|
1057
|
+
)
|
|
1058
|
+
],
|
|
1059
|
+
),
|
|
1060
|
+
# Reference the processed sample, not individual biosamples
|
|
1061
|
+
self.set_element(
|
|
1062
|
+
"AttributeRefId",
|
|
1063
|
+
attrib={"name": "BioSample"},
|
|
1064
|
+
children=[
|
|
1065
|
+
self.set_element(
|
|
1066
|
+
"RefId",
|
|
1067
|
+
children=[
|
|
1068
|
+
self.set_element(
|
|
1069
|
+
"SPUID",
|
|
1070
|
+
processed_sample_id,
|
|
1071
|
+
{"spuid_namespace": org},
|
|
1072
|
+
)
|
|
1073
|
+
],
|
|
1074
|
+
)
|
|
1075
|
+
],
|
|
1076
|
+
),
|
|
1077
|
+
]
|
|
1078
|
+
|
|
1079
|
+
sra_attributes = []
|
|
1080
|
+
if instrument_vendor == "illumina":
|
|
1081
|
+
sra_attributes.append(
|
|
1082
|
+
self.set_element("Attribute", "ILLUMINA", {"name": "platform"})
|
|
1083
|
+
)
|
|
1084
|
+
if instrument_model == "nextseq_550":
|
|
1085
|
+
sra_attributes.append(
|
|
1086
|
+
self.set_element(
|
|
1087
|
+
"Attribute", "NextSeq 550", {"name": "instrument_model"}
|
|
1088
|
+
)
|
|
1089
|
+
)
|
|
1090
|
+
elif instrument_model == "novaseq_6000":
|
|
1091
|
+
sra_attributes.append(
|
|
1092
|
+
self.set_element(
|
|
1093
|
+
"Attribute",
|
|
1094
|
+
"NovaSeq 6000",
|
|
1095
|
+
{"name": "instrument_model"},
|
|
1096
|
+
)
|
|
1097
|
+
)
|
|
1098
|
+
elif instrument_model == "hiseq":
|
|
1099
|
+
sra_attributes.append(
|
|
1100
|
+
self.set_element(
|
|
1101
|
+
"Attribute", "HiSeq", {"name": "instrument_model"}
|
|
1102
|
+
)
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
if analyte_category == "metagenome":
|
|
1106
|
+
sra_attributes.append(
|
|
1107
|
+
self.set_element("Attribute", "WGS", {"name": "library_strategy"})
|
|
1108
|
+
)
|
|
1109
|
+
sra_attributes.append(
|
|
1110
|
+
self.set_element(
|
|
1111
|
+
"Attribute", "METAGENOMIC", {"name": "library_source"}
|
|
1112
|
+
)
|
|
1113
|
+
)
|
|
1114
|
+
sra_attributes.append(
|
|
1115
|
+
self.set_element(
|
|
1116
|
+
"Attribute", "RANDOM", {"name": "library_selection"}
|
|
1117
|
+
)
|
|
1118
|
+
)
|
|
1119
|
+
elif analyte_category == "metatranscriptome":
|
|
1120
|
+
sra_attributes.append(
|
|
1121
|
+
self.set_element(
|
|
1122
|
+
"Attribute",
|
|
1123
|
+
"METATRANSCRIPTOMIC",
|
|
1124
|
+
{"name": "library_source"},
|
|
1125
|
+
)
|
|
1126
|
+
)
|
|
1127
|
+
|
|
1128
|
+
# Determine library layout based on file patterns
|
|
1129
|
+
has_paired_reads = any(
|
|
1130
|
+
"_R1" in f and "_R2" in f.replace("_R1", "_R2") in all_fastq_files
|
|
1131
|
+
for f in all_fastq_files
|
|
1132
|
+
if "_R1" in f
|
|
1133
|
+
)
|
|
1134
|
+
|
|
1135
|
+
if has_paired_reads:
|
|
1136
|
+
sra_attributes.append(
|
|
1137
|
+
self.set_element("Attribute", "paired", {"name": "library_layout"})
|
|
1138
|
+
)
|
|
1139
|
+
else:
|
|
1140
|
+
sra_attributes.append(
|
|
1141
|
+
self.set_element("Attribute", "single", {"name": "library_layout"})
|
|
1142
|
+
)
|
|
1143
|
+
|
|
1144
|
+
# Add library_name attribute using ProcessedSample name
|
|
1145
|
+
if processed_sample_name:
|
|
1146
|
+
sra_attributes.append(
|
|
1147
|
+
self.set_element(
|
|
1148
|
+
"Attribute", processed_sample_name, {"name": "library_name"}
|
|
1149
|
+
)
|
|
1150
|
+
)
|
|
1151
|
+
|
|
1152
|
+
# Add library construction protocol from any of the biosamples
|
|
1153
|
+
for biosample_id, lib_prep_name in lib_prep_protocol_names.items():
|
|
1154
|
+
if lib_prep_name:
|
|
1155
|
+
sra_attributes.append(
|
|
1156
|
+
self.set_element(
|
|
1157
|
+
"Attribute",
|
|
1158
|
+
lib_prep_name,
|
|
1159
|
+
{"name": "library_construction_protocol"},
|
|
1160
|
+
)
|
|
1161
|
+
)
|
|
1162
|
+
break # Only add one protocol name
|
|
1163
|
+
|
|
1164
|
+
# Use the first nucleotide sequencing ID as the identifier
|
|
1165
|
+
omics_processing_id = None
|
|
1166
|
+
for biosample_id, seq_id in nucleotide_sequencing_ids.items():
|
|
1167
|
+
if seq_id:
|
|
1168
|
+
omics_processing_id = seq_id
|
|
1169
|
+
break
|
|
1170
|
+
|
|
1171
|
+
if omics_processing_id:
|
|
1172
|
+
identifier_element = self.set_element(
|
|
1173
|
+
"Identifier",
|
|
1174
|
+
children=[
|
|
1175
|
+
self.set_element(
|
|
1176
|
+
"SPUID", omics_processing_id, {"spuid_namespace": org}
|
|
1177
|
+
)
|
|
1178
|
+
],
|
|
1179
|
+
)
|
|
1180
|
+
|
|
1181
|
+
action = self.set_element(
|
|
1182
|
+
"Action",
|
|
1183
|
+
children=[
|
|
1184
|
+
self.set_element(
|
|
1185
|
+
"AddFiles",
|
|
1186
|
+
attrib={"target_db": "SRA"},
|
|
1187
|
+
children=files_elements
|
|
1188
|
+
+ attribute_elements
|
|
1189
|
+
+ sra_attributes
|
|
1190
|
+
+ [identifier_element],
|
|
1191
|
+
),
|
|
1192
|
+
],
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
self.root.append(action)
|
|
1196
|
+
|
|
572
1197
|
def get_submission_xml(
|
|
573
1198
|
self,
|
|
574
1199
|
biosamples_list: list,
|
|
@@ -576,19 +1201,47 @@ class NCBISubmissionXML:
|
|
|
576
1201
|
biosample_data_objects_list: list,
|
|
577
1202
|
biosample_library_preparation_list: list,
|
|
578
1203
|
instruments_dict: dict,
|
|
1204
|
+
pooled_biosamples_data=None,
|
|
579
1205
|
):
|
|
580
|
-
data_type = None
|
|
581
|
-
|
|
1206
|
+
# data_type = None
|
|
1207
|
+
|
|
1208
|
+
biosamples_to_exclude = set()
|
|
582
1209
|
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
583
|
-
for
|
|
1210
|
+
for bsm_id, ntseq_list in bsm_ntseq.items():
|
|
1211
|
+
# Check if any processing_institution is "JGI"
|
|
584
1212
|
for ntseq in ntseq_list:
|
|
585
|
-
if
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
1213
|
+
if (
|
|
1214
|
+
"processing_institution" in ntseq
|
|
1215
|
+
and ntseq["processing_institution"] == "JGI"
|
|
1216
|
+
):
|
|
1217
|
+
biosamples_to_exclude.add(bsm_id)
|
|
1218
|
+
break
|
|
1219
|
+
|
|
1220
|
+
# Filter biosample_nucleotide_sequencing_list to exclude JGI records
|
|
1221
|
+
filtered_nucleotide_sequencing_list = []
|
|
1222
|
+
for bsm_ntseq in biosample_nucleotide_sequencing_list:
|
|
1223
|
+
filtered_dict = {}
|
|
1224
|
+
for bsm_id, ntseq_list in bsm_ntseq.items():
|
|
1225
|
+
if bsm_id not in biosamples_to_exclude:
|
|
1226
|
+
filtered_dict[bsm_id] = ntseq_list
|
|
1227
|
+
if filtered_dict: # Only add non-empty dictionaries
|
|
1228
|
+
filtered_nucleotide_sequencing_list.append(filtered_dict)
|
|
1229
|
+
|
|
1230
|
+
# Filter biosamples_list to exclude JGI-processed biosamples
|
|
1231
|
+
filtered_biosamples_list = [
|
|
1232
|
+
biosample
|
|
1233
|
+
for biosample in biosamples_list
|
|
1234
|
+
if biosample.get("id") not in biosamples_to_exclude
|
|
1235
|
+
]
|
|
1236
|
+
|
|
1237
|
+
# Get data_type from filtered list
|
|
1238
|
+
# for bsm_ntseq in filtered_nucleotide_sequencing_list:
|
|
1239
|
+
# for _, ntseq_list in bsm_ntseq.items():
|
|
1240
|
+
# for ntseq in ntseq_list:
|
|
1241
|
+
# if "analyte_category" in ntseq:
|
|
1242
|
+
# data_type = handle_string_value(
|
|
1243
|
+
# ntseq["analyte_category"]
|
|
1244
|
+
# ).capitalize()
|
|
592
1245
|
|
|
593
1246
|
self.set_description(
|
|
594
1247
|
email=self.nmdc_pi_email,
|
|
@@ -597,30 +1250,68 @@ class NCBISubmissionXML:
|
|
|
597
1250
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
598
1251
|
)
|
|
599
1252
|
|
|
600
|
-
if not
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
1253
|
+
# if not self.ncbi_bioproject_id:
|
|
1254
|
+
# self.set_bioproject(
|
|
1255
|
+
# title=self.nmdc_study_title,
|
|
1256
|
+
# project_id=self.ncbi_bioproject_id,
|
|
1257
|
+
# description=self.nmdc_study_description,
|
|
1258
|
+
# data_type=data_type,
|
|
1259
|
+
# org=self.ncbi_submission_metadata.get("organization", ""),
|
|
1260
|
+
# )
|
|
608
1261
|
|
|
609
1262
|
self.set_biosample(
|
|
610
1263
|
organism_name=self.ncbi_biosample_metadata.get("organism_name", ""),
|
|
611
1264
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
612
|
-
bioproject_id=
|
|
613
|
-
nmdc_biosamples=
|
|
1265
|
+
bioproject_id=self.ncbi_bioproject_id,
|
|
1266
|
+
nmdc_biosamples=filtered_biosamples_list,
|
|
1267
|
+
pooled_biosamples_data=pooled_biosamples_data,
|
|
614
1268
|
)
|
|
615
1269
|
|
|
1270
|
+
# Also filter biosample_data_objects_list
|
|
1271
|
+
filtered_data_objects_list = []
|
|
1272
|
+
acceptable_extensions = [".fastq.gz", ".fastq"]
|
|
1273
|
+
|
|
1274
|
+
for entry in biosample_data_objects_list:
|
|
1275
|
+
filtered_entry = {}
|
|
1276
|
+
for biosample_id, data_objects in entry.items():
|
|
1277
|
+
if biosample_id not in biosamples_to_exclude:
|
|
1278
|
+
# filter data_objects based on acceptable/allowed extensions
|
|
1279
|
+
# for "url" key in data_object
|
|
1280
|
+
filtered_objects = []
|
|
1281
|
+
for data_object in data_objects:
|
|
1282
|
+
if "url" in data_object:
|
|
1283
|
+
url = urlparse(data_object["url"])
|
|
1284
|
+
file_path = os.path.basename(url.path)
|
|
1285
|
+
if any(
|
|
1286
|
+
file_path.endswith(ext) for ext in acceptable_extensions
|
|
1287
|
+
):
|
|
1288
|
+
filtered_objects.append(data_object)
|
|
1289
|
+
|
|
1290
|
+
if filtered_objects:
|
|
1291
|
+
filtered_entry[biosample_id] = filtered_objects
|
|
1292
|
+
|
|
1293
|
+
if filtered_entry: # Only add non-empty entries
|
|
1294
|
+
filtered_data_objects_list.append(filtered_entry)
|
|
1295
|
+
|
|
1296
|
+
# Filter library preparation list as well
|
|
1297
|
+
filtered_library_preparation_list = []
|
|
1298
|
+
for lib_prep_dict in biosample_library_preparation_list:
|
|
1299
|
+
filtered_lib_prep = {}
|
|
1300
|
+
for biosample_id, lib_prep in lib_prep_dict.items():
|
|
1301
|
+
if biosample_id not in biosamples_to_exclude:
|
|
1302
|
+
filtered_lib_prep[biosample_id] = lib_prep
|
|
1303
|
+
if filtered_lib_prep: # Only add non-empty entries
|
|
1304
|
+
filtered_library_preparation_list.append(filtered_lib_prep)
|
|
1305
|
+
|
|
616
1306
|
self.set_fastq(
|
|
617
|
-
biosample_data_objects=
|
|
618
|
-
bioproject_id=
|
|
1307
|
+
biosample_data_objects=filtered_data_objects_list,
|
|
1308
|
+
bioproject_id=self.ncbi_bioproject_id,
|
|
619
1309
|
org=self.ncbi_submission_metadata.get("organization", ""),
|
|
620
|
-
nmdc_nucleotide_sequencing=
|
|
621
|
-
nmdc_biosamples=
|
|
622
|
-
nmdc_library_preparation=
|
|
1310
|
+
nmdc_nucleotide_sequencing=filtered_nucleotide_sequencing_list,
|
|
1311
|
+
nmdc_biosamples=filtered_biosamples_list,
|
|
1312
|
+
nmdc_library_preparation=filtered_library_preparation_list,
|
|
623
1313
|
all_instruments=instruments_dict,
|
|
1314
|
+
pooled_biosamples_data=pooled_biosamples_data,
|
|
624
1315
|
)
|
|
625
1316
|
|
|
626
1317
|
rough_string = ET.tostring(self.root, "unicode")
|