nmdc-runtime 2.10.0__py3-none-any.whl → 2.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nmdc-runtime might be problematic. Click here for more details.

Files changed (77) hide show
  1. nmdc_runtime/Dockerfile +177 -0
  2. nmdc_runtime/api/analytics.py +22 -2
  3. nmdc_runtime/api/core/idgen.py +36 -6
  4. nmdc_runtime/api/db/mongo.py +0 -12
  5. nmdc_runtime/api/endpoints/find.py +65 -225
  6. nmdc_runtime/api/endpoints/lib/linked_instances.py +180 -0
  7. nmdc_runtime/api/endpoints/nmdcschema.py +65 -144
  8. nmdc_runtime/api/endpoints/objects.py +4 -11
  9. nmdc_runtime/api/endpoints/operations.py +0 -27
  10. nmdc_runtime/api/endpoints/queries.py +22 -0
  11. nmdc_runtime/api/endpoints/sites.py +0 -24
  12. nmdc_runtime/api/endpoints/util.py +57 -35
  13. nmdc_runtime/api/entrypoint.sh +7 -0
  14. nmdc_runtime/api/main.py +84 -60
  15. nmdc_runtime/api/models/util.py +12 -5
  16. nmdc_runtime/api/openapi.py +116 -180
  17. nmdc_runtime/api/swagger_ui/assets/custom-elements.js +522 -0
  18. nmdc_runtime/api/swagger_ui/assets/script.js +247 -0
  19. nmdc_runtime/api/swagger_ui/assets/style.css +155 -0
  20. nmdc_runtime/api/swagger_ui/swagger_ui.py +34 -0
  21. nmdc_runtime/minter/adapters/repository.py +21 -0
  22. nmdc_runtime/minter/domain/model.py +20 -0
  23. nmdc_runtime/site/changesheets/data/OmicsProcessing-to-catted-Biosamples.tsv +1561 -0
  24. nmdc_runtime/site/changesheets/scripts/missing_neon_soils_ecosystem_data.py +311 -0
  25. nmdc_runtime/site/changesheets/scripts/neon_soils_add_ncbi_ids.py +210 -0
  26. nmdc_runtime/site/dagster.yaml +53 -0
  27. nmdc_runtime/site/entrypoint-daemon.sh +26 -0
  28. nmdc_runtime/site/entrypoint-dagit-readonly.sh +26 -0
  29. nmdc_runtime/site/entrypoint-dagit.sh +26 -0
  30. nmdc_runtime/site/export/ncbi_xml.py +632 -11
  31. nmdc_runtime/site/export/ncbi_xml_utils.py +114 -0
  32. nmdc_runtime/site/graphs.py +7 -0
  33. nmdc_runtime/site/ops.py +92 -34
  34. nmdc_runtime/site/repository.py +2 -0
  35. nmdc_runtime/site/resources.py +16 -3
  36. nmdc_runtime/site/translation/submission_portal_translator.py +82 -14
  37. nmdc_runtime/site/workspace.yaml +13 -0
  38. nmdc_runtime/static/NMDC_logo.svg +1073 -0
  39. nmdc_runtime/static/ORCID-iD_icon_vector.svg +4 -0
  40. nmdc_runtime/static/README.md +5 -0
  41. nmdc_runtime/static/favicon.ico +0 -0
  42. nmdc_runtime/util.py +87 -1
  43. nmdc_runtime-2.11.1.dist-info/METADATA +46 -0
  44. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/RECORD +47 -57
  45. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/WHEEL +1 -2
  46. nmdc_runtime/api/endpoints/ids.py +0 -192
  47. nmdc_runtime/client/__init__.py +0 -0
  48. nmdc_runtime/containers.py +0 -14
  49. nmdc_runtime/core/__init__.py +0 -0
  50. nmdc_runtime/core/db/Database.py +0 -13
  51. nmdc_runtime/core/db/__init__.py +0 -0
  52. nmdc_runtime/core/exceptions/__init__.py +0 -23
  53. nmdc_runtime/core/exceptions/base.py +0 -47
  54. nmdc_runtime/core/exceptions/token.py +0 -13
  55. nmdc_runtime/domain/__init__.py +0 -0
  56. nmdc_runtime/domain/users/__init__.py +0 -0
  57. nmdc_runtime/domain/users/queriesInterface.py +0 -18
  58. nmdc_runtime/domain/users/userSchema.py +0 -37
  59. nmdc_runtime/domain/users/userService.py +0 -14
  60. nmdc_runtime/infrastructure/__init__.py +0 -0
  61. nmdc_runtime/infrastructure/database/__init__.py +0 -0
  62. nmdc_runtime/infrastructure/database/db.py +0 -3
  63. nmdc_runtime/infrastructure/database/models/__init__.py +0 -0
  64. nmdc_runtime/infrastructure/database/models/user.py +0 -1
  65. nmdc_runtime/lib/__init__.py +0 -1
  66. nmdc_runtime/lib/extract_nmdc_data.py +0 -33
  67. nmdc_runtime/lib/load_nmdc_data.py +0 -121
  68. nmdc_runtime/lib/nmdc_dataframes.py +0 -825
  69. nmdc_runtime/lib/nmdc_etl_class.py +0 -396
  70. nmdc_runtime/lib/transform_nmdc_data.py +0 -1117
  71. nmdc_runtime/site/drsobjects/__init__.py +0 -0
  72. nmdc_runtime/site/drsobjects/ingest.py +0 -93
  73. nmdc_runtime/site/drsobjects/registration.py +0 -131
  74. nmdc_runtime-2.10.0.dist-info/METADATA +0 -265
  75. nmdc_runtime-2.10.0.dist-info/top_level.txt +0 -1
  76. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/entry_points.txt +0 -0
  77. {nmdc_runtime-2.10.0.dist-info → nmdc_runtime-2.11.1.dist-info}/licenses/LICENSE +0 -0
@@ -275,6 +275,120 @@ def load_mappings(url):
275
275
  return attribute_mappings, slot_range_mappings
276
276
 
277
277
 
278
+ def check_pooling_for_biosamples(
279
+ material_processing_set: Collection, biosamples_list: List[Dict[str, Any]]
280
+ ) -> Dict[str, Dict[str, Any]]:
281
+ """Check which biosamples are part of pooling processes and return pooling information.
282
+
283
+ The way in which we check if a biosample is part of a Pooling process is by checking if
284
+ the biosample id has been asserted on the `has_input` slot/key of an `nmdc:Pooling` process
285
+ instance.
286
+
287
+ :param material_processing_set: reference to the material_processing_set collection
288
+ :param biosamples_list: list of all biosamples to check
289
+ :return: dictionary mapping biosample_id to pooling information (empty dict if not pooled)
290
+ """
291
+ result = {}
292
+ # get list of all biosample IDs that are part of a given study
293
+ biosample_lookup = {bs["id"]: bs for bs in biosamples_list}
294
+
295
+ # get list of all pooling processes
296
+ pooling_processes = list(material_processing_set.find({"type": "nmdc:Pooling"}))
297
+
298
+ # initialize all biosamples as not pooled
299
+ for biosample in biosamples_list:
300
+ result[biosample["id"]] = {}
301
+
302
+ # process each pooling process
303
+ for pooling_process in pooling_processes:
304
+ pooled_biosample_ids = pooling_process.get("has_input", [])
305
+
306
+ # get the processed sample output from the pooling process
307
+ has_output = pooling_process.get("has_output", [])
308
+ processed_sample_id = None
309
+
310
+ for output_id in has_output:
311
+ if get_classname_from_typecode(output_id) == "ProcessedSample":
312
+ processed_sample_id = output_id
313
+ break
314
+
315
+ # aggregate the values on `collection_date` and `depth` slots
316
+ # here, we are collecting the `collection_date` and `depth` values
317
+ # asserted on each of the biosamples that are part of a given pooling
318
+ # process in the following way:
319
+ # example of aggregated `collection_date`: 2017-06-05T16:50Z/2017-06-05T17:47Z
320
+ # example of aggregated `depth`: 0-10 m
321
+ collection_dates = []
322
+ depths = []
323
+
324
+ for bs_id in pooled_biosample_ids:
325
+ biosample = biosample_lookup.get(bs_id)
326
+ if not biosample:
327
+ continue
328
+
329
+ if "collection_date" in biosample:
330
+ collection_date = biosample["collection_date"]
331
+ if (
332
+ isinstance(collection_date, dict)
333
+ and "has_raw_value" in collection_date
334
+ ):
335
+ collection_dates.append(collection_date["has_raw_value"])
336
+ elif isinstance(collection_date, str):
337
+ collection_dates.append(collection_date)
338
+
339
+ if "depth" in biosample:
340
+ depth = biosample["depth"]
341
+ if isinstance(depth, dict):
342
+ if "has_numeric_value" in depth:
343
+ depths.append(depth["has_numeric_value"])
344
+ elif (
345
+ "has_minimum_numeric_value" in depth
346
+ and "has_maximum_numeric_value" in depth
347
+ ):
348
+ depths.extend(
349
+ [
350
+ depth["has_minimum_numeric_value"],
351
+ depth["has_maximum_numeric_value"],
352
+ ]
353
+ )
354
+ elif isinstance(depth, (int, float)):
355
+ depths.append(depth)
356
+
357
+ # create aggregated (forward slash separated) value for `collection_date`
358
+ aggregated_collection_date = None
359
+ if collection_dates:
360
+ sorted_dates = sorted(collection_dates)
361
+ if len(sorted_dates) > 1:
362
+ aggregated_collection_date = f"{sorted_dates[0]}/{sorted_dates[-1]}"
363
+ else:
364
+ aggregated_collection_date = sorted_dates[0]
365
+
366
+ # create aggregated (hyphen separated) value for `depth`
367
+ aggregated_depth = None
368
+ if depths:
369
+ min_depth = min(depths)
370
+ max_depth = max(depths)
371
+ if min_depth != max_depth:
372
+ aggregated_depth = f"{min_depth}-{max_depth} m"
373
+ else:
374
+ aggregated_depth = f"{min_depth} m"
375
+
376
+ # update all biosamples that are part of this pooling process
377
+ pooling_info = {
378
+ "processed_sample_id": processed_sample_id,
379
+ "pooling_process_id": pooling_process.get("id"),
380
+ "pooled_biosample_ids": pooled_biosample_ids,
381
+ "aggregated_collection_date": aggregated_collection_date,
382
+ "aggregated_depth": aggregated_depth,
383
+ }
384
+
385
+ for bs_id in pooled_biosample_ids:
386
+ if bs_id in result:
387
+ result[bs_id] = pooling_info
388
+
389
+ return result
390
+
391
+
278
392
  def validate_xml(xml, xsd_url):
279
393
  response = requests.get(xsd_url)
280
394
  response.raise_for_status()
@@ -53,6 +53,7 @@ from nmdc_runtime.site.ops import (
53
53
  get_data_objects_from_biosamples,
54
54
  get_nucleotide_sequencing_from_biosamples,
55
55
  get_library_preparation_from_biosamples,
56
+ get_aggregated_pooled_biosamples,
56
57
  get_all_instruments,
57
58
  get_ncbi_export_pipeline_inputs,
58
59
  ncbi_submission_xml_from_nmdc_study,
@@ -173,6 +174,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
173
174
  data_object_mapping_file_url,
174
175
  biosample_extras_file_url,
175
176
  biosample_extras_slot_mapping_file_url,
177
+ study_id,
176
178
  ) = get_submission_portal_pipeline_inputs()
177
179
 
178
180
  metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
@@ -193,6 +195,7 @@ def translate_metadata_submission_to_nmdc_schema_database():
193
195
  biosample_extras=biosample_extras,
194
196
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
195
197
  instrument_mapping=instrument_mapping,
198
+ study_id=study_id,
196
199
  )
197
200
 
198
201
  validate_metadata(database)
@@ -213,6 +216,7 @@ def ingest_metadata_submission():
213
216
  data_object_mapping_file_url,
214
217
  biosample_extras_file_url,
215
218
  biosample_extras_slot_mapping_file_url,
219
+ study_id,
216
220
  ) = get_submission_portal_pipeline_inputs()
217
221
 
218
222
  metadata_submission = fetch_nmdc_portal_submission_by_id(submission_id)
@@ -233,6 +237,7 @@ def ingest_metadata_submission():
233
237
  biosample_extras=biosample_extras,
234
238
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
235
239
  instrument_mapping=instrument_mapping,
240
+ study_id=study_id,
236
241
  )
237
242
 
238
243
  log_database_ids(database)
@@ -472,6 +477,7 @@ def nmdc_study_to_ncbi_submission_export():
472
477
  )
473
478
  data_object_records = get_data_objects_from_biosamples(biosamples)
474
479
  library_preparation_records = get_library_preparation_from_biosamples(biosamples)
480
+ pooled_biosamples_data = get_aggregated_pooled_biosamples(biosamples)
475
481
  all_instruments = get_all_instruments()
476
482
  xml_data = ncbi_submission_xml_from_nmdc_study(
477
483
  nmdc_study,
@@ -481,6 +487,7 @@ def nmdc_study_to_ncbi_submission_export():
481
487
  data_object_records,
482
488
  library_preparation_records,
483
489
  all_instruments,
490
+ pooled_biosamples_data,
484
491
  )
485
492
  ncbi_submission_xml_asset(xml_data)
486
493
 
nmdc_runtime/site/ops.py CHANGED
@@ -1,7 +1,6 @@
1
1
  import csv
2
2
  import json
3
3
  import logging
4
- import mimetypes
5
4
  import os
6
5
  import subprocess
7
6
  from collections import defaultdict
@@ -16,6 +15,7 @@ from ontology_loader.ontology_load_controller import OntologyLoaderController
16
15
  import pandas as pd
17
16
  import requests
18
17
  from refscan.lib.helpers import get_names_of_classes_in_effective_range_of_slot
18
+ from toolz import dissoc
19
19
 
20
20
  from bson import ObjectId, json_util
21
21
  from dagster import (
@@ -73,7 +73,6 @@ from nmdc_runtime.site.export.ncbi_xml_utils import (
73
73
  fetch_nucleotide_sequencing_from_biosamples,
74
74
  fetch_library_preparation_from_biosamples,
75
75
  )
76
- from nmdc_runtime.site.drsobjects.ingest import mongo_add_docs_result_as_dict
77
76
  from nmdc_runtime.site.resources import (
78
77
  NmdcPortalApiClient,
79
78
  GoldApiClient,
@@ -95,15 +94,12 @@ from nmdc_runtime.site.translation.submission_portal_translator import (
95
94
  )
96
95
  from nmdc_runtime.site.repair.database_updater import DatabaseUpdater
97
96
  from nmdc_runtime.site.util import (
98
- run_and_log,
99
97
  schema_collection_has_index_on_id,
100
98
  nmdc_study_id_to_filename,
101
99
  get_instruments_by_id,
102
100
  )
103
101
  from nmdc_runtime.util import (
104
- drs_object_in_for,
105
102
  pluralize,
106
- put_object,
107
103
  specialize_activity_set_docs,
108
104
  collection_name_to_class_names,
109
105
  nmdc_schema_view,
@@ -112,7 +108,7 @@ from nmdc_runtime.util import (
112
108
  from nmdc_schema import nmdc
113
109
  from pymongo import InsertOne, UpdateOne
114
110
  from pymongo.database import Database as MongoDatabase
115
- from starlette import status
111
+ from pymongo.collection import Collection as MongoCollection
116
112
  from toolz import get_in, valfilter, identity
117
113
 
118
114
 
@@ -373,6 +369,9 @@ def perform_changesheet_updates(context, sheet_in: ChangesheetIn):
373
369
 
374
370
  @op(required_resource_keys={"runtime_api_site_client"})
375
371
  def get_json_in(context):
372
+ """
373
+ TODO: Document this function.
374
+ """
376
375
  object_id = context.op_config.get("object_id")
377
376
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
378
377
  rv = client.get_object_bytes(object_id)
@@ -385,6 +384,9 @@ def get_json_in(context):
385
384
 
386
385
  @op(required_resource_keys={"runtime_api_site_client", "mongo"})
387
386
  def perform_mongo_updates(context, json_in):
387
+ """
388
+ TODO: Document this function.
389
+ """
388
390
  mongo = context.resources.mongo
389
391
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
390
392
  op_id = context.op_config.get("operation_id")
@@ -414,6 +416,9 @@ def perform_mongo_updates(context, json_in):
414
416
  def _add_schema_docs_with_or_without_replacement(
415
417
  mongo: MongoDBResource, docs: Dict[str, list]
416
418
  ):
419
+ """
420
+ TODO: Document this function.
421
+ """
417
422
  coll_index_on_id_map = schema_collection_has_index_on_id(mongo.db)
418
423
  if all(coll_index_on_id_map[coll] for coll in docs.keys()):
419
424
  replace = True
@@ -437,7 +442,13 @@ def _add_schema_docs_with_or_without_replacement(
437
442
  f"{colls_not_id_indexed=} ; {colls_id_indexed=}"
438
443
  )
439
444
  op_result = mongo.add_docs(docs, validate=False, replace=replace)
440
- return mongo_add_docs_result_as_dict(op_result)
445
+
446
+ # Translate the operation result into a dictionary in which each item's key is a collection name
447
+ # and each item's value is the corresponding bulk API result (excluding the "upserted" field).
448
+ return {
449
+ collection_name: dissoc(bulk_write_result.bulk_api_result, "upserted")
450
+ for collection_name, bulk_write_result in op_result.items()
451
+ }
441
452
 
442
453
 
443
454
  @op(required_resource_keys={"mongo"})
@@ -545,27 +556,39 @@ def nmdc_schema_database_from_gold_study(
545
556
 
546
557
 
547
558
  @op(
559
+ required_resource_keys={"mongo"},
548
560
  out={
549
561
  "submission_id": Out(),
550
562
  "nucleotide_sequencing_mapping_file_url": Out(Optional[str]),
551
563
  "data_object_mapping_file_url": Out(Optional[str]),
552
564
  "biosample_extras_file_url": Out(Optional[str]),
553
565
  "biosample_extras_slot_mapping_file_url": Out(Optional[str]),
566
+ "study_id": Out(Optional[str]),
554
567
  },
555
568
  )
556
569
  def get_submission_portal_pipeline_inputs(
570
+ context: OpExecutionContext,
557
571
  submission_id: str,
558
572
  nucleotide_sequencing_mapping_file_url: Optional[str],
559
573
  data_object_mapping_file_url: Optional[str],
560
574
  biosample_extras_file_url: Optional[str],
561
575
  biosample_extras_slot_mapping_file_url: Optional[str],
562
- ) -> Tuple[str, str | None, str | None, str | None, str | None]:
576
+ study_id: Optional[str],
577
+ ) -> Tuple[str, str | None, str | None, str | None, str | None, str | None]:
578
+ # query for studies matching the ID to see if it eists
579
+ if study_id:
580
+ mdb = context.resources.mongo.db
581
+ result = mdb.study_set.find_one({"id": study_id})
582
+ if not result:
583
+ raise Exception(f"Study id: {study_id} does not exist in Mongo.")
584
+
563
585
  return (
564
586
  submission_id,
565
587
  nucleotide_sequencing_mapping_file_url,
566
588
  data_object_mapping_file_url,
567
589
  biosample_extras_file_url,
568
590
  biosample_extras_slot_mapping_file_url,
591
+ study_id,
569
592
  )
570
593
 
571
594
 
@@ -590,6 +613,7 @@ def translate_portal_submission_to_nmdc_schema_database(
590
613
  study_pi_image_url: Optional[str],
591
614
  biosample_extras: Optional[list[dict]],
592
615
  biosample_extras_slot_mapping: Optional[list[dict]],
616
+ study_id: Optional[str],
593
617
  ) -> nmdc.Database:
594
618
  client: RuntimeApiSiteClient = context.resources.runtime_api_site_client
595
619
 
@@ -607,6 +631,7 @@ def translate_portal_submission_to_nmdc_schema_database(
607
631
  biosample_extras=biosample_extras,
608
632
  biosample_extras_slot_mapping=biosample_extras_slot_mapping,
609
633
  illumina_instrument_mapping=instrument_mapping,
634
+ study_id=study_id,
610
635
  )
611
636
  database = translator.get_database()
612
637
  return database
@@ -947,7 +972,9 @@ def load_ontology(context: OpExecutionContext):
947
972
 
948
973
 
949
974
  def _add_linked_instances_to_alldocs(
950
- temp_collection, context, document_reference_ranged_slots_by_type
975
+ temp_collection: MongoCollection,
976
+ context: OpExecutionContext,
977
+ document_reference_ranged_slots_by_type: dict,
951
978
  ) -> None:
952
979
  """
953
980
  Adds {`_upstream`,`_downstream`} fields to each document in the temporary alldocs collection.
@@ -983,16 +1010,13 @@ def _add_linked_instances_to_alldocs(
983
1010
  # Store the full type with prefix intact
984
1011
  doc_type = doc["type"]
985
1012
  # For looking up reference slots, we still need the type without prefix
986
- # FIXME `document_reference_ranged_slots_by_type` should key on `doc_type`
987
1013
  doc_type_no_prefix = doc_type[5:] if doc_type.startswith("nmdc:") else doc_type
988
1014
 
989
1015
  # Record ID to type mapping - preserve the original type with prefix
990
1016
  id_to_type_map[doc_id] = doc_type
991
1017
 
992
1018
  # Find all document references from this document
993
- reference_slots = document_reference_ranged_slots_by_type.get(
994
- doc_type_no_prefix, []
995
- )
1019
+ reference_slots = document_reference_ranged_slots_by_type.get(doc_type, [])
996
1020
  for slot in reference_slots:
997
1021
  if slot in doc:
998
1022
  # Handle both single-value and array references
@@ -1116,7 +1140,7 @@ def _add_linked_instances_to_alldocs(
1116
1140
  # Reference: https://docs.dagster.io/guides/build/ops/graphs#defining-nothing-dependencies
1117
1141
  #
1118
1142
  @op(required_resource_keys={"mongo"}, ins={"waits_for": In(dagster_type=Nothing)})
1119
- def materialize_alldocs(context) -> int:
1143
+ def materialize_alldocs(context: OpExecutionContext) -> int:
1120
1144
  """
1121
1145
  This function (re)builds the `alldocs` collection to reflect the current state of the MongoDB database by:
1122
1146
 
@@ -1167,17 +1191,16 @@ def materialize_alldocs(context) -> int:
1167
1191
  )
1168
1192
  )
1169
1193
 
1170
- # FIXME rename to `document_reference_ranged_slots_by_type`
1171
- # FIXME key on CURIE, e.g. `nmdc:Study`
1172
- # (here, not upstream in `cls_slot_map`/`document_referenceable_ranges`, b/c `schema_view` used directly in those)
1173
- document_reference_ranged_slots = defaultdict(list)
1194
+ document_reference_ranged_slots_by_type = defaultdict(list)
1174
1195
  for cls_name, slot_map in cls_slot_map.items():
1175
1196
  for slot_name, slot in slot_map.items():
1176
1197
  if (
1177
1198
  set(get_names_of_classes_in_effective_range_of_slot(schema_view, slot))
1178
1199
  & document_referenceable_ranges
1179
1200
  ):
1180
- document_reference_ranged_slots[cls_name].append(slot_name)
1201
+ document_reference_ranged_slots_by_type[f"nmdc:{cls_name}"].append(
1202
+ slot_name
1203
+ )
1181
1204
 
1182
1205
  # Build `alldocs` to a temporary collection for atomic replacement
1183
1206
  # https://www.mongodb.com/docs/v6.0/reference/method/db.collection.renameCollection/#resource-locking-in-replica-sets
@@ -1194,25 +1217,19 @@ def materialize_alldocs(context) -> int:
1194
1217
  # Keep the full type with prefix for document
1195
1218
  doc_type_full = doc["type"]
1196
1219
  # Remove prefix for slot lookup and ancestor lookup
1197
- doc_type = (
1198
- doc_type_full[5:]
1199
- if doc_type_full.startswith("nmdc:")
1200
- else doc_type_full
1201
- )
1220
+ doc_type = doc_type_full.removeprefix("nmdc:")
1202
1221
  except KeyError:
1203
1222
  raise Exception(
1204
1223
  f"doc {doc['id']} in collection {coll_name} has no 'type'!"
1205
1224
  )
1206
- slots_to_include = ["id", "type"] + document_reference_ranged_slots[
1207
- doc_type
1225
+ slots_to_include = ["id", "type"] + document_reference_ranged_slots_by_type[
1226
+ doc_type_full
1208
1227
  ]
1209
1228
  new_doc = keyfilter(lambda slot: slot in slots_to_include, doc)
1210
1229
 
1211
- new_doc["_type_and_ancestors"] = schema_view.class_ancestors(doc_type)
1212
1230
  # Get ancestors without the prefix, but add prefix to each one in the output
1213
- ancestors = schema_view.class_ancestors(doc_type)
1214
1231
  new_doc["_type_and_ancestors"] = [
1215
- "nmdc:" + a if not a.startswith("nmdc:") else a for a in ancestors
1232
+ f"nmdc:{a}" for a in schema_view.class_ancestors(doc_type)
1216
1233
  ]
1217
1234
  # InsertOne is a pymongo representation of a mongo command.
1218
1235
  write_operations.append(InsertOne(new_doc))
@@ -1221,7 +1238,7 @@ def materialize_alldocs(context) -> int:
1221
1238
  write_operations.clear()
1222
1239
  documents_processed_counter += BULK_WRITE_BATCH_SIZE
1223
1240
  if len(write_operations) > 0:
1224
- # here bulk_write is a method on the py-mongo db Client class
1241
+ # here bulk_write is a method on the pymongo db Collection class
1225
1242
  _ = temp_alldocs_collection.bulk_write(write_operations, ordered=False)
1226
1243
  documents_processed_counter += len(write_operations)
1227
1244
  context.log.info(
@@ -1238,15 +1255,18 @@ def materialize_alldocs(context) -> int:
1238
1255
  # so that `temp_alldocs_collection` will be "good to go" on renaming.
1239
1256
  temp_alldocs_collection.create_index("id", unique=True)
1240
1257
  # Add indexes to improve performance of `GET /data_objects/study/{study_id}`:
1241
- # TODO add indexes on each of `set(document_reference_ranged_slots.values())`.
1242
- slots_to_index = ["has_input", "has_output", "was_informed_by"]
1258
+ slots_to_index = {"_type_and_ancestors"} | {
1259
+ slot
1260
+ for slots in document_reference_ranged_slots_by_type.values()
1261
+ for slot in slots
1262
+ }
1243
1263
  [temp_alldocs_collection.create_index(slot) for slot in slots_to_index]
1244
- context.log.info(f"created indexes on id, {slots_to_index}.")
1264
+ context.log.info(f"created indexes on id and on each of {slots_to_index=}.")
1245
1265
 
1246
1266
  # Add related-ids fields to enable efficient relationship traversal
1247
1267
  context.log.info("Adding fields for related ids to documents...")
1248
1268
  _add_linked_instances_to_alldocs(
1249
- temp_alldocs_collection, context, document_reference_ranged_slots
1269
+ temp_alldocs_collection, context, document_reference_ranged_slots_by_type
1250
1270
  )
1251
1271
  context.log.info("Creating {`_upstream`,`_downstream`} indexes...")
1252
1272
  temp_alldocs_collection.create_index("_upstream.id")
@@ -1350,6 +1370,42 @@ def get_library_preparation_from_biosamples(
1350
1370
  return biosample_lib_prep
1351
1371
 
1352
1372
 
1373
+ @op(required_resource_keys={"mongo"})
1374
+ def get_aggregated_pooled_biosamples(context: OpExecutionContext, biosamples: list):
1375
+ from nmdc_runtime.site.export.ncbi_xml_utils import check_pooling_for_biosamples
1376
+
1377
+ mdb = context.resources.mongo.db
1378
+ material_processing_set = mdb["material_processing_set"]
1379
+ pooled_biosamples_data = check_pooling_for_biosamples(
1380
+ material_processing_set, biosamples
1381
+ )
1382
+
1383
+ # Fetch ProcessedSample names from database
1384
+ processed_sample_ids = set()
1385
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1386
+ if pooling_info and pooling_info.get("processed_sample_id"):
1387
+ processed_sample_ids.add(pooling_info["processed_sample_id"])
1388
+
1389
+ # Query database for ProcessedSample names
1390
+ if processed_sample_ids:
1391
+ processed_sample_set = mdb["processed_sample_set"]
1392
+ cursor = processed_sample_set.find(
1393
+ {"id": {"$in": list(processed_sample_ids)}}, {"id": 1, "name": 1}
1394
+ )
1395
+ processed_samples = {doc["id"]: doc.get("name", "") for doc in cursor}
1396
+
1397
+ # Update pooled_biosamples_data with ProcessedSample names
1398
+ for biosample_id, pooling_info in pooled_biosamples_data.items():
1399
+ if pooling_info and pooling_info.get("processed_sample_id"):
1400
+ processed_sample_id = pooling_info["processed_sample_id"]
1401
+ if processed_sample_id in processed_samples:
1402
+ pooling_info["processed_sample_name"] = processed_samples[
1403
+ processed_sample_id
1404
+ ]
1405
+
1406
+ return pooled_biosamples_data
1407
+
1408
+
1353
1409
  @op(required_resource_keys={"mongo"})
1354
1410
  def get_all_instruments(context: OpExecutionContext) -> dict[str, dict]:
1355
1411
  mdb = context.resources.mongo.db
@@ -1383,6 +1439,7 @@ def ncbi_submission_xml_from_nmdc_study(
1383
1439
  data_object_records: list,
1384
1440
  library_preparation_records: list,
1385
1441
  all_instruments: dict,
1442
+ pooled_biosamples_data: dict,
1386
1443
  ) -> str:
1387
1444
  ncbi_exporter = NCBISubmissionXML(nmdc_study, ncbi_exporter_metadata)
1388
1445
  ncbi_xml = ncbi_exporter.get_submission_xml(
@@ -1391,6 +1448,7 @@ def ncbi_submission_xml_from_nmdc_study(
1391
1448
  data_object_records,
1392
1449
  library_preparation_records,
1393
1450
  all_instruments,
1451
+ pooled_biosamples_data,
1394
1452
  )
1395
1453
  return ncbi_xml
1396
1454
 
@@ -502,6 +502,7 @@ def biosample_submission_ingest():
502
502
  "data_object_mapping_file_url": None,
503
503
  "biosample_extras_file_url": None,
504
504
  "biosample_extras_slot_mapping_file_url": None,
505
+ "study_id": None,
505
506
  }
506
507
  },
507
508
  "translate_portal_submission_to_nmdc_schema_database": {
@@ -538,6 +539,7 @@ def biosample_submission_ingest():
538
539
  "data_object_mapping_file_url": None,
539
540
  "biosample_extras_file_url": None,
540
541
  "biosample_extras_slot_mapping_file_url": None,
542
+ "study_id": None,
541
543
  }
542
544
  },
543
545
  "translate_portal_submission_to_nmdc_schema_database": {
@@ -520,11 +520,24 @@ class MongoDB:
520
520
  self.db = self.client[dbname]
521
521
 
522
522
  def add_docs(self, docs, validate=True, replace=True):
523
+ """
524
+ TODO: Document this function.
525
+ """
523
526
  try:
524
527
  if validate:
525
528
  nmdc_jsonschema_validator_noidpatterns(docs)
526
529
  rv = {}
527
- for collection_name, docs in docs.items():
530
+ for collection_name, collection_docs in docs.items():
531
+ # If `collection_docs` is empty, abort this iteration.
532
+ #
533
+ # Note: We do this because the `bulk_write` method called below will raise
534
+ # an `InvalidOperation` exception if it is passed 0 operations.
535
+ #
536
+ # Reference: https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.bulk_write
537
+ #
538
+ if len(collection_docs) == 0:
539
+ continue
540
+
528
541
  rv[collection_name] = self.db[collection_name].bulk_write(
529
542
  [
530
543
  (
@@ -532,7 +545,7 @@ class MongoDB:
532
545
  if replace
533
546
  else InsertOne(d)
534
547
  )
535
- for d in docs
548
+ for d in collection_docs
536
549
  ]
537
550
  )
538
551
  now = datetime.now(timezone.utc)
@@ -544,7 +557,7 @@ class MongoDB:
544
557
  "ts": now,
545
558
  # "dtl": {},
546
559
  }
547
- for d in docs
560
+ for d in collection_docs
548
561
  ]
549
562
  )
550
563
  return rv