npm - @sjcrh/proteinpaint-server - Versions diffs - 2.30.1 → 2.30.2 - Mend

@sjcrh/proteinpaint-server 2.30.1 → 2.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +2 -2
package/routes/gdc.maf.ts +86 -24
package/routes/gdc.mafBuild.ts +105 -0
package/routes/gdc.topVariablyExpressedGenes.ts +1 -1
package/server.js +1 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@sjcrh/proteinpaint-server",
-  "version": "2.30.1",
+  "version": "2.30.2",
   "description": "a genomics visualization tool for exploring a cohort's genotype and phenotype data",
   "main": "server.js",
   "bin": "start.js",
@@ -56,7 +56,7 @@
   },
   "dependencies": {
     "@sjcrh/augen": "2.27.0",
-    "@sjcrh/proteinpaint-rust": "2.29.6",
+    "@sjcrh/proteinpaint-rust": "2.30.2",
     "better-sqlite3": "^7.5.3",
     "body-parser": "^1.15.2",
     "canvas": "~2.9.3",

package/routes/gdc.maf.ts CHANGED Viewed

@@ -4,19 +4,22 @@ import path from 'path'
 import got from 'got'
 const apihost = process.env.PP_GDC_HOST || 'https://api.gdc.cancer.gov'
+const maxFileNumber = 1000
+const allowedWorkflowType = 'Aliquot Ensemble Somatic Variant Merging and Masking'
+const allowedExpStrategy = new Set(['WXS', 'Targeted Sequencing'])
 export const api = {
 	endpoint: 'gdc/maf',
 	methods: {
 		get: {
 			init({ genomes }) {
-				// genomes parameter is not used
-				// could be used later to verify hg38/GDC is on this instance and otherwise disable this route..
 				return async (req: any, res: any): Promise<void> => {
 					try {
-						const files = await listMafFiles(req)
-						const payload = { files } as GdcMafResponse
+						const g = genomes.hg38
+						if (!g) throw 'hg38 missing'
+						const ds = g.datasets.GDC
+						if (!ds) throw 'hg38 GDC missing'
+						const payload = await listMafFiles(req)
 						res.send(payload)
 					} catch (e: any) {
 						res.send({ status: 'error', error: e.message || e })
@@ -39,16 +42,25 @@ export const api = {
 /*
 req.query {
 	filter0 // optional gdc GFF cohort filter, invisible and read only
+	experimentalStrategy: WXS/Targeted Sequencing
+}
+ds {
+	__gdc {
+		gdcOpenProjects
+	}
 }
 */
 async function listMafFiles(req: any) {
+	if (!allowedExpStrategy.has(req.query.experimentalStrategy)) throw 'invalid req.query.experimentalStrategy'
 	const filters = {
 		op: 'and',
 		content: [
-			{
-				op: '=',
-				content: { field: 'data_format', value: 'MAF' }
-			}
+			{ op: '=', content: { field: 'data_format', value: 'MAF' } },
+			{ op: '=', content: { field: 'experimental_strategy', value: req.query.experimentalStrategy } },
+			{ op: '=', content: { field: 'analysis.workflow_type', value: allowedWorkflowType } },
+			{ op: '=', content: { field: 'access', value: 'open' } } // delete if later to support controlled files
 		]
 	}
@@ -60,16 +72,14 @@ async function listMafFiles(req: any) {
 	const data = {
 		filters,
-		size: 1000,
+		size: maxFileNumber,
 		fields: [
 			'id',
 			'file_size',
-			'experimental_strategy',
+			'cases.project.project_id', // for display only
 			'cases.submitter_id', // used when listing all cases & files
-			//'associated_entities.entity_submitter_id', // semi human readable
-			//'associated_entities.case_id', // case uuid
-			'cases.samples.sample_type',
-			'analysis.workflow_type' // to drop out those as skip_workflow_type
+			'cases.samples.sample_type'
+			// may add diagnosis and primary site
 		].join(',')
 	}
@@ -81,26 +91,78 @@ async function listMafFiles(req: any) {
 	} catch (e) {
 		throw 'invalid JSON from ' + api.endpoint
 	}
+	if (!Number.isInteger(re.data?.pagination?.total)) throw 're.data.pagination.total is not int'
 	if (!Array.isArray(re.data?.hits)) throw 're.data.hits[] not array'
 	// flatten api return to table row objects
 	// it is possible to set a max size limit to limit the number of files passed to client
 	const files = [] as File[]
 	for (const h of re.data.hits) {
+		/*
+		{
+		  "id": "39768777-fec5-4a79-9515-65712c002b19",
+		  "cases": [
+			{
+			  "submitter_id": "HTMCP-03-06-02104",
+			  "project": {
+			  	"project_id":"xx"
+			  },
+			  "samples": [
+				{
+				  "sample_type": "Blood Derived Normal"
+				},
+				{
+				  "sample_type": "Primary Tumor"
+				}
+			  ]
+			}
+		  ],
+		  "analysis": {
+			"workflow_type": "MuSE Annotation"
+		  },
+		  "experimental_strategy": "Targeted Sequencing",
+		  "file_size": 146038
+		}
+		*/
+		const c = h.cases?.[0]
+		if (!c) throw 'h.cases[0] missing'
+		// only keep files from open access projects for now
+		/*
+		if (c.project?.project_id) {
+			if (ds.__gdc.gdcOpenProjects.has(c.project.project_id)) {
+				// open-access project, keep
+			} else {
+				// not open access
+				continue
+			}
+		} else {
+			throw 'h.cases[0].project.project_id missing'
+		}
+		*/
 		const file = {
 			id: h.id,
-			workflow_type: h.analysis?.workflow_type,
-			experimental_strategy: h.experimental_strategy,
+			project_id: c.project.project_id,
 			file_size: fileSize(h.file_size)
 		} as File
-		const c = h.cases?.[0]
-		if (c) {
-			file.case_submitter_id = c.submitter_id
-			if (c.samples) {
-				file.sample_types = c.samples.map(i => i.sample_type).join(', ')
-			}
+		file.case_submitter_id = c.submitter_id
+		if (c.samples) {
+			file.sample_types = c.samples.map(i => i.sample_type).sort()
+			// sort to show sample type names in consistent alphabetical order
+			// otherwise one file shows 'Blood, Primary' and another shows 'Primary, Blood'
+			// FIXME this includes samples not associated with current maf file
 		}
 		files.push(file)
 	}
-	return files
+	const result = {
+		files,
+		filesTotal: re.data.pagination.total
+	} as GdcMafResponse
+	return result
 }

package/routes/gdc.mafBuild.ts ADDED Viewed

@@ -0,0 +1,105 @@
+import got from 'got'
+import path from 'path'
+import fs from 'fs'
+import { run_rust } from '@sjcrh/proteinpaint-rust'
+import serverconfig from '#src/serverconfig.js'
+const apihost = process.env.PP_GDC_HOST || 'https://api.gdc.cancer.gov'
+const maxTotalSizeCompressed = serverconfig.features.gdcMafMaxFileSize || 50000000 // 50Mb
+export const api = {
+	endpoint: 'gdc/mafBuild',
+	methods: {
+		all: {
+			init({ genomes }) {
+				return async (req: any, res: any): Promise<void> => {
+					try {
+						await buildMaf(req, res)
+					} catch (e) {
+						if (e.stack) console.log(e.stack)
+						res.send({ status: 'error', error: e.message || e })
+					}
+				}
+			},
+			request: {
+				typeId: null
+				//valid: default to type checker
+			},
+			response: {
+				typeId: 'GdcMafBuildResponse'
+				// will combine this with type checker
+				//valid: (t) => {}
+			}
+		}
+	}
+}
+/*
+req.query {
+	fileIdLst []  // list of maf file uuids
+}
+res{}
+*/
+async function buildMaf(req: any, res: any) {
+	const fileLst2 = (await getFileLstUnderSizeLimit(req.query.fileIdLst)) as string[]
+	const outFile = path.join(serverconfig.cachedir, 'gdcMaf.' + Math.random().toString()) // should be a gzipped file. does it need to end with '.gz' or it's auto-added?
+	const arg = {
+		fileIdLst: fileLst2,
+		host: path.join(apihost, 'data'), // must use the /data/ endpoint from current host
+		outFile
+	}
+	await run_rust('gdcmaf', JSON.stringify(arg))
+	const data = await fs.promises.readFile(outFile)
+	// by directly returning a blob, it won't tell client how many files are used
+	res.writeHead(200, {
+		'Content-Type': 'application/octet-stream',
+		'Content-Disposition': 'attachment; filename=cohort.maf.gz',
+		'Content-Length': data.length
+	})
+	res.end(Buffer.from(data, 'binary'))
+}
+/*
+query api get size of each input maf file, and only process those files with total size under a set limit,
+excess files are not processed in order not to crash server
+must not rely on file size sent by client, as that can be spoofed and never to be trusted
+it's inexpensive to query api for this
+*/
+async function getFileLstUnderSizeLimit(lst: string[]) {
+	if (lst.length == 0) throw 'fileIdLst[] not array or blank'
+	const data = {
+		filters: {
+			op: 'in',
+			content: { field: 'file_id', value: lst }
+		},
+		size: 1000,
+		fields: 'file_size'
+	}
+	const headers = { 'Content-Type': 'application/json', Accept: 'application/json' }
+	const response = await got.post(path.join(apihost, 'files'), { headers, body: JSON.stringify(data) })
+	let re
+	try {
+		re = JSON.parse(response.body)
+	} catch (e) {
+		throw 'invalid json from getFileLstUnderSizeLimit'
+	}
+	if (!Array.isArray(re.data?.hits)) throw 're.data.hits[] not array'
+	const out = [] as string[]
+	let cumsize = 0
+	for (const h of re.data.hits) {
+		if (cumsize >= maxTotalSizeCompressed) break // maxed out
+		if (!h.id) throw '.id missing'
+		if (!Number.isInteger(h.file_size)) throw '.file_size not integer'
+		cumsize += h.file_size
+		out.push(h.id)
+	}
+	if (out.length == 0) throw 'no file available'
+	return out
+}

package/routes/gdc.topVariablyExpressedGenes.ts CHANGED Viewed

@@ -2,7 +2,7 @@ import { GdcTopVariablyExpressedGenesResponse } from '#shared/types/routes/gdc.t
 import { getCasesWithExressionDataFromCohort } from '../src/mds3.gdc.js'
 //import path from 'path'
 import got from 'got'
-import serverconfig from '../src/serverconfig.js'
+import serverconfig from '#src/serverconfig.js'
 // TODO change when api is released to prod
 //const apihost = process.env.PP_GDC_HOST || 'https://api.gdc.cancer.gov'