@sjcrh/proteinpaint-server 2.30.1 → 2.30.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@sjcrh/proteinpaint-server",
3
- "version": "2.30.1",
3
+ "version": "2.30.2",
4
4
  "description": "a genomics visualization tool for exploring a cohort's genotype and phenotype data",
5
5
  "main": "server.js",
6
6
  "bin": "start.js",
@@ -56,7 +56,7 @@
56
56
  },
57
57
  "dependencies": {
58
58
  "@sjcrh/augen": "2.27.0",
59
- "@sjcrh/proteinpaint-rust": "2.29.6",
59
+ "@sjcrh/proteinpaint-rust": "2.30.2",
60
60
  "better-sqlite3": "^7.5.3",
61
61
  "body-parser": "^1.15.2",
62
62
  "canvas": "~2.9.3",
package/routes/gdc.maf.ts CHANGED
@@ -4,19 +4,22 @@ import path from 'path'
4
4
  import got from 'got'
5
5
 
6
6
  const apihost = process.env.PP_GDC_HOST || 'https://api.gdc.cancer.gov'
7
+ const maxFileNumber = 1000
8
+ const allowedWorkflowType = 'Aliquot Ensemble Somatic Variant Merging and Masking'
9
+ const allowedExpStrategy = new Set(['WXS', 'Targeted Sequencing'])
7
10
 
8
11
  export const api = {
9
12
  endpoint: 'gdc/maf',
10
13
  methods: {
11
14
  get: {
12
15
  init({ genomes }) {
13
- // genomes parameter is not used
14
- // could be used later to verify hg38/GDC is on this instance and otherwise disable this route..
15
-
16
16
  return async (req: any, res: any): Promise<void> => {
17
17
  try {
18
- const files = await listMafFiles(req)
19
- const payload = { files } as GdcMafResponse
18
+ const g = genomes.hg38
19
+ if (!g) throw 'hg38 missing'
20
+ const ds = g.datasets.GDC
21
+ if (!ds) throw 'hg38 GDC missing'
22
+ const payload = await listMafFiles(req)
20
23
  res.send(payload)
21
24
  } catch (e: any) {
22
25
  res.send({ status: 'error', error: e.message || e })
@@ -39,16 +42,25 @@ export const api = {
39
42
  /*
40
43
  req.query {
41
44
  filter0 // optional gdc GFF cohort filter, invisible and read only
45
+ experimentalStrategy: WXS/Targeted Sequencing
46
+ }
47
+
48
+ ds {
49
+ __gdc {
50
+ gdcOpenProjects
51
+ }
42
52
  }
43
53
  */
44
54
  async function listMafFiles(req: any) {
55
+ if (!allowedExpStrategy.has(req.query.experimentalStrategy)) throw 'invalid req.query.experimentalStrategy'
56
+
45
57
  const filters = {
46
58
  op: 'and',
47
59
  content: [
48
- {
49
- op: '=',
50
- content: { field: 'data_format', value: 'MAF' }
51
- }
60
+ { op: '=', content: { field: 'data_format', value: 'MAF' } },
61
+ { op: '=', content: { field: 'experimental_strategy', value: req.query.experimentalStrategy } },
62
+ { op: '=', content: { field: 'analysis.workflow_type', value: allowedWorkflowType } },
63
+ { op: '=', content: { field: 'access', value: 'open' } } // delete if later to support controlled files
52
64
  ]
53
65
  }
54
66
 
@@ -60,16 +72,14 @@ async function listMafFiles(req: any) {
60
72
 
61
73
  const data = {
62
74
  filters,
63
- size: 1000,
75
+ size: maxFileNumber,
64
76
  fields: [
65
77
  'id',
66
78
  'file_size',
67
- 'experimental_strategy',
79
+ 'cases.project.project_id', // for display only
68
80
  'cases.submitter_id', // used when listing all cases & files
69
- //'associated_entities.entity_submitter_id', // semi human readable
70
- //'associated_entities.case_id', // case uuid
71
- 'cases.samples.sample_type',
72
- 'analysis.workflow_type' // to drop out those as skip_workflow_type
81
+ 'cases.samples.sample_type'
82
+ // may add diagnosis and primary site
73
83
  ].join(',')
74
84
  }
75
85
 
@@ -81,26 +91,78 @@ async function listMafFiles(req: any) {
81
91
  } catch (e) {
82
92
  throw 'invalid JSON from ' + api.endpoint
83
93
  }
94
+ if (!Number.isInteger(re.data?.pagination?.total)) throw 're.data.pagination.total is not int'
84
95
  if (!Array.isArray(re.data?.hits)) throw 're.data.hits[] not array'
85
96
 
86
97
  // flatten api return to table row objects
87
98
  // it is possible to set a max size limit to limit the number of files passed to client
88
99
  const files = [] as File[]
100
+
89
101
  for (const h of re.data.hits) {
102
+ /*
103
+ {
104
+ "id": "39768777-fec5-4a79-9515-65712c002b19",
105
+ "cases": [
106
+ {
107
+ "submitter_id": "HTMCP-03-06-02104",
108
+ "project": {
109
+ "project_id":"xx"
110
+ },
111
+ "samples": [
112
+ {
113
+ "sample_type": "Blood Derived Normal"
114
+ },
115
+ {
116
+ "sample_type": "Primary Tumor"
117
+ }
118
+ ]
119
+ }
120
+ ],
121
+ "analysis": {
122
+ "workflow_type": "MuSE Annotation"
123
+ },
124
+ "experimental_strategy": "Targeted Sequencing",
125
+ "file_size": 146038
126
+ }
127
+ */
128
+
129
+ const c = h.cases?.[0]
130
+ if (!c) throw 'h.cases[0] missing'
131
+
132
+ // only keep files from open access projects for now
133
+ /*
134
+ if (c.project?.project_id) {
135
+ if (ds.__gdc.gdcOpenProjects.has(c.project.project_id)) {
136
+ // open-access project, keep
137
+ } else {
138
+ // not open access
139
+ continue
140
+ }
141
+ } else {
142
+ throw 'h.cases[0].project.project_id missing'
143
+ }
144
+ */
145
+
90
146
  const file = {
91
147
  id: h.id,
92
- workflow_type: h.analysis?.workflow_type,
93
- experimental_strategy: h.experimental_strategy,
148
+ project_id: c.project.project_id,
94
149
  file_size: fileSize(h.file_size)
95
150
  } as File
96
- const c = h.cases?.[0]
97
- if (c) {
98
- file.case_submitter_id = c.submitter_id
99
- if (c.samples) {
100
- file.sample_types = c.samples.map(i => i.sample_type).join(', ')
101
- }
151
+
152
+ file.case_submitter_id = c.submitter_id
153
+ if (c.samples) {
154
+ file.sample_types = c.samples.map(i => i.sample_type).sort()
155
+ // sort to show sample type names in consistent alphabetical order
156
+ // otherwise one file shows 'Blood, Primary' and another shows 'Primary, Blood'
157
+ // FIXME this includes samples not associated with current maf file
102
158
  }
103
159
  files.push(file)
104
160
  }
105
- return files
161
+
162
+ const result = {
163
+ files,
164
+ filesTotal: re.data.pagination.total
165
+ } as GdcMafResponse
166
+
167
+ return result
106
168
  }
@@ -0,0 +1,105 @@
1
+ import got from 'got'
2
+ import path from 'path'
3
+ import fs from 'fs'
4
+ import { run_rust } from '@sjcrh/proteinpaint-rust'
5
+ import serverconfig from '#src/serverconfig.js'
6
+
7
+ const apihost = process.env.PP_GDC_HOST || 'https://api.gdc.cancer.gov'
8
+ const maxTotalSizeCompressed = serverconfig.features.gdcMafMaxFileSize || 50000000 // 50Mb
9
+
10
+ export const api = {
11
+ endpoint: 'gdc/mafBuild',
12
+ methods: {
13
+ all: {
14
+ init({ genomes }) {
15
+ return async (req: any, res: any): Promise<void> => {
16
+ try {
17
+ await buildMaf(req, res)
18
+ } catch (e) {
19
+ if (e.stack) console.log(e.stack)
20
+ res.send({ status: 'error', error: e.message || e })
21
+ }
22
+ }
23
+ },
24
+ request: {
25
+ typeId: null
26
+ //valid: default to type checker
27
+ },
28
+ response: {
29
+ typeId: 'GdcMafBuildResponse'
30
+ // will combine this with type checker
31
+ //valid: (t) => {}
32
+ }
33
+ }
34
+ }
35
+ }
36
+
37
+ /*
38
+ req.query {
39
+ fileIdLst [] // list of maf file uuids
40
+ }
41
+
42
+ res{}
43
+ */
44
+ async function buildMaf(req: any, res: any) {
45
+ const fileLst2 = (await getFileLstUnderSizeLimit(req.query.fileIdLst)) as string[]
46
+
47
+ const outFile = path.join(serverconfig.cachedir, 'gdcMaf.' + Math.random().toString()) // should be a gzipped file. does it need to end with '.gz' or it's auto-added?
48
+
49
+ const arg = {
50
+ fileIdLst: fileLst2,
51
+ host: path.join(apihost, 'data'), // must use the /data/ endpoint from current host
52
+ outFile
53
+ }
54
+
55
+ await run_rust('gdcmaf', JSON.stringify(arg))
56
+
57
+ const data = await fs.promises.readFile(outFile)
58
+
59
+ // by directly returning a blob, it won't tell client how many files are used
60
+
61
+ res.writeHead(200, {
62
+ 'Content-Type': 'application/octet-stream',
63
+ 'Content-Disposition': 'attachment; filename=cohort.maf.gz',
64
+ 'Content-Length': data.length
65
+ })
66
+ res.end(Buffer.from(data, 'binary'))
67
+ }
68
+
69
+ /*
70
+ query api get size of each input maf file, and only process those files with total size under a set limit,
71
+ excess files are not processed in order not to crash server
72
+ must not rely on file size sent by client, as that can be spoofed and never to be trusted
73
+ it's inexpensive to query api for this
74
+ */
75
+ async function getFileLstUnderSizeLimit(lst: string[]) {
76
+ if (lst.length == 0) throw 'fileIdLst[] not array or blank'
77
+ const data = {
78
+ filters: {
79
+ op: 'in',
80
+ content: { field: 'file_id', value: lst }
81
+ },
82
+ size: 1000,
83
+ fields: 'file_size'
84
+ }
85
+ const headers = { 'Content-Type': 'application/json', Accept: 'application/json' }
86
+ const response = await got.post(path.join(apihost, 'files'), { headers, body: JSON.stringify(data) })
87
+ let re
88
+ try {
89
+ re = JSON.parse(response.body)
90
+ } catch (e) {
91
+ throw 'invalid json from getFileLstUnderSizeLimit'
92
+ }
93
+ if (!Array.isArray(re.data?.hits)) throw 're.data.hits[] not array'
94
+ const out = [] as string[]
95
+ let cumsize = 0
96
+ for (const h of re.data.hits) {
97
+ if (cumsize >= maxTotalSizeCompressed) break // maxed out
98
+ if (!h.id) throw '.id missing'
99
+ if (!Number.isInteger(h.file_size)) throw '.file_size not integer'
100
+ cumsize += h.file_size
101
+ out.push(h.id)
102
+ }
103
+ if (out.length == 0) throw 'no file available'
104
+ return out
105
+ }
@@ -2,7 +2,7 @@ import { GdcTopVariablyExpressedGenesResponse } from '#shared/types/routes/gdc.t
2
2
  import { getCasesWithExressionDataFromCohort } from '../src/mds3.gdc.js'
3
3
  //import path from 'path'
4
4
  import got from 'got'
5
- import serverconfig from '../src/serverconfig.js'
5
+ import serverconfig from '#src/serverconfig.js'
6
6
 
7
7
  // TODO change when api is released to prod
8
8
  //const apihost = process.env.PP_GDC_HOST || 'https://api.gdc.cancer.gov'