@jbrowse/text-indexing 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,170 @@
1
+ import fs from 'fs'
2
+ import fetch from 'node-fetch'
3
+ import path from 'path'
4
+ import { LocalPathLocation, UriLocation, Track } from '../util'
5
+
6
+ // Method for handing off the parsing of a gff3 file URL.
7
+ // Calls the proper parser depending on if it is gzipped or not.
8
+ // Returns a @gmod/gff stream.
9
+ export async function createRemoteStream(urlIn: string) {
10
+ const response = await fetch(urlIn)
11
+ if (!response.ok) {
12
+ throw new Error(
13
+ `Failed to fetch ${urlIn} status ${response.status} ${response.statusText}`,
14
+ )
15
+ }
16
+ return response
17
+ }
18
+
19
+ // Checks if the passed in string is a valid URL.
20
+ // Returns a boolean.
21
+ export function isURL(FileName: string) {
22
+ let url
23
+
24
+ try {
25
+ url = new URL(FileName)
26
+ } catch (_) {
27
+ return false
28
+ }
29
+
30
+ return url.protocol === 'http:' || url.protocol === 'https:'
31
+ }
32
+
33
+ export async function getLocalOrRemoteStream(uri: string, out: string) {
34
+ let stream
35
+ let totalBytes = 0
36
+ if (isURL(uri)) {
37
+ const result = await createRemoteStream(uri)
38
+ totalBytes = +(result.headers?.get('Content-Length') || 0)
39
+ stream = result.body
40
+ } else {
41
+ const filename = path.isAbsolute(uri) ? uri : path.join(out, uri)
42
+ totalBytes = fs.statSync(filename).size
43
+ stream = fs.createReadStream(filename)
44
+ }
45
+ return { totalBytes, stream }
46
+ }
47
+
48
+ export function makeLocation(location: string, protocol: string) {
49
+ if (protocol === 'uri') {
50
+ return { uri: location, locationType: 'UriLocation' } as UriLocation
51
+ }
52
+ if (protocol === 'localPath') {
53
+ return {
54
+ localPath: path.resolve(location),
55
+ locationType: 'LocalPathLocation',
56
+ } as LocalPathLocation
57
+ }
58
+ throw new Error(`invalid protocol ${protocol}`)
59
+ }
60
+
61
+ export function guessAdapterFromFileName(filePath: string): Track {
62
+ // const uri = isURL(filePath) ? filePath : path.resolve(filePath)
63
+ const protocol = isURL(filePath) ? 'uri' : 'localPath'
64
+ const name = path.basename(filePath)
65
+ if (/\.vcf\.b?gz$/i.test(filePath)) {
66
+ return {
67
+ trackId: name,
68
+ name: name,
69
+ assemblyNames: [],
70
+ adapter: {
71
+ type: 'VcfTabixAdapter',
72
+ vcfGzLocation: makeLocation(filePath, protocol),
73
+ },
74
+ }
75
+ } else if (/\.gff3?\.b?gz$/i.test(filePath)) {
76
+ return {
77
+ trackId: name,
78
+ name,
79
+ assemblyNames: [],
80
+ adapter: {
81
+ type: 'Gff3TabixAdapter',
82
+ gffGzLocation: makeLocation(filePath, protocol),
83
+ },
84
+ }
85
+ } else if (/\.gtf?$/i.test(filePath)) {
86
+ return {
87
+ trackId: name,
88
+ name,
89
+ assemblyNames: [],
90
+ adapter: {
91
+ type: 'GtfAdapter',
92
+ gtfLocation: makeLocation(filePath, protocol),
93
+ },
94
+ }
95
+ } else if (/\.vcf$/i.test(filePath)) {
96
+ return {
97
+ trackId: name,
98
+ name,
99
+ assemblyNames: [],
100
+ adapter: {
101
+ type: 'VcfAdapter',
102
+ vcfLocation: makeLocation(filePath, protocol),
103
+ },
104
+ }
105
+ } else if (/\.gff3?$/i.test(filePath)) {
106
+ return {
107
+ trackId: name,
108
+ name,
109
+ assemblyNames: [],
110
+ adapter: {
111
+ type: 'Gff3Adapter',
112
+ gffLocation: makeLocation(filePath, protocol),
113
+ },
114
+ }
115
+ } else {
116
+ throw new Error(`Unsupported file type ${filePath}`)
117
+ }
118
+ }
119
+
120
+ /**
121
+ * Generates metadata of index given a filename (trackId or assembly)
122
+ * @param name - assembly name or trackId
123
+ * @param attributes - attributes indexed
124
+ * @param include - feature types included from index
125
+ * @param exclude - feature types excluded from index
126
+ * @param configs - list of track
127
+ */
128
+ export async function generateMeta({
129
+ configs,
130
+ attributes,
131
+ outDir,
132
+ name,
133
+ exclude,
134
+ assemblyNames,
135
+ }: {
136
+ configs: Track[]
137
+ attributes: string[]
138
+ outDir: string
139
+ name: string
140
+ exclude: string[]
141
+ assemblyNames: string[]
142
+ }) {
143
+ const tracks = configs.map(config => {
144
+ const { trackId, textSearching, adapter } = config
145
+
146
+ const includeExclude =
147
+ textSearching?.indexingFeatureTypesToExclude || exclude
148
+
149
+ const metaAttrs = textSearching?.indexingAttributes || attributes
150
+
151
+ return {
152
+ trackId: trackId,
153
+ attributesIndexed: metaAttrs,
154
+ excludedTypes: includeExclude,
155
+ adapterConf: adapter,
156
+ }
157
+ })
158
+ fs.writeFileSync(
159
+ path.join(outDir, 'trix', `${name}_meta.json`),
160
+ JSON.stringify(
161
+ {
162
+ dateCreated: new Date().toISOString(),
163
+ tracks,
164
+ assemblyNames,
165
+ },
166
+ null,
167
+ 2,
168
+ ),
169
+ )
170
+ }
@@ -0,0 +1,75 @@
1
+ import { createGunzip } from 'zlib'
2
+ import readline from 'readline'
3
+ import { Track } from '../util'
4
+ import { getLocalOrRemoteStream } from './common'
5
+ import { checkAbortSignal } from '@jbrowse/core/util'
6
+
7
+ export async function* indexGff3(
8
+ config: Track,
9
+ attributes: string[],
10
+ inLocation: string,
11
+ outLocation: string,
12
+ typesToExclude: string[],
13
+ quiet: boolean,
14
+ statusCallback: (message: string) => void,
15
+ signal?: AbortSignal,
16
+ ) {
17
+ const { trackId } = config
18
+ let receivedBytes = 0
19
+ const { totalBytes, stream } = await getLocalOrRemoteStream(
20
+ inLocation,
21
+ outLocation,
22
+ )
23
+ stream.on('data', chunk => {
24
+ receivedBytes += chunk.length
25
+ // send an update?
26
+ const progress = Math.round((receivedBytes / totalBytes) * 100)
27
+ statusCallback(`${progress}`)
28
+ })
29
+ const rl = readline.createInterface({
30
+ input: inLocation.match(/.b?gz$/) ? stream.pipe(createGunzip()) : stream,
31
+ })
32
+
33
+ for await (const line of rl) {
34
+ if (line.startsWith('#')) {
35
+ continue
36
+ } else if (line.startsWith('>')) {
37
+ break
38
+ }
39
+
40
+ const [seq_id, , type, start, end, , , , col9] = line.split('\t')
41
+ const locStr = `${seq_id}:${start}..${end}`
42
+
43
+ if (!typesToExclude.includes(type)) {
44
+ // turns gff3 attrs into a map, and converts the arrays into space
45
+ // separated strings
46
+ const col9attrs = Object.fromEntries(
47
+ col9
48
+ .split(';')
49
+ .map(f => f.trim())
50
+ .filter(f => !!f)
51
+ .map(f => f.split('='))
52
+ .map(([key, val]) => [
53
+ key.trim(),
54
+ decodeURIComponent(val).trim().split(',').join(' '),
55
+ ]),
56
+ )
57
+ const attrs = attributes
58
+ .map(attr => col9attrs[attr])
59
+ .filter((f): f is string => !!f)
60
+
61
+ if (attrs.length) {
62
+ const record = JSON.stringify([
63
+ encodeURIComponent(locStr),
64
+ encodeURIComponent(trackId),
65
+ ...attrs.map(a => encodeURIComponent(a)),
66
+ ]).replaceAll(',', '|')
67
+
68
+ // Check abort signal
69
+ checkAbortSignal(signal)
70
+ yield `${record} ${[...new Set(attrs)].join(' ')}\n`
71
+ }
72
+ }
73
+ }
74
+ // console.log('done')
75
+ }
@@ -0,0 +1,86 @@
1
+ import { createGunzip } from 'zlib'
2
+ import readline from 'readline'
3
+ import { Track } from '../util'
4
+ import { getLocalOrRemoteStream } from './common'
5
+ import { checkAbortSignal } from '@jbrowse/core/util'
6
+
7
+ export async function* indexVcf(
8
+ config: Track,
9
+ attributesToIndex: string[],
10
+ inLocation: string,
11
+ outLocation: string,
12
+ typesToExclude: string[],
13
+ quiet: boolean,
14
+ statusCallback: (message: string) => void,
15
+ signal?: AbortSignal,
16
+ ) {
17
+ const { trackId } = config
18
+ let receivedBytes = 0
19
+ const { totalBytes, stream } = await getLocalOrRemoteStream(
20
+ inLocation,
21
+ outLocation,
22
+ )
23
+ stream.on('data', chunk => {
24
+ receivedBytes += chunk.length
25
+ const progress = Math.round((receivedBytes / totalBytes) * 100)
26
+ statusCallback(`${progress}`)
27
+ })
28
+
29
+ const gzStream = inLocation.match(/.b?gz$/)
30
+ ? stream.pipe(createGunzip())
31
+ : stream
32
+
33
+ const rl = readline.createInterface({
34
+ input: gzStream,
35
+ })
36
+
37
+ for await (const line of rl) {
38
+ if (line.startsWith('#')) {
39
+ continue
40
+ }
41
+
42
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
43
+ const [ref, pos, id, _ref, _alt, _qual, _filter, info] = line.split('\t')
44
+
45
+ // turns vcf info attrs into a map, and converts the arrays into space
46
+ // separated strings
47
+ const fields = Object.fromEntries(
48
+ info
49
+ .split(';')
50
+ .map(f => f.trim())
51
+ .filter(f => !!f)
52
+ .map(f => f.split('='))
53
+ .map(([key, val]) => [
54
+ key.trim(),
55
+ val ? decodeURIComponent(val).trim().split(',').join(' ') : undefined,
56
+ ]),
57
+ )
58
+
59
+ const end = fields.END
60
+
61
+ const locStr = `${ref}:${pos}..${end || +pos + 1}`
62
+ if (id === '.') {
63
+ continue
64
+ }
65
+
66
+ const infoAttrs = attributesToIndex
67
+ .map(attr => fields[attr])
68
+ .filter((f): f is string => !!f)
69
+
70
+ const ids = id.split(',')
71
+ for (let i = 0; i < ids.length; i++) {
72
+ const id = ids[i]
73
+ const attrs = [id]
74
+ const record = JSON.stringify([
75
+ encodeURIComponent(locStr),
76
+ encodeURIComponent(trackId),
77
+ encodeURIComponent(id || ''),
78
+ ...infoAttrs.map(a => encodeURIComponent(a || '')),
79
+ ]).replaceAll(',', '|')
80
+
81
+ // Check abort signal
82
+ checkAbortSignal(signal)
83
+ yield `${record} ${[...new Set(attrs)].join(' ')}\n`
84
+ }
85
+ }
86
+ }
package/src/util.ts ADDED
@@ -0,0 +1,177 @@
1
+ export interface UriLocation {
2
+ uri: string
3
+ locationType: 'UriLocation'
4
+ }
5
+ export interface LocalPathLocation {
6
+ localPath: string
7
+ locationType: 'LocalPathLocation'
8
+ }
9
+ export interface IndexedFastaAdapter {
10
+ type: 'IndexedFastaAdapter'
11
+ fastaLocation: UriLocation
12
+ faiLocation: UriLocation
13
+ }
14
+
15
+ export interface BgzipFastaAdapter {
16
+ type: 'BgzipFastaAdapter'
17
+ fastaLocation: UriLocation
18
+ faiLocation: UriLocation
19
+ gziLocation: UriLocation
20
+ }
21
+
22
+ export interface TwoBitAdapter {
23
+ type: 'TwoBitAdapter'
24
+ twoBitLocation: UriLocation
25
+ }
26
+
27
+ export interface ChromeSizesAdapter {
28
+ type: 'ChromSizesAdapter'
29
+ chromSizesLocation: UriLocation
30
+ }
31
+
32
+ export interface CustomSequenceAdapter {
33
+ type: string
34
+ }
35
+
36
+ export interface RefNameAliasAdapter {
37
+ type: 'RefNameAliasAdapter'
38
+ location: UriLocation
39
+ }
40
+
41
+ export interface CustomRefNameAliasAdapter {
42
+ type: string
43
+ }
44
+ export interface Assembly {
45
+ displayName?: string
46
+ name: string
47
+ aliases?: string[]
48
+ sequence: Sequence
49
+ refNameAliases?: {
50
+ adapter: RefNameAliasAdapter | CustomRefNameAliasAdapter
51
+ }
52
+ refNameColors?: string[]
53
+ }
54
+
55
+ export interface Sequence {
56
+ type: 'ReferenceSequenceTrack'
57
+ trackId: string
58
+ adapter:
59
+ | IndexedFastaAdapter
60
+ | BgzipFastaAdapter
61
+ | TwoBitAdapter
62
+ | ChromeSizesAdapter
63
+ | CustomSequenceAdapter
64
+ }
65
+
66
+ export interface Gff3TabixAdapter {
67
+ type: 'Gff3TabixAdapter'
68
+ gffGzLocation: UriLocation | LocalPathLocation
69
+ }
70
+
71
+ export interface Gff3Adapter {
72
+ type: 'Gff3Adapter'
73
+ gffLocation: UriLocation | LocalPathLocation
74
+ }
75
+ export interface GtfAdapter {
76
+ type: 'GtfAdapter'
77
+ gtfLocation: UriLocation | LocalPathLocation
78
+ }
79
+
80
+ export interface VcfTabixAdapter {
81
+ type: 'VcfTabixAdapter'
82
+ vcfGzLocation: UriLocation | LocalPathLocation
83
+ }
84
+ export interface VcfAdapter {
85
+ type: 'VcfAdapter'
86
+ vcfLocation: UriLocation | LocalPathLocation
87
+ }
88
+ export interface Track {
89
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
90
+ [key: string]: any
91
+ }
92
+ export interface TextSearching {
93
+ indexingFeatureTypesToExclude?: string[]
94
+ indexingAttributes?: string[]
95
+ textSearchAdapter: TrixTextSearchAdapter
96
+ }
97
+
98
+ export interface TrixTextSearchAdapter {
99
+ type: string
100
+ textSearchAdapterId: string
101
+ ixFilePath: UriLocation
102
+ ixxFilePath: UriLocation
103
+ metaFilePath: UriLocation
104
+ assemblyNames: string[]
105
+ }
106
+
107
+ export interface Config {
108
+ assemblies?: Assembly[]
109
+ assembly?: Assembly
110
+ configuration?: {}
111
+ aggregateTextSearchAdapters?: TrixTextSearchAdapter[]
112
+ connections?: unknown[]
113
+ defaultSession?: {}
114
+ tracks?: Track[]
115
+ }
116
+
117
+ export type indexType = 'aggregate' | 'perTrack'
118
+
119
+ // supported adapter types by text indexer
120
+ // ensure that this matches the method found in @jbrowse/core/util
121
+ export function supportedIndexingAdapters(type: string) {
122
+ return [
123
+ 'Gff3TabixAdapter',
124
+ 'VcfTabixAdapter',
125
+ 'Gff3Adapter',
126
+ 'VcfAdapter',
127
+ ].includes(type)
128
+ }
129
+
130
+ export function createTextSearchConf(
131
+ name: string,
132
+ trackIds: string[],
133
+ assemblyNames: string[],
134
+ locationPath: string,
135
+ ) {
136
+ // const locationPath = self.sessionPath.substring(
137
+ // 0,
138
+ // self.sessionPath.lastIndexOf('/'),
139
+ // )
140
+ return {
141
+ type: 'TrixTextSearchAdapter',
142
+ textSearchAdapterId: name,
143
+ ixFilePath: {
144
+ localPath: locationPath + `/trix/${name}.ix`,
145
+ locationType: 'LocalPathLocation',
146
+ },
147
+ ixxFilePath: {
148
+ localPath: locationPath + `/trix/${name}.ixx`,
149
+ locationType: 'LocalPathLocation',
150
+ },
151
+ metaFilePath: {
152
+ localPath: locationPath + `/trix/${name}.json`,
153
+ locationType: 'LocalPathLocation',
154
+ },
155
+ tracks: trackIds,
156
+ assemblyNames,
157
+ }
158
+ }
159
+
160
+ export function findTrackConfigsToIndex(
161
+ tracks: Track[],
162
+ trackIds: string[],
163
+ assemblyName?: string,
164
+ ) {
165
+ return trackIds
166
+ .map(trackId => {
167
+ const currentTrack = tracks.find(t => trackId === t.trackId)
168
+ if (!currentTrack) {
169
+ throw new Error(`Track not found in session for trackId ${trackId}`)
170
+ }
171
+ return currentTrack
172
+ })
173
+ .filter(track =>
174
+ assemblyName ? track.assemblyNames.includes(assemblyName) : true,
175
+ )
176
+ .filter(track => supportedIndexingAdapters(track.adapter.type))
177
+ }