@teselagen/bio-parsers 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,330 @@
1
+ # Bio Parsers
2
+
3
+ <!-- TOC -->
4
+
5
+ - [Bio Parsers](#bio-parsers)
6
+ - [About this Repo](#about-this-repo)
7
+ - [[CHANGELOG](CHANGELOG.md)](#changelogchangelogmd)
8
+ - [Exported Functions](#exported-functions)
9
+ - [Format Specification](#format-specification)
10
+ - [Usage](#usage)
11
+ - [install](#install)
12
+ - [jsonToGenbank (same interface as jsonToFasta)](#jsontogenbank-same-interface-as-jsontofasta)
13
+ - [anyToJson (same interface as genbankToJson, fastaToJson, xxxxToJson) (async required)](#anytojson-same-interface-as-genbanktojson-fastatojson-xxxxtojson-async-required)
14
+ - [Options (for anyToJson or xxxxToJson)](#options-for-anytojson-or-xxxxtojson)
15
+ - [ab1ToJson](#ab1tojson)
16
+ - [snapgeneToJson (.dna files)](#snapgenetojson-dna-files)
17
+ - [genbankToJson](#genbanktojson)
18
+ - [Updating this repo](#updating-this-repo)
19
+ - [Outside collaborators](#outside-collaborators)
20
+ - [Thanks/Collaborators](#thankscollaborators)
21
+
22
+ <!-- /TOC -->
23
+
24
+ ## About this Repo
25
+
26
+ This repo contains a set of parsers to convert between datatypes through a generalized JSON format.
27
+
28
+ ## [CHANGELOG](CHANGELOG.md)
29
+
30
+ ## Exported Functions
31
+
32
+ Use the following exports to convert to a generalized JSON format:
33
+
34
+ ```
35
+ fastaToJson //handles fasta files (.fa, .fasta)
36
+ genbankToJson //handles genbank files (.gb, .gbk)
37
+ ab1ToJson //handles .ab1 sequencing read files
38
+ sbolXmlToJson //handles .sbol files
39
+ geneiousXmlToJson //handles .genious files
40
+ jbeiXmlToJson //handles jbei .seq or .xml files
41
+ snapgeneToJson //handles snapgene (.dna) files
42
+ anyToJson //this handles any of the above file types based on file extension
43
+ ```
44
+
45
+ Use the following exports to convert from a generalized JSON format back to a specific format:
46
+
47
+ ```
48
+ jsonToGenbank
49
+ jsonToFasta
50
+ jsonToBed
51
+ ```
52
+
53
+ ## Format Specification
54
+
55
+ The generalized JSON format looks like:
56
+
57
+ ```js
58
+ const generalizedJsonFormat = {
59
+ size: 25,
60
+ sequence: "asaasdgasdgasdgasdgasgdasgdasdgasdgasgdagasdgasdfasdfdfasdfa",
61
+ circular: true,
62
+ name: "pBbS8c-RFP",
63
+ description: "",
64
+ parts: [
65
+ {
66
+ name: "part 1",
67
+ type: "CDS", //optional for parts
68
+ id: "092j92", //Must be a unique id. If no id is provided, we'll autogenerate one for you
69
+ start: 10, //0-based inclusive index
70
+ end: 30, //0-based inclusive index
71
+ strand: 1,
72
+ notes: {}
73
+ }
74
+ ],
75
+ primers: [
76
+ {
77
+ name: "primer 1",
78
+ id: "092j92", //Must be a unique id. If no id is provided, we'll autogenerate one for you
79
+ start: 10, //0-based inclusive index
80
+ end: 30, //0-based inclusive index
81
+ strand: 1,
82
+ notes: {}
83
+ }
84
+ ],
85
+ features: [
86
+ {
87
+ name: "anonymous feature",
88
+ type: "misc_feature",
89
+ id: "5590c1978979df000a4f02c7", //Must be a unique id. If no id is provided, we'll autogenerate one for you
90
+ start: 1,
91
+ end: 3,
92
+ strand: 1,
93
+ notes: {}
94
+ },
95
+ {
96
+ name: "coding region 1",
97
+ type: "CDS",
98
+ id: "5590c1d88979df000a4f02f5",
99
+ start: 12,
100
+ end: 9,
101
+ strand: -1,
102
+ notes: {}
103
+ }
104
+ ],
105
+ //only if parsing in an ab1 file
106
+ chromatogramData: {
107
+ aTrace: [], //same as cTrace but for a
108
+ tTrace: [], //same as cTrace but for t
109
+ gTrace: [], //same as cTrace but for g
110
+ cTrace: [0, 0, 0, 1, 3, 5, 11, 24, 56, 68, 54, 30, 21, 3, 1, 4, 1, 0, 0, ...etc], //heights of the curve spaced 1 per x position (aka if the cTrace.length === 1000, then the max basePos can be is 1000)
111
+ basePos: [33, 46, 55, ...etc], //x position of the bases (can be unevenly spaced)
112
+ baseCalls: ["A", "T", ...etc],
113
+ qualNums: [] //or undefined if no qualNums are detected on the file
114
+ }
115
+ };
116
+ ```
117
+
118
+ ## Usage
119
+
120
+ ### install
121
+
122
+ `npm install -S @teselagen/bio-parsers`
123
+
124
+ or
125
+
126
+ `yarn add @teselagen/bio-parsers`
127
+
128
+ or
129
+
130
+ use it from a script tag:
131
+
132
+ ```html
133
+ <script src="https://unpkg.com/bio-parsers/umd/bio-parsers.js"></script>
134
+ <script>
135
+ async function main() {
136
+ var jsonOutput = await window.bioParsers.genbankToJson(
137
+ `LOCUS kc2 108 bp DNA linear 01-NOV-2016
138
+ COMMENT teselagen_unique_id: 581929a7bc6d3e00ac7394e8
139
+ FEATURES Location/Qualifiers
140
+ CDS 1..108
141
+ /label="GFPuv"
142
+ misc_feature 61..108
143
+ /label="gly_ser_linker"
144
+ bogus_dude 4..60
145
+ /label="ccmN_sig_pep"
146
+ misc_feature 4..60
147
+ /label="ccmN_nterm_sig_pep"
148
+ /pragma="Teselagen_Part"
149
+ /preferred5PrimeOverhangs=""
150
+ /preferred3PrimeOverhangs=""
151
+ ORIGIN
152
+ 1 atgaaggtct acggcaagga acagtttttg cggatgcgcc agagcatgtt ccccgatcgc
153
+ 61 ggtggcagtg gtagcgggag ctcgggtggc tcaggctctg ggg
154
+ //`
155
+ );
156
+ console.log("jsonOutput:", jsonOutput);
157
+ var genbankString = window.bioParsers.jsonToGenbank(jsonOutput[0].parsedSequence);
158
+ console.log(genbankString);
159
+ }
160
+ main();
161
+ </script>
162
+ ```
163
+
164
+ see the `./umd_demo.html` file for a full working example
165
+
166
+ ### jsonToGenbank (same interface as jsonToFasta)
167
+
168
+ ```js
169
+ //To go from json to genbank:
170
+ import { jsonToGenbank } from "bio-parsers"
171
+ //You can pass an optional options object as the second argument. Here are the defaults
172
+ const options = {
173
+ isProtein: false, //by default the sequence will be parsed and validated as type DNA (unless U's instead of T's are found). If isProtein=true the sequence will be parsed and validated as a PROTEIN type (seqData.isProtein === true)
174
+ guessIfProtein: false, //if true the parser will attempt to guess if the sequence is of type DNA or type PROTEIN (this will override the isProtein flag)
175
+ guessIfProteinOptions: {
176
+ threshold = 0.90, //percent of characters that must be DNA letters to be considered of type DNA
177
+ dnaLetters = ['G', 'A', 'T', 'C'] //customizable set of letters to use as DNA
178
+ },
179
+ inclusive1BasedStart: false //by default feature starts are parsed out as 0-based and inclusive
180
+ inclusive1BasedEnd: false //by default feature ends are parsed out as 0-based and inclusive
181
+ // Example:
182
+ // 0123456
183
+ // ATGAGAG
184
+ // --fff-- (the feature covers GAG)
185
+ // 0-based inclusive start:
186
+ // feature.start = 2
187
+ // 1-based inclusive start:
188
+ // feature.start = 3
189
+ // 0-based inclusive end:
190
+ // feature.end = 4
191
+ // 1-based inclusive end:
192
+ // feature.end = 5
193
+ }
194
+ const genbankString = jsonToGenbank(generalizedJsonFormat, options)
195
+
196
+ ```
197
+
198
+ ### anyToJson (same interface as genbankToJson, fastaToJson, xxxxToJson) (async required)
199
+
200
+ ```js
201
+ import { anyToJson } from "bio-parsers";
202
+
203
+ //note, anyToJson should be called using an await to allow for file parsing to occur (if a file is being passed)
204
+ const results = await anyToJson(
205
+ stringOrFile, //if ab1 files are being passed in you should pass files only, otherwise strings or files are fine as inputs
206
+ options //options.fileName (eg "pBad.ab1" or "pCherry.fasta") is important to pass here in order for the parser to!
207
+ );
208
+
209
+ //we always return an array of results because some files my contain multiple sequences
210
+ results[0].success; //either true or false
211
+ results[0].messages; //either an array of strings giving any warnings or errors generated during the parsing process
212
+ results[0].parsedSequence; //this will be the generalized json format as specified above :)
213
+ //chromatogram data will be here (ab1 only):
214
+ results[0].parsedSequence.chromatogramData;
215
+ ```
216
+
217
+ ### Options (for anyToJson or xxxxToJson)
218
+
219
+ ```js
220
+ //You can pass an optional options object as the third argument. Here are the defaults
221
+ const options = {
222
+ fileName: "example.gb", //the filename is used if none is found in the genbank
223
+ isProtein: false, //if you know that it is a protein string being parsed you can pass true here
224
+ parseFastaAsCircular: false; //by default fasta files are parsed as linear sequences. You can change this by setting parseFastaAsCircular=true
225
+ //genbankToJson options only
226
+ inclusive1BasedStart: false //by default feature starts are parsed out as 0-based and inclusive
227
+ inclusive1BasedEnd: false //by default feature ends are parsed out as 0-based and inclusive
228
+ acceptParts: true //by default features with a feature.notes.pragma[0] === "Teselagen_Part" are added to the sequenceData.parts array. Setting this to false will keep them as features instead
229
+ // fastaToJson options only
230
+ parseName: true //by default attempt to parse the name and description of sequence from the comment line. Setting this to false will keep the name unchanged with no description
231
+ }
232
+ ```
233
+
234
+ ### ab1ToJson
235
+
236
+ ```js
237
+ import { ab1ToJson } from "bio-parsers";
238
+ const results = await ab1ToJson(
239
+ //this can be either a browser file <input type="file" id="input" multiple onchange="ab1ToJson(this.files[0])">
240
+ // or a node file ab1ToJson(fs.readFileSync(path.join(__dirname, './testData/ab1/example1.ab1')));
241
+ file,
242
+ options //options.fileName (eg "pBad.ab1" or "pCherry.fasta") is important to pass here in order for the parser to!
243
+ );
244
+
245
+ //we always return an array of results because some files my contain multiple sequences
246
+ results[0].success; //either true or false
247
+ results[0].messages; //either an array of strings giving any warnings or errors generated during the parsing process
248
+ results[0].parsedSequence; //this will be the generalized json format as specified above :)
249
+ //chromatogram data will be here (ab1 only):
250
+ results[0].parsedSequence.chromatogramData;
251
+ ```
252
+
253
+ ### snapgeneToJson (.dna files)
254
+
255
+ ```js
256
+ import { snapgeneToJson } from "bio-parsers";
257
+ //file can be either a browser file <input type="file" id="input" multiple onchange="snapgeneToJson(this.files[0])">
258
+ // or a node file snapgeneToJson(fs.readFileSync(path.join(__dirname, './testData/ab1/example1.ab1')));
259
+ const results = await snapgeneToJson(file, options);
260
+ ```
261
+
262
+ ### genbankToJson
263
+
264
+ ```js
265
+ import { genbankToJson } from "bio-parsers";
266
+
267
+ const result = genbankToJson(string, options);
268
+
269
+ console.info(result);
270
+ // [
271
+ // {
272
+ // "messages": [
273
+ // "Import Error: Illegal character(s) detected and removed from sequence. Allowed characters are: atgcyrswkmbvdhn",
274
+ // "Invalid feature end: 1384 detected for Homo sapiens and set to 1",
275
+ // ],
276
+ // "success": true,
277
+ // "parsedSequence": {
278
+ // "features": [
279
+ // {
280
+ // "notes": {
281
+ // "organism": [
282
+ // "Homo sapiens"
283
+ // ],
284
+ // "db_xref": [
285
+ // "taxon:9606"
286
+ // ],
287
+ // "chromosome": [
288
+ // "17"
289
+ // ],
290
+ // "map": [
291
+ // "17q21"
292
+ // ]
293
+ // },
294
+ // "type": "source",
295
+ // "strand": 1,
296
+ // "name": "Homo sapiens",
297
+ // "start": 0,
298
+ // "end": 1
299
+ // }
300
+ // ],
301
+ // "name": "NP_003623",
302
+ // "sequence": "gagaggggggttatccccccttcgtcagtcgatcgtaacgtatcagcagcgcgcgagattttctggcgcagtcag",
303
+ // "circular": true,
304
+ // "extraLines": [
305
+ // "DEFINITION contactin-associated protein 1 precursor [Homo sapiens].",
306
+ // "ACCESSION NP_003623",
307
+ // "VERSION NP_003623.1 GI:4505463",
308
+ // "DBSOURCE REFSEQ: accession NM_003632.2",
309
+ // "KEYWORDS RefSeq."
310
+ // ],
311
+ // "type": "DNA",
312
+ // "size": 925
313
+ // }
314
+ // }
315
+ // ]
316
+ ```
317
+
318
+ You can see more examples by looking at the tests.
319
+
320
+ ## Updating this repo
321
+
322
+ ### Outside collaborators
323
+
324
+ fork and pull request please :)
325
+
326
+ ## Thanks/Collaborators
327
+
328
+ - IsaacLuo - https://github.com/IsaacLuo/SnapGeneFileReader (from which the snapgene parser was adapted)
329
+ - Joshua Nixon (original collaborator)
330
+ - Thomas Rich (original collaborator)
package/fastaToJson.d.ts CHANGED
@@ -5,4 +5,4 @@ export default fastaToJson;
5
5
  * @param {[function]} onFileParsed [callback for a parsed sequence]
6
6
  * @author Joshua P Nixon
7
7
  */
8
- declare function fastaToJson(fileString: [string], options: any): any;
8
+ declare function fastaToJson(fileString: [string], options?: {}): any;