npm - @teselagen/bio-parsers - Versions diffs - 0.1.27 → 0.1.28 - Mend

@teselagen/bio-parsers 0.1.27 → 0.1.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/index.js +24219 -39924
package/index.mjs +24238 -39921
package/index.umd.js +32684 -48391
package/package.json +3 -7
package/src/ab1ToJson.js +177 -0
package/src/anyToJson.js +225 -0
package/src/fastaToJson.js +101 -0
package/src/genbankToJson.d.__ts +20 -0
package/src/genbankToJson.js +688 -0
package/src/geneiousXmlToJson.js +147 -0
package/src/gffToJson.js +43 -0
package/src/index.js +23 -0
package/src/jbeiXmlToJson.js +109 -0
package/src/jsonToBed.js +39 -0
package/src/jsonToFasta.js +33 -0
package/src/jsonToGenbank.js +423 -0
package/src/jsonToJsonString.js +26 -0
package/src/sbolXmlToJson.js +135 -0
package/src/snapgeneToJson.js +245 -0
package/src/utils/NameUtils.js +10 -0
package/src/utils/ParserUtil.js +93 -0
package/src/utils/cleanUpTeselagenJsonForExport.js +13 -0
package/src/utils/constants.js +24 -0
package/src/utils/convertOldSequenceDataToNewDataType.js +64 -0
package/src/utils/createInitialSequence.js +14 -0
package/src/utils/extractFileExtension.js +14 -0
package/src/utils/flattenSequenceArray.js +17 -0
package/src/utils/getArrayBufferFromFile.js +32 -0
package/src/utils/isBrowser.js +1 -0
package/src/utils/parseUracilFeatures.js +13 -0
package/src/utils/pragmasAndTypes.js +21 -0
package/src/utils/searchWholeObjByName.js +98 -0
package/src/utils/splitStringIntoLines.js +13 -0
package/src/utils/unmangleUrls.js +34 -0
package/src/utils/validateSequence.js +349 -0
package/src/utils/validateSequenceArray.js +20 -0

package/package.json CHANGED Viewed

@@ -1,21 +1,17 @@
 {
   "name": "@teselagen/bio-parsers",
-  "version": "0.1.27",
+  "version": "0.1.28",
   "type": "commonjs",
   "dependencies": {
-    "@teselagen/sequence-utils": "0.1.22",
-    "@teselagen/range-utils": "0.1.21",
+    "@teselagen/sequence-utils": "0.1.23",
+    "@teselagen/range-utils": "0.1.22",
     "@gmod/gff": "^1.2.1",
-    "bson-objectid": "2.0.4",
     "buffer": "^6.0.3",
     "bufferpack": "^0.0.6",
     "color": "^4.2.3",
-    "escape-string-regexp": "1.0.5",
     "fast-xml-parser": "^4.2.5",
     "fflate": "^0.8.0",
-    "jsondiffpatch-rc": "0.4.2",
     "lodash": "^4.17.21",
-    "shortid": "^2.2.16",
     "string_decoder": "^1.3.0",
     "validate.io-nonnegative-integer-array": "^1.0.1"
   }

package/src/ab1ToJson.js ADDED Viewed

@@ -0,0 +1,177 @@
+import createInitialSequence from "./utils/createInitialSequence";
+import getArrayBufferFromFile from "./utils/getArrayBufferFromFile";
+async function ab1ToJson(fileObj, options = {}) {
+  const arrayBuffer = await getArrayBufferFromFile(fileObj);
+  const dataview = new DataView(arrayBuffer);
+  const converter = new abConverter(dataview);
+  const chromatogramData = converter.getTraceData();
+  const returnVal = createInitialSequence(options);
+  returnVal.parsedSequence = {
+    ...returnVal.parsedSequence,
+    sequence: chromatogramData.baseCalls.join(""),
+    chromatogramData,
+  };
+  return [returnVal];
+}
+export default ab1ToJson;
+function abConverter(inputArrayBuffer) {
+  const dirLocation = inputArrayBuffer.getInt32(26);
+  const numElements = inputArrayBuffer.getInt32(18);
+  const lastEntry = dirLocation + numElements * 28;
+  this.getNumber = (inOffset, numEntries) => {
+    const retArray = [];
+    for (let counter = 0; counter < numEntries; counter += 1) {
+      retArray.push(inputArrayBuffer.getInt8(inOffset + counter));
+    }
+    return retArray;
+  };
+  this.getChar = (inOffset, numEntries) => {
+    const retArray = [];
+    for (let counter = 0; counter < numEntries; counter += 1) {
+      retArray.push(
+        String.fromCharCode(inputArrayBuffer.getInt8(inOffset + counter))
+      );
+    }
+    return retArray;
+  };
+  this.getShort = (inOffset, numEntries) => {
+    const retArray = [];
+    for (let counter = 0; counter < numEntries; counter += 2) {
+      retArray.push(inputArrayBuffer.getInt16(inOffset + counter));
+    }
+    return retArray;
+  };
+  this.getTagName = (inOffset) => {
+    let name = "";
+    for (let loopOffset = inOffset; loopOffset < inOffset + 4; loopOffset++) {
+      name += String.fromCharCode(inputArrayBuffer.getInt8(loopOffset));
+    }
+    return name;
+  };
+  this.getDataTag = function(inTag) {
+    let output;
+    let curElem = dirLocation;
+    do {
+      const currTagName = this.getTagName(curElem);
+      const tagNum = inputArrayBuffer.getInt32(curElem + 4);
+      // eslint-disable-next-line eqeqeq
+      if (currTagName == inTag.tagName && tagNum === inTag.tagNum) {
+        const numEntries = inputArrayBuffer.getInt32(curElem + 16);
+        const entryOffset = inputArrayBuffer.getInt32(curElem + 20);
+        output = this[inTag.typeToReturn](entryOffset, numEntries);
+      }
+      curElem += 28;
+    } while (curElem < lastEntry);
+    return output;
+  };
+  this.getTraceData = function() {
+    const traceData = {};
+    traceData.aTrace = this.getDataTag(tagDict.colorDataA);
+    traceData.tTrace = this.getDataTag(tagDict.colorDataT);
+    traceData.gTrace = this.getDataTag(tagDict.colorDataG);
+    traceData.cTrace = this.getDataTag(tagDict.colorDataC);
+    traceData.basePos = this.getDataTag(tagDict.peakLocations);
+    traceData.baseCalls = this.getDataTag(tagDict.baseCalls2);
+    traceData.qualNums = this.getDataTag(tagDict.qualNums);
+    if (traceData.qualNums ) {
+      //tnr if we're only getting 1's and 0's as qualNums, that means that there weren't actual qual nums attached to the file
+      if (!traceData.qualNums.filter(q => (q!==1 && q!==0)).length)  {
+        delete traceData.qualNums
+      }
+    }
+    return convertBasePosTraceToPerBpTrace(traceData);
+  };
+  this.getFirstEntry = () => {
+    let output = "";
+    for (let curElem = dirLocation; curElem < lastEntry; curElem += 28) {
+      let name = "";
+      for (let offset = curElem; offset < curElem + 4; offset++) {
+        name += String.fromCharCode(inputArrayBuffer.getInt8(offset));
+      }
+      output += ` - ${name}`;
+    }
+    return output;
+  };
+}
+const tagDict = {
+  baseCalls1: { tagName: "PBAS", tagNum: 1, typeToReturn: "getChar" },
+  baseCalls2: { tagName: "PBAS", tagNum: 2, typeToReturn: "getChar" },
+  qualNums: { tagName: "PCON", tagNum: 2, typeToReturn: "getNumber" },
+  peakLocations: { tagName: "PLOC", tagNum: 2, typeToReturn: "getShort" },
+  peakDev: { tagName: "P1RL", tagNum: 1, typeToReturn: "getShort" },
+  peakOneAmp: { tagName: "P1AM", tagNum: 1, typeToReturn: "getShort" },
+  colorDataA: { tagName: "DATA", tagNum: 10, typeToReturn: "getShort" },
+  colorDataT: { tagName: "DATA", tagNum: 11, typeToReturn: "getShort" },
+  colorDataG: { tagName: "DATA", tagNum: 9, typeToReturn: "getShort" },
+  colorDataC: { tagName: "DATA", tagNum: 12, typeToReturn: "getShort" },
+};
+const correctionAmount = 3
+// tnr: this function takes in chromData which has 4 traces and a basePos (which describes where in the trace the base call lands)
+// It "normalizes" that data into a baseTraces array so that each base has its own set of that data (having a per-base trace makes insertion/deletion/copy/paste actions all easier)
+function convertBasePosTraceToPerBpTrace(chromData) {
+  const { basePos, aTrace } = chromData;
+  const traceLength = aTrace.length;
+  let startPos = 0;
+  let nextBasePos = basePos[1];
+  let endPos;
+  function setEndPos() {
+    if (nextBasePos) {
+      endPos = startPos + Math.ceil((nextBasePos - startPos) / 2);
+    } else {
+      endPos = traceLength;
+    }
+  }
+  setEndPos();
+  const baseTraces = [];
+  for (let i = 0; i < basePos.length; i++) {
+    const tracesForType = {
+      aTrace: [],
+      tTrace: [],
+      gTrace: [],
+      cTrace: []
+    };
+    baseTraces[i] = tracesForType;
+    [
+      "aTrace",
+      "tTrace",
+      "gTrace",
+      "cTrace"
+      // eslint-disable-next-line no-loop-func
+    ].forEach((type) => {
+      const traceForType = tracesForType[type];
+      const traceData = chromData[type];
+      for (let j = startPos; j < endPos + correctionAmount; j++) {
+        traceForType.push(traceData[j] || 0);
+      }
+    });
+    if (i !== basePos.length-1) {
+      startPos = endPos+correctionAmount;
+      nextBasePos = basePos[i + 2];
+      setEndPos();
+    }
+  }
+  return {
+    baseTraces,
+    ...chromData
+  };
+}
+export {
+  convertBasePosTraceToPerBpTrace
+}

package/src/anyToJson.js ADDED Viewed

@@ -0,0 +1,225 @@
+import fastaToJson from "./fastaToJson";
+import genbankToJson from "./genbankToJson";
+import sbolXmlToJson from "./sbolXmlToJson";
+import extractFileExtension from "./utils/extractFileExtension.js";
+import snapgeneToJson from "./snapgeneToJson";
+import ab1ToJson from "./ab1ToJson";
+import gffToJson from "./gffToJson";
+import isBrowser from "./utils/isBrowser";
+import { tidyUpSequenceData } from "@teselagen/sequence-utils";
+import geneiousXmlToJson from "./geneiousXmlToJson";
+import jbeiXmlToJson from "./jbeiXmlToJson";
+import { unzipSync } from "fflate";
+/**
+ * takes in file content string and its file name and determines what parser it needs to be sent to.
+ * The file is parsed to our old JSON schema and after it goes through an intermediate step where we convert that json to our new schema
+ * @param  {string} fileContentString content of the file as a string
+ * @param  {Function} onFileParsed    //tnr: fill this out
+ */
+async function anyToJson(fileContentStringOrFileObj, options) {
+  let fileContentString;
+  options = options || {};
+  let fileName = options.fileName || "";
+  if (!fileName && typeof fileContentStringOrFileObj !== "string") {
+    fileName = fileContentStringOrFileObj.name;
+    options.fileName = fileName;
+  }
+  const ext = extractFileExtension(fileName);
+  if (typeof fileContentStringOrFileObj === "string") {
+    fileContentString = fileContentStringOrFileObj;
+  } else {
+    if (/^(ab1)$/.test(ext)) {
+      // AB1 sequencing read
+      //we will always want to pass the file obj and not the string to ab1
+      return ab1ToJson(fileContentStringOrFileObj, options);
+    } else if (/^(prot)$/.test(ext)) {
+      // fileContentString = await getUtf8StringFromFile(
+      //   fileContentStringOrFileObj,
+      //   options
+      // );
+      // snapgene file (always requires that the full filename be passed in to anyToJson otherwise it won't parse properly)
+      //we will always want to pass the file obj and not the string to the snapgene parser because it expects a binary file
+      return snapgeneToJson(fileContentStringOrFileObj, options);
+    } else if (/^(dna)$/.test(ext)) {
+      // snapgene file (always requires that the full filename be passed in to anyToJson otherwise it won't parse properly)
+      //we will always want to pass the file obj and not the string to the snapgene parser because it expects a binary file
+      return snapgeneToJson(fileContentStringOrFileObj, options);
+    } else if (/^(geneious)$/.test(ext)) {
+      const a = await getUint8ArrayFromFile(fileContentStringOrFileObj);
+      let d;
+      try {
+        d = new TextDecoder().decode(a, { stream: false });
+        if (!d.includes("<geneious")) {
+          throw new Error("not geneious");
+        }
+      } catch (e) {
+        //catch the above error and try to unzip the file and see if it works
+        const b = unzipSync(a);
+        const c = Object.values(b)[0];
+        d = new TextDecoder().decode(c, { stream: false });
+      }
+      return geneiousXmlToJson(d, options);
+    } else {
+      // we want to get the string from the file obj
+      fileContentString = await getUtf8StringFromFile(
+        fileContentStringOrFileObj,
+        options
+      );
+    }
+  }
+  // console.log(`fileContentString.includes("seq:seq"):`,fileContentString.includes("seq:seq"))
+  // console.log(`fileContentString.includes("jbei")):`,fileContentString.includes("jbei"))
+  if (/^(fasta|fas|fa|fna|ffn)$/.test(ext)) {
+    // FASTA
+    return fastaToJson(fileContentString, options);
+  } else if (/^(gb|gbk)$/.test(ext)) {
+    // GENBANK
+    return genbankToJson(fileContentString, options);
+  } else if (
+    /^(seq)$/.test(ext) ||
+    (/^(xml)$/.test(ext) &&
+      fileContentString.includes("seq:seq") &&
+      fileContentString.includes("jbei"))
+  ) {
+    // JBEI
+    return jbeiXmlToJson(fileContentString, options);
+  } else if (/^(json)$/.test(ext)) {
+    // TG JSON Probably
+    const failure = {
+      messages: [`Unable to parse JSON file ${fileName}`],
+      success: false
+    };
+    try {
+      const cleaned = tidyUpSequenceData(
+        JSON.parse(fileContentString),
+        options
+      );
+      if (!cleaned.sequence.length) return [failure];
+      return [{ parsedSequence: cleaned, success: true }];
+    } catch (error) {
+      console.error(`error:`, error);
+      return [failure];
+    }
+  } else if (/^(gp|genpep)$/.test(ext)) {
+    // PROTEIN GENBANK
+    return genbankToJson(fileContentString, { ...options, isProtein: true });
+  } else if (/^(xml|rdf)$/.test(ext)) {
+    // XML/RDF
+    return sbolXmlToJson(
+      fileContentString || fileContentStringOrFileObj,
+      options
+    );
+  } else if (/^(gff|gff3)$/.test(ext)) {
+    // GFF
+    return gffToJson(fileContentStringOrFileObj, options);
+  } else {
+    // console.warn(
+    //   "TNR: No filename passed to anyToJson so we're going through the list of parsers. Make sure you're passing the filename when using anyToJson!"
+    // );
+    let parsersToTry = [
+      {
+        fn: genbankToJson,
+        name: "Genbank Parser"
+      },
+      {
+        fn: fastaToJson,
+        name: "Fasta Parser"
+      }
+    ];
+    const firstChar = fileContentString[fileContentString.search(/\S|$/)];
+    //try to guess the file type based on the first non-whitespace char in the filestring
+    if (firstChar === ">") {
+      parsersToTry = parsersToTry.sort((a) => {
+        if (a.name === "Fasta Parser") return -1;
+        return 1;
+      });
+    } else if (firstChar === "L") {
+      parsersToTry = parsersToTry.sort((a) => {
+        if (a.name === "Genbank Parser") return -1;
+        return 1;
+      });
+    }
+    for (const parser of parsersToTry) {
+      const toReturn = await parser.fn(fileContentString, options);
+      if (successfulParsing(toReturn)) {
+        //continue on to through the normal flow
+        toReturn.forEach(function (result) {
+          result.messages.push("Parsed using " + parser.name + ".");
+        });
+        return toReturn;
+      }
+    }
+    //none of the parsers worked
+    return [
+      {
+        messages: [
+          "Unable to parse file as FASTA, genbank, JBEI, or SBOL formats"
+        ],
+        success: false
+      }
+    ];
+  }
+  //helper function to determine whether or not the parsing was successful or not
+  function successfulParsing(resultArray) {
+    return resultArray.some(function (result) {
+      return result.success;
+    });
+  }
+}
+export default anyToJson;
+function getUtf8StringFromFile(file, { emulateBrowser } = {}) {
+  if (!isBrowser && !emulateBrowser) {
+    //emulate browser is only used for testing purposes
+    //we're in a node context
+    return Buffer.isBuffer(file)
+      ? file.toString("utf-8")
+      : Buffer.isBuffer(file.buffer)
+      ? file.buffer.toString("utf-8")
+      : file;
+  }
+  const reader = new window.FileReader();
+  reader.readAsText(file, "UTF-8");
+  return new Promise((resolve, reject) => {
+    reader.onload = (evt) => {
+      resolve(evt.target.result);
+    };
+    reader.onerror = (err) => {
+      console.error("err:", err);
+      reject(err);
+    };
+  });
+}
+function getUint8ArrayFromFile(file, { emulateBrowser } = {}) {
+  if (!isBrowser && !emulateBrowser) {
+    //emulate browser is only used for testing purposes
+    //we're in a node context
+    return Buffer.isBuffer(file)
+      ? new Uint8Array(file)
+      : Buffer.isBuffer(file.buffer)
+      ? new Uint8Array(file.buffer)
+      : file;
+  }
+  const reader = new window.FileReader();
+  // reader.readAsText(file, "UTF-8");
+  reader.readAsArrayBuffer(file);
+  return new Promise((resolve, reject) => {
+    reader.onload = (evt) => {
+      const arrayBuffer = evt.target.result;
+      const bytes = new Uint8Array(arrayBuffer);
+      resolve(bytes);
+    };
+    reader.onerror = (err) => {
+      console.error("err:", err);
+      reject(err);
+    };
+  });
+}

package/src/fastaToJson.js ADDED Viewed

@@ -0,0 +1,101 @@
+import createInitialSequence from "./utils/createInitialSequence";
+import splitStringIntoLines from "./utils/splitStringIntoLines.js";
+import validateSequenceArray from "./utils/validateSequenceArray";
+/**
+ * parses a fasta file that may or may not contain multiple resultArray
+ * @param  {[string]} fileString   [string respresentation of file contents]
+ * @param  {[function]} onFileParsed [callback for a parsed sequence]
+ * @author Joshua P Nixon
+ */
+function fastaToJson(fileString, options) {
+  let resultArray = [];
+  let result = null;
+  try {
+    const lines = splitStringIntoLines(fileString);
+    for (let i = 0; i < lines.length; i++) {
+      parseLine(lines[i]);
+    }
+    if (result) {
+      resultArray.push(result);
+      result = null;
+    }
+  } catch (e) {
+    console.error("error:", e);
+    console.error("error.stack: ", e.stack);
+    resultArray = [
+      {
+        success: false,
+        messages: ["Import Error: Invalid File"],
+      },
+    ];
+  }
+  return validateSequenceArray(resultArray, options);
+  function parseLine(line) {
+    line = line.trim();
+    if (";" === line[0]) {
+      //first instace is title, afterwards comments are ignored
+      if (result) {
+        return;
+      }
+      result = createInitialSequence(options);
+      parseTitle(line);
+    } else if (">" === line[0]) {
+      //header line
+      if (result) {
+        resultArray.push(result);
+        result = null;
+      }
+      result = createInitialSequence(options);
+      parseTitle(line);
+    } else {
+      //sequence line
+      if (!result) {
+        result = createInitialSequence(options);
+      }
+      if ("*" === line[line.length - 1]) {
+        //some resultArray are ended with an asterisk
+        parseSequenceLine(line.substring(0, line.length - 1));
+        resultArray.push(result);
+        result = null;
+      } else {
+        parseSequenceLine(line);
+      }
+    }
+    if (options && options.parseFastaAsCircular) {
+      result.parsedSequence.circular = true;
+    }
+  }
+  function parseTitle(line) {
+    if (options && 'parseName' in options && !options.parseName){
+      result.parsedSequence.name = line.slice(1)
+      return
+    }
+    const pipeIndex = line.indexOf("|");
+    if (pipeIndex > -1) {
+      result.parsedSequence.name = line.slice(1, pipeIndex);
+      result.parsedSequence.description = line.slice(pipeIndex + 1);
+    } else {
+      result.parsedSequence.name = line.slice(1);
+    }
+  }
+  function parseSequenceLine(line) {
+    // http://www.ncbi.nlm.nih.gov/BLAST/blastcgihelp.shtml says
+    // that the sequence can be interspersed with numbers and/or spaces and - dashes for gaps.
+    // if (options && !options.doNotRemoveDashes && line.match(/[\s0-9-]/)) {
+    //     line = line.replace(/[\s[0-9-]/g, "");
+    //     const msg = "Warning: spaces, numbers and/or dashes were removed from sequence"
+    //     result.messages.indexOf(msg === -1) && result.messages.push(msg);
+    // }
+    result.parsedSequence.sequence += line;
+  }
+}
+export default fastaToJson;

package/src/genbankToJson.d.__ts ADDED Viewed

@@ -0,0 +1,20 @@
+interface sequenceObject {
+    features: [],
+    parts: [],
+    circular: boolean,
+}
+// interface parsedResult {
+//     parsedSequence: sequenceObject
+// }
+type ParsedResult = {
+    parsedSequence: boolean
+}
+// interface onFileParsedCallback {
+//     (res: [parsedResult]): void;
+// }
+type onFileParsedCallback<ParsedResult> =  (parsedResult: <ParsedResult>): ParsedResult => void
+export default genbankToJson<genbankFileString,onFileParsedCallback > = (genbankFileString: string, onFileParsedCallback: onFileParsedCallback, options) => void