sdf-parser 6.0.1 → 7.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -7
- package/lib/index.js +105 -51
- package/package.json +14 -17
- package/src/MolfileStream.js +33 -0
- package/src/getEntriesBoundaries.js +7 -0
- package/src/index.js +1 -0
- package/src/iterator.js +35 -28
- package/src/parse.js +15 -15
- package/src/util/getMolecule.js +16 -7
- package/src/__tests__/__snapshots__/getEntriesBoundaries.test.js.snap +0 -10
- package/src/__tests__/checkOptions.test.js +0 -57
- package/src/__tests__/checkUndefined.test.js +0 -24
- package/src/__tests__/getEntriesBoundaries.test.js +0 -33
- package/src/__tests__/index.test.js +0 -76
- package/src/__tests__/iterator.test.js +0 -183
- package/src/__tests__/notWellFormatted.test.js +0 -14
- package/src/__tests__/test.sdf +0 -8707
- package/src/__tests__/test.sdf.gz +0 -0
- package/src/__tests__/test1.sdf +0 -38
- package/src/__tests__/test2.sdf +0 -498
- package/src/__tests__/test4.sdf +0 -37
- package/src/iterator.browser.js +0 -3
- package/src/stream.browser.js +0 -3
package/README.md
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# sdf-parser
|
|
2
2
|
|
|
3
3
|
[![NPM version][npm-image]][npm-url]
|
|
4
|
-
[![build status][
|
|
4
|
+
[![build status][ci-image]][ci-url]
|
|
5
|
+
[![Test coverage][codecov-image]][codecov-url]
|
|
5
6
|
[![npm download][download-image]][download-url]
|
|
6
7
|
|
|
7
8
|
Allow to parse a SDF file and convert it to an array of objects.
|
|
@@ -59,13 +60,19 @@ var result = parse(sdf, {
|
|
|
59
60
|
|
|
60
61
|
## Iterator
|
|
61
62
|
|
|
62
|
-
This API is only available on Node.js.
|
|
63
|
-
|
|
64
63
|
```js
|
|
65
64
|
const { iterator } = require('sdf-parser');
|
|
66
|
-
const
|
|
67
|
-
|
|
65
|
+
const file = await openAsBlob(join(__dirname, 'test.sdf.gz'));
|
|
66
|
+
|
|
67
|
+
const decompressionStream = new DecompressionStream('gzip');
|
|
68
|
+
const textDecoder = new TextDecoderStream();
|
|
69
|
+
|
|
70
|
+
const stream = file
|
|
71
|
+
.stream()
|
|
72
|
+
.pipeThrough(decompressionStream)
|
|
73
|
+
.pipeThrough(textDecoder);
|
|
68
74
|
const results = [];
|
|
75
|
+
|
|
69
76
|
for await (const entry of iterator(stream)) {
|
|
70
77
|
results.push(entry);
|
|
71
78
|
}
|
|
@@ -77,7 +84,9 @@ for await (const entry of iterator(stream)) {
|
|
|
77
84
|
|
|
78
85
|
[npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
|
|
79
86
|
[npm-url]: https://www.npmjs.com/package/sdf-parser
|
|
80
|
-
[
|
|
81
|
-
[
|
|
87
|
+
[ci-image]: https://github.com/cheminfo/sdf-parser/actions/workflows/nodejs.yml/badge.svg
|
|
88
|
+
[ci-url]: https://github.com/cheminfo/sdf-parser/actions/workflows/nodejs.yml
|
|
89
|
+
[codecov-image]: https://img.shields.io/codecov/c/github/cheminfo/sdf-parser.svg
|
|
90
|
+
[codecov-url]: https://codecov.io/gh/cheminfo/sdf-parser
|
|
82
91
|
[download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
|
|
83
92
|
[download-url]: https://www.npmjs.com/package/sdf-parser
|
package/lib/index.js
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
Object.defineProperty(exports, '__esModule', { value: true });
|
|
4
|
-
|
|
5
3
|
var ensureString = require('ensure-string');
|
|
6
|
-
var readline = require('readline');
|
|
7
4
|
var dynamicTyping = require('dynamic-typing');
|
|
8
5
|
|
|
6
|
+
/**
|
|
7
|
+
*
|
|
8
|
+
* @param {*} string
|
|
9
|
+
* @param {*} substring
|
|
10
|
+
* @param {*} eol
|
|
11
|
+
* @returns
|
|
12
|
+
*/
|
|
9
13
|
function getEntriesBoundaries(string, substring, eol) {
|
|
10
14
|
const res = [];
|
|
11
15
|
let previous = 0;
|
|
@@ -28,6 +32,14 @@ function getEntriesBoundaries(string, substring, eol) {
|
|
|
28
32
|
return res;
|
|
29
33
|
}
|
|
30
34
|
|
|
35
|
+
/**
|
|
36
|
+
* Parse the molfile and the properties with > < labels >
|
|
37
|
+
* @param {string} sdfPart
|
|
38
|
+
* @param {*} labels
|
|
39
|
+
* @param {*} currentLabels
|
|
40
|
+
* @param {object} options
|
|
41
|
+
* @returns
|
|
42
|
+
*/
|
|
31
43
|
function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
32
44
|
let parts = sdfPart.split(`${options.eol}>`);
|
|
33
45
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
@@ -37,7 +49,7 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
37
49
|
let lines = parts[j].split(options.eol);
|
|
38
50
|
let from = lines[0].indexOf('<');
|
|
39
51
|
let to = lines[0].indexOf('>');
|
|
40
|
-
let label = lines[0].
|
|
52
|
+
let label = lines[0].slice(from + 1, to);
|
|
41
53
|
currentLabels.push(label);
|
|
42
54
|
if (!labels[label]) {
|
|
43
55
|
labels[label] = {
|
|
@@ -46,8 +58,8 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
46
58
|
keep: false,
|
|
47
59
|
};
|
|
48
60
|
if (
|
|
49
|
-
(!options.exclude || options.exclude.
|
|
50
|
-
(!options.include || options.include.
|
|
61
|
+
(!options.exclude || !options.exclude.includes(label)) &&
|
|
62
|
+
(!options.include || options.include.includes(label))
|
|
51
63
|
) {
|
|
52
64
|
labels[label].keep = true;
|
|
53
65
|
if (options.modifiers[label]) {
|
|
@@ -74,10 +86,11 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
74
86
|
molecule[label] = modifiedValue;
|
|
75
87
|
}
|
|
76
88
|
}
|
|
77
|
-
if (
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
89
|
+
if (
|
|
90
|
+
labels[label].isNumeric &&
|
|
91
|
+
(!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
|
|
92
|
+
) {
|
|
93
|
+
labels[label].isNumeric = false;
|
|
81
94
|
}
|
|
82
95
|
}
|
|
83
96
|
}
|
|
@@ -86,15 +99,15 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
86
99
|
|
|
87
100
|
/**
|
|
88
101
|
* Parse a SDF file
|
|
89
|
-
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
102
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
|
|
90
103
|
* @param {object} [options={}]
|
|
91
|
-
* @param {string[]} [options.include] List of fields to include
|
|
92
|
-
* @param {string[]} [options.exclude] List of fields to exclude
|
|
93
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
94
|
-
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
95
|
-
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
|
|
96
|
-
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
|
|
97
|
-
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
|
|
104
|
+
* @param {string[]} [options.include] - List of fields to include
|
|
105
|
+
* @param {string[]} [options.exclude] - List of fields to exclude
|
|
106
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
107
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
108
|
+
* @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
|
|
109
|
+
* @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
|
|
110
|
+
* @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
|
|
98
111
|
*/
|
|
99
112
|
function parse(sdf, options = {}) {
|
|
100
113
|
options = { ...options };
|
|
@@ -110,14 +123,14 @@ function parse(sdf, options = {}) {
|
|
|
110
123
|
if (options.eol === undefined) {
|
|
111
124
|
options.eol = '\n';
|
|
112
125
|
if (options.mixedEOL) {
|
|
113
|
-
sdf = sdf.
|
|
114
|
-
sdf = sdf.
|
|
126
|
+
sdf = sdf.replaceAll('\r\n', '\n');
|
|
127
|
+
sdf = sdf.replaceAll('\r', '\n');
|
|
115
128
|
} else {
|
|
116
129
|
// we will find the delimiter in order to be much faster and not use regular expression
|
|
117
|
-
let header = sdf.
|
|
118
|
-
if (header.
|
|
130
|
+
let header = new Set(sdf.slice(0, 1000));
|
|
131
|
+
if (header.has('\r\n')) {
|
|
119
132
|
options.eol = '\r\n';
|
|
120
|
-
} else if (header.
|
|
133
|
+
} else if (header.has('\r')) {
|
|
121
134
|
options.eol = '\r';
|
|
122
135
|
}
|
|
123
136
|
}
|
|
@@ -134,7 +147,7 @@ function parse(sdf, options = {}) {
|
|
|
134
147
|
let start = Date.now();
|
|
135
148
|
|
|
136
149
|
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
137
|
-
let sdfPart = sdf.
|
|
150
|
+
let sdfPart = sdf.slice(...entriesBoundaries[i]);
|
|
138
151
|
|
|
139
152
|
let currentLabels = [];
|
|
140
153
|
const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
|
|
@@ -155,7 +168,7 @@ function parse(sdf, options = {}) {
|
|
|
155
168
|
currentLabel.maxValue = -Infinity;
|
|
156
169
|
for (let j = 0; j < molecules.length; j++) {
|
|
157
170
|
if (molecules[j][label]) {
|
|
158
|
-
let value = parseFloat(molecules[j][label]);
|
|
171
|
+
let value = Number.parseFloat(molecules[j][label]);
|
|
159
172
|
molecules[j][label] = value;
|
|
160
173
|
if (value > currentLabel.maxValue) {
|
|
161
174
|
currentLabel.maxValue = value;
|
|
@@ -192,57 +205,98 @@ function parse(sdf, options = {}) {
|
|
|
192
205
|
};
|
|
193
206
|
}
|
|
194
207
|
|
|
208
|
+
class MolfileStream extends TransformStream {
|
|
209
|
+
#buffer = '';
|
|
210
|
+
|
|
211
|
+
constructor() {
|
|
212
|
+
super({
|
|
213
|
+
transform: (chunk, controller) => {
|
|
214
|
+
this.#buffer += chunk;
|
|
215
|
+
let begin = 0;
|
|
216
|
+
let index = 0;
|
|
217
|
+
while ((index = this.#buffer.indexOf('$$$$', index)) !== -1) {
|
|
218
|
+
// we need to check if the delimiter '\n' is in the current buffer
|
|
219
|
+
// if it is not we need to wait for the next chunk
|
|
220
|
+
const endOfDelimiter = this.#buffer.indexOf('\n', index);
|
|
221
|
+
if (endOfDelimiter === -1) {
|
|
222
|
+
index = begin;
|
|
223
|
+
break;
|
|
224
|
+
}
|
|
225
|
+
const eolLength = this.#buffer[endOfDelimiter - 1] === '\r' ? 2 : 1;
|
|
226
|
+
// need to remove the last eol because we will split on eol+'>' in getMolecule
|
|
227
|
+
controller.enqueue(this.#buffer.slice(begin, index - eolLength));
|
|
228
|
+
index = endOfDelimiter + eolLength;
|
|
229
|
+
begin = index;
|
|
230
|
+
}
|
|
231
|
+
this.#buffer = this.#buffer.slice(begin);
|
|
232
|
+
},
|
|
233
|
+
flush: (controller) => {
|
|
234
|
+
if (this.#buffer) {
|
|
235
|
+
controller.enqueue(this.#buffer);
|
|
236
|
+
}
|
|
237
|
+
},
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
195
242
|
/**
|
|
196
|
-
* Parse a SDF file
|
|
197
|
-
* @param {
|
|
198
|
-
* @param {object} [options={}]
|
|
199
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
200
|
-
* @param {
|
|
243
|
+
* Parse a SDF file as an iterator
|
|
244
|
+
* @param {ReadableStream} readStream - SDF file to parse
|
|
245
|
+
* @param {object} [options={}] - iterator options
|
|
246
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
247
|
+
* @param {string} [options.eol='\n'] - End of line character
|
|
248
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
249
|
+
* @yields {object} - Molecule object
|
|
201
250
|
*/
|
|
202
|
-
|
|
203
251
|
async function* iterator(readStream, options = {}) {
|
|
204
|
-
const
|
|
205
|
-
const currentLines = [];
|
|
206
|
-
options = { ...options };
|
|
207
|
-
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
252
|
+
const { eol = '\n', dynamicTyping = true } = options;
|
|
208
253
|
|
|
209
|
-
|
|
210
|
-
for await (
|
|
211
|
-
if (
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
currentLines.push(line);
|
|
254
|
+
const moleculeStream = readStream.pipeThrough(new MolfileStream({ eol }));
|
|
255
|
+
for await (const entry of moleculeStream) {
|
|
256
|
+
if (entry.length < 20) continue;
|
|
257
|
+
const molecule = getMolecule(entry, {
|
|
258
|
+
eol,
|
|
259
|
+
dynamicTyping,
|
|
260
|
+
});
|
|
261
|
+
if (!options.filter || options.filter(molecule)) {
|
|
262
|
+
yield molecule;
|
|
219
263
|
}
|
|
220
264
|
}
|
|
221
265
|
}
|
|
222
266
|
|
|
267
|
+
/**
|
|
268
|
+
* Convert a SDF part to an object
|
|
269
|
+
* @param {string} sdfPart - text containing the molfile
|
|
270
|
+
* @param {object} options - options
|
|
271
|
+
* @param {string} options.eol - end of line character
|
|
272
|
+
* @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
|
|
273
|
+
* @returns
|
|
274
|
+
*/
|
|
223
275
|
function getMolecule(sdfPart, options) {
|
|
224
|
-
|
|
276
|
+
const { eol, dynamicTyping: dynamicTyping$1 } = options;
|
|
277
|
+
let parts = sdfPart.split(`${eol}>`);
|
|
225
278
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
226
279
|
let molecule = {};
|
|
227
|
-
molecule.molfile = parts[0] +
|
|
280
|
+
molecule.molfile = parts[0] + eol;
|
|
228
281
|
for (let j = 1; j < parts.length; j++) {
|
|
229
|
-
let lines = parts[j].split(
|
|
282
|
+
let lines = parts[j].split(eol);
|
|
230
283
|
let from = lines[0].indexOf('<');
|
|
231
284
|
let to = lines[0].indexOf('>');
|
|
232
|
-
let label = lines[0].
|
|
285
|
+
let label = lines[0].slice(from + 1, to);
|
|
233
286
|
for (let k = 1; k < lines.length - 1; k++) {
|
|
234
287
|
if (molecule[label]) {
|
|
235
|
-
molecule[label] +=
|
|
288
|
+
molecule[label] += eol + lines[k];
|
|
236
289
|
} else {
|
|
237
290
|
molecule[label] = lines[k];
|
|
238
291
|
}
|
|
239
292
|
}
|
|
240
|
-
if (
|
|
293
|
+
if (dynamicTyping$1) {
|
|
241
294
|
molecule[label] = dynamicTyping.parseString(molecule[label]);
|
|
242
295
|
}
|
|
243
296
|
}
|
|
244
297
|
return molecule;
|
|
245
298
|
}
|
|
246
299
|
|
|
300
|
+
exports.MolfileStream = MolfileStream;
|
|
247
301
|
exports.iterator = iterator;
|
|
248
302
|
exports.parse = parse;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sdf-parser",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "7.0.3",
|
|
4
4
|
"description": "SDF parser",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"module": "src/index.js",
|
|
@@ -8,9 +8,6 @@
|
|
|
8
8
|
"lib",
|
|
9
9
|
"src"
|
|
10
10
|
],
|
|
11
|
-
"browser": {
|
|
12
|
-
"./src/iterator.js": "./src/iterator.browser.js"
|
|
13
|
-
},
|
|
14
11
|
"sideEffects": false,
|
|
15
12
|
"scripts": {
|
|
16
13
|
"build": "cheminfo-build --entry src/index.js --root SDFParser",
|
|
@@ -20,9 +17,8 @@
|
|
|
20
17
|
"prepack": "npm run compile",
|
|
21
18
|
"prettier": "prettier --check src",
|
|
22
19
|
"prettier-write": "prettier --write src",
|
|
23
|
-
"test": "npm run test-
|
|
24
|
-
"test-
|
|
25
|
-
"test-only": "jest"
|
|
20
|
+
"test": "npm run test-only && npm run eslint && npm run prettier",
|
|
21
|
+
"test-only": "vitest run --coverage"
|
|
26
22
|
},
|
|
27
23
|
"repository": {
|
|
28
24
|
"type": "git",
|
|
@@ -43,20 +39,21 @@
|
|
|
43
39
|
},
|
|
44
40
|
"homepage": "https://github.com/cheminfo/sdf-parser",
|
|
45
41
|
"devDependencies": {
|
|
46
|
-
"@babel/plugin-transform-modules-commonjs": "^7.
|
|
47
|
-
"@types/
|
|
42
|
+
"@babel/plugin-transform-modules-commonjs": "^7.26.3",
|
|
43
|
+
"@types/node": "^22.13.5",
|
|
44
|
+
"@vitest/coverage-v8": "^3.0.7",
|
|
48
45
|
"babel-eslint": "^10.1.0",
|
|
49
46
|
"callback-stream": "^1.1.0",
|
|
50
|
-
"cheminfo-build": "^1.1
|
|
51
|
-
"eslint": "^
|
|
52
|
-
"eslint-config-cheminfo": "^
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
47
|
+
"cheminfo-build": "^1.2.1",
|
|
48
|
+
"eslint": "^9.21.0",
|
|
49
|
+
"eslint-config-cheminfo": "^13.0.0",
|
|
50
|
+
"file-collection": "^1.0.0",
|
|
51
|
+
"openchemlib": "^8.18.1",
|
|
52
|
+
"prettier": "^3.5.2",
|
|
53
|
+
"vitest": "^3.0.7"
|
|
57
54
|
},
|
|
58
55
|
"dependencies": {
|
|
59
|
-
"dynamic-typing": "^1.0.
|
|
56
|
+
"dynamic-typing": "^1.0.1",
|
|
60
57
|
"ensure-string": "^1.2.0"
|
|
61
58
|
}
|
|
62
59
|
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export class MolfileStream extends TransformStream {
|
|
2
|
+
#buffer = '';
|
|
3
|
+
|
|
4
|
+
constructor() {
|
|
5
|
+
super({
|
|
6
|
+
transform: (chunk, controller) => {
|
|
7
|
+
this.#buffer += chunk;
|
|
8
|
+
let begin = 0;
|
|
9
|
+
let index = 0;
|
|
10
|
+
while ((index = this.#buffer.indexOf('$$$$', index)) !== -1) {
|
|
11
|
+
// we need to check if the delimiter '\n' is in the current buffer
|
|
12
|
+
// if it is not we need to wait for the next chunk
|
|
13
|
+
const endOfDelimiter = this.#buffer.indexOf('\n', index);
|
|
14
|
+
if (endOfDelimiter === -1) {
|
|
15
|
+
index = begin;
|
|
16
|
+
break;
|
|
17
|
+
}
|
|
18
|
+
const eolLength = this.#buffer[endOfDelimiter - 1] === '\r' ? 2 : 1;
|
|
19
|
+
// need to remove the last eol because we will split on eol+'>' in getMolecule
|
|
20
|
+
controller.enqueue(this.#buffer.slice(begin, index - eolLength));
|
|
21
|
+
index = endOfDelimiter + eolLength;
|
|
22
|
+
begin = index;
|
|
23
|
+
}
|
|
24
|
+
this.#buffer = this.#buffer.slice(begin);
|
|
25
|
+
},
|
|
26
|
+
flush: (controller) => {
|
|
27
|
+
if (this.#buffer) {
|
|
28
|
+
controller.enqueue(this.#buffer);
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
});
|
|
32
|
+
}
|
|
33
|
+
}
|
package/src/index.js
CHANGED
package/src/iterator.js
CHANGED
|
@@ -1,52 +1,59 @@
|
|
|
1
|
-
import { createInterface } from 'readline';
|
|
2
|
-
|
|
3
1
|
import { parseString } from 'dynamic-typing';
|
|
2
|
+
|
|
3
|
+
import { MolfileStream } from './MolfileStream.js';
|
|
4
|
+
|
|
4
5
|
/**
|
|
5
|
-
* Parse a SDF file
|
|
6
|
-
* @param {
|
|
7
|
-
* @param {object} [options={}]
|
|
8
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
9
|
-
* @param {
|
|
6
|
+
* Parse a SDF file as an iterator
|
|
7
|
+
* @param {ReadableStream} readStream - SDF file to parse
|
|
8
|
+
* @param {object} [options={}] - iterator options
|
|
9
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
10
|
+
* @param {string} [options.eol='\n'] - End of line character
|
|
11
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
12
|
+
* @yields {object} - Molecule object
|
|
10
13
|
*/
|
|
11
|
-
|
|
12
14
|
export async function* iterator(readStream, options = {}) {
|
|
13
|
-
const
|
|
14
|
-
const currentLines = [];
|
|
15
|
-
options = { ...options };
|
|
16
|
-
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
15
|
+
const { eol = '\n', dynamicTyping = true } = options;
|
|
17
16
|
|
|
18
|
-
|
|
19
|
-
for await (
|
|
20
|
-
if (
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
currentLines.push(line);
|
|
17
|
+
const moleculeStream = readStream.pipeThrough(new MolfileStream({ eol }));
|
|
18
|
+
for await (const entry of moleculeStream) {
|
|
19
|
+
if (entry.length < 20) continue;
|
|
20
|
+
const molecule = getMolecule(entry, {
|
|
21
|
+
eol,
|
|
22
|
+
dynamicTyping,
|
|
23
|
+
});
|
|
24
|
+
if (!options.filter || options.filter(molecule)) {
|
|
25
|
+
yield molecule;
|
|
28
26
|
}
|
|
29
27
|
}
|
|
30
28
|
}
|
|
31
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Convert a SDF part to an object
|
|
32
|
+
* @param {string} sdfPart - text containing the molfile
|
|
33
|
+
* @param {object} options - options
|
|
34
|
+
* @param {string} options.eol - end of line character
|
|
35
|
+
* @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
|
|
36
|
+
* @returns
|
|
37
|
+
*/
|
|
32
38
|
function getMolecule(sdfPart, options) {
|
|
33
|
-
|
|
39
|
+
const { eol, dynamicTyping } = options;
|
|
40
|
+
let parts = sdfPart.split(`${eol}>`);
|
|
34
41
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
35
42
|
let molecule = {};
|
|
36
|
-
molecule.molfile = parts[0] +
|
|
43
|
+
molecule.molfile = parts[0] + eol;
|
|
37
44
|
for (let j = 1; j < parts.length; j++) {
|
|
38
|
-
let lines = parts[j].split(
|
|
45
|
+
let lines = parts[j].split(eol);
|
|
39
46
|
let from = lines[0].indexOf('<');
|
|
40
47
|
let to = lines[0].indexOf('>');
|
|
41
|
-
let label = lines[0].
|
|
48
|
+
let label = lines[0].slice(from + 1, to);
|
|
42
49
|
for (let k = 1; k < lines.length - 1; k++) {
|
|
43
50
|
if (molecule[label]) {
|
|
44
|
-
molecule[label] +=
|
|
51
|
+
molecule[label] += eol + lines[k];
|
|
45
52
|
} else {
|
|
46
53
|
molecule[label] = lines[k];
|
|
47
54
|
}
|
|
48
55
|
}
|
|
49
|
-
if (
|
|
56
|
+
if (dynamicTyping) {
|
|
50
57
|
molecule[label] = parseString(molecule[label]);
|
|
51
58
|
}
|
|
52
59
|
}
|
package/src/parse.js
CHANGED
|
@@ -4,15 +4,15 @@ import { getEntriesBoundaries } from './getEntriesBoundaries';
|
|
|
4
4
|
import { getMolecule } from './util/getMolecule';
|
|
5
5
|
/**
|
|
6
6
|
* Parse a SDF file
|
|
7
|
-
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
7
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
|
|
8
8
|
* @param {object} [options={}]
|
|
9
|
-
* @param {string[]} [options.include] List of fields to include
|
|
10
|
-
* @param {string[]} [options.exclude] List of fields to exclude
|
|
11
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
12
|
-
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
13
|
-
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
|
|
14
|
-
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
|
|
15
|
-
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
|
|
9
|
+
* @param {string[]} [options.include] - List of fields to include
|
|
10
|
+
* @param {string[]} [options.exclude] - List of fields to exclude
|
|
11
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
12
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
13
|
+
* @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
|
|
14
|
+
* @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
|
|
15
|
+
* @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
|
|
16
16
|
*/
|
|
17
17
|
export function parse(sdf, options = {}) {
|
|
18
18
|
options = { ...options };
|
|
@@ -28,14 +28,14 @@ export function parse(sdf, options = {}) {
|
|
|
28
28
|
if (options.eol === undefined) {
|
|
29
29
|
options.eol = '\n';
|
|
30
30
|
if (options.mixedEOL) {
|
|
31
|
-
sdf = sdf.
|
|
32
|
-
sdf = sdf.
|
|
31
|
+
sdf = sdf.replaceAll('\r\n', '\n');
|
|
32
|
+
sdf = sdf.replaceAll('\r', '\n');
|
|
33
33
|
} else {
|
|
34
34
|
// we will find the delimiter in order to be much faster and not use regular expression
|
|
35
|
-
let header = sdf.
|
|
36
|
-
if (header.
|
|
35
|
+
let header = new Set(sdf.slice(0, 1000));
|
|
36
|
+
if (header.has('\r\n')) {
|
|
37
37
|
options.eol = '\r\n';
|
|
38
|
-
} else if (header.
|
|
38
|
+
} else if (header.has('\r')) {
|
|
39
39
|
options.eol = '\r';
|
|
40
40
|
}
|
|
41
41
|
}
|
|
@@ -52,7 +52,7 @@ export function parse(sdf, options = {}) {
|
|
|
52
52
|
let start = Date.now();
|
|
53
53
|
|
|
54
54
|
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
55
|
-
let sdfPart = sdf.
|
|
55
|
+
let sdfPart = sdf.slice(...entriesBoundaries[i]);
|
|
56
56
|
|
|
57
57
|
let currentLabels = [];
|
|
58
58
|
const molecule = getMolecule(sdfPart, labels, currentLabels, options);
|
|
@@ -73,7 +73,7 @@ export function parse(sdf, options = {}) {
|
|
|
73
73
|
currentLabel.maxValue = -Infinity;
|
|
74
74
|
for (let j = 0; j < molecules.length; j++) {
|
|
75
75
|
if (molecules[j][label]) {
|
|
76
|
-
let value = parseFloat(molecules[j][label]);
|
|
76
|
+
let value = Number.parseFloat(molecules[j][label]);
|
|
77
77
|
molecules[j][label] = value;
|
|
78
78
|
if (value > currentLabel.maxValue) {
|
|
79
79
|
currentLabel.maxValue = value;
|
package/src/util/getMolecule.js
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parse the molfile and the properties with > < labels >
|
|
3
|
+
* @param {string} sdfPart
|
|
4
|
+
* @param {*} labels
|
|
5
|
+
* @param {*} currentLabels
|
|
6
|
+
* @param {object} options
|
|
7
|
+
* @returns
|
|
8
|
+
*/
|
|
1
9
|
export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
2
10
|
let parts = sdfPart.split(`${options.eol}>`);
|
|
3
11
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
@@ -7,7 +15,7 @@ export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
|
7
15
|
let lines = parts[j].split(options.eol);
|
|
8
16
|
let from = lines[0].indexOf('<');
|
|
9
17
|
let to = lines[0].indexOf('>');
|
|
10
|
-
let label = lines[0].
|
|
18
|
+
let label = lines[0].slice(from + 1, to);
|
|
11
19
|
currentLabels.push(label);
|
|
12
20
|
if (!labels[label]) {
|
|
13
21
|
labels[label] = {
|
|
@@ -16,8 +24,8 @@ export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
|
16
24
|
keep: false,
|
|
17
25
|
};
|
|
18
26
|
if (
|
|
19
|
-
(!options.exclude || options.exclude.
|
|
20
|
-
(!options.include || options.include.
|
|
27
|
+
(!options.exclude || !options.exclude.includes(label)) &&
|
|
28
|
+
(!options.include || options.include.includes(label))
|
|
21
29
|
) {
|
|
22
30
|
labels[label].keep = true;
|
|
23
31
|
if (options.modifiers[label]) {
|
|
@@ -44,10 +52,11 @@ export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
|
44
52
|
molecule[label] = modifiedValue;
|
|
45
53
|
}
|
|
46
54
|
}
|
|
47
|
-
if (
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
55
|
+
if (
|
|
56
|
+
labels[label].isNumeric &&
|
|
57
|
+
(!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
|
|
58
|
+
) {
|
|
59
|
+
labels[label].isNumeric = false;
|
|
51
60
|
}
|
|
52
61
|
}
|
|
53
62
|
}
|