sdf-parser 6.0.1 → 7.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -7
- package/lib/index.js +98 -50
- package/package.json +14 -17
- package/src/MolfileStream.js +27 -0
- package/src/getEntriesBoundaries.js +7 -0
- package/src/index.js +1 -0
- package/src/iterator.js +34 -27
- package/src/parse.js +15 -15
- package/src/util/getMolecule.js +16 -7
- package/src/__tests__/__snapshots__/getEntriesBoundaries.test.js.snap +0 -10
- package/src/__tests__/checkOptions.test.js +0 -57
- package/src/__tests__/checkUndefined.test.js +0 -24
- package/src/__tests__/getEntriesBoundaries.test.js +0 -33
- package/src/__tests__/index.test.js +0 -76
- package/src/__tests__/iterator.test.js +0 -183
- package/src/__tests__/notWellFormatted.test.js +0 -14
- package/src/__tests__/test.sdf +0 -8707
- package/src/__tests__/test.sdf.gz +0 -0
- package/src/__tests__/test1.sdf +0 -38
- package/src/__tests__/test2.sdf +0 -498
- package/src/__tests__/test4.sdf +0 -37
- package/src/iterator.browser.js +0 -3
- package/src/stream.browser.js +0 -3
package/README.md
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
# sdf-parser
|
|
2
2
|
|
|
3
3
|
[![NPM version][npm-image]][npm-url]
|
|
4
|
-
[![build status][
|
|
4
|
+
[![build status][ci-image]][ci-url]
|
|
5
|
+
[![Test coverage][codecov-image]][codecov-url]
|
|
5
6
|
[![npm download][download-image]][download-url]
|
|
6
7
|
|
|
7
8
|
Allow to parse a SDF file and convert it to an array of objects.
|
|
@@ -59,13 +60,19 @@ var result = parse(sdf, {
|
|
|
59
60
|
|
|
60
61
|
## Iterator
|
|
61
62
|
|
|
62
|
-
This API is only available on Node.js.
|
|
63
|
-
|
|
64
63
|
```js
|
|
65
64
|
const { iterator } = require('sdf-parser');
|
|
66
|
-
const
|
|
67
|
-
|
|
65
|
+
const file = await openAsBlob(join(__dirname, 'test.sdf.gz'));
|
|
66
|
+
|
|
67
|
+
const decompressionStream = new DecompressionStream('gzip');
|
|
68
|
+
const textDecoder = new TextDecoderStream();
|
|
69
|
+
|
|
70
|
+
const stream = file
|
|
71
|
+
.stream()
|
|
72
|
+
.pipeThrough(decompressionStream)
|
|
73
|
+
.pipeThrough(textDecoder);
|
|
68
74
|
const results = [];
|
|
75
|
+
|
|
69
76
|
for await (const entry of iterator(stream)) {
|
|
70
77
|
results.push(entry);
|
|
71
78
|
}
|
|
@@ -77,7 +84,9 @@ for await (const entry of iterator(stream)) {
|
|
|
77
84
|
|
|
78
85
|
[npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
|
|
79
86
|
[npm-url]: https://www.npmjs.com/package/sdf-parser
|
|
80
|
-
[
|
|
81
|
-
[
|
|
87
|
+
[ci-image]: https://github.com/cheminfo/sdf-parser/actions/workflows/nodejs.yml/badge.svg
|
|
88
|
+
[ci-url]: https://github.com/cheminfo/sdf-parser/actions/workflows/nodejs.yml
|
|
89
|
+
[codecov-image]: https://img.shields.io/codecov/c/github/cheminfo/sdf-parser.svg
|
|
90
|
+
[codecov-url]: https://codecov.io/gh/cheminfo/sdf-parser
|
|
82
91
|
[download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
|
|
83
92
|
[download-url]: https://www.npmjs.com/package/sdf-parser
|
package/lib/index.js
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
Object.defineProperty(exports, '__esModule', { value: true });
|
|
4
|
-
|
|
5
3
|
var ensureString = require('ensure-string');
|
|
6
|
-
var readline = require('readline');
|
|
7
4
|
var dynamicTyping = require('dynamic-typing');
|
|
8
5
|
|
|
6
|
+
/**
|
|
7
|
+
*
|
|
8
|
+
* @param {*} string
|
|
9
|
+
* @param {*} substring
|
|
10
|
+
* @param {*} eol
|
|
11
|
+
* @returns
|
|
12
|
+
*/
|
|
9
13
|
function getEntriesBoundaries(string, substring, eol) {
|
|
10
14
|
const res = [];
|
|
11
15
|
let previous = 0;
|
|
@@ -28,6 +32,14 @@ function getEntriesBoundaries(string, substring, eol) {
|
|
|
28
32
|
return res;
|
|
29
33
|
}
|
|
30
34
|
|
|
35
|
+
/**
|
|
36
|
+
* Parse the molfile and the properties with > < labels >
|
|
37
|
+
* @param {string} sdfPart
|
|
38
|
+
* @param {*} labels
|
|
39
|
+
* @param {*} currentLabels
|
|
40
|
+
* @param {object} options
|
|
41
|
+
* @returns
|
|
42
|
+
*/
|
|
31
43
|
function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
32
44
|
let parts = sdfPart.split(`${options.eol}>`);
|
|
33
45
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
@@ -37,7 +49,7 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
37
49
|
let lines = parts[j].split(options.eol);
|
|
38
50
|
let from = lines[0].indexOf('<');
|
|
39
51
|
let to = lines[0].indexOf('>');
|
|
40
|
-
let label = lines[0].
|
|
52
|
+
let label = lines[0].slice(from + 1, to);
|
|
41
53
|
currentLabels.push(label);
|
|
42
54
|
if (!labels[label]) {
|
|
43
55
|
labels[label] = {
|
|
@@ -46,8 +58,8 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
46
58
|
keep: false,
|
|
47
59
|
};
|
|
48
60
|
if (
|
|
49
|
-
(!options.exclude || options.exclude.
|
|
50
|
-
(!options.include || options.include.
|
|
61
|
+
(!options.exclude || !options.exclude.includes(label)) &&
|
|
62
|
+
(!options.include || options.include.includes(label))
|
|
51
63
|
) {
|
|
52
64
|
labels[label].keep = true;
|
|
53
65
|
if (options.modifiers[label]) {
|
|
@@ -74,10 +86,11 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
74
86
|
molecule[label] = modifiedValue;
|
|
75
87
|
}
|
|
76
88
|
}
|
|
77
|
-
if (
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
89
|
+
if (
|
|
90
|
+
labels[label].isNumeric &&
|
|
91
|
+
(!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
|
|
92
|
+
) {
|
|
93
|
+
labels[label].isNumeric = false;
|
|
81
94
|
}
|
|
82
95
|
}
|
|
83
96
|
}
|
|
@@ -86,15 +99,15 @@ function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
|
86
99
|
|
|
87
100
|
/**
|
|
88
101
|
* Parse a SDF file
|
|
89
|
-
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
102
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
|
|
90
103
|
* @param {object} [options={}]
|
|
91
|
-
* @param {string[]} [options.include] List of fields to include
|
|
92
|
-
* @param {string[]} [options.exclude] List of fields to exclude
|
|
93
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
94
|
-
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
95
|
-
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
|
|
96
|
-
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
|
|
97
|
-
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
|
|
104
|
+
* @param {string[]} [options.include] - List of fields to include
|
|
105
|
+
* @param {string[]} [options.exclude] - List of fields to exclude
|
|
106
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
107
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
108
|
+
* @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
|
|
109
|
+
* @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
|
|
110
|
+
* @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
|
|
98
111
|
*/
|
|
99
112
|
function parse(sdf, options = {}) {
|
|
100
113
|
options = { ...options };
|
|
@@ -110,14 +123,14 @@ function parse(sdf, options = {}) {
|
|
|
110
123
|
if (options.eol === undefined) {
|
|
111
124
|
options.eol = '\n';
|
|
112
125
|
if (options.mixedEOL) {
|
|
113
|
-
sdf = sdf.
|
|
114
|
-
sdf = sdf.
|
|
126
|
+
sdf = sdf.replaceAll('\r\n', '\n');
|
|
127
|
+
sdf = sdf.replaceAll('\r', '\n');
|
|
115
128
|
} else {
|
|
116
129
|
// we will find the delimiter in order to be much faster and not use regular expression
|
|
117
|
-
let header = sdf.
|
|
118
|
-
if (header.
|
|
130
|
+
let header = new Set(sdf.slice(0, 1000));
|
|
131
|
+
if (header.has('\r\n')) {
|
|
119
132
|
options.eol = '\r\n';
|
|
120
|
-
} else if (header.
|
|
133
|
+
} else if (header.has('\r')) {
|
|
121
134
|
options.eol = '\r';
|
|
122
135
|
}
|
|
123
136
|
}
|
|
@@ -134,7 +147,7 @@ function parse(sdf, options = {}) {
|
|
|
134
147
|
let start = Date.now();
|
|
135
148
|
|
|
136
149
|
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
137
|
-
let sdfPart = sdf.
|
|
150
|
+
let sdfPart = sdf.slice(...entriesBoundaries[i]);
|
|
138
151
|
|
|
139
152
|
let currentLabels = [];
|
|
140
153
|
const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
|
|
@@ -155,7 +168,7 @@ function parse(sdf, options = {}) {
|
|
|
155
168
|
currentLabel.maxValue = -Infinity;
|
|
156
169
|
for (let j = 0; j < molecules.length; j++) {
|
|
157
170
|
if (molecules[j][label]) {
|
|
158
|
-
let value = parseFloat(molecules[j][label]);
|
|
171
|
+
let value = Number.parseFloat(molecules[j][label]);
|
|
159
172
|
molecules[j][label] = value;
|
|
160
173
|
if (value > currentLabel.maxValue) {
|
|
161
174
|
currentLabel.maxValue = value;
|
|
@@ -192,57 +205,92 @@ function parse(sdf, options = {}) {
|
|
|
192
205
|
};
|
|
193
206
|
}
|
|
194
207
|
|
|
208
|
+
class MolfileStream extends TransformStream {
|
|
209
|
+
#buffer = '';
|
|
210
|
+
|
|
211
|
+
constructor() {
|
|
212
|
+
super({
|
|
213
|
+
transform: (chunk, controller) => {
|
|
214
|
+
this.#buffer += chunk;
|
|
215
|
+
let begin = 0;
|
|
216
|
+
let index = 0;
|
|
217
|
+
while ((index = this.#buffer.indexOf('\n$$$$', index)) !== -1) {
|
|
218
|
+
controller.enqueue(this.#buffer.slice(begin, index));
|
|
219
|
+
index += 5;
|
|
220
|
+
if (this.#buffer[index] === '\r') {
|
|
221
|
+
index++;
|
|
222
|
+
}
|
|
223
|
+
begin = index;
|
|
224
|
+
}
|
|
225
|
+
this.#buffer = this.#buffer.slice(begin);
|
|
226
|
+
},
|
|
227
|
+
flush: (controller) => {
|
|
228
|
+
if (this.#buffer) {
|
|
229
|
+
controller.enqueue(this.#buffer);
|
|
230
|
+
}
|
|
231
|
+
},
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
195
236
|
/**
|
|
196
237
|
* Parse a SDF file
|
|
197
|
-
* @param {
|
|
198
|
-
* @param {object} [options={}]
|
|
199
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
200
|
-
* @param {
|
|
238
|
+
* @param {ReadableStream} readStream - SDF file to parse
|
|
239
|
+
* @param {object} [options={}] - iterator options
|
|
240
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
241
|
+
* @param {string} [options.eol='\n'] - End of line character
|
|
242
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
243
|
+
* @yields {object} - Molecule object
|
|
201
244
|
*/
|
|
202
|
-
|
|
203
245
|
async function* iterator(readStream, options = {}) {
|
|
204
|
-
const
|
|
205
|
-
const currentLines = [];
|
|
206
|
-
options = { ...options };
|
|
207
|
-
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
246
|
+
const { eol = '\n', dynamicTyping = true } = options;
|
|
208
247
|
|
|
209
|
-
|
|
210
|
-
for await (
|
|
211
|
-
if (
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
currentLines.push(line);
|
|
248
|
+
const moleculeStream = readStream.pipeThrough(new MolfileStream());
|
|
249
|
+
for await (const molfile of moleculeStream) {
|
|
250
|
+
if (molfile.length < 20) continue;
|
|
251
|
+
const molecule = getMolecule(molfile, {
|
|
252
|
+
eol,
|
|
253
|
+
dynamicTyping,
|
|
254
|
+
});
|
|
255
|
+
if (!options.filter || options.filter(molecule)) {
|
|
256
|
+
yield molecule;
|
|
219
257
|
}
|
|
220
258
|
}
|
|
221
259
|
}
|
|
222
260
|
|
|
261
|
+
/**
|
|
262
|
+
* Convert a SDF part to an object
|
|
263
|
+
* @param {string} sdfPart - text containing the molfile
|
|
264
|
+
* @param {object} options - options
|
|
265
|
+
* @param {string} options.eol - end of line character
|
|
266
|
+
* @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
|
|
267
|
+
* @returns
|
|
268
|
+
*/
|
|
223
269
|
function getMolecule(sdfPart, options) {
|
|
224
|
-
|
|
270
|
+
const { eol, dynamicTyping: dynamicTyping$1 } = options;
|
|
271
|
+
let parts = sdfPart.split(`${eol}>`);
|
|
225
272
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
226
273
|
let molecule = {};
|
|
227
|
-
molecule.molfile = parts[0] +
|
|
274
|
+
molecule.molfile = parts[0] + eol;
|
|
228
275
|
for (let j = 1; j < parts.length; j++) {
|
|
229
|
-
let lines = parts[j].split(
|
|
276
|
+
let lines = parts[j].split(eol);
|
|
230
277
|
let from = lines[0].indexOf('<');
|
|
231
278
|
let to = lines[0].indexOf('>');
|
|
232
|
-
let label = lines[0].
|
|
279
|
+
let label = lines[0].slice(from + 1, to);
|
|
233
280
|
for (let k = 1; k < lines.length - 1; k++) {
|
|
234
281
|
if (molecule[label]) {
|
|
235
|
-
molecule[label] +=
|
|
282
|
+
molecule[label] += eol + lines[k];
|
|
236
283
|
} else {
|
|
237
284
|
molecule[label] = lines[k];
|
|
238
285
|
}
|
|
239
286
|
}
|
|
240
|
-
if (
|
|
287
|
+
if (dynamicTyping$1) {
|
|
241
288
|
molecule[label] = dynamicTyping.parseString(molecule[label]);
|
|
242
289
|
}
|
|
243
290
|
}
|
|
244
291
|
return molecule;
|
|
245
292
|
}
|
|
246
293
|
|
|
294
|
+
exports.MolfileStream = MolfileStream;
|
|
247
295
|
exports.iterator = iterator;
|
|
248
296
|
exports.parse = parse;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sdf-parser",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "7.0.2",
|
|
4
4
|
"description": "SDF parser",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"module": "src/index.js",
|
|
@@ -8,9 +8,6 @@
|
|
|
8
8
|
"lib",
|
|
9
9
|
"src"
|
|
10
10
|
],
|
|
11
|
-
"browser": {
|
|
12
|
-
"./src/iterator.js": "./src/iterator.browser.js"
|
|
13
|
-
},
|
|
14
11
|
"sideEffects": false,
|
|
15
12
|
"scripts": {
|
|
16
13
|
"build": "cheminfo-build --entry src/index.js --root SDFParser",
|
|
@@ -20,9 +17,8 @@
|
|
|
20
17
|
"prepack": "npm run compile",
|
|
21
18
|
"prettier": "prettier --check src",
|
|
22
19
|
"prettier-write": "prettier --write src",
|
|
23
|
-
"test": "npm run test-
|
|
24
|
-
"test-
|
|
25
|
-
"test-only": "jest"
|
|
20
|
+
"test": "npm run test-only && npm run eslint && npm run prettier",
|
|
21
|
+
"test-only": "vitest run --coverage"
|
|
26
22
|
},
|
|
27
23
|
"repository": {
|
|
28
24
|
"type": "git",
|
|
@@ -43,20 +39,21 @@
|
|
|
43
39
|
},
|
|
44
40
|
"homepage": "https://github.com/cheminfo/sdf-parser",
|
|
45
41
|
"devDependencies": {
|
|
46
|
-
"@babel/plugin-transform-modules-commonjs": "^7.
|
|
47
|
-
"@types/
|
|
42
|
+
"@babel/plugin-transform-modules-commonjs": "^7.26.3",
|
|
43
|
+
"@types/node": "^22.13.5",
|
|
44
|
+
"@vitest/coverage-v8": "^3.0.7",
|
|
48
45
|
"babel-eslint": "^10.1.0",
|
|
49
46
|
"callback-stream": "^1.1.0",
|
|
50
|
-
"cheminfo-build": "^1.1
|
|
51
|
-
"eslint": "^
|
|
52
|
-
"eslint-config-cheminfo": "^
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
47
|
+
"cheminfo-build": "^1.2.1",
|
|
48
|
+
"eslint": "^9.21.0",
|
|
49
|
+
"eslint-config-cheminfo": "^13.0.0",
|
|
50
|
+
"file-collection": "^1.0.0",
|
|
51
|
+
"openchemlib": "^8.18.1",
|
|
52
|
+
"prettier": "^3.5.2",
|
|
53
|
+
"vitest": "^3.0.7"
|
|
57
54
|
},
|
|
58
55
|
"dependencies": {
|
|
59
|
-
"dynamic-typing": "^1.0.
|
|
56
|
+
"dynamic-typing": "^1.0.1",
|
|
60
57
|
"ensure-string": "^1.2.0"
|
|
61
58
|
}
|
|
62
59
|
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export class MolfileStream extends TransformStream {
|
|
2
|
+
#buffer = '';
|
|
3
|
+
|
|
4
|
+
constructor() {
|
|
5
|
+
super({
|
|
6
|
+
transform: (chunk, controller) => {
|
|
7
|
+
this.#buffer += chunk;
|
|
8
|
+
let begin = 0;
|
|
9
|
+
let index = 0;
|
|
10
|
+
while ((index = this.#buffer.indexOf('\n$$$$', index)) !== -1) {
|
|
11
|
+
controller.enqueue(this.#buffer.slice(begin, index));
|
|
12
|
+
index += 5;
|
|
13
|
+
if (this.#buffer[index] === '\r') {
|
|
14
|
+
index++;
|
|
15
|
+
}
|
|
16
|
+
begin = index;
|
|
17
|
+
}
|
|
18
|
+
this.#buffer = this.#buffer.slice(begin);
|
|
19
|
+
},
|
|
20
|
+
flush: (controller) => {
|
|
21
|
+
if (this.#buffer) {
|
|
22
|
+
controller.enqueue(this.#buffer);
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
}
|
package/src/index.js
CHANGED
package/src/iterator.js
CHANGED
|
@@ -1,52 +1,59 @@
|
|
|
1
|
-
import { createInterface } from 'readline';
|
|
2
|
-
|
|
3
1
|
import { parseString } from 'dynamic-typing';
|
|
2
|
+
|
|
3
|
+
import { MolfileStream } from './MolfileStream.js';
|
|
4
|
+
|
|
4
5
|
/**
|
|
5
6
|
* Parse a SDF file
|
|
6
|
-
* @param {
|
|
7
|
-
* @param {object} [options={}]
|
|
8
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
9
|
-
* @param {
|
|
7
|
+
* @param {ReadableStream} readStream - SDF file to parse
|
|
8
|
+
* @param {object} [options={}] - iterator options
|
|
9
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
10
|
+
* @param {string} [options.eol='\n'] - End of line character
|
|
11
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
12
|
+
* @yields {object} - Molecule object
|
|
10
13
|
*/
|
|
11
|
-
|
|
12
14
|
export async function* iterator(readStream, options = {}) {
|
|
13
|
-
const
|
|
14
|
-
const currentLines = [];
|
|
15
|
-
options = { ...options };
|
|
16
|
-
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
15
|
+
const { eol = '\n', dynamicTyping = true } = options;
|
|
17
16
|
|
|
18
|
-
|
|
19
|
-
for await (
|
|
20
|
-
if (
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
currentLines.push(line);
|
|
17
|
+
const moleculeStream = readStream.pipeThrough(new MolfileStream());
|
|
18
|
+
for await (const molfile of moleculeStream) {
|
|
19
|
+
if (molfile.length < 20) continue;
|
|
20
|
+
const molecule = getMolecule(molfile, {
|
|
21
|
+
eol,
|
|
22
|
+
dynamicTyping,
|
|
23
|
+
});
|
|
24
|
+
if (!options.filter || options.filter(molecule)) {
|
|
25
|
+
yield molecule;
|
|
28
26
|
}
|
|
29
27
|
}
|
|
30
28
|
}
|
|
31
29
|
|
|
30
|
+
/**
|
|
31
|
+
* Convert a SDF part to an object
|
|
32
|
+
* @param {string} sdfPart - text containing the molfile
|
|
33
|
+
* @param {object} options - options
|
|
34
|
+
* @param {string} options.eol - end of line character
|
|
35
|
+
* @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
|
|
36
|
+
* @returns
|
|
37
|
+
*/
|
|
32
38
|
function getMolecule(sdfPart, options) {
|
|
33
|
-
|
|
39
|
+
const { eol, dynamicTyping } = options;
|
|
40
|
+
let parts = sdfPart.split(`${eol}>`);
|
|
34
41
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
35
42
|
let molecule = {};
|
|
36
|
-
molecule.molfile = parts[0] +
|
|
43
|
+
molecule.molfile = parts[0] + eol;
|
|
37
44
|
for (let j = 1; j < parts.length; j++) {
|
|
38
|
-
let lines = parts[j].split(
|
|
45
|
+
let lines = parts[j].split(eol);
|
|
39
46
|
let from = lines[0].indexOf('<');
|
|
40
47
|
let to = lines[0].indexOf('>');
|
|
41
|
-
let label = lines[0].
|
|
48
|
+
let label = lines[0].slice(from + 1, to);
|
|
42
49
|
for (let k = 1; k < lines.length - 1; k++) {
|
|
43
50
|
if (molecule[label]) {
|
|
44
|
-
molecule[label] +=
|
|
51
|
+
molecule[label] += eol + lines[k];
|
|
45
52
|
} else {
|
|
46
53
|
molecule[label] = lines[k];
|
|
47
54
|
}
|
|
48
55
|
}
|
|
49
|
-
if (
|
|
56
|
+
if (dynamicTyping) {
|
|
50
57
|
molecule[label] = parseString(molecule[label]);
|
|
51
58
|
}
|
|
52
59
|
}
|
package/src/parse.js
CHANGED
|
@@ -4,15 +4,15 @@ import { getEntriesBoundaries } from './getEntriesBoundaries';
|
|
|
4
4
|
import { getMolecule } from './util/getMolecule';
|
|
5
5
|
/**
|
|
6
6
|
* Parse a SDF file
|
|
7
|
-
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
7
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
|
|
8
8
|
* @param {object} [options={}]
|
|
9
|
-
* @param {string[]} [options.include] List of fields to include
|
|
10
|
-
* @param {string[]} [options.exclude] List of fields to exclude
|
|
11
|
-
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
12
|
-
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
13
|
-
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
|
|
14
|
-
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
|
|
15
|
-
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
|
|
9
|
+
* @param {string[]} [options.include] - List of fields to include
|
|
10
|
+
* @param {string[]} [options.exclude] - List of fields to exclude
|
|
11
|
+
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
12
|
+
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
13
|
+
* @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
|
|
14
|
+
* @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
|
|
15
|
+
* @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
|
|
16
16
|
*/
|
|
17
17
|
export function parse(sdf, options = {}) {
|
|
18
18
|
options = { ...options };
|
|
@@ -28,14 +28,14 @@ export function parse(sdf, options = {}) {
|
|
|
28
28
|
if (options.eol === undefined) {
|
|
29
29
|
options.eol = '\n';
|
|
30
30
|
if (options.mixedEOL) {
|
|
31
|
-
sdf = sdf.
|
|
32
|
-
sdf = sdf.
|
|
31
|
+
sdf = sdf.replaceAll('\r\n', '\n');
|
|
32
|
+
sdf = sdf.replaceAll('\r', '\n');
|
|
33
33
|
} else {
|
|
34
34
|
// we will find the delimiter in order to be much faster and not use regular expression
|
|
35
|
-
let header = sdf.
|
|
36
|
-
if (header.
|
|
35
|
+
let header = new Set(sdf.slice(0, 1000));
|
|
36
|
+
if (header.has('\r\n')) {
|
|
37
37
|
options.eol = '\r\n';
|
|
38
|
-
} else if (header.
|
|
38
|
+
} else if (header.has('\r')) {
|
|
39
39
|
options.eol = '\r';
|
|
40
40
|
}
|
|
41
41
|
}
|
|
@@ -52,7 +52,7 @@ export function parse(sdf, options = {}) {
|
|
|
52
52
|
let start = Date.now();
|
|
53
53
|
|
|
54
54
|
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
55
|
-
let sdfPart = sdf.
|
|
55
|
+
let sdfPart = sdf.slice(...entriesBoundaries[i]);
|
|
56
56
|
|
|
57
57
|
let currentLabels = [];
|
|
58
58
|
const molecule = getMolecule(sdfPart, labels, currentLabels, options);
|
|
@@ -73,7 +73,7 @@ export function parse(sdf, options = {}) {
|
|
|
73
73
|
currentLabel.maxValue = -Infinity;
|
|
74
74
|
for (let j = 0; j < molecules.length; j++) {
|
|
75
75
|
if (molecules[j][label]) {
|
|
76
|
-
let value = parseFloat(molecules[j][label]);
|
|
76
|
+
let value = Number.parseFloat(molecules[j][label]);
|
|
77
77
|
molecules[j][label] = value;
|
|
78
78
|
if (value > currentLabel.maxValue) {
|
|
79
79
|
currentLabel.maxValue = value;
|
package/src/util/getMolecule.js
CHANGED
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Parse the molfile and the properties with > < labels >
|
|
3
|
+
* @param {string} sdfPart
|
|
4
|
+
* @param {*} labels
|
|
5
|
+
* @param {*} currentLabels
|
|
6
|
+
* @param {object} options
|
|
7
|
+
* @returns
|
|
8
|
+
*/
|
|
1
9
|
export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
2
10
|
let parts = sdfPart.split(`${options.eol}>`);
|
|
3
11
|
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
@@ -7,7 +15,7 @@ export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
|
7
15
|
let lines = parts[j].split(options.eol);
|
|
8
16
|
let from = lines[0].indexOf('<');
|
|
9
17
|
let to = lines[0].indexOf('>');
|
|
10
|
-
let label = lines[0].
|
|
18
|
+
let label = lines[0].slice(from + 1, to);
|
|
11
19
|
currentLabels.push(label);
|
|
12
20
|
if (!labels[label]) {
|
|
13
21
|
labels[label] = {
|
|
@@ -16,8 +24,8 @@ export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
|
16
24
|
keep: false,
|
|
17
25
|
};
|
|
18
26
|
if (
|
|
19
|
-
(!options.exclude || options.exclude.
|
|
20
|
-
(!options.include || options.include.
|
|
27
|
+
(!options.exclude || !options.exclude.includes(label)) &&
|
|
28
|
+
(!options.include || options.include.includes(label))
|
|
21
29
|
) {
|
|
22
30
|
labels[label].keep = true;
|
|
23
31
|
if (options.modifiers[label]) {
|
|
@@ -44,10 +52,11 @@ export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
|
44
52
|
molecule[label] = modifiedValue;
|
|
45
53
|
}
|
|
46
54
|
}
|
|
47
|
-
if (
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
55
|
+
if (
|
|
56
|
+
labels[label].isNumeric &&
|
|
57
|
+
(!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
|
|
58
|
+
) {
|
|
59
|
+
labels[label].isNumeric = false;
|
|
51
60
|
}
|
|
52
61
|
}
|
|
53
62
|
}
|
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
|
|
3
|
-
import { parse } from '..';
|
|
4
|
-
|
|
5
|
-
let sdf = fs.readFileSync(`${__dirname}/test.sdf`, 'utf-8');
|
|
6
|
-
|
|
7
|
-
describe('SDF Parser options', () => {
|
|
8
|
-
let result = parse(sdf, {
|
|
9
|
-
exclude: ['Number of H-Donors'],
|
|
10
|
-
include: ['Number of H-Donors', 'CLogP', 'Code'],
|
|
11
|
-
modifiers: {
|
|
12
|
-
CLogP: (field) => {
|
|
13
|
-
return {
|
|
14
|
-
low: field * 1 - 0.2,
|
|
15
|
-
high: field * 1 + 0.2,
|
|
16
|
-
};
|
|
17
|
-
},
|
|
18
|
-
},
|
|
19
|
-
filter: (entry) => {
|
|
20
|
-
return entry.CLogP && entry.CLogP.low > 4;
|
|
21
|
-
},
|
|
22
|
-
});
|
|
23
|
-
|
|
24
|
-
it('Check statistics', () => {
|
|
25
|
-
expect(result.statistics[0].counter).toBe(43);
|
|
26
|
-
expect(result.statistics[0].isNumeric).toBe(false);
|
|
27
|
-
expect(result.statistics[0].label).toBe('Code');
|
|
28
|
-
expect(result.statistics[0].always).toBe(true);
|
|
29
|
-
expect(result.statistics[4].counter).toBe(43);
|
|
30
|
-
expect(result.statistics[4].isNumeric).toBe(false);
|
|
31
|
-
expect(result.statistics[4].label).toBe('CLogP');
|
|
32
|
-
expect(result.statistics[4].always).toBe(true);
|
|
33
|
-
});
|
|
34
|
-
|
|
35
|
-
it('Check molecules', () => {
|
|
36
|
-
expect(result.molecules).toHaveLength(43);
|
|
37
|
-
let molecule = result.molecules[0];
|
|
38
|
-
|
|
39
|
-
expect(Object.keys(molecule)).toHaveLength(3);
|
|
40
|
-
expect(molecule.Code).toBe('0100380851');
|
|
41
|
-
expect(molecule.CLogP.low).toBeCloseTo(4.8, 0.0001);
|
|
42
|
-
expect(molecule.CLogP.high).toBeCloseTo(5.2, 0.0001);
|
|
43
|
-
expect(molecule.molfile.split('\n')).toHaveLength(56);
|
|
44
|
-
});
|
|
45
|
-
|
|
46
|
-
it('should throw with non-string argument', () => {
|
|
47
|
-
expect(() => {
|
|
48
|
-
parse();
|
|
49
|
-
}).toThrow(TypeError);
|
|
50
|
-
expect(() => {
|
|
51
|
-
parse(42);
|
|
52
|
-
}).toThrow(TypeError);
|
|
53
|
-
expect(() => {
|
|
54
|
-
parse({});
|
|
55
|
-
}).toThrow(TypeError);
|
|
56
|
-
});
|
|
57
|
-
});
|