sdf-parser 5.0.0 → 6.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -30
- package/lib/index.js +160 -133
- package/package.json +11 -11
- package/src/__tests__/__snapshots__/getEntriesBoundaries.test.js.snap +10 -0
- package/src/__tests__/getEntriesBoundaries.test.js +7 -0
- package/src/__tests__/iterator.test.js +120 -0
- package/src/__tests__/test.sdf.gz +0 -0
- package/src/__tests__/test4.sdf +37 -0
- package/src/getEntriesBoundaries.js +7 -2
- package/src/index.js +1 -1
- package/src/iterator.js +54 -0
- package/src/parse.js +49 -86
- package/src/util/getMolecule.js +55 -0
- package/src/__tests__/stream.test.js +0 -98
- package/src/stream.js +0 -44
package/README.md
CHANGED
|
@@ -57,39 +57,18 @@ var result = parse(sdf, {
|
|
|
57
57
|
});
|
|
58
58
|
```
|
|
59
59
|
|
|
60
|
-
##
|
|
60
|
+
## Iterator
|
|
61
61
|
|
|
62
62
|
This API is only available on Node.js.
|
|
63
63
|
|
|
64
|
-
### molecules(options)
|
|
65
|
-
|
|
66
|
-
Transform an input text stream to a stream of molecule objects.
|
|
67
|
-
|
|
68
|
-
#### options
|
|
69
|
-
|
|
70
|
-
- `fullResult`: true to emit the full result of `parse` instead of just the molecules.
|
|
71
|
-
- All other options from the `parse` function.
|
|
72
|
-
|
|
73
|
-
```js
|
|
74
|
-
const { stream } = require('sdf-parser');
|
|
75
|
-
fs.createReadStream('test.sdf')
|
|
76
|
-
.pipe(stream.molecules())
|
|
77
|
-
.on('data', (molecule) => {
|
|
78
|
-
console.log(molecule.molfile);
|
|
79
|
-
});
|
|
80
|
-
```
|
|
81
|
-
|
|
82
|
-
### entries()
|
|
83
|
-
|
|
84
|
-
Transform an input text stream to a stream of sdf entries.
|
|
85
|
-
|
|
86
64
|
```js
|
|
87
|
-
const {
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
65
|
+
const { iterator } = require('sdf-parser');
|
|
66
|
+
const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
|
|
67
|
+
const stream = readStream.pipe(createGunzip());
|
|
68
|
+
const results = [];
|
|
69
|
+
for await (const entry of iterator(stream)) {
|
|
70
|
+
results.push(entry);
|
|
71
|
+
}
|
|
93
72
|
```
|
|
94
73
|
|
|
95
74
|
## License
|
|
@@ -98,7 +77,7 @@ fs.createReadStream('test.sdf')
|
|
|
98
77
|
|
|
99
78
|
[npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
|
|
100
79
|
[npm-url]: https://www.npmjs.com/package/sdf-parser
|
|
101
|
-
[travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/
|
|
80
|
+
[travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/main.svg?style=flat-square
|
|
102
81
|
[travis-url]: https://travis-ci.org/cheminfo/sdf-parser
|
|
103
82
|
[download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
|
|
104
83
|
[download-url]: https://www.npmjs.com/package/sdf-parser
|
package/lib/index.js
CHANGED
|
@@ -2,17 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
Object.defineProperty(exports, '__esModule', { value: true });
|
|
4
4
|
|
|
5
|
-
var
|
|
6
|
-
var
|
|
7
|
-
var
|
|
8
|
-
var filter = require('through2-filter');
|
|
9
|
-
|
|
10
|
-
function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
|
|
11
|
-
|
|
12
|
-
var pipeline__default = /*#__PURE__*/_interopDefaultLegacy(pipeline);
|
|
13
|
-
var split2__default = /*#__PURE__*/_interopDefaultLegacy(split2);
|
|
14
|
-
var through2__default = /*#__PURE__*/_interopDefaultLegacy(through2);
|
|
15
|
-
var filter__default = /*#__PURE__*/_interopDefaultLegacy(filter);
|
|
5
|
+
var ensureString = require('ensure-string');
|
|
6
|
+
var readline = require('readline');
|
|
7
|
+
var dynamicTyping = require('dynamic-typing');
|
|
16
8
|
|
|
17
9
|
function getEntriesBoundaries(string, substring, eol) {
|
|
18
10
|
const res = [];
|
|
@@ -22,8 +14,13 @@ function getEntriesBoundaries(string, substring, eol) {
|
|
|
22
14
|
next = string.indexOf(substring, previous);
|
|
23
15
|
if (next !== -1) {
|
|
24
16
|
res.push([previous, next]);
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
const nextMatch = string.indexOf(eol, next + substring.length);
|
|
18
|
+
if (nextMatch === -1) {
|
|
19
|
+
next = -1;
|
|
20
|
+
} else {
|
|
21
|
+
previous = nextMatch + eol.length;
|
|
22
|
+
next = previous;
|
|
23
|
+
}
|
|
27
24
|
} else {
|
|
28
25
|
res.push([previous, string.length]);
|
|
29
26
|
}
|
|
@@ -31,35 +28,106 @@ function getEntriesBoundaries(string, substring, eol) {
|
|
|
31
28
|
return res;
|
|
32
29
|
}
|
|
33
30
|
|
|
31
|
+
function getMolecule$1(sdfPart, labels, currentLabels, options) {
|
|
32
|
+
let parts = sdfPart.split(`${options.eol}>`);
|
|
33
|
+
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
34
|
+
let molecule = {};
|
|
35
|
+
molecule.molfile = parts[0] + options.eol;
|
|
36
|
+
for (let j = 1; j < parts.length; j++) {
|
|
37
|
+
let lines = parts[j].split(options.eol);
|
|
38
|
+
let from = lines[0].indexOf('<');
|
|
39
|
+
let to = lines[0].indexOf('>');
|
|
40
|
+
let label = lines[0].substring(from + 1, to);
|
|
41
|
+
currentLabels.push(label);
|
|
42
|
+
if (!labels[label]) {
|
|
43
|
+
labels[label] = {
|
|
44
|
+
counter: 0,
|
|
45
|
+
isNumeric: options.dynamicTyping,
|
|
46
|
+
keep: false,
|
|
47
|
+
};
|
|
48
|
+
if (
|
|
49
|
+
(!options.exclude || options.exclude.indexOf(label) === -1) &&
|
|
50
|
+
(!options.include || options.include.indexOf(label) > -1)
|
|
51
|
+
) {
|
|
52
|
+
labels[label].keep = true;
|
|
53
|
+
if (options.modifiers[label]) {
|
|
54
|
+
labels[label].modifier = options.modifiers[label];
|
|
55
|
+
}
|
|
56
|
+
if (options.forEach[label]) {
|
|
57
|
+
labels[label].forEach = options.forEach[label];
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
if (labels[label].keep) {
|
|
62
|
+
for (let k = 1; k < lines.length - 1; k++) {
|
|
63
|
+
if (molecule[label]) {
|
|
64
|
+
molecule[label] += options.eol + lines[k];
|
|
65
|
+
} else {
|
|
66
|
+
molecule[label] = lines[k];
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
if (labels[label].modifier) {
|
|
70
|
+
let modifiedValue = labels[label].modifier(molecule[label]);
|
|
71
|
+
if (modifiedValue === undefined || modifiedValue === null) {
|
|
72
|
+
delete molecule[label];
|
|
73
|
+
} else {
|
|
74
|
+
molecule[label] = modifiedValue;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
if (labels[label].isNumeric) {
|
|
78
|
+
if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
|
|
79
|
+
labels[label].isNumeric = false;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
return molecule;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Parse a SDF file
|
|
89
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
90
|
+
* @param {object} [options={}]
|
|
91
|
+
* @param {string[]} [options.include] List of fields to include
|
|
92
|
+
* @param {string[]} [options.exclude] List of fields to exclude
|
|
93
|
+
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
94
|
+
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
95
|
+
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
|
|
96
|
+
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
|
|
97
|
+
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
|
|
98
|
+
*/
|
|
34
99
|
function parse(sdf, options = {}) {
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
modifiers = {},
|
|
40
|
-
forEach = {},
|
|
41
|
-
dynamicTyping = true,
|
|
42
|
-
} = options;
|
|
100
|
+
options = { ...options };
|
|
101
|
+
if (options.modifiers === undefined) options.modifiers = {};
|
|
102
|
+
if (options.forEach === undefined) options.forEach = {};
|
|
103
|
+
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
43
104
|
|
|
105
|
+
sdf = ensureString.ensureString(sdf);
|
|
44
106
|
if (typeof sdf !== 'string') {
|
|
45
107
|
throw new TypeError('Parameter "sdf" must be a string');
|
|
46
108
|
}
|
|
47
109
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
110
|
+
if (options.eol === undefined) {
|
|
111
|
+
options.eol = '\n';
|
|
112
|
+
if (options.mixedEOL) {
|
|
113
|
+
sdf = sdf.replace(/\r\n/g, '\n');
|
|
114
|
+
sdf = sdf.replace(/\r/g, '\n');
|
|
115
|
+
} else {
|
|
116
|
+
// we will find the delimiter in order to be much faster and not use regular expression
|
|
117
|
+
let header = sdf.substr(0, 1000);
|
|
118
|
+
if (header.indexOf('\r\n') > -1) {
|
|
119
|
+
options.eol = '\r\n';
|
|
120
|
+
} else if (header.indexOf('\r') > -1) {
|
|
121
|
+
options.eol = '\r';
|
|
122
|
+
}
|
|
59
123
|
}
|
|
60
124
|
}
|
|
61
125
|
|
|
62
|
-
let entriesBoundaries = getEntriesBoundaries(
|
|
126
|
+
let entriesBoundaries = getEntriesBoundaries(
|
|
127
|
+
sdf,
|
|
128
|
+
`${options.eol}$$$$`,
|
|
129
|
+
options.eol,
|
|
130
|
+
);
|
|
63
131
|
let molecules = [];
|
|
64
132
|
let labels = {};
|
|
65
133
|
|
|
@@ -67,72 +135,18 @@ function parse(sdf, options = {}) {
|
|
|
67
135
|
|
|
68
136
|
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
69
137
|
let sdfPart = sdf.substring(...entriesBoundaries[i]);
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
let label = lines[0].substring(from + 1, to);
|
|
80
|
-
currentLabels.push(label);
|
|
81
|
-
if (!labels[label]) {
|
|
82
|
-
labels[label] = {
|
|
83
|
-
counter: 0,
|
|
84
|
-
isNumeric: dynamicTyping,
|
|
85
|
-
keep: false,
|
|
86
|
-
};
|
|
87
|
-
if (
|
|
88
|
-
(!exclude || exclude.indexOf(label) === -1) &&
|
|
89
|
-
(!include || include.indexOf(label) > -1)
|
|
90
|
-
) {
|
|
91
|
-
labels[label].keep = true;
|
|
92
|
-
if (modifiers[label]) {
|
|
93
|
-
labels[label].modifier = modifiers[label];
|
|
94
|
-
}
|
|
95
|
-
if (forEach[label]) {
|
|
96
|
-
labels[label].forEach = forEach[label];
|
|
97
|
-
}
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
if (labels[label].keep) {
|
|
101
|
-
for (let k = 1; k < lines.length - 1; k++) {
|
|
102
|
-
if (molecule[label]) {
|
|
103
|
-
molecule[label] += eol + lines[k];
|
|
104
|
-
} else {
|
|
105
|
-
molecule[label] = lines[k];
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
if (labels[label].modifier) {
|
|
109
|
-
let modifiedValue = labels[label].modifier(molecule[label]);
|
|
110
|
-
if (modifiedValue === undefined || modifiedValue === null) {
|
|
111
|
-
delete molecule[label];
|
|
112
|
-
} else {
|
|
113
|
-
molecule[label] = modifiedValue;
|
|
114
|
-
}
|
|
115
|
-
}
|
|
116
|
-
if (labels[label].isNumeric) {
|
|
117
|
-
if (
|
|
118
|
-
!isFinite(molecule[label]) ||
|
|
119
|
-
molecule[label].match(/^0[0-9]/)
|
|
120
|
-
) {
|
|
121
|
-
labels[label].isNumeric = false;
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
if (!filter || filter(molecule)) {
|
|
127
|
-
molecules.push(molecule);
|
|
128
|
-
// only now we can increase the counter
|
|
129
|
-
for (let j = 0; j < currentLabels.length; j++) {
|
|
130
|
-
labels[currentLabels[j]].counter++;
|
|
131
|
-
}
|
|
138
|
+
|
|
139
|
+
let currentLabels = [];
|
|
140
|
+
const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
|
|
141
|
+
if (!molecule) continue;
|
|
142
|
+
if (!options.filter || options.filter(molecule)) {
|
|
143
|
+
molecules.push(molecule);
|
|
144
|
+
// only now we can increase the counter
|
|
145
|
+
for (let j = 0; j < currentLabels.length; j++) {
|
|
146
|
+
labels[currentLabels[j]].counter++;
|
|
132
147
|
}
|
|
133
148
|
}
|
|
134
149
|
}
|
|
135
|
-
|
|
136
150
|
// all numeric fields should be converted to numbers
|
|
137
151
|
for (let label in labels) {
|
|
138
152
|
let currentLabel = labels[label];
|
|
@@ -172,50 +186,63 @@ function parse(sdf, options = {}) {
|
|
|
172
186
|
|
|
173
187
|
return {
|
|
174
188
|
time: Date.now() - start,
|
|
175
|
-
molecules
|
|
189
|
+
molecules,
|
|
176
190
|
labels: Object.keys(labels),
|
|
177
|
-
statistics
|
|
191
|
+
statistics,
|
|
178
192
|
};
|
|
179
193
|
}
|
|
180
194
|
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
}
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
195
|
+
/**
|
|
196
|
+
* Parse a SDF file
|
|
197
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
198
|
+
* @param {object} [options={}]
|
|
199
|
+
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
200
|
+
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
201
|
+
*/
|
|
202
|
+
|
|
203
|
+
async function* iterator(readStream, options = {}) {
|
|
204
|
+
const lines = readline.createInterface(readStream);
|
|
205
|
+
const currentLines = [];
|
|
206
|
+
options = { ...options };
|
|
207
|
+
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
208
|
+
|
|
209
|
+
options.eol = '\n';
|
|
210
|
+
for await (let line of lines) {
|
|
211
|
+
if (line.startsWith('$$$$')) {
|
|
212
|
+
const molecule = getMolecule(currentLines.join(options.eol), options);
|
|
213
|
+
if (!options.filter || options.filter(molecule)) {
|
|
214
|
+
yield molecule;
|
|
215
|
+
}
|
|
216
|
+
currentLines.length = 0;
|
|
217
|
+
} else {
|
|
218
|
+
currentLines.push(line);
|
|
219
|
+
}
|
|
220
|
+
}
|
|
196
221
|
}
|
|
197
222
|
|
|
198
|
-
function
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
callback(e);
|
|
223
|
+
function getMolecule(sdfPart, options) {
|
|
224
|
+
let parts = sdfPart.split(`${options.eol}>`);
|
|
225
|
+
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
226
|
+
let molecule = {};
|
|
227
|
+
molecule.molfile = parts[0] + options.eol;
|
|
228
|
+
for (let j = 1; j < parts.length; j++) {
|
|
229
|
+
let lines = parts[j].split(options.eol);
|
|
230
|
+
let from = lines[0].indexOf('<');
|
|
231
|
+
let to = lines[0].indexOf('>');
|
|
232
|
+
let label = lines[0].substring(from + 1, to);
|
|
233
|
+
for (let k = 1; k < lines.length - 1; k++) {
|
|
234
|
+
if (molecule[label]) {
|
|
235
|
+
molecule[label] += options.eol + lines[k];
|
|
236
|
+
} else {
|
|
237
|
+
molecule[label] = lines[k];
|
|
214
238
|
}
|
|
215
|
-
}
|
|
216
|
-
|
|
239
|
+
}
|
|
240
|
+
if (options.dynamicTyping) {
|
|
241
|
+
molecule[label] = dynamicTyping.parseString(molecule[label]);
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
return molecule;
|
|
217
245
|
}
|
|
218
246
|
|
|
219
|
-
exports.
|
|
220
|
-
exports.molecules = molecules;
|
|
247
|
+
exports.iterator = iterator;
|
|
221
248
|
exports.parse = parse;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "sdf-parser",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "6.0.0",
|
|
4
4
|
"description": "SDF parser",
|
|
5
5
|
"main": "lib/index.js",
|
|
6
6
|
"module": "src/index.js",
|
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
"lib",
|
|
9
9
|
"src"
|
|
10
10
|
],
|
|
11
|
+
"sideEffects": false,
|
|
11
12
|
"scripts": {
|
|
12
13
|
"build": "npm run compile && cheminfo-build --root SDFParser",
|
|
13
14
|
"compile": "rollup -c",
|
|
@@ -42,20 +43,19 @@
|
|
|
42
43
|
},
|
|
43
44
|
"homepage": "https://github.com/cheminfo/sdf-parser",
|
|
44
45
|
"devDependencies": {
|
|
45
|
-
"@babel/plugin-transform-modules-commonjs": "^7.
|
|
46
|
+
"@babel/plugin-transform-modules-commonjs": "^7.18.6",
|
|
46
47
|
"babel-eslint": "^10.1.0",
|
|
47
48
|
"callback-stream": "^1.1.0",
|
|
48
49
|
"cheminfo-build": "^1.1.11",
|
|
49
|
-
"eslint": "^
|
|
50
|
-
"eslint-config-cheminfo": "^
|
|
51
|
-
"
|
|
52
|
-
"
|
|
53
|
-
"
|
|
50
|
+
"eslint": "^8.22.0",
|
|
51
|
+
"eslint-config-cheminfo": "^8.0.2",
|
|
52
|
+
"filelist-utils": "^0.6.0",
|
|
53
|
+
"jest": "^28.1.3",
|
|
54
|
+
"openchemlib": "^8.0.1",
|
|
55
|
+
"prettier": "^2.7.1"
|
|
54
56
|
},
|
|
55
57
|
"dependencies": {
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"through2": "^4.0.2",
|
|
59
|
-
"through2-filter": "^3.0.0"
|
|
58
|
+
"dynamic-typing": "^1.0.0",
|
|
59
|
+
"ensure-string": "^1.2.0"
|
|
60
60
|
}
|
|
61
61
|
}
|
|
@@ -24,3 +24,10 @@ let sdf2 = fs.readFileSync(`${__dirname}/test2.sdf`, 'utf-8');
|
|
|
24
24
|
);
|
|
25
25
|
});
|
|
26
26
|
});
|
|
27
|
+
|
|
28
|
+
test('should parse sdf files without EOL in the EOF', () => {
|
|
29
|
+
const eol = '\n';
|
|
30
|
+
const sdf = fs.readFileSync(`${__dirname}/test4.sdf`, 'utf-8');
|
|
31
|
+
|
|
32
|
+
expect(getEntriesBoundaries(sdf, `${eol}$$$$`, eol)).toMatchSnapshot();
|
|
33
|
+
});
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { createReadStream } from 'fs';
|
|
2
|
+
import { join } from 'path';
|
|
3
|
+
import { createGunzip } from 'zlib';
|
|
4
|
+
|
|
5
|
+
import { fileListFromPath } from 'filelist-utils';
|
|
6
|
+
|
|
7
|
+
import { iterator } from '../iterator';
|
|
8
|
+
|
|
9
|
+
test('iterator', async () => {
|
|
10
|
+
const fileList = (await fileListFromPath(join(__dirname, '.'))).filter(
|
|
11
|
+
(file) => file.name === 'test.sdf',
|
|
12
|
+
);
|
|
13
|
+
const results = [];
|
|
14
|
+
for await (const entry of iterator(fileList[0].stream())) {
|
|
15
|
+
results.push(entry);
|
|
16
|
+
}
|
|
17
|
+
expect(results).toHaveLength(128);
|
|
18
|
+
expect(results[0]).toMatchInlineSnapshot(`
|
|
19
|
+
Object {
|
|
20
|
+
"CLogP": 2.7,
|
|
21
|
+
"Code": 100380824,
|
|
22
|
+
"Number of H-Acceptors": 3,
|
|
23
|
+
"Number of H-Donors": 1,
|
|
24
|
+
"Number of Rotatable bonds": 1,
|
|
25
|
+
"molfile": "
|
|
26
|
+
-ISIS- 04231216572D
|
|
27
|
+
|
|
28
|
+
15 16 0 0 0 0 0 0 0 0999 V2000
|
|
29
|
+
2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
|
|
30
|
+
2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
31
|
+
0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
32
|
+
1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
33
|
+
1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
|
|
34
|
+
-0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
35
|
+
3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
|
36
|
+
0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
|
37
|
+
-1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
38
|
+
-0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
39
|
+
-1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
40
|
+
-0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
|
|
41
|
+
-3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
42
|
+
-2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
43
|
+
-3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
44
|
+
2 1 1 0 0 0 0
|
|
45
|
+
3 4 1 0 0 0 0
|
|
46
|
+
4 1 1 0 0 0 0
|
|
47
|
+
5 2 1 0 0 0 0
|
|
48
|
+
6 3 2 0 0 0 0
|
|
49
|
+
7 2 2 0 0 0 0
|
|
50
|
+
8 4 2 0 0 0 0
|
|
51
|
+
9 6 1 0 0 0 0
|
|
52
|
+
10 9 2 0 0 0 0
|
|
53
|
+
11 10 1 0 0 0 0
|
|
54
|
+
12 11 1 0 0 0 0
|
|
55
|
+
13 14 2 0 0 0 0
|
|
56
|
+
14 9 1 0 0 0 0
|
|
57
|
+
15 13 1 0 0 0 0
|
|
58
|
+
3 5 1 0 0 0 0
|
|
59
|
+
15 11 2 0 0 0 0
|
|
60
|
+
M END
|
|
61
|
+
",
|
|
62
|
+
}
|
|
63
|
+
`);
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
test('iterator on stream', async () => {
|
|
67
|
+
const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
|
|
68
|
+
const stream = readStream.pipe(createGunzip());
|
|
69
|
+
const results = [];
|
|
70
|
+
for await (const entry of iterator(stream)) {
|
|
71
|
+
results.push(entry);
|
|
72
|
+
}
|
|
73
|
+
expect(results).toHaveLength(128);
|
|
74
|
+
expect(results[0]).toMatchInlineSnapshot(`
|
|
75
|
+
Object {
|
|
76
|
+
"CLogP": 2.7,
|
|
77
|
+
"Code": 100380824,
|
|
78
|
+
"Number of H-Acceptors": 3,
|
|
79
|
+
"Number of H-Donors": 1,
|
|
80
|
+
"Number of Rotatable bonds": 1,
|
|
81
|
+
"molfile": "
|
|
82
|
+
-ISIS- 04231216572D
|
|
83
|
+
|
|
84
|
+
15 16 0 0 0 0 0 0 0 0999 V2000
|
|
85
|
+
2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
|
|
86
|
+
2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
87
|
+
0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
88
|
+
1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
89
|
+
1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
|
|
90
|
+
-0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
91
|
+
3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
|
92
|
+
0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
|
93
|
+
-1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
94
|
+
-0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
95
|
+
-1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
96
|
+
-0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
|
|
97
|
+
-3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
98
|
+
-2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
99
|
+
-3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
100
|
+
2 1 1 0 0 0 0
|
|
101
|
+
3 4 1 0 0 0 0
|
|
102
|
+
4 1 1 0 0 0 0
|
|
103
|
+
5 2 1 0 0 0 0
|
|
104
|
+
6 3 2 0 0 0 0
|
|
105
|
+
7 2 2 0 0 0 0
|
|
106
|
+
8 4 2 0 0 0 0
|
|
107
|
+
9 6 1 0 0 0 0
|
|
108
|
+
10 9 2 0 0 0 0
|
|
109
|
+
11 10 1 0 0 0 0
|
|
110
|
+
12 11 1 0 0 0 0
|
|
111
|
+
13 14 2 0 0 0 0
|
|
112
|
+
14 9 1 0 0 0 0
|
|
113
|
+
15 13 1 0 0 0 0
|
|
114
|
+
3 5 1 0 0 0 0
|
|
115
|
+
15 11 2 0 0 0 0
|
|
116
|
+
M END
|
|
117
|
+
",
|
|
118
|
+
}
|
|
119
|
+
`);
|
|
120
|
+
});
|
|
Binary file
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
|
|
2
|
+
-ISIS- 04231216572D
|
|
3
|
+
|
|
4
|
+
15 16 0 0 0 0 0 0 0 0999 V2000
|
|
5
|
+
2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
|
|
6
|
+
2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
7
|
+
0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
8
|
+
1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
9
|
+
1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
|
|
10
|
+
-0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
11
|
+
3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
|
12
|
+
0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
|
|
13
|
+
-1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
14
|
+
-0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
15
|
+
-1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
16
|
+
-0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
|
|
17
|
+
-3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
18
|
+
-2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
19
|
+
-3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
|
|
20
|
+
2 1 1 0 0 0 0
|
|
21
|
+
3 4 1 0 0 0 0
|
|
22
|
+
4 1 1 0 0 0 0
|
|
23
|
+
5 2 1 0 0 0 0
|
|
24
|
+
6 3 2 0 0 0 0
|
|
25
|
+
7 2 2 0 0 0 0
|
|
26
|
+
8 4 2 0 0 0 0
|
|
27
|
+
9 6 1 0 0 0 0
|
|
28
|
+
10 9 2 0 0 0 0
|
|
29
|
+
11 10 1 0 0 0 0
|
|
30
|
+
12 11 1 0 0 0 0
|
|
31
|
+
13 14 2 0 0 0 0
|
|
32
|
+
14 9 1 0 0 0 0
|
|
33
|
+
15 13 1 0 0 0 0
|
|
34
|
+
3 5 1 0 0 0 0
|
|
35
|
+
15 11 2 0 0 0 0
|
|
36
|
+
M END
|
|
37
|
+
$$$$
|
|
@@ -6,8 +6,13 @@ export function getEntriesBoundaries(string, substring, eol) {
|
|
|
6
6
|
next = string.indexOf(substring, previous);
|
|
7
7
|
if (next !== -1) {
|
|
8
8
|
res.push([previous, next]);
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
const nextMatch = string.indexOf(eol, next + substring.length);
|
|
10
|
+
if (nextMatch === -1) {
|
|
11
|
+
next = -1;
|
|
12
|
+
} else {
|
|
13
|
+
previous = nextMatch + eol.length;
|
|
14
|
+
next = previous;
|
|
15
|
+
}
|
|
11
16
|
} else {
|
|
12
17
|
res.push([previous, string.length]);
|
|
13
18
|
}
|
package/src/index.js
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
export * from './parse';
|
|
2
|
-
export * from './
|
|
2
|
+
export * from './iterator';
|
package/src/iterator.js
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { createInterface } from 'readline';
|
|
2
|
+
|
|
3
|
+
import { parseString } from 'dynamic-typing';
|
|
4
|
+
/**
|
|
5
|
+
* Parse a SDF file
|
|
6
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
7
|
+
* @param {object} [options={}]
|
|
8
|
+
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
9
|
+
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
export async function* iterator(readStream, options = {}) {
|
|
13
|
+
const lines = createInterface(readStream);
|
|
14
|
+
const currentLines = [];
|
|
15
|
+
options = { ...options };
|
|
16
|
+
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
17
|
+
|
|
18
|
+
options.eol = '\n';
|
|
19
|
+
for await (let line of lines) {
|
|
20
|
+
if (line.startsWith('$$$$')) {
|
|
21
|
+
const molecule = getMolecule(currentLines.join(options.eol), options);
|
|
22
|
+
if (!options.filter || options.filter(molecule)) {
|
|
23
|
+
yield molecule;
|
|
24
|
+
}
|
|
25
|
+
currentLines.length = 0;
|
|
26
|
+
} else {
|
|
27
|
+
currentLines.push(line);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
function getMolecule(sdfPart, options) {
|
|
33
|
+
let parts = sdfPart.split(`${options.eol}>`);
|
|
34
|
+
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
35
|
+
let molecule = {};
|
|
36
|
+
molecule.molfile = parts[0] + options.eol;
|
|
37
|
+
for (let j = 1; j < parts.length; j++) {
|
|
38
|
+
let lines = parts[j].split(options.eol);
|
|
39
|
+
let from = lines[0].indexOf('<');
|
|
40
|
+
let to = lines[0].indexOf('>');
|
|
41
|
+
let label = lines[0].substring(from + 1, to);
|
|
42
|
+
for (let k = 1; k < lines.length - 1; k++) {
|
|
43
|
+
if (molecule[label]) {
|
|
44
|
+
molecule[label] += options.eol + lines[k];
|
|
45
|
+
} else {
|
|
46
|
+
molecule[label] = lines[k];
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
if (options.dynamicTyping) {
|
|
50
|
+
molecule[label] = parseString(molecule[label]);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return molecule;
|
|
54
|
+
}
|
package/src/parse.js
CHANGED
|
@@ -1,34 +1,51 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ensureString } from 'ensure-string';
|
|
2
2
|
|
|
3
|
+
import { getEntriesBoundaries } from './getEntriesBoundaries';
|
|
4
|
+
import { getMolecule } from './util/getMolecule';
|
|
5
|
+
/**
|
|
6
|
+
* Parse a SDF file
|
|
7
|
+
* @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
|
|
8
|
+
* @param {object} [options={}]
|
|
9
|
+
* @param {string[]} [options.include] List of fields to include
|
|
10
|
+
* @param {string[]} [options.exclude] List of fields to exclude
|
|
11
|
+
* @param {Function} [options.filter] Callback allowing to filter the molecules
|
|
12
|
+
* @param {boolean} [options.dynamicTyping] Dynamically type the data
|
|
13
|
+
* @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
|
|
14
|
+
* @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
|
|
15
|
+
* @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
|
|
16
|
+
*/
|
|
3
17
|
export function parse(sdf, options = {}) {
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
modifiers = {},
|
|
9
|
-
forEach = {},
|
|
10
|
-
dynamicTyping = true,
|
|
11
|
-
} = options;
|
|
18
|
+
options = { ...options };
|
|
19
|
+
if (options.modifiers === undefined) options.modifiers = {};
|
|
20
|
+
if (options.forEach === undefined) options.forEach = {};
|
|
21
|
+
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
12
22
|
|
|
23
|
+
sdf = ensureString(sdf);
|
|
13
24
|
if (typeof sdf !== 'string') {
|
|
14
25
|
throw new TypeError('Parameter "sdf" must be a string');
|
|
15
26
|
}
|
|
16
27
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
if (options.eol === undefined) {
|
|
29
|
+
options.eol = '\n';
|
|
30
|
+
if (options.mixedEOL) {
|
|
31
|
+
sdf = sdf.replace(/\r\n/g, '\n');
|
|
32
|
+
sdf = sdf.replace(/\r/g, '\n');
|
|
33
|
+
} else {
|
|
34
|
+
// we will find the delimiter in order to be much faster and not use regular expression
|
|
35
|
+
let header = sdf.substr(0, 1000);
|
|
36
|
+
if (header.indexOf('\r\n') > -1) {
|
|
37
|
+
options.eol = '\r\n';
|
|
38
|
+
} else if (header.indexOf('\r') > -1) {
|
|
39
|
+
options.eol = '\r';
|
|
40
|
+
}
|
|
28
41
|
}
|
|
29
42
|
}
|
|
30
43
|
|
|
31
|
-
let entriesBoundaries = getEntriesBoundaries(
|
|
44
|
+
let entriesBoundaries = getEntriesBoundaries(
|
|
45
|
+
sdf,
|
|
46
|
+
`${options.eol}$$$$`,
|
|
47
|
+
options.eol,
|
|
48
|
+
);
|
|
32
49
|
let molecules = [];
|
|
33
50
|
let labels = {};
|
|
34
51
|
|
|
@@ -36,72 +53,18 @@ export function parse(sdf, options = {}) {
|
|
|
36
53
|
|
|
37
54
|
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
38
55
|
let sdfPart = sdf.substring(...entriesBoundaries[i]);
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
let label = lines[0].substring(from + 1, to);
|
|
49
|
-
currentLabels.push(label);
|
|
50
|
-
if (!labels[label]) {
|
|
51
|
-
labels[label] = {
|
|
52
|
-
counter: 0,
|
|
53
|
-
isNumeric: dynamicTyping,
|
|
54
|
-
keep: false,
|
|
55
|
-
};
|
|
56
|
-
if (
|
|
57
|
-
(!exclude || exclude.indexOf(label) === -1) &&
|
|
58
|
-
(!include || include.indexOf(label) > -1)
|
|
59
|
-
) {
|
|
60
|
-
labels[label].keep = true;
|
|
61
|
-
if (modifiers[label]) {
|
|
62
|
-
labels[label].modifier = modifiers[label];
|
|
63
|
-
}
|
|
64
|
-
if (forEach[label]) {
|
|
65
|
-
labels[label].forEach = forEach[label];
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
if (labels[label].keep) {
|
|
70
|
-
for (let k = 1; k < lines.length - 1; k++) {
|
|
71
|
-
if (molecule[label]) {
|
|
72
|
-
molecule[label] += eol + lines[k];
|
|
73
|
-
} else {
|
|
74
|
-
molecule[label] = lines[k];
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
if (labels[label].modifier) {
|
|
78
|
-
let modifiedValue = labels[label].modifier(molecule[label]);
|
|
79
|
-
if (modifiedValue === undefined || modifiedValue === null) {
|
|
80
|
-
delete molecule[label];
|
|
81
|
-
} else {
|
|
82
|
-
molecule[label] = modifiedValue;
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
if (labels[label].isNumeric) {
|
|
86
|
-
if (
|
|
87
|
-
!isFinite(molecule[label]) ||
|
|
88
|
-
molecule[label].match(/^0[0-9]/)
|
|
89
|
-
) {
|
|
90
|
-
labels[label].isNumeric = false;
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
if (!filter || filter(molecule)) {
|
|
96
|
-
molecules.push(molecule);
|
|
97
|
-
// only now we can increase the counter
|
|
98
|
-
for (let j = 0; j < currentLabels.length; j++) {
|
|
99
|
-
labels[currentLabels[j]].counter++;
|
|
100
|
-
}
|
|
56
|
+
|
|
57
|
+
let currentLabels = [];
|
|
58
|
+
const molecule = getMolecule(sdfPart, labels, currentLabels, options);
|
|
59
|
+
if (!molecule) continue;
|
|
60
|
+
if (!options.filter || options.filter(molecule)) {
|
|
61
|
+
molecules.push(molecule);
|
|
62
|
+
// only now we can increase the counter
|
|
63
|
+
for (let j = 0; j < currentLabels.length; j++) {
|
|
64
|
+
labels[currentLabels[j]].counter++;
|
|
101
65
|
}
|
|
102
66
|
}
|
|
103
67
|
}
|
|
104
|
-
|
|
105
68
|
// all numeric fields should be converted to numbers
|
|
106
69
|
for (let label in labels) {
|
|
107
70
|
let currentLabel = labels[label];
|
|
@@ -141,8 +104,8 @@ export function parse(sdf, options = {}) {
|
|
|
141
104
|
|
|
142
105
|
return {
|
|
143
106
|
time: Date.now() - start,
|
|
144
|
-
molecules
|
|
107
|
+
molecules,
|
|
145
108
|
labels: Object.keys(labels),
|
|
146
|
-
statistics
|
|
109
|
+
statistics,
|
|
147
110
|
};
|
|
148
111
|
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
2
|
+
let parts = sdfPart.split(`${options.eol}>`);
|
|
3
|
+
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
4
|
+
let molecule = {};
|
|
5
|
+
molecule.molfile = parts[0] + options.eol;
|
|
6
|
+
for (let j = 1; j < parts.length; j++) {
|
|
7
|
+
let lines = parts[j].split(options.eol);
|
|
8
|
+
let from = lines[0].indexOf('<');
|
|
9
|
+
let to = lines[0].indexOf('>');
|
|
10
|
+
let label = lines[0].substring(from + 1, to);
|
|
11
|
+
currentLabels.push(label);
|
|
12
|
+
if (!labels[label]) {
|
|
13
|
+
labels[label] = {
|
|
14
|
+
counter: 0,
|
|
15
|
+
isNumeric: options.dynamicTyping,
|
|
16
|
+
keep: false,
|
|
17
|
+
};
|
|
18
|
+
if (
|
|
19
|
+
(!options.exclude || options.exclude.indexOf(label) === -1) &&
|
|
20
|
+
(!options.include || options.include.indexOf(label) > -1)
|
|
21
|
+
) {
|
|
22
|
+
labels[label].keep = true;
|
|
23
|
+
if (options.modifiers[label]) {
|
|
24
|
+
labels[label].modifier = options.modifiers[label];
|
|
25
|
+
}
|
|
26
|
+
if (options.forEach[label]) {
|
|
27
|
+
labels[label].forEach = options.forEach[label];
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
if (labels[label].keep) {
|
|
32
|
+
for (let k = 1; k < lines.length - 1; k++) {
|
|
33
|
+
if (molecule[label]) {
|
|
34
|
+
molecule[label] += options.eol + lines[k];
|
|
35
|
+
} else {
|
|
36
|
+
molecule[label] = lines[k];
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
if (labels[label].modifier) {
|
|
40
|
+
let modifiedValue = labels[label].modifier(molecule[label]);
|
|
41
|
+
if (modifiedValue === undefined || modifiedValue === null) {
|
|
42
|
+
delete molecule[label];
|
|
43
|
+
} else {
|
|
44
|
+
molecule[label] = modifiedValue;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
if (labels[label].isNumeric) {
|
|
48
|
+
if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
|
|
49
|
+
labels[label].isNumeric = false;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return molecule;
|
|
55
|
+
}
|
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
2
|
-
|
|
3
|
-
import callbackStream from 'callback-stream';
|
|
4
|
-
import OCL from 'openchemlib/minimal';
|
|
5
|
-
|
|
6
|
-
import { entries, molecules } from '..';
|
|
7
|
-
|
|
8
|
-
const cbStream = callbackStream.bind(null, { objectMode: true });
|
|
9
|
-
|
|
10
|
-
describe('stream', () => {
|
|
11
|
-
it('entries', () =>
|
|
12
|
-
new Promise((resolve) => {
|
|
13
|
-
fs.createReadStream(`${__dirname}/test.sdf`)
|
|
14
|
-
.pipe(entries())
|
|
15
|
-
.pipe(
|
|
16
|
-
cbStream((err, data) => {
|
|
17
|
-
expect(err).toBeNull();
|
|
18
|
-
expect(data).toHaveLength(128);
|
|
19
|
-
expect(data[0]).toContain('-ISIS- 04231216572D');
|
|
20
|
-
const mol = OCL.Molecule.fromMolfile(data[5]);
|
|
21
|
-
expect(mol.toMolfile()).toContain(
|
|
22
|
-
'17 18 0 0 0 0 0 0 0 0999 V2000',
|
|
23
|
-
);
|
|
24
|
-
resolve();
|
|
25
|
-
}),
|
|
26
|
-
);
|
|
27
|
-
}));
|
|
28
|
-
|
|
29
|
-
it('molecules', () =>
|
|
30
|
-
new Promise((resolve) => {
|
|
31
|
-
fs.createReadStream(`${__dirname}/test.sdf`)
|
|
32
|
-
.pipe(molecules())
|
|
33
|
-
.pipe(
|
|
34
|
-
cbStream((err, data) => {
|
|
35
|
-
expect(err).toBeNull();
|
|
36
|
-
expect(data).toHaveLength(128);
|
|
37
|
-
expect(data[0]).toMatchObject({
|
|
38
|
-
Code: '0100380824',
|
|
39
|
-
CLogP: 2.7,
|
|
40
|
-
});
|
|
41
|
-
expect(data[0].molfile).toContain('-ISIS- 04231216572D');
|
|
42
|
-
resolve();
|
|
43
|
-
}),
|
|
44
|
-
);
|
|
45
|
-
}));
|
|
46
|
-
|
|
47
|
-
it('molecules - full result', () =>
|
|
48
|
-
new Promise((resolve) => {
|
|
49
|
-
fs.createReadStream(`${__dirname}/test.sdf`)
|
|
50
|
-
.pipe(molecules({ fullResult: true }))
|
|
51
|
-
.pipe(
|
|
52
|
-
cbStream((err, data) => {
|
|
53
|
-
expect(err).toBeNull();
|
|
54
|
-
expect(data).toHaveLength(128);
|
|
55
|
-
expect(data[0]).toMatchObject({
|
|
56
|
-
labels: [
|
|
57
|
-
'Code',
|
|
58
|
-
'Number of H-Donors',
|
|
59
|
-
'Number of H-Acceptors',
|
|
60
|
-
'Number of Rotatable bonds',
|
|
61
|
-
'CLogP',
|
|
62
|
-
],
|
|
63
|
-
});
|
|
64
|
-
expect(data[0].molecules).toHaveLength(1);
|
|
65
|
-
resolve();
|
|
66
|
-
}),
|
|
67
|
-
);
|
|
68
|
-
}));
|
|
69
|
-
|
|
70
|
-
it('molecules with filter', () =>
|
|
71
|
-
new Promise((resolve) => {
|
|
72
|
-
fs.createReadStream(`${__dirname}/test.sdf`)
|
|
73
|
-
.pipe(
|
|
74
|
-
molecules({
|
|
75
|
-
filter: (entry) => entry.Code === '0100380869',
|
|
76
|
-
}),
|
|
77
|
-
)
|
|
78
|
-
.pipe(
|
|
79
|
-
cbStream((err, data) => {
|
|
80
|
-
expect(err).toBeNull();
|
|
81
|
-
expect(data).toHaveLength(1);
|
|
82
|
-
resolve();
|
|
83
|
-
}),
|
|
84
|
-
);
|
|
85
|
-
}));
|
|
86
|
-
|
|
87
|
-
it('async iteration', async () => {
|
|
88
|
-
const stream = fs
|
|
89
|
-
.createReadStream(`${__dirname}/test.sdf`)
|
|
90
|
-
.pipe(molecules());
|
|
91
|
-
let count = 0;
|
|
92
|
-
for await (const molecule of stream) {
|
|
93
|
-
count++;
|
|
94
|
-
expect(molecule.molfile.toString()).toContain('0999 V2000');
|
|
95
|
-
}
|
|
96
|
-
expect(count).toBe(128);
|
|
97
|
-
});
|
|
98
|
-
});
|
package/src/stream.js
DELETED
|
@@ -1,44 +0,0 @@
|
|
|
1
|
-
import pipeline from 'pumpify';
|
|
2
|
-
import split2 from 'split2';
|
|
3
|
-
import through2 from 'through2';
|
|
4
|
-
import filter from 'through2-filter';
|
|
5
|
-
|
|
6
|
-
import { parse } from './parse';
|
|
7
|
-
|
|
8
|
-
const filterStream = filter.bind(null, { objectMode: true });
|
|
9
|
-
function filterCb(chunk) {
|
|
10
|
-
return chunk.length > 1 && chunk.trim().length > 1;
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
export function entries() {
|
|
14
|
-
return pipeline.obj(
|
|
15
|
-
split2(/\r?\n\${4}.*\r?\n/),
|
|
16
|
-
filterStream(filterCb),
|
|
17
|
-
through2({ objectMode: true }, function process(value, encoding, callback) {
|
|
18
|
-
const eol = value.includes('\r\n') ? '\r\n' : '\n';
|
|
19
|
-
this.push(`${value + eol}$$$$${eol}`);
|
|
20
|
-
callback();
|
|
21
|
-
}),
|
|
22
|
-
);
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
export function molecules(options) {
|
|
26
|
-
return pipeline.obj(
|
|
27
|
-
entries(),
|
|
28
|
-
through2({ objectMode: true }, function process(value, encoding, callback) {
|
|
29
|
-
try {
|
|
30
|
-
const parsed = parse(value, options);
|
|
31
|
-
if (parsed.molecules.length === 1) {
|
|
32
|
-
if (options && options.fullResult) {
|
|
33
|
-
this.push(parsed);
|
|
34
|
-
} else {
|
|
35
|
-
this.push(parsed.molecules[0]);
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
callback();
|
|
39
|
-
} catch (e) {
|
|
40
|
-
callback(e);
|
|
41
|
-
}
|
|
42
|
-
}),
|
|
43
|
-
);
|
|
44
|
-
}
|