sdf-parser 5.0.2 → 6.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -57,39 +57,18 @@ var result = parse(sdf, {
57
57
  });
58
58
  ```
59
59
 
60
- ## Streams
60
+ ## Iterator
61
61
 
62
62
  This API is only available on Node.js.
63
63
 
64
- ### molecules(options)
65
-
66
- Transform an input text stream to a stream of molecule objects.
67
-
68
- #### options
69
-
70
- - `fullResult`: true to emit the full result of `parse` instead of just the molecules.
71
- - All other options from the `parse` function.
72
-
73
- ```js
74
- const { stream } = require('sdf-parser');
75
- fs.createReadStream('test.sdf')
76
- .pipe(stream.molecules())
77
- .on('data', (molecule) => {
78
- console.log(molecule.molfile);
79
- });
80
- ```
81
-
82
- ### entries()
83
-
84
- Transform an input text stream to a stream of sdf entries.
85
-
86
64
  ```js
87
- const { stream } = require('sdf-parser');
88
- fs.createReadStream('test.sdf')
89
- .pipe(stream.entries())
90
- .on('data', (entry) => {
91
- // sdf entry as a string
92
- });
65
+ const { iterator } = require('sdf-parser');
66
+ const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
67
+ const stream = readStream.pipe(createGunzip());
68
+ const results = [];
69
+ for await (const entry of iterator(stream)) {
70
+ results.push(entry);
71
+ }
93
72
  ```
94
73
 
95
74
  ## License
@@ -98,7 +77,7 @@ fs.createReadStream('test.sdf')
98
77
 
99
78
  [npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
100
79
  [npm-url]: https://www.npmjs.com/package/sdf-parser
101
- [travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/master.svg?style=flat-square
80
+ [travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/main.svg?style=flat-square
102
81
  [travis-url]: https://travis-ci.org/cheminfo/sdf-parser
103
82
  [download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
104
83
  [download-url]: https://www.npmjs.com/package/sdf-parser
package/lib/index.js CHANGED
@@ -3,17 +3,8 @@
3
3
  Object.defineProperty(exports, '__esModule', { value: true });
4
4
 
5
5
  var ensureString = require('ensure-string');
6
- var pipeline = require('pumpify');
7
- var split2 = require('split2');
8
- var through2 = require('through2');
9
- var filter = require('through2-filter');
10
-
11
- function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
12
-
13
- var pipeline__default = /*#__PURE__*/_interopDefaultLegacy(pipeline);
14
- var split2__default = /*#__PURE__*/_interopDefaultLegacy(split2);
15
- var through2__default = /*#__PURE__*/_interopDefaultLegacy(through2);
16
- var filter__default = /*#__PURE__*/_interopDefaultLegacy(filter);
6
+ var readline = require('readline');
7
+ var dynamicTyping = require('dynamic-typing');
17
8
 
18
9
  function getEntriesBoundaries(string, substring, eol) {
19
10
  const res = [];
@@ -37,46 +28,106 @@ function getEntriesBoundaries(string, substring, eol) {
37
28
  return res;
38
29
  }
39
30
 
31
+ function getMolecule$1(sdfPart, labels, currentLabels, options) {
32
+ let parts = sdfPart.split(`${options.eol}>`);
33
+ if (parts.length === 0 || parts[0].length <= 5) return;
34
+ let molecule = {};
35
+ molecule.molfile = parts[0] + options.eol;
36
+ for (let j = 1; j < parts.length; j++) {
37
+ let lines = parts[j].split(options.eol);
38
+ let from = lines[0].indexOf('<');
39
+ let to = lines[0].indexOf('>');
40
+ let label = lines[0].substring(from + 1, to);
41
+ currentLabels.push(label);
42
+ if (!labels[label]) {
43
+ labels[label] = {
44
+ counter: 0,
45
+ isNumeric: options.dynamicTyping,
46
+ keep: false,
47
+ };
48
+ if (
49
+ (!options.exclude || options.exclude.indexOf(label) === -1) &&
50
+ (!options.include || options.include.indexOf(label) > -1)
51
+ ) {
52
+ labels[label].keep = true;
53
+ if (options.modifiers[label]) {
54
+ labels[label].modifier = options.modifiers[label];
55
+ }
56
+ if (options.forEach[label]) {
57
+ labels[label].forEach = options.forEach[label];
58
+ }
59
+ }
60
+ }
61
+ if (labels[label].keep) {
62
+ for (let k = 1; k < lines.length - 1; k++) {
63
+ if (molecule[label]) {
64
+ molecule[label] += options.eol + lines[k];
65
+ } else {
66
+ molecule[label] = lines[k];
67
+ }
68
+ }
69
+ if (labels[label].modifier) {
70
+ let modifiedValue = labels[label].modifier(molecule[label]);
71
+ if (modifiedValue === undefined || modifiedValue === null) {
72
+ delete molecule[label];
73
+ } else {
74
+ molecule[label] = modifiedValue;
75
+ }
76
+ }
77
+ if (labels[label].isNumeric) {
78
+ if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
79
+ labels[label].isNumeric = false;
80
+ }
81
+ }
82
+ }
83
+ }
84
+ return molecule;
85
+ }
86
+
40
87
  /**
41
88
  * Parse a SDF file
42
89
  * @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
43
- * @param {any} [options={}]
44
- * @param {array<string>} [options.include] List of fields to include
45
- * @param {array<string>} [options.exclude] List of fields to exclude
90
+ * @param {object} [options={}]
91
+ * @param {string[]} [options.include] List of fields to include
92
+ * @param {string[]} [options.exclude] List of fields to exclude
93
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
46
94
  * @param {boolean} [options.dynamicTyping] Dynamically type the data
47
95
  * @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
48
96
  * @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
97
+ * @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
49
98
  */
50
99
  function parse(sdf, options = {}) {
51
- const {
52
- include,
53
- exclude,
54
- filter,
55
- modifiers = {},
56
- forEach = {},
57
- dynamicTyping = true,
58
- } = options;
100
+ options = { ...options };
101
+ if (options.modifiers === undefined) options.modifiers = {};
102
+ if (options.forEach === undefined) options.forEach = {};
103
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
59
104
 
60
105
  sdf = ensureString.ensureString(sdf);
61
106
  if (typeof sdf !== 'string') {
62
107
  throw new TypeError('Parameter "sdf" must be a string');
63
108
  }
64
109
 
65
- let eol = '\n';
66
- if (options.mixedEOL) {
67
- sdf = sdf.replace(/\r\n/g, '\n');
68
- sdf = sdf.replace(/\r/g, '\n');
69
- } else {
70
- // we will find the delimiter in order to be much faster and not use regular expression
71
- let header = sdf.substr(0, 1000);
72
- if (header.indexOf('\r\n') > -1) {
73
- eol = '\r\n';
74
- } else if (header.indexOf('\r') > -1) {
75
- eol = '\r';
110
+ if (options.eol === undefined) {
111
+ options.eol = '\n';
112
+ if (options.mixedEOL) {
113
+ sdf = sdf.replace(/\r\n/g, '\n');
114
+ sdf = sdf.replace(/\r/g, '\n');
115
+ } else {
116
+ // we will find the delimiter in order to be much faster and not use regular expression
117
+ let header = sdf.substr(0, 1000);
118
+ if (header.indexOf('\r\n') > -1) {
119
+ options.eol = '\r\n';
120
+ } else if (header.indexOf('\r') > -1) {
121
+ options.eol = '\r';
122
+ }
76
123
  }
77
124
  }
78
125
 
79
- let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
126
+ let entriesBoundaries = getEntriesBoundaries(
127
+ sdf,
128
+ `${options.eol}$$$$`,
129
+ options.eol,
130
+ );
80
131
  let molecules = [];
81
132
  let labels = {};
82
133
 
@@ -84,72 +135,18 @@ function parse(sdf, options = {}) {
84
135
 
85
136
  for (let i = 0; i < entriesBoundaries.length; i++) {
86
137
  let sdfPart = sdf.substring(...entriesBoundaries[i]);
87
- let parts = sdfPart.split(`${eol}>`);
88
- if (parts.length > 0 && parts[0].length > 5) {
89
- let molecule = {};
90
- let currentLabels = [];
91
- molecule.molfile = parts[0] + eol;
92
- for (let j = 1; j < parts.length; j++) {
93
- let lines = parts[j].split(eol);
94
- let from = lines[0].indexOf('<');
95
- let to = lines[0].indexOf('>');
96
- let label = lines[0].substring(from + 1, to);
97
- currentLabels.push(label);
98
- if (!labels[label]) {
99
- labels[label] = {
100
- counter: 0,
101
- isNumeric: dynamicTyping,
102
- keep: false,
103
- };
104
- if (
105
- (!exclude || exclude.indexOf(label) === -1) &&
106
- (!include || include.indexOf(label) > -1)
107
- ) {
108
- labels[label].keep = true;
109
- if (modifiers[label]) {
110
- labels[label].modifier = modifiers[label];
111
- }
112
- if (forEach[label]) {
113
- labels[label].forEach = forEach[label];
114
- }
115
- }
116
- }
117
- if (labels[label].keep) {
118
- for (let k = 1; k < lines.length - 1; k++) {
119
- if (molecule[label]) {
120
- molecule[label] += eol + lines[k];
121
- } else {
122
- molecule[label] = lines[k];
123
- }
124
- }
125
- if (labels[label].modifier) {
126
- let modifiedValue = labels[label].modifier(molecule[label]);
127
- if (modifiedValue === undefined || modifiedValue === null) {
128
- delete molecule[label];
129
- } else {
130
- molecule[label] = modifiedValue;
131
- }
132
- }
133
- if (labels[label].isNumeric) {
134
- if (
135
- !isFinite(molecule[label]) ||
136
- molecule[label].match(/^0[0-9]/)
137
- ) {
138
- labels[label].isNumeric = false;
139
- }
140
- }
141
- }
142
- }
143
- if (!filter || filter(molecule)) {
144
- molecules.push(molecule);
145
- // only now we can increase the counter
146
- for (let j = 0; j < currentLabels.length; j++) {
147
- labels[currentLabels[j]].counter++;
148
- }
138
+
139
+ let currentLabels = [];
140
+ const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
141
+ if (!molecule) continue;
142
+ if (!options.filter || options.filter(molecule)) {
143
+ molecules.push(molecule);
144
+ // only now we can increase the counter
145
+ for (let j = 0; j < currentLabels.length; j++) {
146
+ labels[currentLabels[j]].counter++;
149
147
  }
150
148
  }
151
149
  }
152
-
153
150
  // all numeric fields should be converted to numbers
154
151
  for (let label in labels) {
155
152
  let currentLabel = labels[label];
@@ -195,45 +192,57 @@ function parse(sdf, options = {}) {
195
192
  };
196
193
  }
197
194
 
198
- const filterStream = filter__default["default"].bind(null, { objectMode: true });
199
-
200
- function filterCb(chunk) {
201
- return chunk.length > 1 && chunk.trim().length > 1;
202
- }
195
+ /**
196
+ * Parse a SDF file
197
+ * @param {NodeJS.ReadableStream} readStream SDF file to parse
198
+ * @param {object} [options={}]
199
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
200
+ * @param {boolean} [options.dynamicTyping] Dynamically type the data
201
+ */
203
202
 
204
- function entries() {
205
- return pipeline__default["default"].obj(
206
- split2__default["default"](/\r?\n\${4}.*\r?\n/),
207
- filterStream(filterCb),
208
- through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
209
- const eol = value.includes('\r\n') ? '\r\n' : '\n';
210
- this.push(`${value + eol}$$$$${eol}`);
211
- callback();
212
- }),
213
- );
203
+ async function* iterator(readStream, options = {}) {
204
+ const lines = readline.createInterface(readStream);
205
+ const currentLines = [];
206
+ options = { ...options };
207
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
208
+
209
+ options.eol = '\n';
210
+ for await (let line of lines) {
211
+ if (line.startsWith('$$$$')) {
212
+ const molecule = getMolecule(currentLines.join(options.eol), options);
213
+ if (!options.filter || options.filter(molecule)) {
214
+ yield molecule;
215
+ }
216
+ currentLines.length = 0;
217
+ } else {
218
+ currentLines.push(line);
219
+ }
220
+ }
214
221
  }
215
222
 
216
- function molecules(options) {
217
- return pipeline__default["default"].obj(
218
- entries(),
219
- through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
220
- try {
221
- const parsed = parse(value, options);
222
- if (parsed.molecules.length === 1) {
223
- if (options && options.fullResult) {
224
- this.push(parsed);
225
- } else {
226
- this.push(parsed.molecules[0]);
227
- }
228
- }
229
- callback();
230
- } catch (e) {
231
- callback(e);
223
+ function getMolecule(sdfPart, options) {
224
+ let parts = sdfPart.split(`${options.eol}>`);
225
+ if (parts.length === 0 || parts[0].length <= 5) return;
226
+ let molecule = {};
227
+ molecule.molfile = parts[0] + options.eol;
228
+ for (let j = 1; j < parts.length; j++) {
229
+ let lines = parts[j].split(options.eol);
230
+ let from = lines[0].indexOf('<');
231
+ let to = lines[0].indexOf('>');
232
+ let label = lines[0].substring(from + 1, to);
233
+ for (let k = 1; k < lines.length - 1; k++) {
234
+ if (molecule[label]) {
235
+ molecule[label] += options.eol + lines[k];
236
+ } else {
237
+ molecule[label] = lines[k];
232
238
  }
233
- }),
234
- );
239
+ }
240
+ if (options.dynamicTyping) {
241
+ molecule[label] = dynamicTyping.parseString(molecule[label]);
242
+ }
243
+ }
244
+ return molecule;
235
245
  }
236
246
 
237
- exports.entries = entries;
238
- exports.molecules = molecules;
247
+ exports.iterator = iterator;
239
248
  exports.parse = parse;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sdf-parser",
3
- "version": "5.0.2",
3
+ "version": "6.0.1",
4
4
  "description": "SDF parser",
5
5
  "main": "lib/index.js",
6
6
  "module": "src/index.js",
@@ -8,9 +8,12 @@
8
8
  "lib",
9
9
  "src"
10
10
  ],
11
+ "browser": {
12
+ "./src/iterator.js": "./src/iterator.browser.js"
13
+ },
11
14
  "sideEffects": false,
12
15
  "scripts": {
13
- "build": "npm run compile && cheminfo-build --root SDFParser",
16
+ "build": "cheminfo-build --entry src/index.js --root SDFParser",
14
17
  "compile": "rollup -c",
15
18
  "eslint": "eslint src",
16
19
  "eslint-fix": "npm run eslint -- --fix",
@@ -21,9 +24,6 @@
21
24
  "test-coverage": "jest --coverage",
22
25
  "test-only": "jest"
23
26
  },
24
- "browser": {
25
- "./src/stream.js": "./src/stream.browser.js"
26
- },
27
27
  "repository": {
28
28
  "type": "git",
29
29
  "url": "https://github.com/cheminfo/sdf-parser.git"
@@ -44,20 +44,19 @@
44
44
  "homepage": "https://github.com/cheminfo/sdf-parser",
45
45
  "devDependencies": {
46
46
  "@babel/plugin-transform-modules-commonjs": "^7.18.6",
47
+ "@types/jest": "^29.1.2",
47
48
  "babel-eslint": "^10.1.0",
48
49
  "callback-stream": "^1.1.0",
49
50
  "cheminfo-build": "^1.1.11",
50
- "eslint": "^8.22.0",
51
+ "eslint": "^8.25.0",
51
52
  "eslint-config-cheminfo": "^8.0.2",
52
- "jest": "^28.1.3",
53
+ "filelist-utils": "^1.0.1",
54
+ "jest": "^29.2.0",
53
55
  "openchemlib": "^8.0.1",
54
56
  "prettier": "^2.7.1"
55
57
  },
56
58
  "dependencies": {
57
- "ensure-string": "^1.2.0",
58
- "pumpify": "^2.0.1",
59
- "split2": "^4.1.0",
60
- "through2": "^4.0.2",
61
- "through2-filter": "^3.0.0"
59
+ "dynamic-typing": "^1.0.0",
60
+ "ensure-string": "^1.2.0"
62
61
  }
63
62
  }
@@ -1,8 +1,8 @@
1
1
  // Jest Snapshot v1, https://goo.gl/fbAQLP
2
2
 
3
3
  exports[`should parse sdf files without EOL in the EOF 1`] = `
4
- Array [
5
- Array [
4
+ [
5
+ [
6
6
  0,
7
7
  1473,
8
8
  ],
@@ -0,0 +1,183 @@
1
+ import { createReadStream, ReadStream } from 'fs';
2
+ import { join } from 'path';
3
+ import { createGunzip } from 'zlib';
4
+
5
+ import { fileCollectionFromPath } from 'filelist-utils';
6
+
7
+ import { iterator } from '../iterator';
8
+
9
+ test('iterator', async () => {
10
+ const files = (
11
+ await fileCollectionFromPath(join(__dirname, '.'))
12
+ ).files.filter((file) => file.name === 'test.sdf');
13
+ const results = [];
14
+
15
+ if (parseInt(process.versions.node) >= 18) {
16
+ for await (const entry of iterator(ReadStream.fromWeb(files[0].stream()))) {
17
+ results.push(entry);
18
+ }
19
+ expect(results).toHaveLength(128);
20
+ expect(results[0]).toMatchInlineSnapshot(`
21
+ {
22
+ "CLogP": 2.7,
23
+ "Code": 100380824,
24
+ "Number of H-Acceptors": 3,
25
+ "Number of H-Donors": 1,
26
+ "Number of Rotatable bonds": 1,
27
+ "molfile": "
28
+ -ISIS- 04231216572D
29
+
30
+ 15 16 0 0 0 0 0 0 0 0999 V2000
31
+ 2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
32
+ 2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
33
+ 0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
34
+ 1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
35
+ 1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
36
+ -0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
37
+ 3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
38
+ 0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
39
+ -1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
40
+ -0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
41
+ -1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
42
+ -0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
43
+ -3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
44
+ -2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
45
+ -3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
46
+ 2 1 1 0 0 0 0
47
+ 3 4 1 0 0 0 0
48
+ 4 1 1 0 0 0 0
49
+ 5 2 1 0 0 0 0
50
+ 6 3 2 0 0 0 0
51
+ 7 2 2 0 0 0 0
52
+ 8 4 2 0 0 0 0
53
+ 9 6 1 0 0 0 0
54
+ 10 9 2 0 0 0 0
55
+ 11 10 1 0 0 0 0
56
+ 12 11 1 0 0 0 0
57
+ 13 14 2 0 0 0 0
58
+ 14 9 1 0 0 0 0
59
+ 15 13 1 0 0 0 0
60
+ 3 5 1 0 0 0 0
61
+ 15 11 2 0 0 0 0
62
+ M END
63
+ ",
64
+ }
65
+ `);
66
+ }
67
+ });
68
+
69
+ test('iterator on stream', async () => {
70
+ const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
71
+ const stream = readStream.pipe(createGunzip());
72
+ const results = [];
73
+ for await (const entry of iterator(stream)) {
74
+ results.push(entry);
75
+ }
76
+ expect(results).toHaveLength(128);
77
+ expect(results[0]).toMatchInlineSnapshot(`
78
+ {
79
+ "CLogP": 2.7,
80
+ "Code": 100380824,
81
+ "Number of H-Acceptors": 3,
82
+ "Number of H-Donors": 1,
83
+ "Number of Rotatable bonds": 1,
84
+ "molfile": "
85
+ -ISIS- 04231216572D
86
+
87
+ 15 16 0 0 0 0 0 0 0 0999 V2000
88
+ 2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
89
+ 2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
90
+ 0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
91
+ 1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
92
+ 1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
93
+ -0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
94
+ 3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
95
+ 0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
96
+ -1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
97
+ -0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
98
+ -1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
99
+ -0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
100
+ -3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
101
+ -2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
102
+ -3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
103
+ 2 1 1 0 0 0 0
104
+ 3 4 1 0 0 0 0
105
+ 4 1 1 0 0 0 0
106
+ 5 2 1 0 0 0 0
107
+ 6 3 2 0 0 0 0
108
+ 7 2 2 0 0 0 0
109
+ 8 4 2 0 0 0 0
110
+ 9 6 1 0 0 0 0
111
+ 10 9 2 0 0 0 0
112
+ 11 10 1 0 0 0 0
113
+ 12 11 1 0 0 0 0
114
+ 13 14 2 0 0 0 0
115
+ 14 9 1 0 0 0 0
116
+ 15 13 1 0 0 0 0
117
+ 3 5 1 0 0 0 0
118
+ 15 11 2 0 0 0 0
119
+ M END
120
+ ",
121
+ }
122
+ `);
123
+ });
124
+
125
+ test('iterator on fileCollection stream', async () => {
126
+ const file = (await fileCollectionFromPath(join(__dirname, '.'))).filter(
127
+ (file) => file.size === 32233,
128
+ ).files[0];
129
+ const results = [];
130
+
131
+ if (parseInt(process.versions.node) >= 18) {
132
+ for await (const entry of iterator(ReadStream.fromWeb(file.stream()))) {
133
+ results.push(entry);
134
+ }
135
+ expect(results).toHaveLength(128);
136
+ expect(results[0]).toMatchInlineSnapshot(`
137
+ {
138
+ "CLogP": 2.7,
139
+ "Code": 100380824,
140
+ "Number of H-Acceptors": 3,
141
+ "Number of H-Donors": 1,
142
+ "Number of Rotatable bonds": 1,
143
+ "molfile": "
144
+ -ISIS- 04231216572D
145
+
146
+ 15 16 0 0 0 0 0 0 0 0999 V2000
147
+ 2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
148
+ 2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
149
+ 0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
150
+ 1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
151
+ 1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
152
+ -0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
153
+ 3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
154
+ 0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
155
+ -1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
156
+ -0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
157
+ -1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
158
+ -0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
159
+ -3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
160
+ -2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
161
+ -3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
162
+ 2 1 1 0 0 0 0
163
+ 3 4 1 0 0 0 0
164
+ 4 1 1 0 0 0 0
165
+ 5 2 1 0 0 0 0
166
+ 6 3 2 0 0 0 0
167
+ 7 2 2 0 0 0 0
168
+ 8 4 2 0 0 0 0
169
+ 9 6 1 0 0 0 0
170
+ 10 9 2 0 0 0 0
171
+ 11 10 1 0 0 0 0
172
+ 12 11 1 0 0 0 0
173
+ 13 14 2 0 0 0 0
174
+ 14 9 1 0 0 0 0
175
+ 15 13 1 0 0 0 0
176
+ 3 5 1 0 0 0 0
177
+ 15 11 2 0 0 0 0
178
+ M END
179
+ ",
180
+ }
181
+ `);
182
+ }
183
+ });
Binary file
package/src/index.js CHANGED
@@ -1,2 +1,2 @@
1
1
  export * from './parse';
2
- export * from './stream';
2
+ export * from './iterator';
@@ -0,0 +1,3 @@
1
+ export function iterator() {
2
+ throw new Error('Iterator not implemented in the browser');
3
+ }
@@ -0,0 +1,54 @@
1
+ import { createInterface } from 'readline';
2
+
3
+ import { parseString } from 'dynamic-typing';
4
+ /**
5
+ * Parse a SDF file
6
+ * @param {NodeJS.ReadableStream} readStream SDF file to parse
7
+ * @param {object} [options={}]
8
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
9
+ * @param {boolean} [options.dynamicTyping] Dynamically type the data
10
+ */
11
+
12
+ export async function* iterator(readStream, options = {}) {
13
+ const lines = createInterface(readStream);
14
+ const currentLines = [];
15
+ options = { ...options };
16
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
17
+
18
+ options.eol = '\n';
19
+ for await (let line of lines) {
20
+ if (line.startsWith('$$$$')) {
21
+ const molecule = getMolecule(currentLines.join(options.eol), options);
22
+ if (!options.filter || options.filter(molecule)) {
23
+ yield molecule;
24
+ }
25
+ currentLines.length = 0;
26
+ } else {
27
+ currentLines.push(line);
28
+ }
29
+ }
30
+ }
31
+
32
+ function getMolecule(sdfPart, options) {
33
+ let parts = sdfPart.split(`${options.eol}>`);
34
+ if (parts.length === 0 || parts[0].length <= 5) return;
35
+ let molecule = {};
36
+ molecule.molfile = parts[0] + options.eol;
37
+ for (let j = 1; j < parts.length; j++) {
38
+ let lines = parts[j].split(options.eol);
39
+ let from = lines[0].indexOf('<');
40
+ let to = lines[0].indexOf('>');
41
+ let label = lines[0].substring(from + 1, to);
42
+ for (let k = 1; k < lines.length - 1; k++) {
43
+ if (molecule[label]) {
44
+ molecule[label] += options.eol + lines[k];
45
+ } else {
46
+ molecule[label] = lines[k];
47
+ }
48
+ }
49
+ if (options.dynamicTyping) {
50
+ molecule[label] = parseString(molecule[label]);
51
+ }
52
+ }
53
+ return molecule;
54
+ }
package/src/parse.js CHANGED
@@ -1,46 +1,51 @@
1
1
  import { ensureString } from 'ensure-string';
2
2
 
3
3
  import { getEntriesBoundaries } from './getEntriesBoundaries';
4
+ import { getMolecule } from './util/getMolecule';
4
5
  /**
5
6
  * Parse a SDF file
6
7
  * @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
7
- * @param {any} [options={}]
8
- * @param {array<string>} [options.include] List of fields to include
9
- * @param {array<string>} [options.exclude] List of fields to exclude
8
+ * @param {object} [options={}]
9
+ * @param {string[]} [options.include] List of fields to include
10
+ * @param {string[]} [options.exclude] List of fields to exclude
11
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
10
12
  * @param {boolean} [options.dynamicTyping] Dynamically type the data
11
13
  * @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
12
14
  * @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
15
+ * @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
13
16
  */
14
17
  export function parse(sdf, options = {}) {
15
- const {
16
- include,
17
- exclude,
18
- filter,
19
- modifiers = {},
20
- forEach = {},
21
- dynamicTyping = true,
22
- } = options;
18
+ options = { ...options };
19
+ if (options.modifiers === undefined) options.modifiers = {};
20
+ if (options.forEach === undefined) options.forEach = {};
21
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
23
22
 
24
23
  sdf = ensureString(sdf);
25
24
  if (typeof sdf !== 'string') {
26
25
  throw new TypeError('Parameter "sdf" must be a string');
27
26
  }
28
27
 
29
- let eol = '\n';
30
- if (options.mixedEOL) {
31
- sdf = sdf.replace(/\r\n/g, '\n');
32
- sdf = sdf.replace(/\r/g, '\n');
33
- } else {
34
- // we will find the delimiter in order to be much faster and not use regular expression
35
- let header = sdf.substr(0, 1000);
36
- if (header.indexOf('\r\n') > -1) {
37
- eol = '\r\n';
38
- } else if (header.indexOf('\r') > -1) {
39
- eol = '\r';
28
+ if (options.eol === undefined) {
29
+ options.eol = '\n';
30
+ if (options.mixedEOL) {
31
+ sdf = sdf.replace(/\r\n/g, '\n');
32
+ sdf = sdf.replace(/\r/g, '\n');
33
+ } else {
34
+ // we will find the delimiter in order to be much faster and not use regular expression
35
+ let header = sdf.substr(0, 1000);
36
+ if (header.indexOf('\r\n') > -1) {
37
+ options.eol = '\r\n';
38
+ } else if (header.indexOf('\r') > -1) {
39
+ options.eol = '\r';
40
+ }
40
41
  }
41
42
  }
42
43
 
43
- let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
44
+ let entriesBoundaries = getEntriesBoundaries(
45
+ sdf,
46
+ `${options.eol}$$$$`,
47
+ options.eol,
48
+ );
44
49
  let molecules = [];
45
50
  let labels = {};
46
51
 
@@ -48,72 +53,18 @@ export function parse(sdf, options = {}) {
48
53
 
49
54
  for (let i = 0; i < entriesBoundaries.length; i++) {
50
55
  let sdfPart = sdf.substring(...entriesBoundaries[i]);
51
- let parts = sdfPart.split(`${eol}>`);
52
- if (parts.length > 0 && parts[0].length > 5) {
53
- let molecule = {};
54
- let currentLabels = [];
55
- molecule.molfile = parts[0] + eol;
56
- for (let j = 1; j < parts.length; j++) {
57
- let lines = parts[j].split(eol);
58
- let from = lines[0].indexOf('<');
59
- let to = lines[0].indexOf('>');
60
- let label = lines[0].substring(from + 1, to);
61
- currentLabels.push(label);
62
- if (!labels[label]) {
63
- labels[label] = {
64
- counter: 0,
65
- isNumeric: dynamicTyping,
66
- keep: false,
67
- };
68
- if (
69
- (!exclude || exclude.indexOf(label) === -1) &&
70
- (!include || include.indexOf(label) > -1)
71
- ) {
72
- labels[label].keep = true;
73
- if (modifiers[label]) {
74
- labels[label].modifier = modifiers[label];
75
- }
76
- if (forEach[label]) {
77
- labels[label].forEach = forEach[label];
78
- }
79
- }
80
- }
81
- if (labels[label].keep) {
82
- for (let k = 1; k < lines.length - 1; k++) {
83
- if (molecule[label]) {
84
- molecule[label] += eol + lines[k];
85
- } else {
86
- molecule[label] = lines[k];
87
- }
88
- }
89
- if (labels[label].modifier) {
90
- let modifiedValue = labels[label].modifier(molecule[label]);
91
- if (modifiedValue === undefined || modifiedValue === null) {
92
- delete molecule[label];
93
- } else {
94
- molecule[label] = modifiedValue;
95
- }
96
- }
97
- if (labels[label].isNumeric) {
98
- if (
99
- !isFinite(molecule[label]) ||
100
- molecule[label].match(/^0[0-9]/)
101
- ) {
102
- labels[label].isNumeric = false;
103
- }
104
- }
105
- }
106
- }
107
- if (!filter || filter(molecule)) {
108
- molecules.push(molecule);
109
- // only now we can increase the counter
110
- for (let j = 0; j < currentLabels.length; j++) {
111
- labels[currentLabels[j]].counter++;
112
- }
56
+
57
+ let currentLabels = [];
58
+ const molecule = getMolecule(sdfPart, labels, currentLabels, options);
59
+ if (!molecule) continue;
60
+ if (!options.filter || options.filter(molecule)) {
61
+ molecules.push(molecule);
62
+ // only now we can increase the counter
63
+ for (let j = 0; j < currentLabels.length; j++) {
64
+ labels[currentLabels[j]].counter++;
113
65
  }
114
66
  }
115
67
  }
116
-
117
68
  // all numeric fields should be converted to numbers
118
69
  for (let label in labels) {
119
70
  let currentLabel = labels[label];
@@ -0,0 +1,55 @@
1
+ export function getMolecule(sdfPart, labels, currentLabels, options) {
2
+ let parts = sdfPart.split(`${options.eol}>`);
3
+ if (parts.length === 0 || parts[0].length <= 5) return;
4
+ let molecule = {};
5
+ molecule.molfile = parts[0] + options.eol;
6
+ for (let j = 1; j < parts.length; j++) {
7
+ let lines = parts[j].split(options.eol);
8
+ let from = lines[0].indexOf('<');
9
+ let to = lines[0].indexOf('>');
10
+ let label = lines[0].substring(from + 1, to);
11
+ currentLabels.push(label);
12
+ if (!labels[label]) {
13
+ labels[label] = {
14
+ counter: 0,
15
+ isNumeric: options.dynamicTyping,
16
+ keep: false,
17
+ };
18
+ if (
19
+ (!options.exclude || options.exclude.indexOf(label) === -1) &&
20
+ (!options.include || options.include.indexOf(label) > -1)
21
+ ) {
22
+ labels[label].keep = true;
23
+ if (options.modifiers[label]) {
24
+ labels[label].modifier = options.modifiers[label];
25
+ }
26
+ if (options.forEach[label]) {
27
+ labels[label].forEach = options.forEach[label];
28
+ }
29
+ }
30
+ }
31
+ if (labels[label].keep) {
32
+ for (let k = 1; k < lines.length - 1; k++) {
33
+ if (molecule[label]) {
34
+ molecule[label] += options.eol + lines[k];
35
+ } else {
36
+ molecule[label] = lines[k];
37
+ }
38
+ }
39
+ if (labels[label].modifier) {
40
+ let modifiedValue = labels[label].modifier(molecule[label]);
41
+ if (modifiedValue === undefined || modifiedValue === null) {
42
+ delete molecule[label];
43
+ } else {
44
+ molecule[label] = modifiedValue;
45
+ }
46
+ }
47
+ if (labels[label].isNumeric) {
48
+ if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
49
+ labels[label].isNumeric = false;
50
+ }
51
+ }
52
+ }
53
+ }
54
+ return molecule;
55
+ }
@@ -1,98 +0,0 @@
1
- import fs from 'fs';
2
-
3
- import callbackStream from 'callback-stream';
4
- import OCL from 'openchemlib/minimal';
5
-
6
- import { entries, molecules } from '..';
7
-
8
- const cbStream = callbackStream.bind(null, { objectMode: true });
9
-
10
- describe('stream', () => {
11
- it('entries', () =>
12
- new Promise((resolve) => {
13
- fs.createReadStream(`${__dirname}/test.sdf`)
14
- .pipe(entries())
15
- .pipe(
16
- cbStream((err, data) => {
17
- expect(err).toBeNull();
18
- expect(data).toHaveLength(128);
19
- expect(data[0]).toContain('-ISIS- 04231216572D');
20
- const mol = OCL.Molecule.fromMolfile(data[5]);
21
- expect(mol.toMolfile()).toContain(
22
- '17 18 0 0 0 0 0 0 0 0999 V2000',
23
- );
24
- resolve();
25
- }),
26
- );
27
- }));
28
-
29
- it('molecules', () =>
30
- new Promise((resolve) => {
31
- fs.createReadStream(`${__dirname}/test.sdf`)
32
- .pipe(molecules())
33
- .pipe(
34
- cbStream((err, data) => {
35
- expect(err).toBeNull();
36
- expect(data).toHaveLength(128);
37
- expect(data[0]).toMatchObject({
38
- Code: '0100380824',
39
- CLogP: 2.7,
40
- });
41
- expect(data[0].molfile).toContain('-ISIS- 04231216572D');
42
- resolve();
43
- }),
44
- );
45
- }));
46
-
47
- it('molecules - full result', () =>
48
- new Promise((resolve) => {
49
- fs.createReadStream(`${__dirname}/test.sdf`)
50
- .pipe(molecules({ fullResult: true }))
51
- .pipe(
52
- cbStream((err, data) => {
53
- expect(err).toBeNull();
54
- expect(data).toHaveLength(128);
55
- expect(data[0]).toMatchObject({
56
- labels: [
57
- 'Code',
58
- 'Number of H-Donors',
59
- 'Number of H-Acceptors',
60
- 'Number of Rotatable bonds',
61
- 'CLogP',
62
- ],
63
- });
64
- expect(data[0].molecules).toHaveLength(1);
65
- resolve();
66
- }),
67
- );
68
- }));
69
-
70
- it('molecules with filter', () =>
71
- new Promise((resolve) => {
72
- fs.createReadStream(`${__dirname}/test.sdf`)
73
- .pipe(
74
- molecules({
75
- filter: (entry) => entry.Code === '0100380869',
76
- }),
77
- )
78
- .pipe(
79
- cbStream((err, data) => {
80
- expect(err).toBeNull();
81
- expect(data).toHaveLength(1);
82
- resolve();
83
- }),
84
- );
85
- }));
86
-
87
- it('async iteration', async () => {
88
- const stream = fs
89
- .createReadStream(`${__dirname}/test.sdf`)
90
- .pipe(molecules());
91
- let count = 0;
92
- for await (const molecule of stream) {
93
- count++;
94
- expect(molecule.molfile.toString()).toContain('0999 V2000');
95
- }
96
- expect(count).toBe(128);
97
- });
98
- });
package/src/stream.js DELETED
@@ -1,45 +0,0 @@
1
- import pipeline from 'pumpify';
2
- import split2 from 'split2';
3
- import through2 from 'through2';
4
- import filter from 'through2-filter';
5
-
6
- import { parse } from './parse';
7
-
8
- const filterStream = filter.bind(null, { objectMode: true });
9
-
10
- function filterCb(chunk) {
11
- return chunk.length > 1 && chunk.trim().length > 1;
12
- }
13
-
14
- export function entries() {
15
- return pipeline.obj(
16
- split2(/\r?\n\${4}.*\r?\n/),
17
- filterStream(filterCb),
18
- through2({ objectMode: true }, function process(value, encoding, callback) {
19
- const eol = value.includes('\r\n') ? '\r\n' : '\n';
20
- this.push(`${value + eol}$$$$${eol}`);
21
- callback();
22
- }),
23
- );
24
- }
25
-
26
- export function molecules(options) {
27
- return pipeline.obj(
28
- entries(),
29
- through2({ objectMode: true }, function process(value, encoding, callback) {
30
- try {
31
- const parsed = parse(value, options);
32
- if (parsed.molecules.length === 1) {
33
- if (options && options.fullResult) {
34
- this.push(parsed);
35
- } else {
36
- this.push(parsed.molecules[0]);
37
- }
38
- }
39
- callback();
40
- } catch (e) {
41
- callback(e);
42
- }
43
- }),
44
- );
45
- }