sdf-parser 5.0.0 → 6.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -57,39 +57,18 @@ var result = parse(sdf, {
57
57
  });
58
58
  ```
59
59
 
60
- ## Streams
60
+ ## Iterator
61
61
 
62
62
  This API is only available on Node.js.
63
63
 
64
- ### molecules(options)
65
-
66
- Transform an input text stream to a stream of molecule objects.
67
-
68
- #### options
69
-
70
- - `fullResult`: true to emit the full result of `parse` instead of just the molecules.
71
- - All other options from the `parse` function.
72
-
73
- ```js
74
- const { stream } = require('sdf-parser');
75
- fs.createReadStream('test.sdf')
76
- .pipe(stream.molecules())
77
- .on('data', (molecule) => {
78
- console.log(molecule.molfile);
79
- });
80
- ```
81
-
82
- ### entries()
83
-
84
- Transform an input text stream to a stream of sdf entries.
85
-
86
64
  ```js
87
- const { stream } = require('sdf-parser');
88
- fs.createReadStream('test.sdf')
89
- .pipe(stream.entries())
90
- .on('data', (entry) => {
91
- // sdf entry as a string
92
- });
65
+ const { iterator } = require('sdf-parser');
66
+ const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
67
+ const stream = readStream.pipe(createGunzip());
68
+ const results = [];
69
+ for await (const entry of iterator(stream)) {
70
+ results.push(entry);
71
+ }
93
72
  ```
94
73
 
95
74
  ## License
@@ -98,7 +77,7 @@ fs.createReadStream('test.sdf')
98
77
 
99
78
  [npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
100
79
  [npm-url]: https://www.npmjs.com/package/sdf-parser
101
- [travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/master.svg?style=flat-square
80
+ [travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/main.svg?style=flat-square
102
81
  [travis-url]: https://travis-ci.org/cheminfo/sdf-parser
103
82
  [download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
104
83
  [download-url]: https://www.npmjs.com/package/sdf-parser
package/lib/index.js CHANGED
@@ -2,17 +2,9 @@
2
2
 
3
3
  Object.defineProperty(exports, '__esModule', { value: true });
4
4
 
5
- var pipeline = require('pumpify');
6
- var split2 = require('split2');
7
- var through2 = require('through2');
8
- var filter = require('through2-filter');
9
-
10
- function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
11
-
12
- var pipeline__default = /*#__PURE__*/_interopDefaultLegacy(pipeline);
13
- var split2__default = /*#__PURE__*/_interopDefaultLegacy(split2);
14
- var through2__default = /*#__PURE__*/_interopDefaultLegacy(through2);
15
- var filter__default = /*#__PURE__*/_interopDefaultLegacy(filter);
5
+ var ensureString = require('ensure-string');
6
+ var readline = require('readline');
7
+ var dynamicTyping = require('dynamic-typing');
16
8
 
17
9
  function getEntriesBoundaries(string, substring, eol) {
18
10
  const res = [];
@@ -22,8 +14,13 @@ function getEntriesBoundaries(string, substring, eol) {
22
14
  next = string.indexOf(substring, previous);
23
15
  if (next !== -1) {
24
16
  res.push([previous, next]);
25
- previous = next =
26
- string.indexOf(eol, next + substring.length) + eol.length;
17
+ const nextMatch = string.indexOf(eol, next + substring.length);
18
+ if (nextMatch === -1) {
19
+ next = -1;
20
+ } else {
21
+ previous = nextMatch + eol.length;
22
+ next = previous;
23
+ }
27
24
  } else {
28
25
  res.push([previous, string.length]);
29
26
  }
@@ -31,35 +28,106 @@ function getEntriesBoundaries(string, substring, eol) {
31
28
  return res;
32
29
  }
33
30
 
31
+ function getMolecule$1(sdfPart, labels, currentLabels, options) {
32
+ let parts = sdfPart.split(`${options.eol}>`);
33
+ if (parts.length === 0 || parts[0].length <= 5) return;
34
+ let molecule = {};
35
+ molecule.molfile = parts[0] + options.eol;
36
+ for (let j = 1; j < parts.length; j++) {
37
+ let lines = parts[j].split(options.eol);
38
+ let from = lines[0].indexOf('<');
39
+ let to = lines[0].indexOf('>');
40
+ let label = lines[0].substring(from + 1, to);
41
+ currentLabels.push(label);
42
+ if (!labels[label]) {
43
+ labels[label] = {
44
+ counter: 0,
45
+ isNumeric: options.dynamicTyping,
46
+ keep: false,
47
+ };
48
+ if (
49
+ (!options.exclude || options.exclude.indexOf(label) === -1) &&
50
+ (!options.include || options.include.indexOf(label) > -1)
51
+ ) {
52
+ labels[label].keep = true;
53
+ if (options.modifiers[label]) {
54
+ labels[label].modifier = options.modifiers[label];
55
+ }
56
+ if (options.forEach[label]) {
57
+ labels[label].forEach = options.forEach[label];
58
+ }
59
+ }
60
+ }
61
+ if (labels[label].keep) {
62
+ for (let k = 1; k < lines.length - 1; k++) {
63
+ if (molecule[label]) {
64
+ molecule[label] += options.eol + lines[k];
65
+ } else {
66
+ molecule[label] = lines[k];
67
+ }
68
+ }
69
+ if (labels[label].modifier) {
70
+ let modifiedValue = labels[label].modifier(molecule[label]);
71
+ if (modifiedValue === undefined || modifiedValue === null) {
72
+ delete molecule[label];
73
+ } else {
74
+ molecule[label] = modifiedValue;
75
+ }
76
+ }
77
+ if (labels[label].isNumeric) {
78
+ if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
79
+ labels[label].isNumeric = false;
80
+ }
81
+ }
82
+ }
83
+ }
84
+ return molecule;
85
+ }
86
+
87
+ /**
88
+ * Parse a SDF file
89
+ * @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
90
+ * @param {object} [options={}]
91
+ * @param {string[]} [options.include] List of fields to include
92
+ * @param {string[]} [options.exclude] List of fields to exclude
93
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
94
+ * @param {boolean} [options.dynamicTyping] Dynamically type the data
95
+ * @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
96
+ * @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
97
+ * @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
98
+ */
34
99
  function parse(sdf, options = {}) {
35
- const {
36
- include,
37
- exclude,
38
- filter,
39
- modifiers = {},
40
- forEach = {},
41
- dynamicTyping = true,
42
- } = options;
100
+ options = { ...options };
101
+ if (options.modifiers === undefined) options.modifiers = {};
102
+ if (options.forEach === undefined) options.forEach = {};
103
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
43
104
 
105
+ sdf = ensureString.ensureString(sdf);
44
106
  if (typeof sdf !== 'string') {
45
107
  throw new TypeError('Parameter "sdf" must be a string');
46
108
  }
47
109
 
48
- let eol = '\n';
49
- if (options.mixedEOL) {
50
- sdf = sdf.replace(/\r\n/g, '\n');
51
- sdf = sdf.replace(/\r/g, '\n');
52
- } else {
53
- // we will find the delimiter in order to be much faster and not use regular expression
54
- let header = sdf.substr(0, 1000);
55
- if (header.indexOf('\r\n') > -1) {
56
- eol = '\r\n';
57
- } else if (header.indexOf('\r') > -1) {
58
- eol = '\r';
110
+ if (options.eol === undefined) {
111
+ options.eol = '\n';
112
+ if (options.mixedEOL) {
113
+ sdf = sdf.replace(/\r\n/g, '\n');
114
+ sdf = sdf.replace(/\r/g, '\n');
115
+ } else {
116
+ // we will find the delimiter in order to be much faster and not use regular expression
117
+ let header = sdf.substr(0, 1000);
118
+ if (header.indexOf('\r\n') > -1) {
119
+ options.eol = '\r\n';
120
+ } else if (header.indexOf('\r') > -1) {
121
+ options.eol = '\r';
122
+ }
59
123
  }
60
124
  }
61
125
 
62
- let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
126
+ let entriesBoundaries = getEntriesBoundaries(
127
+ sdf,
128
+ `${options.eol}$$$$`,
129
+ options.eol,
130
+ );
63
131
  let molecules = [];
64
132
  let labels = {};
65
133
 
@@ -67,72 +135,18 @@ function parse(sdf, options = {}) {
67
135
 
68
136
  for (let i = 0; i < entriesBoundaries.length; i++) {
69
137
  let sdfPart = sdf.substring(...entriesBoundaries[i]);
70
- let parts = sdfPart.split(`${eol}>`);
71
- if (parts.length > 0 && parts[0].length > 5) {
72
- let molecule = {};
73
- let currentLabels = [];
74
- molecule.molfile = parts[0] + eol;
75
- for (let j = 1; j < parts.length; j++) {
76
- let lines = parts[j].split(eol);
77
- let from = lines[0].indexOf('<');
78
- let to = lines[0].indexOf('>');
79
- let label = lines[0].substring(from + 1, to);
80
- currentLabels.push(label);
81
- if (!labels[label]) {
82
- labels[label] = {
83
- counter: 0,
84
- isNumeric: dynamicTyping,
85
- keep: false,
86
- };
87
- if (
88
- (!exclude || exclude.indexOf(label) === -1) &&
89
- (!include || include.indexOf(label) > -1)
90
- ) {
91
- labels[label].keep = true;
92
- if (modifiers[label]) {
93
- labels[label].modifier = modifiers[label];
94
- }
95
- if (forEach[label]) {
96
- labels[label].forEach = forEach[label];
97
- }
98
- }
99
- }
100
- if (labels[label].keep) {
101
- for (let k = 1; k < lines.length - 1; k++) {
102
- if (molecule[label]) {
103
- molecule[label] += eol + lines[k];
104
- } else {
105
- molecule[label] = lines[k];
106
- }
107
- }
108
- if (labels[label].modifier) {
109
- let modifiedValue = labels[label].modifier(molecule[label]);
110
- if (modifiedValue === undefined || modifiedValue === null) {
111
- delete molecule[label];
112
- } else {
113
- molecule[label] = modifiedValue;
114
- }
115
- }
116
- if (labels[label].isNumeric) {
117
- if (
118
- !isFinite(molecule[label]) ||
119
- molecule[label].match(/^0[0-9]/)
120
- ) {
121
- labels[label].isNumeric = false;
122
- }
123
- }
124
- }
125
- }
126
- if (!filter || filter(molecule)) {
127
- molecules.push(molecule);
128
- // only now we can increase the counter
129
- for (let j = 0; j < currentLabels.length; j++) {
130
- labels[currentLabels[j]].counter++;
131
- }
138
+
139
+ let currentLabels = [];
140
+ const molecule = getMolecule$1(sdfPart, labels, currentLabels, options);
141
+ if (!molecule) continue;
142
+ if (!options.filter || options.filter(molecule)) {
143
+ molecules.push(molecule);
144
+ // only now we can increase the counter
145
+ for (let j = 0; j < currentLabels.length; j++) {
146
+ labels[currentLabels[j]].counter++;
132
147
  }
133
148
  }
134
149
  }
135
-
136
150
  // all numeric fields should be converted to numbers
137
151
  for (let label in labels) {
138
152
  let currentLabel = labels[label];
@@ -172,50 +186,63 @@ function parse(sdf, options = {}) {
172
186
 
173
187
  return {
174
188
  time: Date.now() - start,
175
- molecules: molecules,
189
+ molecules,
176
190
  labels: Object.keys(labels),
177
- statistics: statistics,
191
+ statistics,
178
192
  };
179
193
  }
180
194
 
181
- const filterStream = filter__default["default"].bind(null, { objectMode: true });
182
- function filterCb(chunk) {
183
- return chunk.length > 1 && chunk.trim().length > 1;
184
- }
185
-
186
- function entries() {
187
- return pipeline__default["default"].obj(
188
- split2__default["default"](/\r?\n\${4}.*\r?\n/),
189
- filterStream(filterCb),
190
- through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
191
- const eol = value.includes('\r\n') ? '\r\n' : '\n';
192
- this.push(`${value + eol}$$$$${eol}`);
193
- callback();
194
- }),
195
- );
195
+ /**
196
+ * Parse a SDF file
197
+ * @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
198
+ * @param {object} [options={}]
199
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
200
+ * @param {boolean} [options.dynamicTyping] Dynamically type the data
201
+ */
202
+
203
+ async function* iterator(readStream, options = {}) {
204
+ const lines = readline.createInterface(readStream);
205
+ const currentLines = [];
206
+ options = { ...options };
207
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
208
+
209
+ options.eol = '\n';
210
+ for await (let line of lines) {
211
+ if (line.startsWith('$$$$')) {
212
+ const molecule = getMolecule(currentLines.join(options.eol), options);
213
+ if (!options.filter || options.filter(molecule)) {
214
+ yield molecule;
215
+ }
216
+ currentLines.length = 0;
217
+ } else {
218
+ currentLines.push(line);
219
+ }
220
+ }
196
221
  }
197
222
 
198
- function molecules(options) {
199
- return pipeline__default["default"].obj(
200
- entries(),
201
- through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
202
- try {
203
- const parsed = parse(value, options);
204
- if (parsed.molecules.length === 1) {
205
- if (options && options.fullResult) {
206
- this.push(parsed);
207
- } else {
208
- this.push(parsed.molecules[0]);
209
- }
210
- }
211
- callback();
212
- } catch (e) {
213
- callback(e);
223
+ function getMolecule(sdfPart, options) {
224
+ let parts = sdfPart.split(`${options.eol}>`);
225
+ if (parts.length === 0 || parts[0].length <= 5) return;
226
+ let molecule = {};
227
+ molecule.molfile = parts[0] + options.eol;
228
+ for (let j = 1; j < parts.length; j++) {
229
+ let lines = parts[j].split(options.eol);
230
+ let from = lines[0].indexOf('<');
231
+ let to = lines[0].indexOf('>');
232
+ let label = lines[0].substring(from + 1, to);
233
+ for (let k = 1; k < lines.length - 1; k++) {
234
+ if (molecule[label]) {
235
+ molecule[label] += options.eol + lines[k];
236
+ } else {
237
+ molecule[label] = lines[k];
214
238
  }
215
- }),
216
- );
239
+ }
240
+ if (options.dynamicTyping) {
241
+ molecule[label] = dynamicTyping.parseString(molecule[label]);
242
+ }
243
+ }
244
+ return molecule;
217
245
  }
218
246
 
219
- exports.entries = entries;
220
- exports.molecules = molecules;
247
+ exports.iterator = iterator;
221
248
  exports.parse = parse;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sdf-parser",
3
- "version": "5.0.0",
3
+ "version": "6.0.0",
4
4
  "description": "SDF parser",
5
5
  "main": "lib/index.js",
6
6
  "module": "src/index.js",
@@ -8,6 +8,7 @@
8
8
  "lib",
9
9
  "src"
10
10
  ],
11
+ "sideEffects": false,
11
12
  "scripts": {
12
13
  "build": "npm run compile && cheminfo-build --root SDFParser",
13
14
  "compile": "rollup -c",
@@ -42,20 +43,19 @@
42
43
  },
43
44
  "homepage": "https://github.com/cheminfo/sdf-parser",
44
45
  "devDependencies": {
45
- "@babel/plugin-transform-modules-commonjs": "^7.15.4",
46
+ "@babel/plugin-transform-modules-commonjs": "^7.18.6",
46
47
  "babel-eslint": "^10.1.0",
47
48
  "callback-stream": "^1.1.0",
48
49
  "cheminfo-build": "^1.1.11",
49
- "eslint": "^7.32.0",
50
- "eslint-config-cheminfo": "^6.0.1",
51
- "jest": "^27.2.5",
52
- "openchemlib": "^7.4.3",
53
- "prettier": "^2.4.1"
50
+ "eslint": "^8.22.0",
51
+ "eslint-config-cheminfo": "^8.0.2",
52
+ "filelist-utils": "^0.6.0",
53
+ "jest": "^28.1.3",
54
+ "openchemlib": "^8.0.1",
55
+ "prettier": "^2.7.1"
54
56
  },
55
57
  "dependencies": {
56
- "pumpify": "^2.0.1",
57
- "split2": "^3.2.2",
58
- "through2": "^4.0.2",
59
- "through2-filter": "^3.0.0"
58
+ "dynamic-typing": "^1.0.0",
59
+ "ensure-string": "^1.2.0"
60
60
  }
61
61
  }
@@ -0,0 +1,10 @@
1
+ // Jest Snapshot v1, https://goo.gl/fbAQLP
2
+
3
+ exports[`should parse sdf files without EOL in the EOF 1`] = `
4
+ Array [
5
+ Array [
6
+ 0,
7
+ 1473,
8
+ ],
9
+ ]
10
+ `;
@@ -24,3 +24,10 @@ let sdf2 = fs.readFileSync(`${__dirname}/test2.sdf`, 'utf-8');
24
24
  );
25
25
  });
26
26
  });
27
+
28
+ test('should parse sdf files without EOL in the EOF', () => {
29
+ const eol = '\n';
30
+ const sdf = fs.readFileSync(`${__dirname}/test4.sdf`, 'utf-8');
31
+
32
+ expect(getEntriesBoundaries(sdf, `${eol}$$$$`, eol)).toMatchSnapshot();
33
+ });
@@ -0,0 +1,120 @@
1
+ import { createReadStream } from 'fs';
2
+ import { join } from 'path';
3
+ import { createGunzip } from 'zlib';
4
+
5
+ import { fileListFromPath } from 'filelist-utils';
6
+
7
+ import { iterator } from '../iterator';
8
+
9
+ test('iterator', async () => {
10
+ const fileList = (await fileListFromPath(join(__dirname, '.'))).filter(
11
+ (file) => file.name === 'test.sdf',
12
+ );
13
+ const results = [];
14
+ for await (const entry of iterator(fileList[0].stream())) {
15
+ results.push(entry);
16
+ }
17
+ expect(results).toHaveLength(128);
18
+ expect(results[0]).toMatchInlineSnapshot(`
19
+ Object {
20
+ "CLogP": 2.7,
21
+ "Code": 100380824,
22
+ "Number of H-Acceptors": 3,
23
+ "Number of H-Donors": 1,
24
+ "Number of Rotatable bonds": 1,
25
+ "molfile": "
26
+ -ISIS- 04231216572D
27
+
28
+ 15 16 0 0 0 0 0 0 0 0999 V2000
29
+ 2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
30
+ 2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
31
+ 0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
32
+ 1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
33
+ 1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
34
+ -0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
35
+ 3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
36
+ 0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
37
+ -1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
38
+ -0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
39
+ -1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
40
+ -0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
41
+ -3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
42
+ -2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
43
+ -3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
44
+ 2 1 1 0 0 0 0
45
+ 3 4 1 0 0 0 0
46
+ 4 1 1 0 0 0 0
47
+ 5 2 1 0 0 0 0
48
+ 6 3 2 0 0 0 0
49
+ 7 2 2 0 0 0 0
50
+ 8 4 2 0 0 0 0
51
+ 9 6 1 0 0 0 0
52
+ 10 9 2 0 0 0 0
53
+ 11 10 1 0 0 0 0
54
+ 12 11 1 0 0 0 0
55
+ 13 14 2 0 0 0 0
56
+ 14 9 1 0 0 0 0
57
+ 15 13 1 0 0 0 0
58
+ 3 5 1 0 0 0 0
59
+ 15 11 2 0 0 0 0
60
+ M END
61
+ ",
62
+ }
63
+ `);
64
+ });
65
+
66
+ test('iterator on stream', async () => {
67
+ const readStream = createReadStream(join(__dirname, 'test.sdf.gz'));
68
+ const stream = readStream.pipe(createGunzip());
69
+ const results = [];
70
+ for await (const entry of iterator(stream)) {
71
+ results.push(entry);
72
+ }
73
+ expect(results).toHaveLength(128);
74
+ expect(results[0]).toMatchInlineSnapshot(`
75
+ Object {
76
+ "CLogP": 2.7,
77
+ "Code": 100380824,
78
+ "Number of H-Acceptors": 3,
79
+ "Number of H-Donors": 1,
80
+ "Number of Rotatable bonds": 1,
81
+ "molfile": "
82
+ -ISIS- 04231216572D
83
+
84
+ 15 16 0 0 0 0 0 0 0 0999 V2000
85
+ 2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
86
+ 2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
87
+ 0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
88
+ 1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
89
+ 1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
90
+ -0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
91
+ 3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
92
+ 0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
93
+ -1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
94
+ -0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
95
+ -1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
96
+ -0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
97
+ -3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
98
+ -2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
99
+ -3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
100
+ 2 1 1 0 0 0 0
101
+ 3 4 1 0 0 0 0
102
+ 4 1 1 0 0 0 0
103
+ 5 2 1 0 0 0 0
104
+ 6 3 2 0 0 0 0
105
+ 7 2 2 0 0 0 0
106
+ 8 4 2 0 0 0 0
107
+ 9 6 1 0 0 0 0
108
+ 10 9 2 0 0 0 0
109
+ 11 10 1 0 0 0 0
110
+ 12 11 1 0 0 0 0
111
+ 13 14 2 0 0 0 0
112
+ 14 9 1 0 0 0 0
113
+ 15 13 1 0 0 0 0
114
+ 3 5 1 0 0 0 0
115
+ 15 11 2 0 0 0 0
116
+ M END
117
+ ",
118
+ }
119
+ `);
120
+ });
Binary file
@@ -0,0 +1,37 @@
1
+
2
+ -ISIS- 04231216572D
3
+
4
+ 15 16 0 0 0 0 0 0 0 0999 V2000
5
+ 2.4792 1.7000 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0
6
+ 2.4292 0.3500 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
7
+ 0.4042 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
8
+ 1.2167 2.1833 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
9
+ 1.1542 -0.0000 0.0000 S 0 0 0 0 0 0 0 0 0 0 0 0
10
+ -0.9208 1.1208 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
11
+ 3.4792 -0.4500 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
12
+ 0.8792 3.4458 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0
13
+ -1.6000 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
14
+ -0.9625 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
15
+ -1.6208 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
16
+ -0.9125 -3.4375 0.0000 Br 0 0 0 0 0 0 0 0 0 0 0 0
17
+ -3.5958 -1.1792 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
18
+ -2.9208 -0.0292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
19
+ -3.0333 -2.3292 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
20
+ 2 1 1 0 0 0 0
21
+ 3 4 1 0 0 0 0
22
+ 4 1 1 0 0 0 0
23
+ 5 2 1 0 0 0 0
24
+ 6 3 2 0 0 0 0
25
+ 7 2 2 0 0 0 0
26
+ 8 4 2 0 0 0 0
27
+ 9 6 1 0 0 0 0
28
+ 10 9 2 0 0 0 0
29
+ 11 10 1 0 0 0 0
30
+ 12 11 1 0 0 0 0
31
+ 13 14 2 0 0 0 0
32
+ 14 9 1 0 0 0 0
33
+ 15 13 1 0 0 0 0
34
+ 3 5 1 0 0 0 0
35
+ 15 11 2 0 0 0 0
36
+ M END
37
+ $$$$
@@ -6,8 +6,13 @@ export function getEntriesBoundaries(string, substring, eol) {
6
6
  next = string.indexOf(substring, previous);
7
7
  if (next !== -1) {
8
8
  res.push([previous, next]);
9
- previous = next =
10
- string.indexOf(eol, next + substring.length) + eol.length;
9
+ const nextMatch = string.indexOf(eol, next + substring.length);
10
+ if (nextMatch === -1) {
11
+ next = -1;
12
+ } else {
13
+ previous = nextMatch + eol.length;
14
+ next = previous;
15
+ }
11
16
  } else {
12
17
  res.push([previous, string.length]);
13
18
  }
package/src/index.js CHANGED
@@ -1,2 +1,2 @@
1
1
  export * from './parse';
2
- export * from './stream';
2
+ export * from './iterator';
@@ -0,0 +1,54 @@
1
+ import { createInterface } from 'readline';
2
+
3
+ import { parseString } from 'dynamic-typing';
4
+ /**
5
+ * Parse a SDF file
6
+ * @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
7
+ * @param {object} [options={}]
8
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
9
+ * @param {boolean} [options.dynamicTyping] Dynamically type the data
10
+ */
11
+
12
+ export async function* iterator(readStream, options = {}) {
13
+ const lines = createInterface(readStream);
14
+ const currentLines = [];
15
+ options = { ...options };
16
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
17
+
18
+ options.eol = '\n';
19
+ for await (let line of lines) {
20
+ if (line.startsWith('$$$$')) {
21
+ const molecule = getMolecule(currentLines.join(options.eol), options);
22
+ if (!options.filter || options.filter(molecule)) {
23
+ yield molecule;
24
+ }
25
+ currentLines.length = 0;
26
+ } else {
27
+ currentLines.push(line);
28
+ }
29
+ }
30
+ }
31
+
32
+ function getMolecule(sdfPart, options) {
33
+ let parts = sdfPart.split(`${options.eol}>`);
34
+ if (parts.length === 0 || parts[0].length <= 5) return;
35
+ let molecule = {};
36
+ molecule.molfile = parts[0] + options.eol;
37
+ for (let j = 1; j < parts.length; j++) {
38
+ let lines = parts[j].split(options.eol);
39
+ let from = lines[0].indexOf('<');
40
+ let to = lines[0].indexOf('>');
41
+ let label = lines[0].substring(from + 1, to);
42
+ for (let k = 1; k < lines.length - 1; k++) {
43
+ if (molecule[label]) {
44
+ molecule[label] += options.eol + lines[k];
45
+ } else {
46
+ molecule[label] = lines[k];
47
+ }
48
+ }
49
+ if (options.dynamicTyping) {
50
+ molecule[label] = parseString(molecule[label]);
51
+ }
52
+ }
53
+ return molecule;
54
+ }
package/src/parse.js CHANGED
@@ -1,34 +1,51 @@
1
- import { getEntriesBoundaries } from './getEntriesBoundaries';
1
+ import { ensureString } from 'ensure-string';
2
2
 
3
+ import { getEntriesBoundaries } from './getEntriesBoundaries';
4
+ import { getMolecule } from './util/getMolecule';
5
+ /**
6
+ * Parse a SDF file
7
+ * @param {string|ArrayBuffer|Uint8Array} sdf SDF file to parse
8
+ * @param {object} [options={}]
9
+ * @param {string[]} [options.include] List of fields to include
10
+ * @param {string[]} [options.exclude] List of fields to exclude
11
+ * @param {Function} [options.filter] Callback allowing to filter the molecules
12
+ * @param {boolean} [options.dynamicTyping] Dynamically type the data
13
+ * @param {object} [options.modifiers] Object containing callbacks to apply on some specific fields
14
+ * @param {boolean} [options.mixedEOL=false] Set to true if you know there is a mixture between \r\n and \n
15
+ * @param {string} [options.eol] Specify the end of line character. Default will be the one found in the file
16
+ */
3
17
  export function parse(sdf, options = {}) {
4
- const {
5
- include,
6
- exclude,
7
- filter,
8
- modifiers = {},
9
- forEach = {},
10
- dynamicTyping = true,
11
- } = options;
18
+ options = { ...options };
19
+ if (options.modifiers === undefined) options.modifiers = {};
20
+ if (options.forEach === undefined) options.forEach = {};
21
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
12
22
 
23
+ sdf = ensureString(sdf);
13
24
  if (typeof sdf !== 'string') {
14
25
  throw new TypeError('Parameter "sdf" must be a string');
15
26
  }
16
27
 
17
- let eol = '\n';
18
- if (options.mixedEOL) {
19
- sdf = sdf.replace(/\r\n/g, '\n');
20
- sdf = sdf.replace(/\r/g, '\n');
21
- } else {
22
- // we will find the delimiter in order to be much faster and not use regular expression
23
- let header = sdf.substr(0, 1000);
24
- if (header.indexOf('\r\n') > -1) {
25
- eol = '\r\n';
26
- } else if (header.indexOf('\r') > -1) {
27
- eol = '\r';
28
+ if (options.eol === undefined) {
29
+ options.eol = '\n';
30
+ if (options.mixedEOL) {
31
+ sdf = sdf.replace(/\r\n/g, '\n');
32
+ sdf = sdf.replace(/\r/g, '\n');
33
+ } else {
34
+ // we will find the delimiter in order to be much faster and not use regular expression
35
+ let header = sdf.substr(0, 1000);
36
+ if (header.indexOf('\r\n') > -1) {
37
+ options.eol = '\r\n';
38
+ } else if (header.indexOf('\r') > -1) {
39
+ options.eol = '\r';
40
+ }
28
41
  }
29
42
  }
30
43
 
31
- let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
44
+ let entriesBoundaries = getEntriesBoundaries(
45
+ sdf,
46
+ `${options.eol}$$$$`,
47
+ options.eol,
48
+ );
32
49
  let molecules = [];
33
50
  let labels = {};
34
51
 
@@ -36,72 +53,18 @@ export function parse(sdf, options = {}) {
36
53
 
37
54
  for (let i = 0; i < entriesBoundaries.length; i++) {
38
55
  let sdfPart = sdf.substring(...entriesBoundaries[i]);
39
- let parts = sdfPart.split(`${eol}>`);
40
- if (parts.length > 0 && parts[0].length > 5) {
41
- let molecule = {};
42
- let currentLabels = [];
43
- molecule.molfile = parts[0] + eol;
44
- for (let j = 1; j < parts.length; j++) {
45
- let lines = parts[j].split(eol);
46
- let from = lines[0].indexOf('<');
47
- let to = lines[0].indexOf('>');
48
- let label = lines[0].substring(from + 1, to);
49
- currentLabels.push(label);
50
- if (!labels[label]) {
51
- labels[label] = {
52
- counter: 0,
53
- isNumeric: dynamicTyping,
54
- keep: false,
55
- };
56
- if (
57
- (!exclude || exclude.indexOf(label) === -1) &&
58
- (!include || include.indexOf(label) > -1)
59
- ) {
60
- labels[label].keep = true;
61
- if (modifiers[label]) {
62
- labels[label].modifier = modifiers[label];
63
- }
64
- if (forEach[label]) {
65
- labels[label].forEach = forEach[label];
66
- }
67
- }
68
- }
69
- if (labels[label].keep) {
70
- for (let k = 1; k < lines.length - 1; k++) {
71
- if (molecule[label]) {
72
- molecule[label] += eol + lines[k];
73
- } else {
74
- molecule[label] = lines[k];
75
- }
76
- }
77
- if (labels[label].modifier) {
78
- let modifiedValue = labels[label].modifier(molecule[label]);
79
- if (modifiedValue === undefined || modifiedValue === null) {
80
- delete molecule[label];
81
- } else {
82
- molecule[label] = modifiedValue;
83
- }
84
- }
85
- if (labels[label].isNumeric) {
86
- if (
87
- !isFinite(molecule[label]) ||
88
- molecule[label].match(/^0[0-9]/)
89
- ) {
90
- labels[label].isNumeric = false;
91
- }
92
- }
93
- }
94
- }
95
- if (!filter || filter(molecule)) {
96
- molecules.push(molecule);
97
- // only now we can increase the counter
98
- for (let j = 0; j < currentLabels.length; j++) {
99
- labels[currentLabels[j]].counter++;
100
- }
56
+
57
+ let currentLabels = [];
58
+ const molecule = getMolecule(sdfPart, labels, currentLabels, options);
59
+ if (!molecule) continue;
60
+ if (!options.filter || options.filter(molecule)) {
61
+ molecules.push(molecule);
62
+ // only now we can increase the counter
63
+ for (let j = 0; j < currentLabels.length; j++) {
64
+ labels[currentLabels[j]].counter++;
101
65
  }
102
66
  }
103
67
  }
104
-
105
68
  // all numeric fields should be converted to numbers
106
69
  for (let label in labels) {
107
70
  let currentLabel = labels[label];
@@ -141,8 +104,8 @@ export function parse(sdf, options = {}) {
141
104
 
142
105
  return {
143
106
  time: Date.now() - start,
144
- molecules: molecules,
107
+ molecules,
145
108
  labels: Object.keys(labels),
146
- statistics: statistics,
109
+ statistics,
147
110
  };
148
111
  }
@@ -0,0 +1,55 @@
1
+ export function getMolecule(sdfPart, labels, currentLabels, options) {
2
+ let parts = sdfPart.split(`${options.eol}>`);
3
+ if (parts.length === 0 || parts[0].length <= 5) return;
4
+ let molecule = {};
5
+ molecule.molfile = parts[0] + options.eol;
6
+ for (let j = 1; j < parts.length; j++) {
7
+ let lines = parts[j].split(options.eol);
8
+ let from = lines[0].indexOf('<');
9
+ let to = lines[0].indexOf('>');
10
+ let label = lines[0].substring(from + 1, to);
11
+ currentLabels.push(label);
12
+ if (!labels[label]) {
13
+ labels[label] = {
14
+ counter: 0,
15
+ isNumeric: options.dynamicTyping,
16
+ keep: false,
17
+ };
18
+ if (
19
+ (!options.exclude || options.exclude.indexOf(label) === -1) &&
20
+ (!options.include || options.include.indexOf(label) > -1)
21
+ ) {
22
+ labels[label].keep = true;
23
+ if (options.modifiers[label]) {
24
+ labels[label].modifier = options.modifiers[label];
25
+ }
26
+ if (options.forEach[label]) {
27
+ labels[label].forEach = options.forEach[label];
28
+ }
29
+ }
30
+ }
31
+ if (labels[label].keep) {
32
+ for (let k = 1; k < lines.length - 1; k++) {
33
+ if (molecule[label]) {
34
+ molecule[label] += options.eol + lines[k];
35
+ } else {
36
+ molecule[label] = lines[k];
37
+ }
38
+ }
39
+ if (labels[label].modifier) {
40
+ let modifiedValue = labels[label].modifier(molecule[label]);
41
+ if (modifiedValue === undefined || modifiedValue === null) {
42
+ delete molecule[label];
43
+ } else {
44
+ molecule[label] = modifiedValue;
45
+ }
46
+ }
47
+ if (labels[label].isNumeric) {
48
+ if (!isFinite(molecule[label]) || molecule[label].match(/^0[0-9]/)) {
49
+ labels[label].isNumeric = false;
50
+ }
51
+ }
52
+ }
53
+ }
54
+ return molecule;
55
+ }
@@ -1,98 +0,0 @@
1
- import fs from 'fs';
2
-
3
- import callbackStream from 'callback-stream';
4
- import OCL from 'openchemlib/minimal';
5
-
6
- import { entries, molecules } from '..';
7
-
8
- const cbStream = callbackStream.bind(null, { objectMode: true });
9
-
10
- describe('stream', () => {
11
- it('entries', () =>
12
- new Promise((resolve) => {
13
- fs.createReadStream(`${__dirname}/test.sdf`)
14
- .pipe(entries())
15
- .pipe(
16
- cbStream((err, data) => {
17
- expect(err).toBeNull();
18
- expect(data).toHaveLength(128);
19
- expect(data[0]).toContain('-ISIS- 04231216572D');
20
- const mol = OCL.Molecule.fromMolfile(data[5]);
21
- expect(mol.toMolfile()).toContain(
22
- '17 18 0 0 0 0 0 0 0 0999 V2000',
23
- );
24
- resolve();
25
- }),
26
- );
27
- }));
28
-
29
- it('molecules', () =>
30
- new Promise((resolve) => {
31
- fs.createReadStream(`${__dirname}/test.sdf`)
32
- .pipe(molecules())
33
- .pipe(
34
- cbStream((err, data) => {
35
- expect(err).toBeNull();
36
- expect(data).toHaveLength(128);
37
- expect(data[0]).toMatchObject({
38
- Code: '0100380824',
39
- CLogP: 2.7,
40
- });
41
- expect(data[0].molfile).toContain('-ISIS- 04231216572D');
42
- resolve();
43
- }),
44
- );
45
- }));
46
-
47
- it('molecules - full result', () =>
48
- new Promise((resolve) => {
49
- fs.createReadStream(`${__dirname}/test.sdf`)
50
- .pipe(molecules({ fullResult: true }))
51
- .pipe(
52
- cbStream((err, data) => {
53
- expect(err).toBeNull();
54
- expect(data).toHaveLength(128);
55
- expect(data[0]).toMatchObject({
56
- labels: [
57
- 'Code',
58
- 'Number of H-Donors',
59
- 'Number of H-Acceptors',
60
- 'Number of Rotatable bonds',
61
- 'CLogP',
62
- ],
63
- });
64
- expect(data[0].molecules).toHaveLength(1);
65
- resolve();
66
- }),
67
- );
68
- }));
69
-
70
- it('molecules with filter', () =>
71
- new Promise((resolve) => {
72
- fs.createReadStream(`${__dirname}/test.sdf`)
73
- .pipe(
74
- molecules({
75
- filter: (entry) => entry.Code === '0100380869',
76
- }),
77
- )
78
- .pipe(
79
- cbStream((err, data) => {
80
- expect(err).toBeNull();
81
- expect(data).toHaveLength(1);
82
- resolve();
83
- }),
84
- );
85
- }));
86
-
87
- it('async iteration', async () => {
88
- const stream = fs
89
- .createReadStream(`${__dirname}/test.sdf`)
90
- .pipe(molecules());
91
- let count = 0;
92
- for await (const molecule of stream) {
93
- count++;
94
- expect(molecule.molfile.toString()).toContain('0999 V2000');
95
- }
96
- expect(count).toBe(128);
97
- });
98
- });
package/src/stream.js DELETED
@@ -1,44 +0,0 @@
1
- import pipeline from 'pumpify';
2
- import split2 from 'split2';
3
- import through2 from 'through2';
4
- import filter from 'through2-filter';
5
-
6
- import { parse } from './parse';
7
-
8
- const filterStream = filter.bind(null, { objectMode: true });
9
- function filterCb(chunk) {
10
- return chunk.length > 1 && chunk.trim().length > 1;
11
- }
12
-
13
- export function entries() {
14
- return pipeline.obj(
15
- split2(/\r?\n\${4}.*\r?\n/),
16
- filterStream(filterCb),
17
- through2({ objectMode: true }, function process(value, encoding, callback) {
18
- const eol = value.includes('\r\n') ? '\r\n' : '\n';
19
- this.push(`${value + eol}$$$$${eol}`);
20
- callback();
21
- }),
22
- );
23
- }
24
-
25
- export function molecules(options) {
26
- return pipeline.obj(
27
- entries(),
28
- through2({ objectMode: true }, function process(value, encoding, callback) {
29
- try {
30
- const parsed = parse(value, options);
31
- if (parsed.molecules.length === 1) {
32
- if (options && options.fullResult) {
33
- this.push(parsed);
34
- } else {
35
- this.push(parsed.molecules[0]);
36
- }
37
- }
38
- callback();
39
- } catch (e) {
40
- callback(e);
41
- }
42
- }),
43
- );
44
- }