sdf-parser 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,22 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2014 cheminfo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
package/README.md ADDED
@@ -0,0 +1,104 @@
1
+ # sdf-parser
2
+
3
+ [![NPM version][npm-image]][npm-url]
4
+ [![build status][travis-image]][travis-url]
5
+ [![npm download][download-image]][download-url]
6
+
7
+ Allow to parse a SDF file and convert it to an array of objects.
8
+
9
+ ## Use of the package
10
+
11
+ ```bash
12
+ npm install sdf-parser
13
+ ```
14
+
15
+ In node script:
16
+
17
+ ```js
18
+ // allows to parse a file test.sdf that would be present in the same directory
19
+
20
+ var { parse } = require('sdf-parser');
21
+
22
+ var fs = require('fs');
23
+ var sdf = fs.readFileSync('./test.sdf', 'utf-8');
24
+
25
+ var result = parse(sdf);
26
+ console.log(result);
27
+ ```
28
+
29
+ ## require('sdf-parser') (sdf, options)
30
+
31
+ options:
32
+
33
+ - exclude : array of string containing the fields to discard
34
+ - include : array of string containing the fields to keep
35
+ - modifiers : object of functions that need to be converted during the parsing
36
+ - filter : function that allows to filter the result
37
+ - mixedEOL : if set to true will try to deal with mixed End Of Line separator
38
+ - dynamicTyping : convert fields containing only number to numbers (default: true)
39
+
40
+ ## Advanced example with filtering and modifiers
41
+
42
+ ```js
43
+ var result = parse(sdf, {
44
+ exclude: ['Number of H-Donors'],
45
+ include: ['Number of H-Donors', 'CLogP', 'Code'],
46
+ modifiers: {
47
+ CLogP: function (field) {
48
+ return {
49
+ low: field * 1 - 0.2,
50
+ high: field * 1 + 0.2,
51
+ };
52
+ },
53
+ },
54
+ filter: (entry) => {
55
+ return entry.CLogP && entry.CLogP.low > 4;
56
+ },
57
+ });
58
+ ```
59
+
60
+ ## Streams
61
+
62
+ This API is only available on Node.js.
63
+
64
+ ### molecules(options)
65
+
66
+ Transform an input text stream to a stream of molecule objects.
67
+
68
+ #### options
69
+
70
+ - `fullResult`: true to emit the full result of `parse` instead of just the molecules.
71
+ - All other options from the `parse` function.
72
+
73
+ ```js
74
+ const { stream } = require('sdf-parser');
75
+ fs.createReadStream('test.sdf')
76
+ .pipe(stream.molecules())
77
+ .on('data', (molecule) => {
78
+ console.log(molecule.molfile);
79
+ });
80
+ ```
81
+
82
+ ### entries()
83
+
84
+ Transform an input text stream to a stream of sdf entries.
85
+
86
+ ```js
87
+ const { stream } = require('sdf-parser');
88
+ fs.createReadStream('test.sdf')
89
+ .pipe(stream.entries())
90
+ .on('data', (entry) => {
91
+ // sdf entry as a string
92
+ });
93
+ ```
94
+
95
+ ## License
96
+
97
+ [MIT](./LICENSE)
98
+
99
+ [npm-image]: https://img.shields.io/npm/v/sdf-parser.svg?style=flat-square
100
+ [npm-url]: https://www.npmjs.com/package/sdf-parser
101
+ [travis-image]: https://img.shields.io/travis/cheminfo/sdf-parser/master.svg?style=flat-square
102
+ [travis-url]: https://travis-ci.org/cheminfo/sdf-parser
103
+ [download-image]: https://img.shields.io/npm/dm/sdf-parser.svg?style=flat-square
104
+ [download-url]: https://www.npmjs.com/package/sdf-parser
package/lib/index.js ADDED
@@ -0,0 +1,221 @@
1
+ 'use strict';
2
+
3
+ Object.defineProperty(exports, '__esModule', { value: true });
4
+
5
+ var pipeline = require('pumpify');
6
+ var split2 = require('split2');
7
+ var through2 = require('through2');
8
+ var filter = require('through2-filter');
9
+
10
+ function _interopDefaultLegacy (e) { return e && typeof e === 'object' && 'default' in e ? e : { 'default': e }; }
11
+
12
+ var pipeline__default = /*#__PURE__*/_interopDefaultLegacy(pipeline);
13
+ var split2__default = /*#__PURE__*/_interopDefaultLegacy(split2);
14
+ var through2__default = /*#__PURE__*/_interopDefaultLegacy(through2);
15
+ var filter__default = /*#__PURE__*/_interopDefaultLegacy(filter);
16
+
17
+ function getEntriesBoundaries(string, substring, eol) {
18
+ const res = [];
19
+ let previous = 0;
20
+ let next = 0;
21
+ while (next !== -1) {
22
+ next = string.indexOf(substring, previous);
23
+ if (next !== -1) {
24
+ res.push([previous, next]);
25
+ previous = next =
26
+ string.indexOf(eol, next + substring.length) + eol.length;
27
+ } else {
28
+ res.push([previous, string.length]);
29
+ }
30
+ }
31
+ return res;
32
+ }
33
+
34
+ function parse(sdf, options = {}) {
35
+ const {
36
+ include,
37
+ exclude,
38
+ filter,
39
+ modifiers = {},
40
+ forEach = {},
41
+ dynamicTyping = true,
42
+ } = options;
43
+
44
+ if (typeof sdf !== 'string') {
45
+ throw new TypeError('Parameter "sdf" must be a string');
46
+ }
47
+
48
+ let eol = '\n';
49
+ if (options.mixedEOL) {
50
+ sdf = sdf.replace(/\r\n/g, '\n');
51
+ sdf = sdf.replace(/\r/g, '\n');
52
+ } else {
53
+ // we will find the delimiter in order to be much faster and not use regular expression
54
+ let header = sdf.substr(0, 1000);
55
+ if (header.indexOf('\r\n') > -1) {
56
+ eol = '\r\n';
57
+ } else if (header.indexOf('\r') > -1) {
58
+ eol = '\r';
59
+ }
60
+ }
61
+
62
+ let entriesBoundaries = getEntriesBoundaries(sdf, `${eol}$$$$`, eol);
63
+ let molecules = [];
64
+ let labels = {};
65
+
66
+ let start = Date.now();
67
+
68
+ for (let i = 0; i < entriesBoundaries.length; i++) {
69
+ let sdfPart = sdf.substring(...entriesBoundaries[i]);
70
+ let parts = sdfPart.split(`${eol}>`);
71
+ if (parts.length > 0 && parts[0].length > 5) {
72
+ let molecule = {};
73
+ let currentLabels = [];
74
+ molecule.molfile = parts[0] + eol;
75
+ for (let j = 1; j < parts.length; j++) {
76
+ let lines = parts[j].split(eol);
77
+ let from = lines[0].indexOf('<');
78
+ let to = lines[0].indexOf('>');
79
+ let label = lines[0].substring(from + 1, to);
80
+ currentLabels.push(label);
81
+ if (!labels[label]) {
82
+ labels[label] = {
83
+ counter: 0,
84
+ isNumeric: dynamicTyping,
85
+ keep: false,
86
+ };
87
+ if (
88
+ (!exclude || exclude.indexOf(label) === -1) &&
89
+ (!include || include.indexOf(label) > -1)
90
+ ) {
91
+ labels[label].keep = true;
92
+ if (modifiers[label]) {
93
+ labels[label].modifier = modifiers[label];
94
+ }
95
+ if (forEach[label]) {
96
+ labels[label].forEach = forEach[label];
97
+ }
98
+ }
99
+ }
100
+ if (labels[label].keep) {
101
+ for (let k = 1; k < lines.length - 1; k++) {
102
+ if (molecule[label]) {
103
+ molecule[label] += eol + lines[k];
104
+ } else {
105
+ molecule[label] = lines[k];
106
+ }
107
+ }
108
+ if (labels[label].modifier) {
109
+ let modifiedValue = labels[label].modifier(molecule[label]);
110
+ if (modifiedValue === undefined || modifiedValue === null) {
111
+ delete molecule[label];
112
+ } else {
113
+ molecule[label] = modifiedValue;
114
+ }
115
+ }
116
+ if (labels[label].isNumeric) {
117
+ if (
118
+ !isFinite(molecule[label]) ||
119
+ molecule[label].match(/^0[0-9]/)
120
+ ) {
121
+ labels[label].isNumeric = false;
122
+ }
123
+ }
124
+ }
125
+ }
126
+ if (!filter || filter(molecule)) {
127
+ molecules.push(molecule);
128
+ // only now we can increase the counter
129
+ for (let j = 0; j < currentLabels.length; j++) {
130
+ labels[currentLabels[j]].counter++;
131
+ }
132
+ }
133
+ }
134
+ }
135
+
136
+ // all numeric fields should be converted to numbers
137
+ for (let label in labels) {
138
+ let currentLabel = labels[label];
139
+ if (currentLabel.isNumeric) {
140
+ currentLabel.minValue = Infinity;
141
+ currentLabel.maxValue = -Infinity;
142
+ for (let j = 0; j < molecules.length; j++) {
143
+ if (molecules[j][label]) {
144
+ let value = parseFloat(molecules[j][label]);
145
+ molecules[j][label] = value;
146
+ if (value > currentLabel.maxValue) {
147
+ currentLabel.maxValue = value;
148
+ }
149
+ if (value < currentLabel.minValue) {
150
+ currentLabel.minValue = value;
151
+ }
152
+ }
153
+ }
154
+ }
155
+ }
156
+
157
+ // we check that a label is in all the records
158
+ for (let key in labels) {
159
+ if (labels[key].counter === molecules.length) {
160
+ labels[key].always = true;
161
+ } else {
162
+ labels[key].always = false;
163
+ }
164
+ }
165
+
166
+ let statistics = [];
167
+ for (let key in labels) {
168
+ let statistic = labels[key];
169
+ statistic.label = key;
170
+ statistics.push(statistic);
171
+ }
172
+
173
+ return {
174
+ time: Date.now() - start,
175
+ molecules: molecules,
176
+ labels: Object.keys(labels),
177
+ statistics: statistics,
178
+ };
179
+ }
180
+
181
+ const filterStream = filter__default["default"].bind(null, { objectMode: true });
182
+ function filterCb(chunk) {
183
+ return chunk.length > 1 && chunk.trim().length > 1;
184
+ }
185
+
186
+ function entries() {
187
+ return pipeline__default["default"].obj(
188
+ split2__default["default"](/\r?\n\${4}.*\r?\n/),
189
+ filterStream(filterCb),
190
+ through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
191
+ const eol = value.includes('\r\n') ? '\r\n' : '\n';
192
+ this.push(`${value + eol}$$$$${eol}`);
193
+ callback();
194
+ }),
195
+ );
196
+ }
197
+
198
+ function molecules(options) {
199
+ return pipeline__default["default"].obj(
200
+ entries(),
201
+ through2__default["default"]({ objectMode: true }, function process(value, encoding, callback) {
202
+ try {
203
+ const parsed = parse(value, options);
204
+ if (parsed.molecules.length === 1) {
205
+ if (options && options.fullResult) {
206
+ this.push(parsed);
207
+ } else {
208
+ this.push(parsed.molecules[0]);
209
+ }
210
+ }
211
+ callback();
212
+ } catch (e) {
213
+ callback(e);
214
+ }
215
+ }),
216
+ );
217
+ }
218
+
219
+ exports.entries = entries;
220
+ exports.molecules = molecules;
221
+ exports.parse = parse;
package/package.json ADDED
@@ -0,0 +1,61 @@
1
+ {
2
+ "name": "sdf-parser",
3
+ "version": "5.0.0",
4
+ "description": "SDF parser",
5
+ "main": "lib/index.js",
6
+ "module": "src/index.js",
7
+ "files": [
8
+ "lib",
9
+ "src"
10
+ ],
11
+ "scripts": {
12
+ "build": "npm run compile && cheminfo-build --root SDFParser",
13
+ "compile": "rollup -c",
14
+ "eslint": "eslint src",
15
+ "eslint-fix": "npm run eslint -- --fix",
16
+ "prepack": "npm run compile",
17
+ "prettier": "prettier --check src",
18
+ "prettier-write": "prettier --write src",
19
+ "test": "npm run test-coverage && npm run eslint",
20
+ "test-coverage": "jest --coverage",
21
+ "test-only": "jest"
22
+ },
23
+ "browser": {
24
+ "./src/stream.js": "./src/stream.browser.js"
25
+ },
26
+ "repository": {
27
+ "type": "git",
28
+ "url": "https://github.com/cheminfo/sdf-parser.git"
29
+ },
30
+ "keywords": [
31
+ "sdf",
32
+ "parser",
33
+ "molfile",
34
+ "v2000",
35
+ "v3000",
36
+ "mdl"
37
+ ],
38
+ "author": "Luc Patiny",
39
+ "license": "MIT",
40
+ "bugs": {
41
+ "url": "https://github.com/cheminfo/sdf-parser/issues"
42
+ },
43
+ "homepage": "https://github.com/cheminfo/sdf-parser",
44
+ "devDependencies": {
45
+ "@babel/plugin-transform-modules-commonjs": "^7.15.4",
46
+ "babel-eslint": "^10.1.0",
47
+ "callback-stream": "^1.1.0",
48
+ "cheminfo-build": "^1.1.11",
49
+ "eslint": "^7.32.0",
50
+ "eslint-config-cheminfo": "^6.0.1",
51
+ "jest": "^27.2.5",
52
+ "openchemlib": "^7.4.3",
53
+ "prettier": "^2.4.1"
54
+ },
55
+ "dependencies": {
56
+ "pumpify": "^2.0.1",
57
+ "split2": "^3.2.2",
58
+ "through2": "^4.0.2",
59
+ "through2-filter": "^3.0.0"
60
+ }
61
+ }
@@ -0,0 +1,57 @@
1
+ import fs from 'fs';
2
+
3
+ import { parse } from '..';
4
+
5
+ let sdf = fs.readFileSync(`${__dirname}/test.sdf`, 'utf-8');
6
+
7
+ describe('SDF Parser options', () => {
8
+ let result = parse(sdf, {
9
+ exclude: ['Number of H-Donors'],
10
+ include: ['Number of H-Donors', 'CLogP', 'Code'],
11
+ modifiers: {
12
+ CLogP: (field) => {
13
+ return {
14
+ low: field * 1 - 0.2,
15
+ high: field * 1 + 0.2,
16
+ };
17
+ },
18
+ },
19
+ filter: (entry) => {
20
+ return entry.CLogP && entry.CLogP.low > 4;
21
+ },
22
+ });
23
+
24
+ it('Check statistics', () => {
25
+ expect(result.statistics[0].counter).toBe(43);
26
+ expect(result.statistics[0].isNumeric).toBe(false);
27
+ expect(result.statistics[0].label).toBe('Code');
28
+ expect(result.statistics[0].always).toBe(true);
29
+ expect(result.statistics[4].counter).toBe(43);
30
+ expect(result.statistics[4].isNumeric).toBe(false);
31
+ expect(result.statistics[4].label).toBe('CLogP');
32
+ expect(result.statistics[4].always).toBe(true);
33
+ });
34
+
35
+ it('Check molecules', () => {
36
+ expect(result.molecules).toHaveLength(43);
37
+ let molecule = result.molecules[0];
38
+
39
+ expect(Object.keys(molecule)).toHaveLength(3);
40
+ expect(molecule.Code).toBe('0100380851');
41
+ expect(molecule.CLogP.low).toBeCloseTo(4.8, 0.0001);
42
+ expect(molecule.CLogP.high).toBeCloseTo(5.2, 0.0001);
43
+ expect(molecule.molfile.split('\n')).toHaveLength(56);
44
+ });
45
+
46
+ it('should throw with non-string argument', () => {
47
+ expect(() => {
48
+ parse();
49
+ }).toThrow(TypeError);
50
+ expect(() => {
51
+ parse(42);
52
+ }).toThrow(TypeError);
53
+ expect(() => {
54
+ parse({});
55
+ }).toThrow(TypeError);
56
+ });
57
+ });
@@ -0,0 +1,24 @@
1
+ import fs from 'fs';
2
+
3
+ import { parse } from '..';
4
+
5
+ let sdf = fs.readFileSync(`${__dirname}/test.sdf`, 'utf-8');
6
+
7
+ describe('SDF Parser options and undefined', () => {
8
+ let result = parse(sdf, {
9
+ exclude: ['Number of H-Donors'],
10
+ include: ['Number of H-Donors', 'CLogP', 'Code'],
11
+ modifiers: {
12
+ CLogP: () => {
13
+ return undefined;
14
+ },
15
+ },
16
+ filter: (entry) => {
17
+ return entry.CLogP && entry.CLogP.low > 4;
18
+ },
19
+ });
20
+
21
+ it('Check molecules', () => {
22
+ expect(result.molecules).toHaveLength(0);
23
+ });
24
+ });
@@ -0,0 +1,26 @@
1
+ import fs from 'fs';
2
+
3
+ import { getEntriesBoundaries } from '../getEntriesBoundaries';
4
+
5
+ let sdf0 = fs.readFileSync(`${__dirname}/test.sdf`, 'utf-8');
6
+ let sdf1 = fs.readFileSync(`${__dirname}/test1.sdf`, 'utf-8');
7
+ let sdf2 = fs.readFileSync(`${__dirname}/test2.sdf`, 'utf-8');
8
+
9
+ [sdf0, sdf1, sdf2].forEach((sdf) => {
10
+ let eol = '\n';
11
+ let header = sdf.substr(0, 1000);
12
+ if (header.indexOf('\r\n') > -1) {
13
+ eol = '\r\n';
14
+ } else if (header.indexOf('\r') > -1) {
15
+ eol = '\r';
16
+ }
17
+
18
+ test('Split should match regex behavior', () => {
19
+ let sdfParts = sdf.split(new RegExp(`${eol}\\$\\$\\$\\$.*${eol}`));
20
+ expect(sdfParts).toStrictEqual(
21
+ getEntriesBoundaries(sdf, `${eol}$$$$`, eol).map((v) =>
22
+ sdf.substring(...v),
23
+ ),
24
+ );
25
+ });
26
+ });
@@ -0,0 +1,76 @@
1
+ import fs from 'fs';
2
+
3
+ import { parse } from '..';
4
+
5
+ let sdf = fs.readFileSync(`${__dirname}/test.sdf`, 'utf-8');
6
+ let sdf1 = fs.readFileSync(`${__dirname}/test1.sdf`, 'utf-8');
7
+
8
+ describe('SDF Parser', () => {
9
+ let result = parse(sdf);
10
+
11
+ it('Check statistics', () => {
12
+ expect(result.statistics[0].counter).toBe(128);
13
+ expect(result.statistics[0].isNumeric).toBe(false);
14
+ expect(result.statistics[0].label).toBe('Code');
15
+ expect(result.statistics[1].counter).toBe(128);
16
+ expect(result.statistics[1].minValue).toBe(0);
17
+ expect(result.statistics[1].maxValue).toBe(5);
18
+ expect(result.statistics[1].isNumeric).toBe(true);
19
+ expect(result.statistics[1].label).toBe('Number of H-Donors');
20
+ expect(result.statistics[0].always).toBe(true);
21
+ expect(result.statistics[4].always).toBe(false);
22
+ });
23
+
24
+ it('Check molecules', () => {
25
+ let molecule = result.molecules[0];
26
+ expect(molecule.Code).toContain('0100380824');
27
+ expect(molecule.CLogP).toBe(2.7);
28
+ expect(molecule.molfile.split('\n')).toHaveLength(37);
29
+ });
30
+
31
+ it('should throw with non-string argument', () => {
32
+ expect(() => {
33
+ parse();
34
+ }).toThrow(TypeError);
35
+ expect(() => {
36
+ parse(42);
37
+ }).toThrow(TypeError);
38
+ expect(() => {
39
+ parse({});
40
+ }).toThrow(TypeError);
41
+ });
42
+ });
43
+
44
+ describe('SDF Parser no dynamicTyping', () => {
45
+ let result = parse(sdf, {
46
+ dynamicTyping: false,
47
+ });
48
+
49
+ it('Check statistics', () => {
50
+ expect(result.statistics[0].counter).toBe(128);
51
+ expect(result.statistics[0].isNumeric).toBe(false);
52
+ expect(result.statistics[0].label).toBe('Code');
53
+ expect(result.statistics[1].counter).toBe(128);
54
+ expect(result.statistics[1].minValue).toBeUndefined();
55
+ expect(result.statistics[1].maxValue).toBeUndefined();
56
+ expect(result.statistics[1].isNumeric).toBe(false);
57
+ expect(result.statistics[1].label).toBe('Number of H-Donors');
58
+ expect(result.statistics[0].always).toBe(true);
59
+ expect(result.statistics[4].always).toBe(false);
60
+ });
61
+
62
+ it('Check molecules', () => {
63
+ let molecule = result.molecules[0];
64
+ expect(typeof molecule.Code).toBe('string');
65
+ expect(typeof molecule.CLogP).toBe('string');
66
+ expect(molecule.CLogP).toBe('2.700000000000000e+000');
67
+ expect(molecule.molfile.split('\n')).toHaveLength(37);
68
+ });
69
+ });
70
+
71
+ describe('SDF Parser one molecule', () => {
72
+ let result = parse(sdf1);
73
+ it('Check statistics', () => {
74
+ expect(result.molecules).toHaveLength(1);
75
+ });
76
+ });
@@ -0,0 +1,14 @@
1
+ import fs from 'fs';
2
+
3
+ import { parse } from '..';
4
+
5
+ describe('SDF Parser of non well formatted file', () => {
6
+ let sdf = fs.readFileSync(`${__dirname}/test2.sdf`, 'utf-8');
7
+ sdf = sdf.replace(/\r/g, '');
8
+ let result = parse(sdf, { mixedEOL: true });
9
+
10
+ it('Check molecules', () => {
11
+ let molecules = result.molecules;
12
+ expect(molecules).toHaveLength(7);
13
+ });
14
+ });