sdf-parser 7.0.5 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/parse.ts ADDED
@@ -0,0 +1,227 @@
1
+ import { ensureString } from 'ensure-string';
2
+
3
+ import { getEntriesBoundaries } from './getEntriesBoundaries.ts';
4
+ import type { LabelInfo } from './util/getMolecule.ts';
5
+ import { getMolecule } from './util/getMolecule.ts';
6
+
7
+ /**
8
+ * A parsed SDF molecule entry. The `molfile` field contains the raw molfile
9
+ * string. Additional fields are populated from the SDF `> <field>` sections.
10
+ */
11
+ export interface Molecule {
12
+ /** The raw V2000/V3000 molfile block. */
13
+ molfile: string;
14
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
15
+ [label: string]: any;
16
+ }
17
+
18
+ /**
19
+ * Options for the {@link parse} function.
20
+ */
21
+ export interface ParseOptions {
22
+ /**
23
+ * Modifier functions applied to field values after parsing. The function
24
+ * receives the raw string value and may return a transformed value. Returning
25
+ * `undefined` or `null` removes the field from the molecule.
26
+ */
27
+ modifiers?: Record<string, (value: string) => unknown>;
28
+ /**
29
+ * Callback functions called for each field value. The callbacks are stored
30
+ * on the label info and available in statistics.
31
+ */
32
+ forEach?: Record<string, (value: unknown) => void>;
33
+ /**
34
+ * When `true`, numeric string values are automatically converted to numbers.
35
+ * @default true
36
+ */
37
+ dynamicTyping?: boolean;
38
+ /**
39
+ * End-of-line character. Auto-detected from the file content when not set.
40
+ * @default '\n'
41
+ */
42
+ eol?: string;
43
+ /**
44
+ * When `true`, normalises all `\r\n` and `\r` sequences to `\n` before
45
+ * parsing. Useful for SDF files with mixed or Windows-style line endings.
46
+ * @default false
47
+ */
48
+ mixedEOL?: boolean;
49
+ /**
50
+ * Only include fields whose names appear in this list.
51
+ * When combined with `exclude`, the field must satisfy both constraints.
52
+ */
53
+ include?: string[];
54
+ /**
55
+ * Exclude fields whose names appear in this list.
56
+ * When combined with `include`, the field must satisfy both constraints.
57
+ */
58
+ exclude?: string[];
59
+ /**
60
+ * A predicate function to filter molecules. Only molecules for which this
61
+ * function returns `true` are included in the result.
62
+ */
63
+ filter?: (molecule: Molecule) => boolean;
64
+ }
65
+
66
+ /**
67
+ * Statistics for a single SDF field label, as returned in
68
+ * {@link ParseResult.statistics}.
69
+ */
70
+ export interface LabelStatistic {
71
+ /** Field label name. */
72
+ label: string;
73
+ /** Number of molecules that contain this field. */
74
+ counter: number;
75
+ /** Whether all parsed values are numeric. */
76
+ isNumeric: boolean;
77
+ /** Whether this field is included in the output (not excluded). */
78
+ keep: boolean;
79
+ /** Minimum numeric value, only set when `isNumeric` is `true`. */
80
+ minValue?: number;
81
+ /** Maximum numeric value, only set when `isNumeric` is `true`. */
82
+ maxValue?: number;
83
+ /** Whether every molecule in the result contains this field. */
84
+ always: boolean;
85
+ }
86
+
87
+ /**
88
+ * Return value of the {@link parse} function.
89
+ */
90
+ export interface ParseResult {
91
+ /** Wall-clock time taken to parse, in milliseconds. */
92
+ time: number;
93
+ /** Parsed molecule entries. */
94
+ molecules: Molecule[];
95
+ /** Sorted list of all field label names found in the file. */
96
+ labels: string[];
97
+ /** Per-label statistics. */
98
+ statistics: LabelStatistic[];
99
+ }
100
+
101
+ /**
102
+ * Synchronously parse an SDF file into an array of molecule objects.
103
+ * @param sdf - The SDF content as a string, `ArrayBuffer`, or `ArrayBufferView`.
104
+ * @param options - Parsing options.
105
+ * @returns A {@link ParseResult} containing molecules and statistics.
106
+ * @example
107
+ * ```ts
108
+ * import { readFileSync } from 'node:fs';
109
+ * import { parse } from 'sdf-parser';
110
+ *
111
+ * const sdf = readFileSync('compounds.sdf', 'utf8');
112
+ * const { molecules, statistics } = parse(sdf);
113
+ * ```
114
+ */
115
+ export function parse(sdf: unknown, options: ParseOptions = {}): ParseResult {
116
+ options = { ...options };
117
+ if (options.modifiers === undefined) options.modifiers = {};
118
+ if (options.forEach === undefined) options.forEach = {};
119
+ if (options.dynamicTyping === undefined) options.dynamicTyping = true;
120
+
121
+ // ensureString converts ArrayBuffer/ArrayBufferView to string
122
+ const sdfString = ensureString(sdf as Parameters<typeof ensureString>[0]);
123
+ if (typeof sdfString !== 'string') {
124
+ throw new TypeError('Parameter "sdf" must be a string');
125
+ }
126
+
127
+ if (options.eol === undefined) {
128
+ options.eol = '\n';
129
+ if (options.mixedEOL) {
130
+ // Normalize all line endings to \n
131
+ // We work on a local variable so no issue here
132
+ } else {
133
+ // Note: new Set(string) creates a Set of individual characters.
134
+ // '\r\n' is two characters so header.has('\r\n') would always be false.
135
+ // This preserves the original detection behaviour.
136
+ const header = new Set(sdfString.slice(0, 1000));
137
+ if (header.has('\r\n' as unknown as string)) {
138
+ options.eol = '\r\n';
139
+ } else if (header.has('\r')) {
140
+ options.eol = '\r';
141
+ }
142
+ }
143
+ }
144
+
145
+ let workingSdf = sdfString;
146
+ if (options.mixedEOL) {
147
+ workingSdf = workingSdf.replaceAll('\r\n', '\n');
148
+ workingSdf = workingSdf.replaceAll('\r', '\n');
149
+ }
150
+
151
+ const eol = options.eol;
152
+ const modifiers = options.modifiers;
153
+ const forEachMap = options.forEach;
154
+ const dynamicTyping = options.dynamicTyping;
155
+
156
+ const entriesBoundaries = getEntriesBoundaries(workingSdf, `${eol}$$$$`, eol);
157
+ const molecules: Molecule[] = [];
158
+ const labels: Record<string, LabelInfo> = {};
159
+ const start = Date.now();
160
+
161
+ for (const boundary of entriesBoundaries) {
162
+ const sdfPart = workingSdf.slice(...boundary);
163
+ if (sdfPart.length < 40) continue;
164
+ const currentLabels: string[] = [];
165
+ const molecule = getMolecule(sdfPart, labels, currentLabels, {
166
+ eol,
167
+ dynamicTyping,
168
+ modifiers,
169
+ forEach: forEachMap,
170
+ include: options.include,
171
+ exclude: options.exclude,
172
+ });
173
+ if (!molecule) continue;
174
+ if (!options.filter || options.filter(molecule)) {
175
+ molecules.push(molecule);
176
+ for (const label of currentLabels) {
177
+ labels[label].counter++;
178
+ }
179
+ }
180
+ }
181
+
182
+ // Convert all numeric fields and compute min/max
183
+ for (const label in labels) {
184
+ const currentLabel = labels[label];
185
+ if (currentLabel.isNumeric) {
186
+ currentLabel.minValue = Infinity;
187
+ currentLabel.maxValue = -Infinity;
188
+ for (const molecule of molecules) {
189
+ if (molecule[label]) {
190
+ const value = Number.parseFloat(molecule[label]);
191
+ molecule[label] = value;
192
+ if (value > (currentLabel.maxValue ?? -Infinity)) {
193
+ currentLabel.maxValue = value;
194
+ }
195
+ if (value < (currentLabel.minValue ?? Infinity)) {
196
+ currentLabel.minValue = value;
197
+ }
198
+ }
199
+ }
200
+ }
201
+ }
202
+
203
+ for (const key in labels) {
204
+ labels[key].always = labels[key].counter === molecules.length;
205
+ }
206
+
207
+ const statistics: LabelStatistic[] = [];
208
+ for (const key in labels) {
209
+ const info = labels[key];
210
+ statistics.push({
211
+ label: key,
212
+ counter: info.counter,
213
+ isNumeric: info.isNumeric,
214
+ keep: info.keep,
215
+ minValue: info.minValue,
216
+ maxValue: info.maxValue,
217
+ always: info.always ?? false,
218
+ });
219
+ }
220
+
221
+ return {
222
+ time: Date.now() - start,
223
+ molecules,
224
+ labels: Object.keys(labels),
225
+ statistics,
226
+ };
227
+ }
@@ -0,0 +1,107 @@
1
+ import type { Molecule } from '../parse.ts';
2
+
3
+ /**
4
+ * Internal per-label tracking information used during parsing.
5
+ */
6
+ export interface LabelInfo {
7
+ /** Number of molecules that contain this label. */
8
+ counter: number;
9
+ /**
10
+ * Whether all seen values for this label are numeric.
11
+ * Starts as `true` when `dynamicTyping` is enabled.
12
+ */
13
+ isNumeric: boolean;
14
+ /** Whether this label is included in molecule output (not excluded). */
15
+ keep: boolean;
16
+ /** Minimum numeric value (set after all molecules are parsed). */
17
+ minValue?: number;
18
+ /** Maximum numeric value (set after all molecules are parsed). */
19
+ maxValue?: number;
20
+ /** Whether every molecule in the result contains this label. Set after parsing. */
21
+ always?: boolean;
22
+ /** Optional modifier function applied to the raw string value. */
23
+ modifier?: (value: string) => unknown;
24
+ /** Optional callback stored for this label (for statistics). */
25
+ forEach?: (value: unknown) => void;
26
+ }
27
+
28
+ /** Options consumed by {@link getMolecule} (a resolved subset of ParseOptions). */
29
+ export interface GetMoleculeOptions {
30
+ eol: string;
31
+ dynamicTyping: boolean;
32
+ include?: string[];
33
+ exclude?: string[];
34
+ modifiers: Record<string, (value: string) => unknown>;
35
+ forEach: Record<string, (value: unknown) => void>;
36
+ }
37
+
38
+ /**
39
+ * Parse a single SDF entry string into a molecule object.
40
+ * @param sdfPart - A single SDF record (everything before the `$$$$` line).
41
+ * @param labels - Shared label tracking object, mutated in place.
42
+ * @param currentLabels - Array to collect label names found in this entry.
43
+ * @param options - Resolved parse options.
44
+ * @returns The molecule object, or `undefined` if the entry is too short.
45
+ */
46
+ export function getMolecule(
47
+ sdfPart: string,
48
+ labels: Record<string, LabelInfo>,
49
+ currentLabels: string[],
50
+ options: GetMoleculeOptions,
51
+ ): Molecule | undefined {
52
+ const { eol, dynamicTyping, include, exclude, modifiers, forEach } = options;
53
+ const parts = sdfPart.split(`${eol}>`);
54
+ if (parts.length === 0 || parts[0].length <= 5) return undefined;
55
+ const molecule: Molecule = { molfile: parts[0] + eol };
56
+
57
+ for (let j = 1; j < parts.length; j++) {
58
+ const lines = parts[j].split(eol);
59
+ const from = lines[0].indexOf('<');
60
+ const to = lines[0].indexOf('>');
61
+ const label = lines[0].slice(from + 1, to);
62
+ currentLabels.push(label);
63
+
64
+ if (!labels[label]) {
65
+ labels[label] = {
66
+ counter: 0,
67
+ isNumeric: dynamicTyping,
68
+ keep: false,
69
+ };
70
+ if (!exclude?.includes(label) && (!include || include.includes(label))) {
71
+ labels[label].keep = true;
72
+ if (modifiers[label]) labels[label].modifier = modifiers[label];
73
+ if (forEach[label]) labels[label].forEach = forEach[label];
74
+ }
75
+ }
76
+
77
+ if (labels[label].keep) {
78
+ for (let k = 1; k < lines.length - 1; k++) {
79
+ if (molecule[label]) {
80
+ molecule[label] = `${molecule[label] as string}${eol}${lines[k]}`;
81
+ } else {
82
+ molecule[label] = lines[k];
83
+ }
84
+ }
85
+
86
+ if (labels[label].modifier) {
87
+ const modifiedValue = labels[label].modifier(molecule[label]);
88
+ if (modifiedValue === undefined || modifiedValue === null) {
89
+ // eslint-disable-next-line @typescript-eslint/no-dynamic-delete
90
+ delete molecule[label];
91
+ } else {
92
+ molecule[label] = modifiedValue;
93
+ }
94
+ }
95
+
96
+ if (
97
+ labels[label].isNumeric &&
98
+ (!Number.isFinite(+(molecule[label] as string)) ||
99
+ (molecule[label] as string).match(/^0[0-9]/))
100
+ ) {
101
+ labels[label].isNumeric = false;
102
+ }
103
+ }
104
+ }
105
+
106
+ return molecule;
107
+ }
@@ -1,35 +0,0 @@
1
- export class MolfileStream extends TransformStream {
2
- #buffer = '';
3
-
4
- constructor() {
5
- super({
6
- transform: (chunk, controller) => {
7
- this.#buffer += chunk;
8
- let begin = 0;
9
- let index = 0;
10
- while ((index = this.#buffer.indexOf('$$$$', index)) !== -1) {
11
- // we need to check if the delimiter '\n' is in the current buffer
12
- // if it is not we need to wait for the next chunk
13
- const endOfDelimiter = this.#buffer.indexOf('\n', index);
14
- if (endOfDelimiter === -1) {
15
- index = begin;
16
- break;
17
- }
18
- const eolLength = this.#buffer[endOfDelimiter - 1] === '\r' ? 2 : 1;
19
- // need to remove the last eol because we will split on eol+'>' in getMolecule
20
- if (index - eolLength - begin > 40) {
21
- controller.enqueue(this.#buffer.slice(begin, index - eolLength));
22
- }
23
- index = endOfDelimiter + eolLength;
24
- begin = index;
25
- }
26
- this.#buffer = this.#buffer.slice(begin);
27
- },
28
- flush: (controller) => {
29
- if (this.#buffer && this.#buffer.length > 40) {
30
- controller.enqueue(this.#buffer);
31
- }
32
- },
33
- });
34
- }
35
- }
@@ -1,28 +0,0 @@
1
- /**
2
- *
3
- * @param {*} string
4
- * @param {*} substring
5
- * @param {*} eol
6
- * @returns
7
- */
8
- export function getEntriesBoundaries(string, substring, eol) {
9
- const res = [];
10
- let previous = 0;
11
- let next = 0;
12
- while (next !== -1) {
13
- next = string.indexOf(substring, previous);
14
- if (next !== -1) {
15
- res.push([previous, next]);
16
- const nextMatch = string.indexOf(eol, next + substring.length);
17
- if (nextMatch === -1) {
18
- next = -1;
19
- } else {
20
- previous = nextMatch + eol.length;
21
- next = previous;
22
- }
23
- } else {
24
- res.push([previous, string.length]);
25
- }
26
- }
27
- return res;
28
- }
package/src/index.js DELETED
@@ -1,3 +0,0 @@
1
- export * from './parse';
2
- export * from './iterator';
3
- export * from './MolfileStream';
package/src/iterator.js DELETED
@@ -1,59 +0,0 @@
1
- import { parseString } from 'dynamic-typing';
2
-
3
- import { MolfileStream } from './MolfileStream.js';
4
-
5
- /**
6
- * Parse a SDF file as an iterator
7
- * @param {ReadableStream} readStream - SDF file to parse
8
- * @param {object} [options={}] - iterator options
9
- * @param {Function} [options.filter] - Callback allowing to filter the molecules
10
- * @param {string} [options.eol='\n'] - End of line character
11
- * @param {boolean} [options.dynamicTyping] - Dynamically type the data
12
- * @yields {object} - Molecule object
13
- */
14
- export async function* iterator(readStream, options = {}) {
15
- const { eol = '\n', dynamicTyping = true } = options;
16
-
17
- const moleculeStream = readStream.pipeThrough(new MolfileStream({ eol }));
18
- for await (const entry of moleculeStream) {
19
- const molecule = getMolecule(entry, {
20
- eol,
21
- dynamicTyping,
22
- });
23
- if (!options.filter || options.filter(molecule)) {
24
- yield molecule;
25
- }
26
- }
27
- }
28
-
29
- /**
30
- * Convert a SDF part to an object
31
- * @param {string} sdfPart - text containing the molfile
32
- * @param {object} options - options
33
- * @param {string} options.eol - end of line character
34
- * @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
35
- * @returns
36
- */
37
- function getMolecule(sdfPart, options) {
38
- const { eol, dynamicTyping } = options;
39
- let parts = sdfPart.split(`${eol}>`);
40
- if (parts.length === 0 || parts[0].length <= 5) return;
41
- let molecule = { molfile: parts[0] + eol };
42
- for (let j = 1; j < parts.length; j++) {
43
- let lines = parts[j].split(eol);
44
- let from = lines[0].indexOf('<');
45
- let to = lines[0].indexOf('>');
46
- let label = lines[0].slice(from + 1, to);
47
- for (let k = 1; k < lines.length - 1; k++) {
48
- if (molecule[label]) {
49
- molecule[label] += eol + lines[k];
50
- } else {
51
- molecule[label] = lines[k];
52
- }
53
- }
54
- if (dynamicTyping) {
55
- molecule[label] = parseString(molecule[label]);
56
- }
57
- }
58
- return molecule;
59
- }
package/src/parse.js DELETED
@@ -1,112 +0,0 @@
1
- import { ensureString } from 'ensure-string';
2
-
3
- import { getEntriesBoundaries } from './getEntriesBoundaries';
4
- import { getMolecule } from './util/getMolecule';
5
- /**
6
- * Parse a SDF file
7
- * @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
8
- * @param {object} [options={}]
9
- * @param {string[]} [options.include] - List of fields to include
10
- * @param {string[]} [options.exclude] - List of fields to exclude
11
- * @param {Function} [options.filter] - Callback allowing to filter the molecules
12
- * @param {boolean} [options.dynamicTyping] - Dynamically type the data
13
- * @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
14
- * @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
15
- * @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
16
- * @returns {object} - Object containing the molecules, the labels and the statistics
17
- */
18
- export function parse(sdf, options = {}) {
19
- options = { ...options };
20
- if (options.modifiers === undefined) options.modifiers = {};
21
- if (options.forEach === undefined) options.forEach = {};
22
- if (options.dynamicTyping === undefined) options.dynamicTyping = true;
23
-
24
- sdf = ensureString(sdf);
25
- if (typeof sdf !== 'string') {
26
- throw new TypeError('Parameter "sdf" must be a string');
27
- }
28
-
29
- if (options.eol === undefined) {
30
- options.eol = '\n';
31
- if (options.mixedEOL) {
32
- sdf = sdf.replaceAll('\r\n', '\n');
33
- sdf = sdf.replaceAll('\r', '\n');
34
- } else {
35
- // we will find the delimiter in order to be much faster and not use regular expression
36
- let header = new Set(sdf.slice(0, 1000));
37
- if (header.has('\r\n')) {
38
- options.eol = '\r\n';
39
- } else if (header.has('\r')) {
40
- options.eol = '\r';
41
- }
42
- }
43
- }
44
-
45
- let entriesBoundaries = getEntriesBoundaries(
46
- sdf,
47
- `${options.eol}$$$$`,
48
- options.eol,
49
- );
50
- let molecules = [];
51
- let labels = {};
52
-
53
- let start = Date.now();
54
-
55
- for (let i = 0; i < entriesBoundaries.length; i++) {
56
- let sdfPart = sdf.slice(...entriesBoundaries[i]);
57
- if (sdfPart.length < 40) continue;
58
- let currentLabels = [];
59
- const molecule = getMolecule(sdfPart, labels, currentLabels, options);
60
- if (!molecule) continue;
61
- if (!options.filter || options.filter(molecule)) {
62
- molecules.push(molecule);
63
- // only now we can increase the counter
64
- for (let j = 0; j < currentLabels.length; j++) {
65
- labels[currentLabels[j]].counter++;
66
- }
67
- }
68
- }
69
- // all numeric fields should be converted to numbers
70
- for (let label in labels) {
71
- let currentLabel = labels[label];
72
- if (currentLabel.isNumeric) {
73
- currentLabel.minValue = Infinity;
74
- currentLabel.maxValue = -Infinity;
75
- for (let j = 0; j < molecules.length; j++) {
76
- if (molecules[j][label]) {
77
- let value = Number.parseFloat(molecules[j][label]);
78
- molecules[j][label] = value;
79
- if (value > currentLabel.maxValue) {
80
- currentLabel.maxValue = value;
81
- }
82
- if (value < currentLabel.minValue) {
83
- currentLabel.minValue = value;
84
- }
85
- }
86
- }
87
- }
88
- }
89
-
90
- // we check that a label is in all the records
91
- for (let key in labels) {
92
- if (labels[key].counter === molecules.length) {
93
- labels[key].always = true;
94
- } else {
95
- labels[key].always = false;
96
- }
97
- }
98
-
99
- let statistics = [];
100
- for (let key in labels) {
101
- let statistic = labels[key];
102
- statistic.label = key;
103
- statistics.push(statistic);
104
- }
105
-
106
- return {
107
- time: Date.now() - start,
108
- molecules,
109
- labels: Object.keys(labels),
110
- statistics,
111
- };
112
- }
@@ -1,63 +0,0 @@
1
- /**
2
- * Parse the molfile and the properties with > < labels >
3
- * @param {string} sdfPart
4
- * @param {*} labels
5
- * @param {*} currentLabels
6
- * @param {object} options
7
- * @returns
8
- */
9
- export function getMolecule(sdfPart, labels, currentLabels, options) {
10
- let parts = sdfPart.split(`${options.eol}>`);
11
- if (parts.length === 0 || parts[0].length <= 5) return;
12
- let molecule = { molfile: parts[0] + options.eol };
13
- for (let j = 1; j < parts.length; j++) {
14
- let lines = parts[j].split(options.eol);
15
- let from = lines[0].indexOf('<');
16
- let to = lines[0].indexOf('>');
17
- let label = lines[0].slice(from + 1, to);
18
- currentLabels.push(label);
19
- if (!labels[label]) {
20
- labels[label] = {
21
- counter: 0,
22
- isNumeric: options.dynamicTyping,
23
- keep: false,
24
- };
25
- if (
26
- (!options.exclude || !options.exclude.includes(label)) &&
27
- (!options.include || options.include.includes(label))
28
- ) {
29
- labels[label].keep = true;
30
- if (options.modifiers[label]) {
31
- labels[label].modifier = options.modifiers[label];
32
- }
33
- if (options.forEach[label]) {
34
- labels[label].forEach = options.forEach[label];
35
- }
36
- }
37
- }
38
- if (labels[label].keep) {
39
- for (let k = 1; k < lines.length - 1; k++) {
40
- if (molecule[label]) {
41
- molecule[label] += options.eol + lines[k];
42
- } else {
43
- molecule[label] = lines[k];
44
- }
45
- }
46
- if (labels[label].modifier) {
47
- let modifiedValue = labels[label].modifier(molecule[label]);
48
- if (modifiedValue === undefined || modifiedValue === null) {
49
- delete molecule[label];
50
- } else {
51
- molecule[label] = modifiedValue;
52
- }
53
- }
54
- if (
55
- labels[label].isNumeric &&
56
- (!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
57
- ) {
58
- labels[label].isNumeric = false;
59
- }
60
- }
61
- }
62
- return molecule;
63
- }