sdf-parser 7.0.5 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/MolfileStream.d.ts +20 -0
- package/lib/MolfileStream.d.ts.map +1 -0
- package/lib/MolfileStream.js +52 -0
- package/lib/MolfileStream.js.map +1 -0
- package/lib/getEntriesBoundaries.d.ts +11 -0
- package/lib/getEntriesBoundaries.d.ts.map +1 -0
- package/lib/getEntriesBoundaries.js +33 -0
- package/lib/getEntriesBoundaries.js.map +1 -0
- package/lib/index.d.ts +4 -0
- package/lib/index.d.ts.map +1 -0
- package/lib/index.js +4 -302
- package/lib/index.js.map +1 -0
- package/lib/iterator.d.ts +49 -0
- package/lib/iterator.d.ts.map +1 -0
- package/lib/iterator.js +55 -0
- package/lib/iterator.js.map +1 -0
- package/lib/parse.d.ts +105 -0
- package/lib/parse.d.ts.map +1 -0
- package/lib/parse.js +128 -0
- package/lib/parse.js.map +1 -0
- package/lib/util/getMolecule.d.ts +44 -0
- package/lib/util/getMolecule.d.ts.map +1 -0
- package/lib/util/getMolecule.js +63 -0
- package/lib/util/getMolecule.js.map +1 -0
- package/package.json +20 -12
- package/src/MolfileStream.ts +53 -0
- package/src/getEntriesBoundaries.ts +34 -0
- package/src/index.ts +3 -0
- package/src/iterator.ts +104 -0
- package/src/parse.ts +227 -0
- package/src/util/getMolecule.ts +107 -0
- package/src/MolfileStream.js +0 -35
- package/src/getEntriesBoundaries.js +0 -28
- package/src/index.js +0 -3
- package/src/iterator.js +0 -59
- package/src/parse.js +0 -112
- package/src/util/getMolecule.js +0 -63
package/src/parse.ts
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import { ensureString } from 'ensure-string';
|
|
2
|
+
|
|
3
|
+
import { getEntriesBoundaries } from './getEntriesBoundaries.ts';
|
|
4
|
+
import type { LabelInfo } from './util/getMolecule.ts';
|
|
5
|
+
import { getMolecule } from './util/getMolecule.ts';
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* A parsed SDF molecule entry. The `molfile` field contains the raw molfile
|
|
9
|
+
* string. Additional fields are populated from the SDF `> <field>` sections.
|
|
10
|
+
*/
|
|
11
|
+
export interface Molecule {
|
|
12
|
+
/** The raw V2000/V3000 molfile block. */
|
|
13
|
+
molfile: string;
|
|
14
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
15
|
+
[label: string]: any;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Options for the {@link parse} function.
|
|
20
|
+
*/
|
|
21
|
+
export interface ParseOptions {
|
|
22
|
+
/**
|
|
23
|
+
* Modifier functions applied to field values after parsing. The function
|
|
24
|
+
* receives the raw string value and may return a transformed value. Returning
|
|
25
|
+
* `undefined` or `null` removes the field from the molecule.
|
|
26
|
+
*/
|
|
27
|
+
modifiers?: Record<string, (value: string) => unknown>;
|
|
28
|
+
/**
|
|
29
|
+
* Callback functions called for each field value. The callbacks are stored
|
|
30
|
+
* on the label info and available in statistics.
|
|
31
|
+
*/
|
|
32
|
+
forEach?: Record<string, (value: unknown) => void>;
|
|
33
|
+
/**
|
|
34
|
+
* When `true`, numeric string values are automatically converted to numbers.
|
|
35
|
+
* @default true
|
|
36
|
+
*/
|
|
37
|
+
dynamicTyping?: boolean;
|
|
38
|
+
/**
|
|
39
|
+
* End-of-line character. Auto-detected from the file content when not set.
|
|
40
|
+
* @default '\n'
|
|
41
|
+
*/
|
|
42
|
+
eol?: string;
|
|
43
|
+
/**
|
|
44
|
+
* When `true`, normalises all `\r\n` and `\r` sequences to `\n` before
|
|
45
|
+
* parsing. Useful for SDF files with mixed or Windows-style line endings.
|
|
46
|
+
* @default false
|
|
47
|
+
*/
|
|
48
|
+
mixedEOL?: boolean;
|
|
49
|
+
/**
|
|
50
|
+
* Only include fields whose names appear in this list.
|
|
51
|
+
* When combined with `exclude`, the field must satisfy both constraints.
|
|
52
|
+
*/
|
|
53
|
+
include?: string[];
|
|
54
|
+
/**
|
|
55
|
+
* Exclude fields whose names appear in this list.
|
|
56
|
+
* When combined with `include`, the field must satisfy both constraints.
|
|
57
|
+
*/
|
|
58
|
+
exclude?: string[];
|
|
59
|
+
/**
|
|
60
|
+
* A predicate function to filter molecules. Only molecules for which this
|
|
61
|
+
* function returns `true` are included in the result.
|
|
62
|
+
*/
|
|
63
|
+
filter?: (molecule: Molecule) => boolean;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Statistics for a single SDF field label, as returned in
|
|
68
|
+
* {@link ParseResult.statistics}.
|
|
69
|
+
*/
|
|
70
|
+
export interface LabelStatistic {
|
|
71
|
+
/** Field label name. */
|
|
72
|
+
label: string;
|
|
73
|
+
/** Number of molecules that contain this field. */
|
|
74
|
+
counter: number;
|
|
75
|
+
/** Whether all parsed values are numeric. */
|
|
76
|
+
isNumeric: boolean;
|
|
77
|
+
/** Whether this field is included in the output (not excluded). */
|
|
78
|
+
keep: boolean;
|
|
79
|
+
/** Minimum numeric value, only set when `isNumeric` is `true`. */
|
|
80
|
+
minValue?: number;
|
|
81
|
+
/** Maximum numeric value, only set when `isNumeric` is `true`. */
|
|
82
|
+
maxValue?: number;
|
|
83
|
+
/** Whether every molecule in the result contains this field. */
|
|
84
|
+
always: boolean;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Return value of the {@link parse} function.
|
|
89
|
+
*/
|
|
90
|
+
export interface ParseResult {
|
|
91
|
+
/** Wall-clock time taken to parse, in milliseconds. */
|
|
92
|
+
time: number;
|
|
93
|
+
/** Parsed molecule entries. */
|
|
94
|
+
molecules: Molecule[];
|
|
95
|
+
/** Sorted list of all field label names found in the file. */
|
|
96
|
+
labels: string[];
|
|
97
|
+
/** Per-label statistics. */
|
|
98
|
+
statistics: LabelStatistic[];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Synchronously parse an SDF file into an array of molecule objects.
|
|
103
|
+
* @param sdf - The SDF content as a string, `ArrayBuffer`, or `ArrayBufferView`.
|
|
104
|
+
* @param options - Parsing options.
|
|
105
|
+
* @returns A {@link ParseResult} containing molecules and statistics.
|
|
106
|
+
* @example
|
|
107
|
+
* ```ts
|
|
108
|
+
* import { readFileSync } from 'node:fs';
|
|
109
|
+
* import { parse } from 'sdf-parser';
|
|
110
|
+
*
|
|
111
|
+
* const sdf = readFileSync('compounds.sdf', 'utf8');
|
|
112
|
+
* const { molecules, statistics } = parse(sdf);
|
|
113
|
+
* ```
|
|
114
|
+
*/
|
|
115
|
+
export function parse(sdf: unknown, options: ParseOptions = {}): ParseResult {
|
|
116
|
+
options = { ...options };
|
|
117
|
+
if (options.modifiers === undefined) options.modifiers = {};
|
|
118
|
+
if (options.forEach === undefined) options.forEach = {};
|
|
119
|
+
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
120
|
+
|
|
121
|
+
// ensureString converts ArrayBuffer/ArrayBufferView to string
|
|
122
|
+
const sdfString = ensureString(sdf as Parameters<typeof ensureString>[0]);
|
|
123
|
+
if (typeof sdfString !== 'string') {
|
|
124
|
+
throw new TypeError('Parameter "sdf" must be a string');
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (options.eol === undefined) {
|
|
128
|
+
options.eol = '\n';
|
|
129
|
+
if (options.mixedEOL) {
|
|
130
|
+
// Normalize all line endings to \n
|
|
131
|
+
// We work on a local variable so no issue here
|
|
132
|
+
} else {
|
|
133
|
+
// Note: new Set(string) creates a Set of individual characters.
|
|
134
|
+
// '\r\n' is two characters so header.has('\r\n') would always be false.
|
|
135
|
+
// This preserves the original detection behaviour.
|
|
136
|
+
const header = new Set(sdfString.slice(0, 1000));
|
|
137
|
+
if (header.has('\r\n' as unknown as string)) {
|
|
138
|
+
options.eol = '\r\n';
|
|
139
|
+
} else if (header.has('\r')) {
|
|
140
|
+
options.eol = '\r';
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
let workingSdf = sdfString;
|
|
146
|
+
if (options.mixedEOL) {
|
|
147
|
+
workingSdf = workingSdf.replaceAll('\r\n', '\n');
|
|
148
|
+
workingSdf = workingSdf.replaceAll('\r', '\n');
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
const eol = options.eol;
|
|
152
|
+
const modifiers = options.modifiers;
|
|
153
|
+
const forEachMap = options.forEach;
|
|
154
|
+
const dynamicTyping = options.dynamicTyping;
|
|
155
|
+
|
|
156
|
+
const entriesBoundaries = getEntriesBoundaries(workingSdf, `${eol}$$$$`, eol);
|
|
157
|
+
const molecules: Molecule[] = [];
|
|
158
|
+
const labels: Record<string, LabelInfo> = {};
|
|
159
|
+
const start = Date.now();
|
|
160
|
+
|
|
161
|
+
for (const boundary of entriesBoundaries) {
|
|
162
|
+
const sdfPart = workingSdf.slice(...boundary);
|
|
163
|
+
if (sdfPart.length < 40) continue;
|
|
164
|
+
const currentLabels: string[] = [];
|
|
165
|
+
const molecule = getMolecule(sdfPart, labels, currentLabels, {
|
|
166
|
+
eol,
|
|
167
|
+
dynamicTyping,
|
|
168
|
+
modifiers,
|
|
169
|
+
forEach: forEachMap,
|
|
170
|
+
include: options.include,
|
|
171
|
+
exclude: options.exclude,
|
|
172
|
+
});
|
|
173
|
+
if (!molecule) continue;
|
|
174
|
+
if (!options.filter || options.filter(molecule)) {
|
|
175
|
+
molecules.push(molecule);
|
|
176
|
+
for (const label of currentLabels) {
|
|
177
|
+
labels[label].counter++;
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
// Convert all numeric fields and compute min/max
|
|
183
|
+
for (const label in labels) {
|
|
184
|
+
const currentLabel = labels[label];
|
|
185
|
+
if (currentLabel.isNumeric) {
|
|
186
|
+
currentLabel.minValue = Infinity;
|
|
187
|
+
currentLabel.maxValue = -Infinity;
|
|
188
|
+
for (const molecule of molecules) {
|
|
189
|
+
if (molecule[label]) {
|
|
190
|
+
const value = Number.parseFloat(molecule[label]);
|
|
191
|
+
molecule[label] = value;
|
|
192
|
+
if (value > (currentLabel.maxValue ?? -Infinity)) {
|
|
193
|
+
currentLabel.maxValue = value;
|
|
194
|
+
}
|
|
195
|
+
if (value < (currentLabel.minValue ?? Infinity)) {
|
|
196
|
+
currentLabel.minValue = value;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
for (const key in labels) {
|
|
204
|
+
labels[key].always = labels[key].counter === molecules.length;
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
const statistics: LabelStatistic[] = [];
|
|
208
|
+
for (const key in labels) {
|
|
209
|
+
const info = labels[key];
|
|
210
|
+
statistics.push({
|
|
211
|
+
label: key,
|
|
212
|
+
counter: info.counter,
|
|
213
|
+
isNumeric: info.isNumeric,
|
|
214
|
+
keep: info.keep,
|
|
215
|
+
minValue: info.minValue,
|
|
216
|
+
maxValue: info.maxValue,
|
|
217
|
+
always: info.always ?? false,
|
|
218
|
+
});
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return {
|
|
222
|
+
time: Date.now() - start,
|
|
223
|
+
molecules,
|
|
224
|
+
labels: Object.keys(labels),
|
|
225
|
+
statistics,
|
|
226
|
+
};
|
|
227
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import type { Molecule } from '../parse.ts';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Internal per-label tracking information used during parsing.
|
|
5
|
+
*/
|
|
6
|
+
export interface LabelInfo {
|
|
7
|
+
/** Number of molecules that contain this label. */
|
|
8
|
+
counter: number;
|
|
9
|
+
/**
|
|
10
|
+
* Whether all seen values for this label are numeric.
|
|
11
|
+
* Starts as `true` when `dynamicTyping` is enabled.
|
|
12
|
+
*/
|
|
13
|
+
isNumeric: boolean;
|
|
14
|
+
/** Whether this label is included in molecule output (not excluded). */
|
|
15
|
+
keep: boolean;
|
|
16
|
+
/** Minimum numeric value (set after all molecules are parsed). */
|
|
17
|
+
minValue?: number;
|
|
18
|
+
/** Maximum numeric value (set after all molecules are parsed). */
|
|
19
|
+
maxValue?: number;
|
|
20
|
+
/** Whether every molecule in the result contains this label. Set after parsing. */
|
|
21
|
+
always?: boolean;
|
|
22
|
+
/** Optional modifier function applied to the raw string value. */
|
|
23
|
+
modifier?: (value: string) => unknown;
|
|
24
|
+
/** Optional callback stored for this label (for statistics). */
|
|
25
|
+
forEach?: (value: unknown) => void;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** Options consumed by {@link getMolecule} (a resolved subset of ParseOptions). */
|
|
29
|
+
export interface GetMoleculeOptions {
|
|
30
|
+
eol: string;
|
|
31
|
+
dynamicTyping: boolean;
|
|
32
|
+
include?: string[];
|
|
33
|
+
exclude?: string[];
|
|
34
|
+
modifiers: Record<string, (value: string) => unknown>;
|
|
35
|
+
forEach: Record<string, (value: unknown) => void>;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/**
|
|
39
|
+
* Parse a single SDF entry string into a molecule object.
|
|
40
|
+
* @param sdfPart - A single SDF record (everything before the `$$$$` line).
|
|
41
|
+
* @param labels - Shared label tracking object, mutated in place.
|
|
42
|
+
* @param currentLabels - Array to collect label names found in this entry.
|
|
43
|
+
* @param options - Resolved parse options.
|
|
44
|
+
* @returns The molecule object, or `undefined` if the entry is too short.
|
|
45
|
+
*/
|
|
46
|
+
export function getMolecule(
|
|
47
|
+
sdfPart: string,
|
|
48
|
+
labels: Record<string, LabelInfo>,
|
|
49
|
+
currentLabels: string[],
|
|
50
|
+
options: GetMoleculeOptions,
|
|
51
|
+
): Molecule | undefined {
|
|
52
|
+
const { eol, dynamicTyping, include, exclude, modifiers, forEach } = options;
|
|
53
|
+
const parts = sdfPart.split(`${eol}>`);
|
|
54
|
+
if (parts.length === 0 || parts[0].length <= 5) return undefined;
|
|
55
|
+
const molecule: Molecule = { molfile: parts[0] + eol };
|
|
56
|
+
|
|
57
|
+
for (let j = 1; j < parts.length; j++) {
|
|
58
|
+
const lines = parts[j].split(eol);
|
|
59
|
+
const from = lines[0].indexOf('<');
|
|
60
|
+
const to = lines[0].indexOf('>');
|
|
61
|
+
const label = lines[0].slice(from + 1, to);
|
|
62
|
+
currentLabels.push(label);
|
|
63
|
+
|
|
64
|
+
if (!labels[label]) {
|
|
65
|
+
labels[label] = {
|
|
66
|
+
counter: 0,
|
|
67
|
+
isNumeric: dynamicTyping,
|
|
68
|
+
keep: false,
|
|
69
|
+
};
|
|
70
|
+
if (!exclude?.includes(label) && (!include || include.includes(label))) {
|
|
71
|
+
labels[label].keep = true;
|
|
72
|
+
if (modifiers[label]) labels[label].modifier = modifiers[label];
|
|
73
|
+
if (forEach[label]) labels[label].forEach = forEach[label];
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (labels[label].keep) {
|
|
78
|
+
for (let k = 1; k < lines.length - 1; k++) {
|
|
79
|
+
if (molecule[label]) {
|
|
80
|
+
molecule[label] = `${molecule[label] as string}${eol}${lines[k]}`;
|
|
81
|
+
} else {
|
|
82
|
+
molecule[label] = lines[k];
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (labels[label].modifier) {
|
|
87
|
+
const modifiedValue = labels[label].modifier(molecule[label]);
|
|
88
|
+
if (modifiedValue === undefined || modifiedValue === null) {
|
|
89
|
+
// eslint-disable-next-line @typescript-eslint/no-dynamic-delete
|
|
90
|
+
delete molecule[label];
|
|
91
|
+
} else {
|
|
92
|
+
molecule[label] = modifiedValue;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (
|
|
97
|
+
labels[label].isNumeric &&
|
|
98
|
+
(!Number.isFinite(+(molecule[label] as string)) ||
|
|
99
|
+
(molecule[label] as string).match(/^0[0-9]/))
|
|
100
|
+
) {
|
|
101
|
+
labels[label].isNumeric = false;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return molecule;
|
|
107
|
+
}
|
package/src/MolfileStream.js
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
export class MolfileStream extends TransformStream {
|
|
2
|
-
#buffer = '';
|
|
3
|
-
|
|
4
|
-
constructor() {
|
|
5
|
-
super({
|
|
6
|
-
transform: (chunk, controller) => {
|
|
7
|
-
this.#buffer += chunk;
|
|
8
|
-
let begin = 0;
|
|
9
|
-
let index = 0;
|
|
10
|
-
while ((index = this.#buffer.indexOf('$$$$', index)) !== -1) {
|
|
11
|
-
// we need to check if the delimiter '\n' is in the current buffer
|
|
12
|
-
// if it is not we need to wait for the next chunk
|
|
13
|
-
const endOfDelimiter = this.#buffer.indexOf('\n', index);
|
|
14
|
-
if (endOfDelimiter === -1) {
|
|
15
|
-
index = begin;
|
|
16
|
-
break;
|
|
17
|
-
}
|
|
18
|
-
const eolLength = this.#buffer[endOfDelimiter - 1] === '\r' ? 2 : 1;
|
|
19
|
-
// need to remove the last eol because we will split on eol+'>' in getMolecule
|
|
20
|
-
if (index - eolLength - begin > 40) {
|
|
21
|
-
controller.enqueue(this.#buffer.slice(begin, index - eolLength));
|
|
22
|
-
}
|
|
23
|
-
index = endOfDelimiter + eolLength;
|
|
24
|
-
begin = index;
|
|
25
|
-
}
|
|
26
|
-
this.#buffer = this.#buffer.slice(begin);
|
|
27
|
-
},
|
|
28
|
-
flush: (controller) => {
|
|
29
|
-
if (this.#buffer && this.#buffer.length > 40) {
|
|
30
|
-
controller.enqueue(this.#buffer);
|
|
31
|
-
}
|
|
32
|
-
},
|
|
33
|
-
});
|
|
34
|
-
}
|
|
35
|
-
}
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
*
|
|
3
|
-
* @param {*} string
|
|
4
|
-
* @param {*} substring
|
|
5
|
-
* @param {*} eol
|
|
6
|
-
* @returns
|
|
7
|
-
*/
|
|
8
|
-
export function getEntriesBoundaries(string, substring, eol) {
|
|
9
|
-
const res = [];
|
|
10
|
-
let previous = 0;
|
|
11
|
-
let next = 0;
|
|
12
|
-
while (next !== -1) {
|
|
13
|
-
next = string.indexOf(substring, previous);
|
|
14
|
-
if (next !== -1) {
|
|
15
|
-
res.push([previous, next]);
|
|
16
|
-
const nextMatch = string.indexOf(eol, next + substring.length);
|
|
17
|
-
if (nextMatch === -1) {
|
|
18
|
-
next = -1;
|
|
19
|
-
} else {
|
|
20
|
-
previous = nextMatch + eol.length;
|
|
21
|
-
next = previous;
|
|
22
|
-
}
|
|
23
|
-
} else {
|
|
24
|
-
res.push([previous, string.length]);
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
return res;
|
|
28
|
-
}
|
package/src/index.js
DELETED
package/src/iterator.js
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
import { parseString } from 'dynamic-typing';
|
|
2
|
-
|
|
3
|
-
import { MolfileStream } from './MolfileStream.js';
|
|
4
|
-
|
|
5
|
-
/**
|
|
6
|
-
* Parse a SDF file as an iterator
|
|
7
|
-
* @param {ReadableStream} readStream - SDF file to parse
|
|
8
|
-
* @param {object} [options={}] - iterator options
|
|
9
|
-
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
10
|
-
* @param {string} [options.eol='\n'] - End of line character
|
|
11
|
-
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
12
|
-
* @yields {object} - Molecule object
|
|
13
|
-
*/
|
|
14
|
-
export async function* iterator(readStream, options = {}) {
|
|
15
|
-
const { eol = '\n', dynamicTyping = true } = options;
|
|
16
|
-
|
|
17
|
-
const moleculeStream = readStream.pipeThrough(new MolfileStream({ eol }));
|
|
18
|
-
for await (const entry of moleculeStream) {
|
|
19
|
-
const molecule = getMolecule(entry, {
|
|
20
|
-
eol,
|
|
21
|
-
dynamicTyping,
|
|
22
|
-
});
|
|
23
|
-
if (!options.filter || options.filter(molecule)) {
|
|
24
|
-
yield molecule;
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
/**
|
|
30
|
-
* Convert a SDF part to an object
|
|
31
|
-
* @param {string} sdfPart - text containing the molfile
|
|
32
|
-
* @param {object} options - options
|
|
33
|
-
* @param {string} options.eol - end of line character
|
|
34
|
-
* @param {boolean} options.dynamicTyping - Dynamically type the data (create numbers and booleans)
|
|
35
|
-
* @returns
|
|
36
|
-
*/
|
|
37
|
-
function getMolecule(sdfPart, options) {
|
|
38
|
-
const { eol, dynamicTyping } = options;
|
|
39
|
-
let parts = sdfPart.split(`${eol}>`);
|
|
40
|
-
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
41
|
-
let molecule = { molfile: parts[0] + eol };
|
|
42
|
-
for (let j = 1; j < parts.length; j++) {
|
|
43
|
-
let lines = parts[j].split(eol);
|
|
44
|
-
let from = lines[0].indexOf('<');
|
|
45
|
-
let to = lines[0].indexOf('>');
|
|
46
|
-
let label = lines[0].slice(from + 1, to);
|
|
47
|
-
for (let k = 1; k < lines.length - 1; k++) {
|
|
48
|
-
if (molecule[label]) {
|
|
49
|
-
molecule[label] += eol + lines[k];
|
|
50
|
-
} else {
|
|
51
|
-
molecule[label] = lines[k];
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
if (dynamicTyping) {
|
|
55
|
-
molecule[label] = parseString(molecule[label]);
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
return molecule;
|
|
59
|
-
}
|
package/src/parse.js
DELETED
|
@@ -1,112 +0,0 @@
|
|
|
1
|
-
import { ensureString } from 'ensure-string';
|
|
2
|
-
|
|
3
|
-
import { getEntriesBoundaries } from './getEntriesBoundaries';
|
|
4
|
-
import { getMolecule } from './util/getMolecule';
|
|
5
|
-
/**
|
|
6
|
-
* Parse a SDF file
|
|
7
|
-
* @param {string|ArrayBuffer|Uint8Array} sdf - SDF file to parse
|
|
8
|
-
* @param {object} [options={}]
|
|
9
|
-
* @param {string[]} [options.include] - List of fields to include
|
|
10
|
-
* @param {string[]} [options.exclude] - List of fields to exclude
|
|
11
|
-
* @param {Function} [options.filter] - Callback allowing to filter the molecules
|
|
12
|
-
* @param {boolean} [options.dynamicTyping] - Dynamically type the data
|
|
13
|
-
* @param {object} [options.modifiers] - Object containing callbacks to apply on some specific fields
|
|
14
|
-
* @param {boolean} [options.mixedEOL=false] - Set to true if you know there is a mixture between \r\n and \n
|
|
15
|
-
* @param {string} [options.eol] - Specify the end of line character. Default will be the one found in the file
|
|
16
|
-
* @returns {object} - Object containing the molecules, the labels and the statistics
|
|
17
|
-
*/
|
|
18
|
-
export function parse(sdf, options = {}) {
|
|
19
|
-
options = { ...options };
|
|
20
|
-
if (options.modifiers === undefined) options.modifiers = {};
|
|
21
|
-
if (options.forEach === undefined) options.forEach = {};
|
|
22
|
-
if (options.dynamicTyping === undefined) options.dynamicTyping = true;
|
|
23
|
-
|
|
24
|
-
sdf = ensureString(sdf);
|
|
25
|
-
if (typeof sdf !== 'string') {
|
|
26
|
-
throw new TypeError('Parameter "sdf" must be a string');
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
if (options.eol === undefined) {
|
|
30
|
-
options.eol = '\n';
|
|
31
|
-
if (options.mixedEOL) {
|
|
32
|
-
sdf = sdf.replaceAll('\r\n', '\n');
|
|
33
|
-
sdf = sdf.replaceAll('\r', '\n');
|
|
34
|
-
} else {
|
|
35
|
-
// we will find the delimiter in order to be much faster and not use regular expression
|
|
36
|
-
let header = new Set(sdf.slice(0, 1000));
|
|
37
|
-
if (header.has('\r\n')) {
|
|
38
|
-
options.eol = '\r\n';
|
|
39
|
-
} else if (header.has('\r')) {
|
|
40
|
-
options.eol = '\r';
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
|
|
45
|
-
let entriesBoundaries = getEntriesBoundaries(
|
|
46
|
-
sdf,
|
|
47
|
-
`${options.eol}$$$$`,
|
|
48
|
-
options.eol,
|
|
49
|
-
);
|
|
50
|
-
let molecules = [];
|
|
51
|
-
let labels = {};
|
|
52
|
-
|
|
53
|
-
let start = Date.now();
|
|
54
|
-
|
|
55
|
-
for (let i = 0; i < entriesBoundaries.length; i++) {
|
|
56
|
-
let sdfPart = sdf.slice(...entriesBoundaries[i]);
|
|
57
|
-
if (sdfPart.length < 40) continue;
|
|
58
|
-
let currentLabels = [];
|
|
59
|
-
const molecule = getMolecule(sdfPart, labels, currentLabels, options);
|
|
60
|
-
if (!molecule) continue;
|
|
61
|
-
if (!options.filter || options.filter(molecule)) {
|
|
62
|
-
molecules.push(molecule);
|
|
63
|
-
// only now we can increase the counter
|
|
64
|
-
for (let j = 0; j < currentLabels.length; j++) {
|
|
65
|
-
labels[currentLabels[j]].counter++;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
// all numeric fields should be converted to numbers
|
|
70
|
-
for (let label in labels) {
|
|
71
|
-
let currentLabel = labels[label];
|
|
72
|
-
if (currentLabel.isNumeric) {
|
|
73
|
-
currentLabel.minValue = Infinity;
|
|
74
|
-
currentLabel.maxValue = -Infinity;
|
|
75
|
-
for (let j = 0; j < molecules.length; j++) {
|
|
76
|
-
if (molecules[j][label]) {
|
|
77
|
-
let value = Number.parseFloat(molecules[j][label]);
|
|
78
|
-
molecules[j][label] = value;
|
|
79
|
-
if (value > currentLabel.maxValue) {
|
|
80
|
-
currentLabel.maxValue = value;
|
|
81
|
-
}
|
|
82
|
-
if (value < currentLabel.minValue) {
|
|
83
|
-
currentLabel.minValue = value;
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
// we check that a label is in all the records
|
|
91
|
-
for (let key in labels) {
|
|
92
|
-
if (labels[key].counter === molecules.length) {
|
|
93
|
-
labels[key].always = true;
|
|
94
|
-
} else {
|
|
95
|
-
labels[key].always = false;
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
let statistics = [];
|
|
100
|
-
for (let key in labels) {
|
|
101
|
-
let statistic = labels[key];
|
|
102
|
-
statistic.label = key;
|
|
103
|
-
statistics.push(statistic);
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
return {
|
|
107
|
-
time: Date.now() - start,
|
|
108
|
-
molecules,
|
|
109
|
-
labels: Object.keys(labels),
|
|
110
|
-
statistics,
|
|
111
|
-
};
|
|
112
|
-
}
|
package/src/util/getMolecule.js
DELETED
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Parse the molfile and the properties with > < labels >
|
|
3
|
-
* @param {string} sdfPart
|
|
4
|
-
* @param {*} labels
|
|
5
|
-
* @param {*} currentLabels
|
|
6
|
-
* @param {object} options
|
|
7
|
-
* @returns
|
|
8
|
-
*/
|
|
9
|
-
export function getMolecule(sdfPart, labels, currentLabels, options) {
|
|
10
|
-
let parts = sdfPart.split(`${options.eol}>`);
|
|
11
|
-
if (parts.length === 0 || parts[0].length <= 5) return;
|
|
12
|
-
let molecule = { molfile: parts[0] + options.eol };
|
|
13
|
-
for (let j = 1; j < parts.length; j++) {
|
|
14
|
-
let lines = parts[j].split(options.eol);
|
|
15
|
-
let from = lines[0].indexOf('<');
|
|
16
|
-
let to = lines[0].indexOf('>');
|
|
17
|
-
let label = lines[0].slice(from + 1, to);
|
|
18
|
-
currentLabels.push(label);
|
|
19
|
-
if (!labels[label]) {
|
|
20
|
-
labels[label] = {
|
|
21
|
-
counter: 0,
|
|
22
|
-
isNumeric: options.dynamicTyping,
|
|
23
|
-
keep: false,
|
|
24
|
-
};
|
|
25
|
-
if (
|
|
26
|
-
(!options.exclude || !options.exclude.includes(label)) &&
|
|
27
|
-
(!options.include || options.include.includes(label))
|
|
28
|
-
) {
|
|
29
|
-
labels[label].keep = true;
|
|
30
|
-
if (options.modifiers[label]) {
|
|
31
|
-
labels[label].modifier = options.modifiers[label];
|
|
32
|
-
}
|
|
33
|
-
if (options.forEach[label]) {
|
|
34
|
-
labels[label].forEach = options.forEach[label];
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
if (labels[label].keep) {
|
|
39
|
-
for (let k = 1; k < lines.length - 1; k++) {
|
|
40
|
-
if (molecule[label]) {
|
|
41
|
-
molecule[label] += options.eol + lines[k];
|
|
42
|
-
} else {
|
|
43
|
-
molecule[label] = lines[k];
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
if (labels[label].modifier) {
|
|
47
|
-
let modifiedValue = labels[label].modifier(molecule[label]);
|
|
48
|
-
if (modifiedValue === undefined || modifiedValue === null) {
|
|
49
|
-
delete molecule[label];
|
|
50
|
-
} else {
|
|
51
|
-
molecule[label] = modifiedValue;
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
if (
|
|
55
|
-
labels[label].isNumeric &&
|
|
56
|
-
(!Number.isFinite(+molecule[label]) || molecule[label].match(/^0[0-9]/))
|
|
57
|
-
) {
|
|
58
|
-
labels[label].isNumeric = false;
|
|
59
|
-
}
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
return molecule;
|
|
63
|
-
}
|