@toxplanet/pegasus-sdk 1.1.16 → 1.1.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/chemicals.js CHANGED
@@ -1,1029 +1,1088 @@
1
- const { logError, logInfo } = require('@toxplanet/tphelper/logging');
2
- const { getDrizzle, schema } = require('./db');
3
- const { eq, sql, and, inArray, arrayContains } = require('drizzle-orm');
4
- const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
5
-
6
- const SEARCH_BOOST_EXACT_PRIMARY = 100;
7
- const SEARCH_BOOST_PREFIX_PRIMARY = 50;
8
- const SEARCH_BOOST_EXACT_SECONDARY = 30;
9
- const SEARCH_BOOST_PREFIX_SECONDARY = 10;
10
-
11
- const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'DTXSID', 'EINECS', 'EC']);
12
-
13
- function escapeLikePattern(value) {
14
- return value.replace(/[%_\\]/g, '\\$&');
15
- }
16
-
17
- class ChemicalsService {
18
- constructor(connection) {
19
- this.connection = connection;
20
- this.db = null;
21
- this.sqsClient = null;
22
- }
23
-
24
- getDb() {
25
- if (!this.db) {
26
- this.db = getDrizzle(this.connection.pgPool);
27
- }
28
- return this.db;
29
- }
30
-
31
- async sendSqlWriteFailure({ sql, parameters, error, retryCount, failedAt }) {
32
- try {
33
- const region = process.env.AWS_REGION || this.connection.region;
34
- const { awsAccountId, environment } = this.connection.config;
35
- const defaultQueueUrl = awsAccountId
36
- ? `https://sqs.${region}.amazonaws.com/${awsAccountId}/cr-pegasus-failed-items-${environment}`
37
- : null;
38
- const queueUrl = process.env.SQS_FAILED_ITEMS_QUEUE || defaultQueueUrl;
39
-
40
- if (!queueUrl) {
41
- logError('pegasus-sdk', 'sendSqlWriteFailure', 'No SQS queue URL available: set SQS_FAILED_ITEMS_QUEUE or provide awsAccountId in config');
42
- return false;
43
- }
44
-
45
- logInfo('pegasus-sdk', `[sendSqlWriteFailure] Using queue: ${queueUrl}${process.env.SQS_FAILED_ITEMS_QUEUE ? ' (from env)' : ' (default)'}`);
46
-
47
- if (!this.sqsClient) {
48
- this.sqsClient = new SQSClient({ region });
49
- }
50
-
51
- const message = {
52
- MessageType: 'SqlWriteFailure',
53
- SourceService: this.connection.config.sourceService || 'pegasus-sdk',
54
- Timestamp: (failedAt || new Date()).toISOString(),
55
- Sql: sql,
56
- Parameters: parameters,
57
- OriginalError: error.message,
58
- RetryCount: retryCount
59
- };
60
-
61
- const command = new SendMessageCommand({
62
- QueueUrl: queueUrl,
63
- MessageBody: JSON.stringify(message)
64
- });
65
-
66
- const response = await this.sqsClient.send(command);
67
- logInfo('pegasus-sdk', `[sendSqlWriteFailure] SqlWriteFailure posted to SQS: MessageId=${response.MessageId}, RetryCount=${retryCount}`);
68
- return true;
69
- } catch (sqsError) {
70
- logError('pegasus-sdk', 'sendSqlWriteFailure', 'Failed to post SqlWriteFailure to SQS', sqsError);
71
- return false;
72
- }
73
- }
74
-
75
- _buildChemicalUpsertSql(chemical) {
76
- const sql = [
77
- 'INSERT INTO chemicals (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)',
78
- 'VALUES (@source_id, @chemical_name, @chemical_meta::jsonb, @chemical_identifiers::jsonb, @chemical_synonyms, @chemical_categories, @created_at, @updated_at)',
79
- 'ON CONFLICT (source_id) DO UPDATE SET',
80
- ' chemical_name = @chemical_name,',
81
- ' chemical_meta = @chemical_meta::jsonb,',
82
- ' chemical_identifiers = @chemical_identifiers::jsonb,',
83
- ' chemical_synonyms = @chemical_synonyms,',
84
- ' chemical_categories = @chemical_categories,',
85
- ' updated_at = @updated_at'
86
- ].join('\n');
87
-
88
- const serializeDate = (d) => d instanceof Date ? d.toISOString() : d;
89
-
90
- const parameters = {
91
- '@source_id': chemical.sourceId,
92
- '@chemical_name': chemical.chemicalName,
93
- '@chemical_meta': JSON.stringify(chemical.chemicalMeta ?? {}),
94
- '@chemical_identifiers': JSON.stringify(chemical.chemicalIdentifiers ?? {}),
95
- '@chemical_synonyms': JSON.stringify(chemical.chemicalSynonyms ?? []),
96
- '@chemical_categories': JSON.stringify(chemical.chemicalCategories ?? []),
97
- '@created_at': serializeDate(chemical.createdAt),
98
- '@updated_at': serializeDate(chemical.updatedAt)
99
- };
100
-
101
- return { sql, parameters };
102
- }
103
-
104
- async bulkIndexFielded(documents) {
105
- try {
106
- logInfo('pegasus-sdk', `[bulkIndexFielded] Starting bulk index with ${documents?.length || 0} documents`);
107
-
108
- if (!documents || documents.length === 0) {
109
- logInfo('pegasus-sdk', `[bulkIndexFielded] No documents provided, returning empty result`);
110
- return { indexed: 0, errors: [], results: [] };
111
- }
112
-
113
- const db = this.getDb();
114
- const results = [];
115
- const errors = [];
116
-
117
- logInfo('pegasus-sdk', `[bulkIndexFielded] Database connection established`);
118
-
119
- for (let i = 0; i < documents.length; i++) {
120
- const doc = documents[i];
121
- logInfo('pegasus-sdk', `[bulkIndexFielded] Processing document ${i}: source_id=${doc.source_id}, chemical_name=${doc.chemical_name}`);
122
-
123
- const parseDate = (dateValue) => {
124
- if (!dateValue) return new Date();
125
- if (dateValue instanceof Date) return dateValue;
126
- if (typeof dateValue === 'string') return new Date(dateValue);
127
- return new Date();
128
- };
129
-
130
- const chemical = {
131
- sourceId: doc.source_id || doc._id,
132
- chemicalName: doc.chemical_name || doc.name,
133
- chemicalMeta: doc.chemical_meta || {},
134
- chemicalIdentifiers: doc.chemical_identifiers || {},
135
- chemicalSynonyms: doc.chemical_synonyms || [],
136
- chemicalCategories: doc.chemical_categories || [],
137
- createdAt: parseDate(doc.created_at),
138
- updatedAt: parseDate(doc.updated_at),
139
- ...(doc.imported_at && { importedAt: doc.imported_at }),
140
- ...(doc.chemical_id && { chemicalId: doc.chemical_id })
141
- };
142
-
143
- logInfo('pegasus-sdk', `[bulkIndexFielded] Prepared chemical object: sourceId=${chemical.sourceId}, chemicalName=${chemical.chemicalName}`);
144
-
145
- const isConnectionError = (err) =>
146
- err.message?.toLowerCase().includes('timeout') ||
147
- err.message?.toLowerCase().includes('connection') ||
148
- err.code === 'ECONNREFUSED' ||
149
- err.code === 'ETIMEDOUT';
150
-
151
- const attemptUpsert = () =>
152
- db.insert(schema.chemicals)
153
- .values(chemical)
154
- .onConflictDoUpdate({
155
- target: schema.chemicals.sourceId,
156
- set: {
157
- chemicalName: chemical.chemicalName,
158
- chemicalMeta: chemical.chemicalMeta,
159
- chemicalIdentifiers: chemical.chemicalIdentifiers,
160
- chemicalSynonyms: chemical.chemicalSynonyms,
161
- chemicalCategories: chemical.chemicalCategories,
162
- updatedAt: new Date()
163
- }
164
- })
165
- .returning({
166
- chemicalId: schema.chemicals.chemicalId,
167
- sourceId: schema.chemicals.sourceId
168
- });
169
-
170
- let lastError = null;
171
- let retryCount = 0;
172
- const failedAt = new Date();
173
-
174
- try {
175
- const [result] = await attemptUpsert();
176
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully: ${result?.chemicalId || 'no ID returned'}`);
177
- results.push({ index: i, success: true, result });
178
- continue;
179
- } catch (firstErr) {
180
- lastError = firstErr;
181
- if (!isConnectionError(firstErr)) {
182
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} first attempt failed (${firstErr.message}), retrying once`);
183
- try {
184
- const [result] = await attemptUpsert();
185
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully on retry: ${result?.chemicalId || 'no ID returned'}`);
186
- results.push({ index: i, success: true, result });
187
- continue;
188
- } catch (retryErr) {
189
- lastError = retryErr;
190
- retryCount = 1;
191
- }
192
- }
193
- }
194
-
195
- logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} failed after ${retryCount} local retries (source_id=${chemical.sourceId})`, lastError);
196
-
197
- const { sql: failureSql, parameters: failureParams } = this._buildChemicalUpsertSql(chemical);
198
- const queued = await this.sendSqlWriteFailure({
199
- sql: failureSql,
200
- parameters: failureParams,
201
- error: lastError,
202
- retryCount,
203
- failedAt
204
- });
205
-
206
- if (queued) {
207
- logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} (source_id=${chemical.sourceId}) queued for repair via SQS`);
208
- } else {
209
- logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} (source_id=${chemical.sourceId}) failed and could not be queued — data loss risk`, lastError);
210
- }
211
-
212
- results.push({ index: i, success: false, error: lastError.message, queued });
213
- errors.push({ document: doc, error: lastError.message, queued });
214
- }
215
-
216
- const successCount = results.filter(r => r.success).length;
217
- const queuedCount = results.filter(r => !r.success && r.queued).length;
218
- logInfo('pegasus-sdk', `[bulkIndexFielded] Bulk index complete: ${successCount}/${documents.length} succeeded, ${queuedCount} queued for repair, ${errors.length - queuedCount} unhandled errors`);
219
-
220
- return { indexed: successCount, errors, results };
221
- } catch (error) {
222
- logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFielded', error);
223
- throw error;
224
- }
225
- }
226
-
227
- async bulkIndexFulltext(documents) {
228
- try {
229
- return { acknowledged: true, count: documents?.length || 0 };
230
- } catch (error) {
231
- logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFulltext', error);
232
- throw error;
233
- }
234
- }
235
-
236
- async bulkIndexSubstances(substances) {
237
- try {
238
- const documents = substances.map(substance => ({
239
- source_id: substance.substance_id || substance.id,
240
- chemical_name: substance.name || substance.substance_name,
241
- chemical_meta: substance.meta || {},
242
- chemical_identifiers: substance.identifiers || {},
243
- chemical_synonyms: substance.synonyms || [],
244
- chemical_categories: substance.categories || substance.substance_types || [],
245
- created_at: substance.created_at,
246
- updated_at: substance.updated_at,
247
- imported_at: substance.imported_at
248
- }));
249
-
250
- return await this.bulkIndexFielded(documents);
251
- } catch (error) {
252
- logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexSubstances', error);
253
- throw error;
254
- }
255
- }
256
-
257
- async createChemical(chemical) {
258
- try {
259
- const db = this.getDb();
260
-
261
- const [result] = await db
262
- .insert(schema.chemicals)
263
- .values({
264
- sourceId: chemical.source_id,
265
- chemicalName: chemical.chemical_name,
266
- chemicalMeta: chemical.chemical_meta,
267
- chemicalIdentifiers: chemical.chemical_identifiers,
268
- chemicalSynonyms: chemical.chemical_synonyms,
269
- chemicalCategories: chemical.chemical_categories,
270
- createdAt: chemical.created_at || new Date(),
271
- updatedAt: chemical.updated_at || new Date(),
272
- ...(chemical.imported_at && { importedAt: chemical.imported_at }),
273
- ...(chemical.chemical_id && { chemicalId: chemical.chemical_id })
274
- })
275
- .returning();
276
-
277
- return result;
278
- } catch (error) {
279
- logError('pegasus-sdk', 'ChemicalsService', 'createChemical', error);
280
- throw error;
281
- }
282
- }
283
-
284
- async updateChemical(chemicalId, updates) {
285
- try {
286
- const db = this.getDb();
287
-
288
- const updateData = {};
289
- if (updates.chemical_name) updateData.chemicalName = updates.chemical_name;
290
- if (updates.chemical_meta) updateData.chemicalMeta = updates.chemical_meta;
291
- if (updates.chemical_identifiers) updateData.chemicalIdentifiers = updates.chemical_identifiers;
292
- if (updates.chemical_synonyms) updateData.chemicalSynonyms = updates.chemical_synonyms;
293
- if (updates.chemical_categories) updateData.chemicalCategories = updates.chemical_categories;
294
- updateData.updatedAt = new Date();
295
-
296
- const [result] = await db
297
- .update(schema.chemicals)
298
- .set(updateData)
299
- .where(eq(schema.chemicals.chemicalId, chemicalId))
300
- .returning();
301
-
302
- return result || null;
303
- } catch (error) {
304
- logError('pegasus-sdk', 'ChemicalsService', 'updateChemical', error);
305
- throw error;
306
- }
307
- }
308
-
309
- async deleteChemical(chemicalId) {
310
- try {
311
- const db = this.getDb();
312
-
313
- const [deleted] = await db
314
- .delete(schema.chemicals)
315
- .where(eq(schema.chemicals.chemicalId, chemicalId))
316
- .returning();
317
-
318
- return deleted || null;
319
- } catch (error) {
320
- logError('pegasus-sdk', 'ChemicalsService', 'deleteChemical', error);
321
- throw error;
322
- }
323
- }
324
-
325
- async deleteBySourceId(sourceId) {
326
- try {
327
- const db = this.getDb();
328
-
329
- const [deleted] = await db
330
- .delete(schema.chemicals)
331
- .where(eq(schema.chemicals.sourceId, sourceId))
332
- .returning();
333
-
334
- return deleted || null;
335
- } catch (error) {
336
- logError('pegasus-sdk', 'ChemicalsService', 'deleteBySourceId', error);
337
- throw error;
338
- }
339
- }
340
-
341
- async deleteCollection(collectionName) {
342
- try {
343
- const db = this.getDb();
344
-
345
- const deleted = await db
346
- .delete(schema.chemicals)
347
- .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
348
- .returning();
349
-
350
- return { deletedCount: deleted.length, deleted };
351
- } catch (error) {
352
- logError('pegasus-sdk', 'ChemicalsService', 'deleteCollection', error);
353
- throw error;
354
- }
355
- }
356
-
357
- async updateCollectionProperty(collectionName, propertyPath, newValue) {
358
- try {
359
- const db = this.getDb();
360
- const pathArray = propertyPath.split('.');
361
- const valueJson = JSON.stringify(newValue);
362
-
363
- const results = await db
364
- .update(schema.chemicals)
365
- .set({
366
- chemicalMeta: sql`jsonb_set(${schema.chemicals.chemicalMeta}, ${pathArray}::text[], ${valueJson}::jsonb)`,
367
- updatedAt: new Date()
368
- })
369
- .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
370
- .returning();
371
-
372
- return { updatedCount: results.length, updated: results };
373
- } catch (error) {
374
- logError('pegasus-sdk', 'ChemicalsService', 'updateCollectionProperty', error);
375
- throw error;
376
- }
377
- }
378
-
379
- async bulkUpdateProperty(filter, propertyPath, newValue) {
380
- try {
381
- const db = this.getDb();
382
-
383
- let whereCondition = sql`1=1`;
384
-
385
- if (filter.chemicalIds && filter.chemicalIds.length > 0) {
386
- whereCondition = inArray(schema.chemicals.chemicalId, filter.chemicalIds);
387
- } else if (filter.sourceIds && filter.sourceIds.length > 0) {
388
- whereCondition = inArray(schema.chemicals.sourceId, filter.sourceIds);
389
- } else if (filter.category) {
390
- whereCondition = arrayContains(schema.chemicals.chemicalCategories, [filter.category]);
391
- }
392
-
393
- const pathArray = propertyPath.split('.');
394
- const valueJson = JSON.stringify(newValue);
395
-
396
- const results = await db
397
- .update(schema.chemicals)
398
- .set({
399
- chemicalMeta: sql`jsonb_set(COALESCE(${schema.chemicals.chemicalMeta}, '{}'), ${pathArray}::text[], ${valueJson}::jsonb)`,
400
- updatedAt: new Date()
401
- })
402
- .where(whereCondition)
403
- .returning();
404
-
405
- return { updatedCount: results.length, updated: results };
406
- } catch (error) {
407
- logError('pegasus-sdk', 'ChemicalsService', 'bulkUpdateProperty', error);
408
- throw error;
409
- }
410
- }
411
-
412
- async getChemicalById(chemicalId) {
413
- try {
414
- const db = this.getDb();
415
-
416
- const [result] = await db
417
- .select()
418
- .from(schema.chemicals)
419
- .where(eq(schema.chemicals.chemicalId, chemicalId))
420
- .limit(1);
421
-
422
- return result || null;
423
- } catch (error) {
424
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalById', error);
425
- throw error;
426
- }
427
- }
428
-
429
- async getChemicalBySourceId(sourceId) {
430
- try {
431
- const db = this.getDb();
432
-
433
- const [result] = await db
434
- .select()
435
- .from(schema.chemicals)
436
- .where(eq(schema.chemicals.sourceId, sourceId))
437
- .limit(1);
438
-
439
- return result || null;
440
- } catch (error) {
441
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalBySourceId', error);
442
- throw error;
443
- }
444
- }
445
-
446
- async getChemicalsByCAS(casNumber) {
447
- try {
448
- const db = this.getDb();
449
-
450
- const results = await db
451
- .select()
452
- .from(schema.chemicals)
453
- .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
454
-
455
- return results;
456
- } catch (error) {
457
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByCAS', error);
458
- throw error;
459
- }
460
- }
461
-
462
- async getChemicalsByIdentifier(identifierType, identifierValue) {
463
- try {
464
- if (!ALLOWED_IDENTIFIER_TYPES.has(identifierType)) {
465
- throw new Error(`Invalid identifier type: ${identifierType}`);
466
- }
467
-
468
- const db = this.getDb();
469
-
470
- const results = await db
471
- .select()
472
- .from(schema.chemicals)
473
- .where(sql`${schema.chemicals.chemicalIdentifiers}->>${identifierType} = ${identifierValue} OR ${schema.chemicals.chemicalIdentifiers}->${identifierType} ? ${identifierValue}`);
474
-
475
- return results;
476
- } catch (error) {
477
- logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByIdentifier', error);
478
- throw error;
479
- }
480
- }
481
-
482
- async countByCollection(collectionName) {
483
- try {
484
- const db = this.getDb();
485
-
486
- const result = await db
487
- .select({ count: sql`count(*)::int` })
488
- .from(schema.chemicals)
489
- .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
490
-
491
- return { count: result[0].count };
492
- } catch (error) {
493
- logError('pegasus-sdk', 'ChemicalsService', 'countByCollection', error);
494
- throw error;
495
- }
496
- }
497
-
498
- async countByIdentifier(identifierValue) {
499
- try {
500
- const db = this.getDb();
501
-
502
- const searchPattern = `%${escapeLikePattern(identifierValue)}%`;
503
- const result = await db
504
- .select({ count: sql`count(*)::int` })
505
- .from(schema.chemicals)
506
- .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
507
-
508
- return { count: result[0].count };
509
- } catch (error) {
510
- logError('pegasus-sdk', 'ChemicalsService', 'countByIdentifier', error);
511
- throw error;
512
- }
513
- }
514
-
515
- async countByCAS(casNumber) {
516
- try {
517
- const db = this.getDb();
518
-
519
- const result = await db
520
- .select({ count: sql`count(*)::int` })
521
- .from(schema.chemicals)
522
- .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
523
-
524
- return { count: result[0].count };
525
- } catch (error) {
526
- logError('pegasus-sdk', 'ChemicalsService', 'countByCAS', error);
527
- throw error;
528
- }
529
- }
530
-
531
- async getTotalSynonymCount() {
532
- try {
533
- const db = this.getDb();
534
-
535
- const result = await db
536
- .select({ count: sql`sum(array_length(${schema.chemicals.chemicalSynonyms}, 1))::int` })
537
- .from(schema.chemicals);
538
-
539
- return { count: result[0].count || 0 };
540
- } catch (error) {
541
- logError('pegasus-sdk', 'ChemicalsService', 'getTotalSynonymCount', error);
542
- throw error;
543
- }
544
- }
545
-
546
- async getSynonymCount(synonymTerm) {
547
- try {
548
- const db = this.getDb();
549
-
550
- const result = await db
551
- .select({ count: sql`count(*)::int` })
552
- .from(schema.chemicals)
553
- .where(arrayContains(schema.chemicals.chemicalSynonyms, [synonymTerm]));
554
-
555
- return { count: result[0].count };
556
- } catch (error) {
557
- logError('pegasus-sdk', 'ChemicalsService', 'getSynonymCount', error);
558
- throw error;
559
- }
560
- }
561
-
562
- async convertIdentifier(fromIdentifier, toIdentifierType) {
563
- try {
564
- const db = this.getDb();
565
-
566
- const searchPattern = `%${escapeLikePattern(fromIdentifier)}%`;
567
- const chemicals = await db
568
- .select()
569
- .from(schema.chemicals)
570
- .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
571
-
572
- if (chemicals.length === 0) {
573
- return null;
574
- }
575
-
576
- const chemical = chemicals[0];
577
- const identifiers = chemical.chemicalIdentifiers || {};
578
- const toIdentifier = identifiers[toIdentifierType];
579
-
580
- return {
581
- fromIdentifier,
582
- toIdentifierType,
583
- toIdentifier,
584
- chemicalId: chemical.chemicalId,
585
- chemicalName: chemical.chemicalName
586
- };
587
- } catch (error) {
588
- logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifier', error);
589
- throw error;
590
- }
591
- }
592
-
593
- async convertIdentifiersBatch(fromIdentifiers, toIdentifierType) {
594
- try {
595
- const conversions = await Promise.all(
596
- fromIdentifiers.map(fromIdentifier =>
597
- this.convertIdentifier(fromIdentifier, toIdentifierType)
598
- )
599
- );
600
-
601
- return conversions.filter(conversion => conversion !== null);
602
- } catch (error) {
603
- logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifiersBatch', error);
604
- throw error;
605
- }
606
- }
607
-
608
- /**
609
- * Search for chemicals by name using OpenSearch
610
- * @param {string} searchTerm - Name to search for
611
- * @param {number} limit - Maximum number of results (default: 10)
612
- * @returns {Promise<Object>} Search results
613
- */
614
- async searchByName(searchTerm, limit = 10) {
615
- if (!searchTerm) {
616
- return { results: [] };
617
- }
618
-
619
- try {
620
- const opensearchClient = this.connection.getOpenSearchClient();
621
- const indexName = this.connection.getOpenSearchIndex();
622
-
623
- const response = await opensearchClient.search({
624
- index: indexName,
625
- body: {
626
- size: limit,
627
- query: {
628
- bool: {
629
- should: [
630
- { term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
631
- { prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
632
- { term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
633
- { prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
634
- ],
635
- minimum_should_match: 1
636
- }
637
- },
638
- _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
639
- }
640
- });
641
-
642
- const hits = response.body?.hits?.hits || [];
643
- const results = hits.map((hit) => ({
644
- id: hit._source.postgres_id,
645
- name: hit._source.chemical_name,
646
- cas: hit._source.cas_numbers || [],
647
- identifiers: hit._source.identifier_values || [],
648
- synonyms: hit._source.synonyms || [],
649
- score: hit._score
650
- }));
651
-
652
- return { results };
653
- } catch (error) {
654
- logError('pegasus-sdk', 'ChemicalsService', 'searchByName', error);
655
- throw error;
656
- }
657
- }
658
-
659
- /**
660
- * Search for chemicals by synonym using OpenSearch
661
- * @param {string} synonymTerm - Synonym to search for
662
- * @param {number} limit - Maximum number of results (default: 10)
663
- * @returns {Promise<Object>} Search results
664
- */
665
- async searchBySynonym(synonymTerm, limit = 10) {
666
- if (!synonymTerm) {
667
- return { results: [] };
668
- }
669
-
670
- try {
671
- const opensearchClient = this.connection.getOpenSearchClient();
672
- const indexName = this.connection.getOpenSearchIndex();
673
-
674
- const response = await opensearchClient.search({
675
- index: indexName,
676
- body: {
677
- size: limit,
678
- query: {
679
- bool: {
680
- should: [
681
- { term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
682
- { prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
683
- { term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
684
- { prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
685
- ],
686
- minimum_should_match: 1
687
- }
688
- },
689
- _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
690
- }
691
- });
692
-
693
- const hits = response.body?.hits?.hits || [];
694
- const results = hits.map((hit) => ({
695
- id: hit._source.postgres_id,
696
- name: hit._source.chemical_name,
697
- cas: hit._source.cas_numbers || [],
698
- identifiers: hit._source.identifier_values || [],
699
- synonyms: hit._source.synonyms || [],
700
- score: hit._score
701
- }));
702
-
703
- return { results };
704
- } catch (error) {
705
- logError('pegasus-sdk', 'ChemicalsService', 'searchBySynonym', error);
706
- throw error;
707
- }
708
- }
709
-
710
- async countAll() {
711
- try {
712
- const db = this.getDb();
713
- const result = await db
714
- .select({ count: sql`count(*)::int` })
715
- .from(schema.chemicals);
716
- return { count: result[0].count };
717
- } catch (error) {
718
- logError('pegasus-sdk', 'ChemicalsService', 'countAll', error);
719
- throw error;
720
- }
721
- }
722
-
723
- async findChemicalsWithoutDocuments(collectionName, searchTerm, pageSize = 100) {
724
- try {
725
- const db = this.getDb();
726
-
727
- let whereConditions = [];
728
-
729
- if (collectionName) {
730
- whereConditions.push(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
731
- }
732
-
733
- if (searchTerm) {
734
- const searchPattern = `%${escapeLikePattern(searchTerm)}%`;
735
- whereConditions.push(sql`${schema.chemicals.chemicalName} ILIKE ${searchPattern}`);
736
- }
737
-
738
- const whereClause = whereConditions.length > 0 ? and(...whereConditions) : undefined;
739
-
740
- const results = await db
741
- .select()
742
- .from(schema.chemicals)
743
- .where(whereClause)
744
- .limit(pageSize);
745
-
746
- return results;
747
- } catch (error) {
748
- logError('pegasus-sdk', 'ChemicalsService', 'findChemicalsWithoutDocuments', error);
749
- throw error;
750
- }
751
- }
752
-
753
- async countChemicalsWithoutDocuments(collectionName) {
754
- try {
755
- const db = this.getDb();
756
-
757
- const whereClause = collectionName
758
- ? arrayContains(schema.chemicals.chemicalCategories, [collectionName])
759
- : undefined;
760
-
761
- const result = await db
762
- .select({ count: sql`count(*)::int` })
763
- .from(schema.chemicals)
764
- .where(whereClause);
765
-
766
- return { count: result[0].count };
767
- } catch (error) {
768
- logError('pegasus-sdk', 'ChemicalsService', 'countChemicalsWithoutDocuments', error);
769
- throw error;
770
- }
771
- }
772
-
773
- _buildEsHandlers() {
774
- return {
775
- index: async (params) => {
776
- const chemical = params.body;
777
- const result = await this.createChemical(chemical);
778
-
779
- return {
780
- _index: params.index,
781
- _id: result.chemicalId,
782
- _version: 1,
783
- result: 'created',
784
- _source: result
785
- };
786
- },
787
-
788
- bulk: async (params) => {
789
- const operations = params.body || params.operations;
790
-
791
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Starting bulk operation with ${operations?.length || 0} total operations`);
792
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Params index: ${params.index}`);
793
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Operations array type: ${Array.isArray(operations) ? 'array' : typeof operations}`);
794
-
795
- const cdiDocuments = [];
796
- let cdiOpCount = 0;
797
- let otherOpCount = 0;
798
-
799
- for (let i = 0; i < operations.length; i++) {
800
- const op = operations[i];
801
- const isIndexOp = !!(op.index || op.create);
802
- const indexName = op.index?._index || op.create?._index || op.delete?._index || op.update?._index;
803
-
804
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Op[${i}]: action=${Object.keys(op)[0] || 'unknown'}, index=${indexName}`);
805
-
806
- if ((op.index || op.create) &&
807
- (op.index?._index === 'chemical_data_index' || op.create?._index === 'chemical_data_index')) {
808
- const doc = operations[i + 1];
809
- const sourceId = op.index?._id || op.create?._id;
810
-
811
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Found CDI entry: sourceId=${sourceId}, hasDoc=${!!doc}`);
812
-
813
- if (doc && sourceId) {
814
- const cdiDoc = {
815
- source_id: sourceId,
816
- chemical_name: doc.chemical_primary_name || (doc.chemical_names && doc.chemical_names[0]) || null,
817
- chemical_meta: doc.chemical_meta || {},
818
- chemical_identifiers: doc.chemical_identifiers || {},
819
- chemical_synonyms: doc.chemical_synonyms || [],
820
- chemical_categories: doc.chemical_categories || [],
821
- created_at: doc.chemical_created_at ? (typeof doc.chemical_created_at === 'string' ? new Date(doc.chemical_created_at) : doc.chemical_created_at) : new Date(),
822
- updated_at: doc.chemical_updated_at ? (typeof doc.chemical_updated_at === 'string' ? new Date(doc.chemical_updated_at) : doc.chemical_updated_at) : new Date()
823
- };
824
- cdiDocuments.push(cdiDoc);
825
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Extracted CDI doc: ${JSON.stringify({ source_id: cdiDoc.source_id, chemical_name: cdiDoc.chemical_name })}`);
826
- i++;
827
- cdiOpCount++;
828
- } else {
829
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] CDI entry incomplete: sourceId=${sourceId}, doc=${!!doc}`);
830
- }
831
- } else {
832
- otherOpCount++;
833
- }
834
- }
835
-
836
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Scan complete: ${cdiOpCount} CDI docs found, ${otherOpCount} other operations skipped`);
837
-
838
- if (cdiDocuments.length === 0) {
839
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] No CDI documents to index, returning empty no-op response`);
840
- return { took: 0, errors: false, items: [] };
841
- }
842
-
843
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] Calling bulkIndexFielded with ${cdiDocuments.length} CDI documents`);
844
-
845
- try {
846
- const result = await this.bulkIndexFielded(cdiDocuments);
847
- logInfo('pegasus-sdk', `[ChemicalsService.bulk] bulkIndexFielded returned: indexed=${result.indexed}, errors=${result.errors.length}`);
848
-
849
- if (result.errors.length > 0) {
850
- logError('pegasus-sdk', 'ChemicalsService.bulk', 'Errors during bulk indexing', result.errors);
851
- }
852
-
853
- return {
854
- took: 1,
855
- errors: result.errors.length > 0,
856
- items: result.results.map((res, idx) => ({
857
- index: {
858
- _index: 'chemical_data_index',
859
- _id: cdiDocuments[idx].source_id,
860
- status: res.success ? 200 : 400,
861
- result: res.success ? 'created' : 'error',
862
- ...(res.success ? {} : { error: { type: 'mapper_parsing_exception', reason: res.error } })
863
- }
864
- }))
865
- };
866
- } catch (error) {
867
- logError('pegasus-sdk', 'ChemicalsService.bulk', 'Fatal error during bulk indexing', error);
868
- throw error;
869
- }
870
- },
871
-
872
- get: async (params) => {
873
- const result = await this.getChemicalBySourceId(params.id);
874
-
875
- if (!result) {
876
- return {
877
- _index: params.index,
878
- _id: params.id,
879
- found: false
880
- };
881
- }
882
-
883
- return {
884
- _index: params.index,
885
- _id: params.id,
886
- _version: 1,
887
- found: true,
888
- _source: result
889
- };
890
- },
891
-
892
- update: async (params) => {
893
- const result = await this.updateChemical(params.id, params.body);
894
-
895
- return {
896
- _index: params.index,
897
- _id: params.id,
898
- _version: 2,
899
- result: result ? 'updated' : 'noop',
900
- _source: result
901
- };
902
- },
903
-
904
- delete: async (params) => {
905
- if (params.index === 'synonym_lookup_index') {
906
- return { _index: params.index, _id: params.id, result: 'not_found' };
907
- }
908
- const result = await this.deleteBySourceId(params.id);
909
-
910
- return {
911
- _index: params.index,
912
- _id: params.id,
913
- result: result ? 'deleted' : 'not_found'
914
- };
915
- },
916
-
917
- deleteByQuery: async (params) => {
918
- const sourceId = params.body?.query?.term?.chemical_set_identifier
919
- || params.body?.query?.term?.source_id;
920
- if (!sourceId) {
921
- return { deleted: 0, failures: [] };
922
- }
923
- const result = await this.deleteBySourceId(sourceId);
924
- return {
925
- deleted: result ? 1 : 0,
926
- failures: []
927
- };
928
- },
929
-
930
- search: async (params) => {
931
- let searchTerm = '';
932
- let limit = params.body?.size || 10;
933
-
934
- if (params.index === 'synonym_lookup_index') {
935
- const query = params.body?.query;
936
- searchTerm = query?.match?.chemical_name ||
937
- query?.term?.chemical_name ||
938
- query?.query_string?.query || '';
939
- const searchResults = await this.searchBySynonym(searchTerm, limit);
940
-
941
- return {
942
- took: 1,
943
- timed_out: false,
944
- _shards: {
945
- total: 1,
946
- successful: 1,
947
- skipped: 0,
948
- failed: 0
949
- },
950
- hits: {
951
- total: {
952
- value: searchResults.results.length,
953
- relation: 'eq'
954
- },
955
- max_score: searchResults.results[0]?.score || 0,
956
- hits: searchResults.results.map(result => ({
957
- _index: params.index,
958
- _id: result.id,
959
- _score: result.score,
960
- _source: {
961
- postgres_id: result.id,
962
- chemical_name: result.name,
963
- cas_numbers: result.cas,
964
- identifier_values: result.identifiers,
965
- synonyms: result.synonyms
966
- }
967
- }))
968
- }
969
- };
970
- } else {
971
- const query = params.body?.query;
972
- searchTerm = query?.match?.chemical_name ||
973
- query?.term?.chemical_name ||
974
- query?.query_string?.query || '';
975
- const searchResults = await this.searchByName(searchTerm, limit);
976
-
977
- return {
978
- took: 1,
979
- timed_out: false,
980
- _shards: {
981
- total: 1,
982
- successful: 1,
983
- skipped: 0,
984
- failed: 0
985
- },
986
- hits: {
987
- total: {
988
- value: searchResults.results.length,
989
- relation: 'eq'
990
- },
991
- max_score: searchResults.results[0]?.score || 0,
992
- hits: searchResults.results.map(result => ({
993
- _index: params.index,
994
- _id: result.id,
995
- _score: result.score,
996
- _source: {
997
- postgres_id: result.id,
998
- chemical_name: result.name,
999
- cas_numbers: result.cas,
1000
- identifier_values: result.identifiers,
1001
- synonyms: result.synonyms
1002
- }
1003
- }))
1004
- }
1005
- };
1006
- }
1007
- },
1008
-
1009
- count: async (params) => {
1010
- if (params.index === 'synonym_lookup_index') {
1011
- return await this.getTotalSynonymCount();
1012
- }
1013
- return await this.countAll();
1014
- }
1015
- };
1016
- }
1017
-
1018
- registerElasticsearchHandlers(elasticsearchService) {
1019
- const configurablePatterns = this.connection.config.indexRoutes?.chemicals || ['chemicals*'];
1020
- const legacyPatterns = ['synonym_lookup_index', 'chemical_data_index', 'chemical_converter_index'];
1021
- const allPatterns = [...new Set([...configurablePatterns, ...legacyPatterns])];
1022
- const handlers = this._buildEsHandlers();
1023
- allPatterns.forEach(pattern => {
1024
- elasticsearchService.registerIndexRoute(pattern, handlers);
1025
- });
1026
- }
1027
- }
1028
-
1
+ const { logError, logInfo } = require('@toxplanet/tphelper/logging');
2
+ const { getDrizzle, schema } = require('./db');
3
+ const { eq, sql, and, inArray, arrayContains } = require('drizzle-orm');
4
+ const { SQSClient, SendMessageCommand } = require('@aws-sdk/client-sqs');
5
+
6
+ const SEARCH_BOOST_EXACT_PRIMARY = 100;
7
+ const SEARCH_BOOST_PREFIX_PRIMARY = 50;
8
+ const SEARCH_BOOST_EXACT_SECONDARY = 30;
9
+ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
10
+
11
+ const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'DTXSID', 'EINECS', 'EC']);
12
+
13
+ function escapeLikePattern(value) {
14
+ return value.replace(/[%_\\]/g, '\\$&');
15
+ }
16
+
17
+ class ChemicalsService {
18
+ constructor(connection) {
19
+ this.connection = connection;
20
+ this.db = null;
21
+ this.sqsClient = null;
22
+ }
23
+
24
+ getDb() {
25
+ if (!this.db) {
26
+ this.db = getDrizzle(this.connection.pgPool);
27
+ }
28
+ return this.db;
29
+ }
30
+
31
+ async sendSqlWriteFailure({ sql, parameters, error, retryCount, failedAt }) {
32
+ try {
33
+ const region = process.env.AWS_REGION || this.connection.region;
34
+ const { awsAccountId, environment } = this.connection.config;
35
+ const defaultQueueUrl = awsAccountId
36
+ ? `https://sqs.${region}.amazonaws.com/${awsAccountId}/cr-pegasus-failed-items-${environment}`
37
+ : null;
38
+ const queueUrl = process.env.SQS_FAILED_ITEMS_QUEUE || defaultQueueUrl;
39
+
40
+ if (!queueUrl) {
41
+ logError('pegasus-sdk', 'sendSqlWriteFailure', 'No SQS queue URL available: set SQS_FAILED_ITEMS_QUEUE or provide awsAccountId in config');
42
+ return false;
43
+ }
44
+
45
+ logInfo('pegasus-sdk', `[sendSqlWriteFailure] Using queue: ${queueUrl}${process.env.SQS_FAILED_ITEMS_QUEUE ? ' (from env)' : ' (default)'}`);
46
+
47
+ if (!this.sqsClient) {
48
+ this.sqsClient = new SQSClient({ region });
49
+ }
50
+
51
+ const message = {
52
+ MessageType: 'SqlWriteFailure',
53
+ SourceService: this.connection.config.sourceService || 'pegasus-sdk',
54
+ Timestamp: (failedAt || new Date()).toISOString(),
55
+ Sql: sql,
56
+ Parameters: parameters,
57
+ OriginalError: error.message,
58
+ RetryCount: retryCount
59
+ };
60
+
61
+ const command = new SendMessageCommand({
62
+ QueueUrl: queueUrl,
63
+ MessageBody: JSON.stringify(message)
64
+ });
65
+
66
+ const response = await this.sqsClient.send(command);
67
+ logInfo('pegasus-sdk', `[sendSqlWriteFailure] SqlWriteFailure posted to SQS: MessageId=${response.MessageId}, RetryCount=${retryCount}`);
68
+ return true;
69
+ } catch (sqsError) {
70
+ logError('pegasus-sdk', 'sendSqlWriteFailure', 'Failed to post SqlWriteFailure to SQS', sqsError);
71
+ return false;
72
+ }
73
+ }
74
+
75
+ _buildChemicalUpsertSql(chemical) {
76
+ const sql = [
77
+ 'INSERT INTO chemicals (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)',
78
+ 'VALUES (@source_id, @chemical_name, @chemical_meta::jsonb, @chemical_identifiers::jsonb, @chemical_synonyms, @chemical_categories, @created_at, @updated_at)',
79
+ 'ON CONFLICT (source_id) DO UPDATE SET',
80
+ ' chemical_name = @chemical_name,',
81
+ ' chemical_meta = @chemical_meta::jsonb,',
82
+ ' chemical_identifiers = @chemical_identifiers::jsonb,',
83
+ ' chemical_synonyms = @chemical_synonyms,',
84
+ ' chemical_categories = @chemical_categories,',
85
+ ' updated_at = @updated_at'
86
+ ].join('\n');
87
+
88
+ const serializeDate = (d) => d instanceof Date ? d.toISOString() : d;
89
+
90
+ const parameters = {
91
+ '@source_id': chemical.sourceId,
92
+ '@chemical_name': chemical.chemicalName,
93
+ '@chemical_meta': JSON.stringify(chemical.chemicalMeta ?? {}),
94
+ '@chemical_identifiers': JSON.stringify(chemical.chemicalIdentifiers ?? {}),
95
+ '@chemical_synonyms': JSON.stringify(chemical.chemicalSynonyms ?? []),
96
+ '@chemical_categories': JSON.stringify(chemical.chemicalCategories ?? []),
97
+ '@created_at': serializeDate(chemical.createdAt),
98
+ '@updated_at': serializeDate(chemical.updatedAt)
99
+ };
100
+
101
+ return { sql, parameters };
102
+ }
103
+
104
+ _buildDebugSql(chemical) {
105
+ const esc = (s) => `'${String(s ?? '').replace(/'/g, "''")}'`;
106
+ const escJson = (v) => `'${JSON.stringify(v ?? {}).replace(/'/g, "''")}'`;
107
+ const escArr = (arr) => {
108
+ if (!Array.isArray(arr) || arr.length === 0) return `ARRAY[]::text[]`;
109
+ return `ARRAY[${arr.map(s => esc(s)).join(', ')}]`;
110
+ };
111
+ const escDate = (d) => esc(d instanceof Date ? d.toISOString() : (d ?? new Date().toISOString()));
112
+
113
+ return [
114
+ `INSERT INTO chemicals`,
115
+ ` (source_id, chemical_name, chemical_meta, chemical_identifiers, chemical_synonyms, chemical_categories, created_at, updated_at)`,
116
+ `VALUES (`,
117
+ ` ${esc(chemical.sourceId)},`,
118
+ ` ${esc(chemical.chemicalName)},`,
119
+ ` ${escJson(chemical.chemicalMeta)}::jsonb,`,
120
+ ` ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
121
+ ` ${escArr(chemical.chemicalSynonyms)},`,
122
+ ` ${escArr(chemical.chemicalCategories)},`,
123
+ ` ${escDate(chemical.createdAt)},`,
124
+ ` ${escDate(chemical.updatedAt)}`,
125
+ `)`,
126
+ `ON CONFLICT (source_id) DO UPDATE SET`,
127
+ ` chemical_name = ${esc(chemical.chemicalName)},`,
128
+ ` chemical_meta = ${escJson(chemical.chemicalMeta)}::jsonb,`,
129
+ ` chemical_identifiers = ${escJson(chemical.chemicalIdentifiers)}::jsonb,`,
130
+ ` chemical_synonyms = ${escArr(chemical.chemicalSynonyms)},`,
131
+ ` chemical_categories = ${escArr(chemical.chemicalCategories)},`,
132
+ ` updated_at = NOW();`
133
+ ].join('\n');
134
+ }
135
+
136
+ async bulkIndexFielded(documents) {
137
+ try {
138
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Starting bulk index with ${documents?.length || 0} documents`);
139
+
140
+ if (!documents || documents.length === 0) {
141
+ logInfo('pegasus-sdk', `[bulkIndexFielded] No documents provided, returning empty result`);
142
+ return { indexed: 0, errors: [], results: [] };
143
+ }
144
+
145
+ // Proactively validate the connection before any real query fires.
146
+ // If idle too long, this reconnects first so the real query never faces
147
+ // the full connectionTimeoutMillis wait on a stale pool.
148
+ const reconnected = await this.connection.ensureConnected();
149
+ if (reconnected) {
150
+ this.db = null; // force getDb() to bind to the fresh pool
151
+ }
152
+
153
+ const db = this.getDb();
154
+ const results = [];
155
+ const errors = [];
156
+
157
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Database connection established`);
158
+
159
+ for (let i = 0; i < documents.length; i++) {
160
+ const doc = documents[i];
161
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Processing document ${i}: source_id=${doc.source_id}, chemical_name=${doc.chemical_name}`);
162
+
163
+ const parseDate = (dateValue) => {
164
+ if (!dateValue) return new Date();
165
+ if (dateValue instanceof Date) return dateValue;
166
+ if (typeof dateValue === 'string') return new Date(dateValue);
167
+ return new Date();
168
+ };
169
+
170
+ const chemical = {
171
+ sourceId: doc.source_id || doc._id,
172
+ chemicalName: doc.chemical_name || doc.name,
173
+ chemicalMeta: doc.chemical_meta || {},
174
+ chemicalIdentifiers: doc.chemical_identifiers || {},
175
+ chemicalSynonyms: doc.chemical_synonyms || [],
176
+ chemicalCategories: doc.chemical_categories || [],
177
+ createdAt: parseDate(doc.created_at),
178
+ updatedAt: parseDate(doc.updated_at),
179
+ ...(doc.imported_at && { importedAt: doc.imported_at }),
180
+ ...(doc.chemical_id && { chemicalId: doc.chemical_id })
181
+ };
182
+
183
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Prepared chemical object: sourceId=${chemical.sourceId}, chemicalName=${chemical.chemicalName}`);
184
+ logInfo('pegasus-sdk', `[bulkIndexFielded] DEBUG SQL for document ${i}:\n${this._buildDebugSql(chemical)}`);
185
+
186
+ const isConnectionError = (err) =>
187
+ err.message?.toLowerCase().includes('timeout') ||
188
+ err.message?.toLowerCase().includes('connection') ||
189
+ err.code === 'ECONNREFUSED' ||
190
+ err.code === 'ETIMEDOUT';
191
+
192
+ const attemptUpsert = () =>
193
+ db.insert(schema.chemicals)
194
+ .values(chemical)
195
+ .onConflictDoUpdate({
196
+ target: schema.chemicals.sourceId,
197
+ set: {
198
+ chemicalName: chemical.chemicalName,
199
+ chemicalMeta: chemical.chemicalMeta,
200
+ chemicalIdentifiers: chemical.chemicalIdentifiers,
201
+ chemicalSynonyms: chemical.chemicalSynonyms,
202
+ chemicalCategories: chemical.chemicalCategories,
203
+ updatedAt: new Date()
204
+ }
205
+ })
206
+ .returning({
207
+ chemicalId: schema.chemicals.chemicalId,
208
+ sourceId: schema.chemicals.sourceId
209
+ });
210
+
211
+ let lastError = null;
212
+ let retryCount = 0;
213
+ const failedAt = new Date();
214
+
215
+ try {
216
+ const [result] = await attemptUpsert();
217
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully: ${result?.chemicalId || 'no ID returned'}`);
218
+ this.connection.recordActivity();
219
+ results.push({ index: i, success: true, result });
220
+ continue;
221
+ } catch (firstErr) {
222
+ lastError = firstErr;
223
+
224
+ if (isConnectionError(firstErr)) {
225
+ // Stale pool — rebuild the connection and try once more before queuing
226
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} connection error (${firstErr.message}), reconnecting pool and retrying`);
227
+ try {
228
+ await this.connection.reconnect();
229
+ this.db = null; // force getDb() to bind to the new pool
230
+ const [result] = await attemptUpsert();
231
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully after reconnect: ${result?.chemicalId || 'no ID returned'}`);
232
+ this.connection.recordActivity();
233
+ results.push({ index: i, success: true, result });
234
+ continue;
235
+ } catch (reconnectErr) {
236
+ lastError = reconnectErr;
237
+ retryCount = 1;
238
+ }
239
+ } else {
240
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} first attempt failed (${firstErr.message}), retrying once`);
241
+ try {
242
+ const [result] = await attemptUpsert();
243
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} indexed successfully on retry: ${result?.chemicalId || 'no ID returned'}`);
244
+ this.connection.recordActivity();
245
+ results.push({ index: i, success: true, result });
246
+ continue;
247
+ } catch (retryErr) {
248
+ lastError = retryErr;
249
+ retryCount = 1;
250
+ }
251
+ }
252
+ }
253
+
254
+ logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} failed after ${retryCount} local retries (source_id=${chemical.sourceId})`, lastError);
255
+
256
+ const { sql: failureSql, parameters: failureParams } = this._buildChemicalUpsertSql(chemical);
257
+ const queued = await this.sendSqlWriteFailure({
258
+ sql: failureSql,
259
+ parameters: failureParams,
260
+ error: lastError,
261
+ retryCount,
262
+ failedAt
263
+ });
264
+
265
+ if (queued) {
266
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Document ${i} (source_id=${chemical.sourceId}) queued for repair via SQS`);
267
+ } else {
268
+ logError('pegasus-sdk', 'bulkIndexFielded', `Document ${i} (source_id=${chemical.sourceId}) failed and could not be queued — data loss risk`, lastError);
269
+ }
270
+
271
+ results.push({ index: i, success: false, error: lastError.message, queued });
272
+ errors.push({ document: doc, error: lastError.message, queued });
273
+ }
274
+
275
+ const successCount = results.filter(r => r.success).length;
276
+ const queuedCount = results.filter(r => !r.success && r.queued).length;
277
+ logInfo('pegasus-sdk', `[bulkIndexFielded] Bulk index complete: ${successCount}/${documents.length} succeeded, ${queuedCount} queued for repair, ${errors.length - queuedCount} unhandled errors`);
278
+
279
+ return { indexed: successCount, errors, results };
280
+ } catch (error) {
281
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFielded', error);
282
+ throw error;
283
+ }
284
+ }
285
+
286
+ async bulkIndexFulltext(documents) {
287
+ try {
288
+ return { acknowledged: true, count: documents?.length || 0 };
289
+ } catch (error) {
290
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexFulltext', error);
291
+ throw error;
292
+ }
293
+ }
294
+
295
+ async bulkIndexSubstances(substances) {
296
+ try {
297
+ const documents = substances.map(substance => ({
298
+ source_id: substance.substance_id || substance.id,
299
+ chemical_name: substance.name || substance.substance_name,
300
+ chemical_meta: substance.meta || {},
301
+ chemical_identifiers: substance.identifiers || {},
302
+ chemical_synonyms: substance.synonyms || [],
303
+ chemical_categories: substance.categories || substance.substance_types || [],
304
+ created_at: substance.created_at,
305
+ updated_at: substance.updated_at,
306
+ imported_at: substance.imported_at
307
+ }));
308
+
309
+ return await this.bulkIndexFielded(documents);
310
+ } catch (error) {
311
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkIndexSubstances', error);
312
+ throw error;
313
+ }
314
+ }
315
+
316
+ async createChemical(chemical) {
317
+ try {
318
+ const db = this.getDb();
319
+
320
+ const [result] = await db
321
+ .insert(schema.chemicals)
322
+ .values({
323
+ sourceId: chemical.source_id,
324
+ chemicalName: chemical.chemical_name,
325
+ chemicalMeta: chemical.chemical_meta,
326
+ chemicalIdentifiers: chemical.chemical_identifiers,
327
+ chemicalSynonyms: chemical.chemical_synonyms,
328
+ chemicalCategories: chemical.chemical_categories,
329
+ createdAt: chemical.created_at || new Date(),
330
+ updatedAt: chemical.updated_at || new Date(),
331
+ ...(chemical.imported_at && { importedAt: chemical.imported_at }),
332
+ ...(chemical.chemical_id && { chemicalId: chemical.chemical_id })
333
+ })
334
+ .returning();
335
+
336
+ return result;
337
+ } catch (error) {
338
+ logError('pegasus-sdk', 'ChemicalsService', 'createChemical', error);
339
+ throw error;
340
+ }
341
+ }
342
+
343
+ async updateChemical(chemicalId, updates) {
344
+ try {
345
+ const db = this.getDb();
346
+
347
+ const updateData = {};
348
+ if (updates.chemical_name) updateData.chemicalName = updates.chemical_name;
349
+ if (updates.chemical_meta) updateData.chemicalMeta = updates.chemical_meta;
350
+ if (updates.chemical_identifiers) updateData.chemicalIdentifiers = updates.chemical_identifiers;
351
+ if (updates.chemical_synonyms) updateData.chemicalSynonyms = updates.chemical_synonyms;
352
+ if (updates.chemical_categories) updateData.chemicalCategories = updates.chemical_categories;
353
+ updateData.updatedAt = new Date();
354
+
355
+ const [result] = await db
356
+ .update(schema.chemicals)
357
+ .set(updateData)
358
+ .where(eq(schema.chemicals.chemicalId, chemicalId))
359
+ .returning();
360
+
361
+ return result || null;
362
+ } catch (error) {
363
+ logError('pegasus-sdk', 'ChemicalsService', 'updateChemical', error);
364
+ throw error;
365
+ }
366
+ }
367
+
368
+ async deleteChemical(chemicalId) {
369
+ try {
370
+ const db = this.getDb();
371
+
372
+ const [deleted] = await db
373
+ .delete(schema.chemicals)
374
+ .where(eq(schema.chemicals.chemicalId, chemicalId))
375
+ .returning();
376
+
377
+ return deleted || null;
378
+ } catch (error) {
379
+ logError('pegasus-sdk', 'ChemicalsService', 'deleteChemical', error);
380
+ throw error;
381
+ }
382
+ }
383
+
384
+ async deleteBySourceId(sourceId) {
385
+ try {
386
+ const db = this.getDb();
387
+
388
+ const [deleted] = await db
389
+ .delete(schema.chemicals)
390
+ .where(eq(schema.chemicals.sourceId, sourceId))
391
+ .returning();
392
+
393
+ return deleted || null;
394
+ } catch (error) {
395
+ logError('pegasus-sdk', 'ChemicalsService', 'deleteBySourceId', error);
396
+ throw error;
397
+ }
398
+ }
399
+
400
+ async deleteCollection(collectionName) {
401
+ try {
402
+ const db = this.getDb();
403
+
404
+ const deleted = await db
405
+ .delete(schema.chemicals)
406
+ .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
407
+ .returning();
408
+
409
+ return { deletedCount: deleted.length, deleted };
410
+ } catch (error) {
411
+ logError('pegasus-sdk', 'ChemicalsService', 'deleteCollection', error);
412
+ throw error;
413
+ }
414
+ }
415
+
416
+ async updateCollectionProperty(collectionName, propertyPath, newValue) {
417
+ try {
418
+ const db = this.getDb();
419
+ const pathArray = propertyPath.split('.');
420
+ const valueJson = JSON.stringify(newValue);
421
+
422
+ const results = await db
423
+ .update(schema.chemicals)
424
+ .set({
425
+ chemicalMeta: sql`jsonb_set(${schema.chemicals.chemicalMeta}, ${pathArray}::text[], ${valueJson}::jsonb)`,
426
+ updatedAt: new Date()
427
+ })
428
+ .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]))
429
+ .returning();
430
+
431
+ return { updatedCount: results.length, updated: results };
432
+ } catch (error) {
433
+ logError('pegasus-sdk', 'ChemicalsService', 'updateCollectionProperty', error);
434
+ throw error;
435
+ }
436
+ }
437
+
438
+ async bulkUpdateProperty(filter, propertyPath, newValue) {
439
+ try {
440
+ const db = this.getDb();
441
+
442
+ let whereCondition = sql`1=1`;
443
+
444
+ if (filter.chemicalIds && filter.chemicalIds.length > 0) {
445
+ whereCondition = inArray(schema.chemicals.chemicalId, filter.chemicalIds);
446
+ } else if (filter.sourceIds && filter.sourceIds.length > 0) {
447
+ whereCondition = inArray(schema.chemicals.sourceId, filter.sourceIds);
448
+ } else if (filter.category) {
449
+ whereCondition = arrayContains(schema.chemicals.chemicalCategories, [filter.category]);
450
+ }
451
+
452
+ const pathArray = propertyPath.split('.');
453
+ const valueJson = JSON.stringify(newValue);
454
+
455
+ const results = await db
456
+ .update(schema.chemicals)
457
+ .set({
458
+ chemicalMeta: sql`jsonb_set(COALESCE(${schema.chemicals.chemicalMeta}, '{}'), ${pathArray}::text[], ${valueJson}::jsonb)`,
459
+ updatedAt: new Date()
460
+ })
461
+ .where(whereCondition)
462
+ .returning();
463
+
464
+ return { updatedCount: results.length, updated: results };
465
+ } catch (error) {
466
+ logError('pegasus-sdk', 'ChemicalsService', 'bulkUpdateProperty', error);
467
+ throw error;
468
+ }
469
+ }
470
+
471
+ async getChemicalById(chemicalId) {
472
+ try {
473
+ const db = this.getDb();
474
+
475
+ const [result] = await db
476
+ .select()
477
+ .from(schema.chemicals)
478
+ .where(eq(schema.chemicals.chemicalId, chemicalId))
479
+ .limit(1);
480
+
481
+ return result || null;
482
+ } catch (error) {
483
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalById', error);
484
+ throw error;
485
+ }
486
+ }
487
+
488
+ async getChemicalBySourceId(sourceId) {
489
+ try {
490
+ const db = this.getDb();
491
+
492
+ const [result] = await db
493
+ .select()
494
+ .from(schema.chemicals)
495
+ .where(eq(schema.chemicals.sourceId, sourceId))
496
+ .limit(1);
497
+
498
+ return result || null;
499
+ } catch (error) {
500
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalBySourceId', error);
501
+ throw error;
502
+ }
503
+ }
504
+
505
+ async getChemicalsByCAS(casNumber) {
506
+ try {
507
+ const db = this.getDb();
508
+
509
+ const results = await db
510
+ .select()
511
+ .from(schema.chemicals)
512
+ .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
513
+
514
+ return results;
515
+ } catch (error) {
516
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByCAS', error);
517
+ throw error;
518
+ }
519
+ }
520
+
521
+ async getChemicalsByIdentifier(identifierType, identifierValue) {
522
+ try {
523
+ if (!ALLOWED_IDENTIFIER_TYPES.has(identifierType)) {
524
+ throw new Error(`Invalid identifier type: ${identifierType}`);
525
+ }
526
+
527
+ const db = this.getDb();
528
+
529
+ const results = await db
530
+ .select()
531
+ .from(schema.chemicals)
532
+ .where(sql`${schema.chemicals.chemicalIdentifiers}->>${identifierType} = ${identifierValue} OR ${schema.chemicals.chemicalIdentifiers}->${identifierType} ? ${identifierValue}`);
533
+
534
+ return results;
535
+ } catch (error) {
536
+ logError('pegasus-sdk', 'ChemicalsService', 'getChemicalsByIdentifier', error);
537
+ throw error;
538
+ }
539
+ }
540
+
541
+ async countByCollection(collectionName) {
542
+ try {
543
+ const db = this.getDb();
544
+
545
+ const result = await db
546
+ .select({ count: sql`count(*)::int` })
547
+ .from(schema.chemicals)
548
+ .where(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
549
+
550
+ return { count: result[0].count };
551
+ } catch (error) {
552
+ logError('pegasus-sdk', 'ChemicalsService', 'countByCollection', error);
553
+ throw error;
554
+ }
555
+ }
556
+
557
+ async countByIdentifier(identifierValue) {
558
+ try {
559
+ const db = this.getDb();
560
+
561
+ const searchPattern = `%${escapeLikePattern(identifierValue)}%`;
562
+ const result = await db
563
+ .select({ count: sql`count(*)::int` })
564
+ .from(schema.chemicals)
565
+ .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
566
+
567
+ return { count: result[0].count };
568
+ } catch (error) {
569
+ logError('pegasus-sdk', 'ChemicalsService', 'countByIdentifier', error);
570
+ throw error;
571
+ }
572
+ }
573
+
574
+ async countByCAS(casNumber) {
575
+ try {
576
+ const db = this.getDb();
577
+
578
+ const result = await db
579
+ .select({ count: sql`count(*)::int` })
580
+ .from(schema.chemicals)
581
+ .where(sql`${schema.chemicals.chemicalIdentifiers}->>'CAS' = ${casNumber} OR ${schema.chemicals.chemicalIdentifiers}->'CAS' ? ${casNumber}`);
582
+
583
+ return { count: result[0].count };
584
+ } catch (error) {
585
+ logError('pegasus-sdk', 'ChemicalsService', 'countByCAS', error);
586
+ throw error;
587
+ }
588
+ }
589
+
590
+ async getTotalSynonymCount() {
591
+ try {
592
+ const db = this.getDb();
593
+
594
+ const result = await db
595
+ .select({ count: sql`sum(array_length(${schema.chemicals.chemicalSynonyms}, 1))::int` })
596
+ .from(schema.chemicals);
597
+
598
+ return { count: result[0].count || 0 };
599
+ } catch (error) {
600
+ logError('pegasus-sdk', 'ChemicalsService', 'getTotalSynonymCount', error);
601
+ throw error;
602
+ }
603
+ }
604
+
605
+ async getSynonymCount(synonymTerm) {
606
+ try {
607
+ const db = this.getDb();
608
+
609
+ const result = await db
610
+ .select({ count: sql`count(*)::int` })
611
+ .from(schema.chemicals)
612
+ .where(arrayContains(schema.chemicals.chemicalSynonyms, [synonymTerm]));
613
+
614
+ return { count: result[0].count };
615
+ } catch (error) {
616
+ logError('pegasus-sdk', 'ChemicalsService', 'getSynonymCount', error);
617
+ throw error;
618
+ }
619
+ }
620
+
621
+ async convertIdentifier(fromIdentifier, toIdentifierType) {
622
+ try {
623
+ const db = this.getDb();
624
+
625
+ const searchPattern = `%${escapeLikePattern(fromIdentifier)}%`;
626
+ const chemicals = await db
627
+ .select()
628
+ .from(schema.chemicals)
629
+ .where(sql`${schema.chemicals.chemicalIdentifiers}::text LIKE ${searchPattern}`);
630
+
631
+ if (chemicals.length === 0) {
632
+ return null;
633
+ }
634
+
635
+ const chemical = chemicals[0];
636
+ const identifiers = chemical.chemicalIdentifiers || {};
637
+ const toIdentifier = identifiers[toIdentifierType];
638
+
639
+ return {
640
+ fromIdentifier,
641
+ toIdentifierType,
642
+ toIdentifier,
643
+ chemicalId: chemical.chemicalId,
644
+ chemicalName: chemical.chemicalName
645
+ };
646
+ } catch (error) {
647
+ logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifier', error);
648
+ throw error;
649
+ }
650
+ }
651
+
652
+ async convertIdentifiersBatch(fromIdentifiers, toIdentifierType) {
653
+ try {
654
+ const conversions = await Promise.all(
655
+ fromIdentifiers.map(fromIdentifier =>
656
+ this.convertIdentifier(fromIdentifier, toIdentifierType)
657
+ )
658
+ );
659
+
660
+ return conversions.filter(conversion => conversion !== null);
661
+ } catch (error) {
662
+ logError('pegasus-sdk', 'ChemicalsService', 'convertIdentifiersBatch', error);
663
+ throw error;
664
+ }
665
+ }
666
+
667
+ /**
668
+ * Search for chemicals by name using OpenSearch
669
+ * @param {string} searchTerm - Name to search for
670
+ * @param {number} limit - Maximum number of results (default: 10)
671
+ * @returns {Promise<Object>} Search results
672
+ */
673
+ async searchByName(searchTerm, limit = 10) {
674
+ if (!searchTerm) {
675
+ return { results: [] };
676
+ }
677
+
678
+ try {
679
+ const opensearchClient = this.connection.getOpenSearchClient();
680
+ const indexName = this.connection.getOpenSearchIndex();
681
+
682
+ const response = await opensearchClient.search({
683
+ index: indexName,
684
+ body: {
685
+ size: limit,
686
+ query: {
687
+ bool: {
688
+ should: [
689
+ { term: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
690
+ { prefix: { 'chemical_name.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
691
+ { term: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
692
+ { prefix: { 'synonyms.keyword': { value: searchTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
693
+ ],
694
+ minimum_should_match: 1
695
+ }
696
+ },
697
+ _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
698
+ }
699
+ });
700
+
701
+ const hits = response.body?.hits?.hits || [];
702
+ const results = hits.map((hit) => ({
703
+ id: hit._source.postgres_id,
704
+ name: hit._source.chemical_name,
705
+ cas: hit._source.cas_numbers || [],
706
+ identifiers: hit._source.identifier_values || [],
707
+ synonyms: hit._source.synonyms || [],
708
+ score: hit._score
709
+ }));
710
+
711
+ return { results };
712
+ } catch (error) {
713
+ logError('pegasus-sdk', 'ChemicalsService', 'searchByName', error);
714
+ throw error;
715
+ }
716
+ }
717
+
718
+ /**
719
+ * Search for chemicals by synonym using OpenSearch
720
+ * @param {string} synonymTerm - Synonym to search for
721
+ * @param {number} limit - Maximum number of results (default: 10)
722
+ * @returns {Promise<Object>} Search results
723
+ */
724
+ async searchBySynonym(synonymTerm, limit = 10) {
725
+ if (!synonymTerm) {
726
+ return { results: [] };
727
+ }
728
+
729
+ try {
730
+ const opensearchClient = this.connection.getOpenSearchClient();
731
+ const indexName = this.connection.getOpenSearchIndex();
732
+
733
+ const response = await opensearchClient.search({
734
+ index: indexName,
735
+ body: {
736
+ size: limit,
737
+ query: {
738
+ bool: {
739
+ should: [
740
+ { term: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_PRIMARY, case_insensitive: true } } },
741
+ { prefix: { 'synonyms.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_PRIMARY, case_insensitive: true } } },
742
+ { term: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_EXACT_SECONDARY, case_insensitive: true } } },
743
+ { prefix: { 'chemical_name.keyword': { value: synonymTerm, boost: SEARCH_BOOST_PREFIX_SECONDARY, case_insensitive: true } } }
744
+ ],
745
+ minimum_should_match: 1
746
+ }
747
+ },
748
+ _source: ['postgres_id', 'chemical_name', 'cas_numbers', 'identifier_values', 'synonyms']
749
+ }
750
+ });
751
+
752
+ const hits = response.body?.hits?.hits || [];
753
+ const results = hits.map((hit) => ({
754
+ id: hit._source.postgres_id,
755
+ name: hit._source.chemical_name,
756
+ cas: hit._source.cas_numbers || [],
757
+ identifiers: hit._source.identifier_values || [],
758
+ synonyms: hit._source.synonyms || [],
759
+ score: hit._score
760
+ }));
761
+
762
+ return { results };
763
+ } catch (error) {
764
+ logError('pegasus-sdk', 'ChemicalsService', 'searchBySynonym', error);
765
+ throw error;
766
+ }
767
+ }
768
+
769
+ async countAll() {
770
+ try {
771
+ const db = this.getDb();
772
+ const result = await db
773
+ .select({ count: sql`count(*)::int` })
774
+ .from(schema.chemicals);
775
+ return { count: result[0].count };
776
+ } catch (error) {
777
+ logError('pegasus-sdk', 'ChemicalsService', 'countAll', error);
778
+ throw error;
779
+ }
780
+ }
781
+
782
+ async findChemicalsWithoutDocuments(collectionName, searchTerm, pageSize = 100) {
783
+ try {
784
+ const db = this.getDb();
785
+
786
+ let whereConditions = [];
787
+
788
+ if (collectionName) {
789
+ whereConditions.push(arrayContains(schema.chemicals.chemicalCategories, [collectionName]));
790
+ }
791
+
792
+ if (searchTerm) {
793
+ const searchPattern = `%${escapeLikePattern(searchTerm)}%`;
794
+ whereConditions.push(sql`${schema.chemicals.chemicalName} ILIKE ${searchPattern}`);
795
+ }
796
+
797
+ const whereClause = whereConditions.length > 0 ? and(...whereConditions) : undefined;
798
+
799
+ const results = await db
800
+ .select()
801
+ .from(schema.chemicals)
802
+ .where(whereClause)
803
+ .limit(pageSize);
804
+
805
+ return results;
806
+ } catch (error) {
807
+ logError('pegasus-sdk', 'ChemicalsService', 'findChemicalsWithoutDocuments', error);
808
+ throw error;
809
+ }
810
+ }
811
+
812
+ async countChemicalsWithoutDocuments(collectionName) {
813
+ try {
814
+ const db = this.getDb();
815
+
816
+ const whereClause = collectionName
817
+ ? arrayContains(schema.chemicals.chemicalCategories, [collectionName])
818
+ : undefined;
819
+
820
+ const result = await db
821
+ .select({ count: sql`count(*)::int` })
822
+ .from(schema.chemicals)
823
+ .where(whereClause);
824
+
825
+ return { count: result[0].count };
826
+ } catch (error) {
827
+ logError('pegasus-sdk', 'ChemicalsService', 'countChemicalsWithoutDocuments', error);
828
+ throw error;
829
+ }
830
+ }
831
+
832
+ _buildEsHandlers() {
833
+ return {
834
+ index: async (params) => {
835
+ const chemical = params.body;
836
+ const result = await this.createChemical(chemical);
837
+
838
+ return {
839
+ _index: params.index,
840
+ _id: result.chemicalId,
841
+ _version: 1,
842
+ result: 'created',
843
+ _source: result
844
+ };
845
+ },
846
+
847
+ bulk: async (params) => {
848
+ const operations = params.body || params.operations;
849
+
850
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Starting bulk operation with ${operations?.length || 0} total operations`);
851
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Params index: ${params.index}`);
852
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Operations array type: ${Array.isArray(operations) ? 'array' : typeof operations}`);
853
+
854
+ const cdiDocuments = [];
855
+ let cdiOpCount = 0;
856
+ let otherOpCount = 0;
857
+
858
+ for (let i = 0; i < operations.length; i++) {
859
+ const op = operations[i];
860
+ const isIndexOp = !!(op.index || op.create);
861
+ const indexName = op.index?._index || op.create?._index || op.delete?._index || op.update?._index;
862
+
863
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Op[${i}]: action=${Object.keys(op)[0] || 'unknown'}, index=${indexName}`);
864
+
865
+ if ((op.index || op.create) &&
866
+ (op.index?._index === 'chemical_data_index' || op.create?._index === 'chemical_data_index')) {
867
+ const doc = operations[i + 1];
868
+ const sourceId = op.index?._id || op.create?._id;
869
+
870
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Found CDI entry: sourceId=${sourceId}, hasDoc=${!!doc}`);
871
+
872
+ if (doc && sourceId) {
873
+ const cdiDoc = {
874
+ source_id: sourceId,
875
+ chemical_name: doc.chemical_primary_name || (doc.chemical_names && doc.chemical_names[0]) || null,
876
+ chemical_meta: doc.chemical_meta || {},
877
+ chemical_identifiers: doc.chemical_identifiers || {},
878
+ chemical_synonyms: doc.chemical_synonyms || [],
879
+ chemical_categories: doc.chemical_categories || [],
880
+ created_at: doc.chemical_created_at ? (typeof doc.chemical_created_at === 'string' ? new Date(doc.chemical_created_at) : doc.chemical_created_at) : new Date(),
881
+ updated_at: doc.chemical_updated_at ? (typeof doc.chemical_updated_at === 'string' ? new Date(doc.chemical_updated_at) : doc.chemical_updated_at) : new Date()
882
+ };
883
+ cdiDocuments.push(cdiDoc);
884
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Extracted CDI doc: ${JSON.stringify({ source_id: cdiDoc.source_id, chemical_name: cdiDoc.chemical_name })}`);
885
+ i++;
886
+ cdiOpCount++;
887
+ } else {
888
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] CDI entry incomplete: sourceId=${sourceId}, doc=${!!doc}`);
889
+ }
890
+ } else {
891
+ otherOpCount++;
892
+ }
893
+ }
894
+
895
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Scan complete: ${cdiOpCount} CDI docs found, ${otherOpCount} other operations skipped`);
896
+
897
+ if (cdiDocuments.length === 0) {
898
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] No CDI documents to index, returning empty no-op response`);
899
+ return { took: 0, errors: false, items: [] };
900
+ }
901
+
902
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] Calling bulkIndexFielded with ${cdiDocuments.length} CDI documents`);
903
+
904
+ try {
905
+ const result = await this.bulkIndexFielded(cdiDocuments);
906
+ logInfo('pegasus-sdk', `[ChemicalsService.bulk] bulkIndexFielded returned: indexed=${result.indexed}, errors=${result.errors.length}`);
907
+
908
+ if (result.errors.length > 0) {
909
+ logError('pegasus-sdk', 'ChemicalsService.bulk', 'Errors during bulk indexing', result.errors);
910
+ }
911
+
912
+ return {
913
+ took: 1,
914
+ errors: result.errors.length > 0,
915
+ items: result.results.map((res, idx) => ({
916
+ index: {
917
+ _index: 'chemical_data_index',
918
+ _id: cdiDocuments[idx].source_id,
919
+ status: res.success ? 200 : 400,
920
+ result: res.success ? 'created' : 'error',
921
+ ...(res.success ? {} : { error: { type: 'mapper_parsing_exception', reason: res.error } })
922
+ }
923
+ }))
924
+ };
925
+ } catch (error) {
926
+ logError('pegasus-sdk', 'ChemicalsService.bulk', 'Fatal error during bulk indexing', error);
927
+ throw error;
928
+ }
929
+ },
930
+
931
+ get: async (params) => {
932
+ const result = await this.getChemicalBySourceId(params.id);
933
+
934
+ if (!result) {
935
+ return {
936
+ _index: params.index,
937
+ _id: params.id,
938
+ found: false
939
+ };
940
+ }
941
+
942
+ return {
943
+ _index: params.index,
944
+ _id: params.id,
945
+ _version: 1,
946
+ found: true,
947
+ _source: result
948
+ };
949
+ },
950
+
951
+ update: async (params) => {
952
+ const result = await this.updateChemical(params.id, params.body);
953
+
954
+ return {
955
+ _index: params.index,
956
+ _id: params.id,
957
+ _version: 2,
958
+ result: result ? 'updated' : 'noop',
959
+ _source: result
960
+ };
961
+ },
962
+
963
+ delete: async (params) => {
964
+ if (params.index === 'synonym_lookup_index') {
965
+ return { _index: params.index, _id: params.id, result: 'not_found' };
966
+ }
967
+ const result = await this.deleteBySourceId(params.id);
968
+
969
+ return {
970
+ _index: params.index,
971
+ _id: params.id,
972
+ result: result ? 'deleted' : 'not_found'
973
+ };
974
+ },
975
+
976
+ deleteByQuery: async (params) => {
977
+ const sourceId = params.body?.query?.term?.chemical_set_identifier
978
+ || params.body?.query?.term?.source_id;
979
+ if (!sourceId) {
980
+ return { deleted: 0, failures: [] };
981
+ }
982
+ const result = await this.deleteBySourceId(sourceId);
983
+ return {
984
+ deleted: result ? 1 : 0,
985
+ failures: []
986
+ };
987
+ },
988
+
989
+ search: async (params) => {
990
+ let searchTerm = '';
991
+ let limit = params.body?.size || 10;
992
+
993
+ if (params.index === 'synonym_lookup_index') {
994
+ const query = params.body?.query;
995
+ searchTerm = query?.match?.chemical_name ||
996
+ query?.term?.chemical_name ||
997
+ query?.query_string?.query || '';
998
+ const searchResults = await this.searchBySynonym(searchTerm, limit);
999
+
1000
+ return {
1001
+ took: 1,
1002
+ timed_out: false,
1003
+ _shards: {
1004
+ total: 1,
1005
+ successful: 1,
1006
+ skipped: 0,
1007
+ failed: 0
1008
+ },
1009
+ hits: {
1010
+ total: {
1011
+ value: searchResults.results.length,
1012
+ relation: 'eq'
1013
+ },
1014
+ max_score: searchResults.results[0]?.score || 0,
1015
+ hits: searchResults.results.map(result => ({
1016
+ _index: params.index,
1017
+ _id: result.id,
1018
+ _score: result.score,
1019
+ _source: {
1020
+ postgres_id: result.id,
1021
+ chemical_name: result.name,
1022
+ cas_numbers: result.cas,
1023
+ identifier_values: result.identifiers,
1024
+ synonyms: result.synonyms
1025
+ }
1026
+ }))
1027
+ }
1028
+ };
1029
+ } else {
1030
+ const query = params.body?.query;
1031
+ searchTerm = query?.match?.chemical_name ||
1032
+ query?.term?.chemical_name ||
1033
+ query?.query_string?.query || '';
1034
+ const searchResults = await this.searchByName(searchTerm, limit);
1035
+
1036
+ return {
1037
+ took: 1,
1038
+ timed_out: false,
1039
+ _shards: {
1040
+ total: 1,
1041
+ successful: 1,
1042
+ skipped: 0,
1043
+ failed: 0
1044
+ },
1045
+ hits: {
1046
+ total: {
1047
+ value: searchResults.results.length,
1048
+ relation: 'eq'
1049
+ },
1050
+ max_score: searchResults.results[0]?.score || 0,
1051
+ hits: searchResults.results.map(result => ({
1052
+ _index: params.index,
1053
+ _id: result.id,
1054
+ _score: result.score,
1055
+ _source: {
1056
+ postgres_id: result.id,
1057
+ chemical_name: result.name,
1058
+ cas_numbers: result.cas,
1059
+ identifier_values: result.identifiers,
1060
+ synonyms: result.synonyms
1061
+ }
1062
+ }))
1063
+ }
1064
+ };
1065
+ }
1066
+ },
1067
+
1068
+ count: async (params) => {
1069
+ if (params.index === 'synonym_lookup_index') {
1070
+ return await this.getTotalSynonymCount();
1071
+ }
1072
+ return await this.countAll();
1073
+ }
1074
+ };
1075
+ }
1076
+
1077
+ registerElasticsearchHandlers(elasticsearchService) {
1078
+ const configurablePatterns = this.connection.config.indexRoutes?.chemicals || ['chemicals*'];
1079
+ const legacyPatterns = ['synonym_lookup_index', 'chemical_data_index', 'chemical_converter_index'];
1080
+ const allPatterns = [...new Set([...configurablePatterns, ...legacyPatterns])];
1081
+ const handlers = this._buildEsHandlers();
1082
+ allPatterns.forEach(pattern => {
1083
+ elasticsearchService.registerIndexRoute(pattern, handlers);
1084
+ });
1085
+ }
1086
+ }
1087
+
1029
1088
  module.exports = ChemicalsService;