@toxplanet/pegasus-sdk 1.2.11 → 1.2.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/chemicals.js +68 -13
  2. package/package.json +1 -1
package/lib/chemicals.js CHANGED
@@ -9,6 +9,25 @@ const SEARCH_BOOST_PREFIX_SECONDARY = 10;
9
9
 
10
10
  const ALLOWED_IDENTIFIER_TYPES = new Set(['CAS', 'SMILES', 'InChI', 'InChIKey', 'PubChem', 'CID', 'DTXSID', 'EINECS', 'EC']);
11
11
 
12
+ // Heuristic: does the search term look like a chemical identifier (CAS, EC,
13
+ // InChIKey, CID, etc.) rather than a chemical name? If so, restrict to keyword
14
+ // matches so we don't get noise from tokenized text-field matches on names.
15
+ function looksLikeIdentifier(term) {
16
+ if (!term) return false;
17
+ const t = String(term).trim();
18
+ if (!t) return false;
19
+ // CAS / EC: digits with hyphen or slash separators
20
+ if (/^\d{1,7}[-/]\d{1,3}[-/]\d{1,2}$/.test(t)) return true; // CAS xxxxxxx-xx-x
21
+ if (/^\d{2,3}-\d{3}-\d$/.test(t)) return true; // EC xxx-xxx-x
22
+ // InChIKey: 14-10-1 uppercase letters with hyphens
23
+ if (/^[A-Z]{14}-[A-Z]{10}-[A-Z]$/.test(t)) return true;
24
+ // PubChem CID
25
+ if (/^CID\d+$/i.test(t)) return true;
26
+ // DTXSID
27
+ if (/^DTXSID\d+$/i.test(t)) return true;
28
+ return false;
29
+ }
30
+
12
31
  // Equivalence groups for CDI identifier_key normalization. When any member of
13
32
  // a group is present on a row, the others are backfilled with the same values
14
33
  // so consumers can look up by their preferred spelling.
@@ -776,6 +795,10 @@ class ChemicalsService {
776
795
  query: {
777
796
  bool: {
778
797
  should: [
798
+ // v2 mapping: cas_numbers / identifier_values are keyword type.
799
+ { term: { 'cas_numbers': identifierValue } },
800
+ { term: { 'identifier_values': identifierValue } },
801
+ // Back-compat with legacy mapping where these were dynamically text + .keyword.
779
802
  { term: { 'cas_numbers.keyword': identifierValue } },
780
803
  { term: { 'identifier_values.keyword': identifierValue } }
781
804
  ],
@@ -924,6 +947,7 @@ class ChemicalsService {
924
947
  }
925
948
 
926
949
  try {
950
+ const isIdentifier = looksLikeIdentifier(searchTerm);
927
951
  const variations = getCasNumberVariations(searchTerm);
928
952
  const shouldClauses = [];
929
953
  for (const v of variations) {
@@ -935,12 +959,22 @@ class ChemicalsService {
935
959
  { term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
936
960
  { prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
937
961
  { term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
938
- { prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
939
- { match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
940
- { match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
941
- { match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
942
- { match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
962
+ { prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
943
963
  );
964
+ // Skip tokenized-text matches for identifier-shaped queries to avoid
965
+ // noise from the analyzer splitting "71-43-2" into 71/43/2 tokens.
966
+ if (!isIdentifier) {
967
+ shouldClauses.push(
968
+ // Whitespace-analyzer fields (preserve hyphenated compound names).
969
+ { match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
970
+ { match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
971
+ { match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
972
+ { match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
973
+ // Word-level (standard analyzer) sub-fields — let "ethane" still match "trichloroethane".
974
+ { match: { 'chemical_name.std': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
975
+ { match: { 'synonyms.std': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
976
+ );
977
+ }
944
978
  }
945
979
 
946
980
  const result = await this.connection.invokeOpenSearch({
@@ -989,6 +1023,7 @@ class ChemicalsService {
989
1023
  }
990
1024
 
991
1025
  try {
1026
+ const isIdentifier = looksLikeIdentifier(synonymTerm);
992
1027
  const variations = getCasNumberVariations(synonymTerm);
993
1028
  const shouldClauses = [];
994
1029
  for (const v of variations) {
@@ -1000,12 +1035,22 @@ class ChemicalsService {
1000
1035
  { term: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
1001
1036
  { prefix: { 'cas_numbers': { value: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
1002
1037
  { term: { 'identifier_values': { value: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
1003
- { prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
1004
- { match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
1005
- { match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
1006
- { match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
1007
- { match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
1038
+ { prefix: { 'identifier_values': { value: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
1008
1039
  );
1040
+ // Skip tokenized-text matches for identifier-shaped queries to avoid
1041
+ // noise from the analyzer splitting "71-43-2" into 71/43/2 tokens.
1042
+ if (!isIdentifier) {
1043
+ shouldClauses.push(
1044
+ // Whitespace-analyzer fields (preserve hyphenated compound names).
1045
+ { match: { 'synonyms': { query: v, boost: SEARCH_BOOST_EXACT_PRIMARY } } },
1046
+ { match: { 'chemical_name': { query: v, boost: SEARCH_BOOST_EXACT_SECONDARY } } },
1047
+ { match_phrase_prefix: { 'synonyms': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
1048
+ { match_phrase_prefix: { 'chemical_name': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } },
1049
+ // Word-level (standard analyzer) sub-fields — let "ethane" still match "trichloroethane".
1050
+ { match: { 'synonyms.std': { query: v, boost: SEARCH_BOOST_PREFIX_PRIMARY } } },
1051
+ { match: { 'chemical_name.std': { query: v, boost: SEARCH_BOOST_PREFIX_SECONDARY } } }
1052
+ );
1053
+ }
1009
1054
  }
1010
1055
 
1011
1056
  const result = await this.connection.invokeOpenSearch({
@@ -1305,7 +1350,7 @@ class ChemicalsService {
1305
1350
  search: async (params) => {
1306
1351
  let searchTerm = '';
1307
1352
  let limit = params.body?.size || 10;
1308
-
1353
+
1309
1354
  const toLegacySource = (r) => this._toLegacyChemicalSource({
1310
1355
  name: r.name,
1311
1356
  cas: r.cas || [],
@@ -1313,11 +1358,21 @@ class ChemicalsService {
1313
1358
  id: r.id
1314
1359
  });
1315
1360
 
1361
+ // Callers may pass an Elasticsearch query_string with reserved chars escaped
1362
+ // (e.g. "71\-43\-2"). Strip the escapes plus a trailing wildcard so keyword
1363
+ // term/prefix clauses against cas_numbers / identifier_values can match.
1364
+ const normalizeQueryString = (raw) => {
1365
+ if (!raw) return '';
1366
+ let s = String(raw);
1367
+ if (s.endsWith('*')) s = s.slice(0, -1);
1368
+ return s.replace(/\\(.)/g, '$1');
1369
+ };
1370
+
1316
1371
  if (params.index === 'synonym_lookup_index') {
1317
1372
  const query = params.body?.query;
1318
1373
  searchTerm = query?.match?.chemical_name ||
1319
1374
  query?.term?.chemical_name ||
1320
- query?.query_string?.query || '';
1375
+ normalizeQueryString(query?.query_string?.query) || '';
1321
1376
  const searchResults = await this.searchBySynonym(searchTerm, limit);
1322
1377
 
1323
1378
  return {
@@ -1347,7 +1402,7 @@ class ChemicalsService {
1347
1402
  const query = params.body?.query;
1348
1403
  searchTerm = query?.match?.chemical_name ||
1349
1404
  query?.term?.chemical_name ||
1350
- query?.query_string?.query || '';
1405
+ normalizeQueryString(query?.query_string?.query) || '';
1351
1406
  const searchResults = await this.searchByName(searchTerm, limit);
1352
1407
 
1353
1408
  return {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@toxplanet/pegasus-sdk",
3
- "version": "1.2.11",
3
+ "version": "1.2.14",
4
4
  "description": "SDK for migrating chemical data to Pegasus PostgreSQL + OpenSearch architecture with Elasticsearch client compatibility",
5
5
  "main": "index.js",
6
6
  "type": "commonjs",