npm - pelias-schema - Versions diffs - 7.2.0 → 8.1.0 - Mend

pelias-schema 7.2.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/.github/workflows/_integration_tests.yml +7 -2
package/.github/workflows/_unit_tests.yml +8 -3
package/.github/workflows/push.yml +4 -4
package/configValidation.js +2 -0
package/integration/analyzer_peliasIndexOneEdgeGram.js +12 -2
package/integration/analyzer_peliasQuery.js +30 -2
package/integration/analyzer_peliasStreet.js +15 -0
package/package.json +1 -1
package/settings-icu.js +49 -0
package/settings.js +8 -2
package/test/compile.js +39 -4
package/test/fixtures/config-icu-tokenizer.json +15 -0
package/test/fixtures/expected-icu-tokenizer.json +3059 -0
package/test/settings.js +31 -12

package/.github/workflows/_integration_tests.yml CHANGED Viewed

@@ -6,12 +6,13 @@ jobs:
     strategy:
       matrix:
         os:
-          - ${{ vars.UBUNTU_VERSION }}
+          - ubuntu-22.04
         node-version: [18.x, 20.x, 22.x]
         es-version: [7.6.1]
         jdk-version: [oraclejdk11]
+        icuTokenizer: [true, false]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Install node.js ${{ matrix.node-version }}
       uses: actions/setup-node@v4
       with:
@@ -23,6 +24,10 @@ jobs:
       run: ./scripts/setup_ci.sh
     - name: Run integration tests
       run: |
+        if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
+          jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
+          export PELIAS_CONFIG=$(pwd)/config-icu.json
+        fi
         npm install
         curl http://127.0.0.1:9200/
         ./bin/create_index

package/.github/workflows/_unit_tests.yml CHANGED Viewed

@@ -6,10 +6,11 @@ jobs:
     strategy:
       matrix:
         os:
-          - ${{ vars.UBUNTU_VERSION }}
+          - ubuntu-22.04
         node-version: [18.x, 20.x, 22.x]
+        icuTokenizer: [true, false]
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Install node.js ${{ matrix.node-version }}
       uses: actions/setup-node@v4
       with:
@@ -17,4 +18,8 @@ jobs:
     - name: Run unit tests
       run: |
         npm install
-        npm run test
+        if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
+          jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
+          export PELIAS_CONFIG=$(pwd)/config-icu.json
+        fi
+        npm run test

package/.github/workflows/push.yml CHANGED Viewed

@@ -11,9 +11,9 @@ jobs:
   npm-publish:
     needs: [unit-tests, integration-tests]
     if: github.ref == 'refs/heads/master' && github.event_name == 'push'
-    runs-on: ${{ vars.UBUNTU_VERSION }}
+    runs-on: ubuntu-22.04
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
     - name: Install Node.js
       uses: actions/setup-node@v4
       with:
@@ -29,9 +29,9 @@ jobs:
     # note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true`
     if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }}
     needs: [unit-tests, integration-tests, npm-publish]
-    runs-on: ${{ vars.UBUNTU_VERSION }}
+    runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Build Docker images
         env:
           DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}

package/configValidation.js CHANGED Viewed

@@ -2,10 +2,12 @@ const Joi = require('@hapi/joi');
 // Schema Configuration
 // schema.indexName: populated by defaults if not overridden
+// schema.icuTokenizer: boolean, optional, defaults to false
 // esclient: object, validation performed by elasticsearch module
 const schema = Joi.object().required().keys({
   schema: Joi.object().required().keys({
     indexName: Joi.string().required(),
+    icuTokenizer: Joi.boolean().optional()
   }),
   esclient: Joi.object().required()
 }).unknown(true);

package/integration/analyzer_peliasIndexOneEdgeGram.js CHANGED Viewed

@@ -1,8 +1,9 @@
 // validate analyzer is behaving as expected
-var tape = require('tape'),
+const tape = require('tape'),
     Suite = require('../test/elastictest/Suite'),
-    punctuation = require('../punctuation');
+    punctuation = require('../punctuation'),
+    config = require('pelias-config').generate();
 module.exports.tests = {};
@@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){
     assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
     assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
+    if (config.schema.icuTokenizer) {
+      assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
+        '0:ซ', '0:ซอ', '0:ซอย',
+        '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
+        '2:f', '2:fo', '2:foo'] );
+    } else {
+      // no ICU tokenization, so we split only on spaces
+      assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']);
+    }
     suite.run( t.end );
   });

package/integration/analyzer_peliasQuery.js CHANGED Viewed

@@ -1,8 +1,9 @@
 // validate analyzer is behaving as expected
-var tape = require('tape'),
+const tape = require('tape'),
     Suite = require('../test/elastictest/Suite'),
-    punctuation = require('../punctuation');
+    punctuation = require('../punctuation'),
+    config = require('pelias-config').generate();
 module.exports.tests = {};
@@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){
     assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
     assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);
+     // complicated tokenization for some Asian languages
+    if (config.schema.icuTokenizer) {
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
+      assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
+      assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
+      assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
+       ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
+      // correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
+      assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
+          ["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
+      // correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
+      assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
+            ["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
+      // correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
+      assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
+            ["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
+      // correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
+      assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
+            ["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);
+      assertAnalysis('japanese_address', '東京都渋谷区渋谷２丁目２１−１渋谷スクランブルスクエア４階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
+      assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
+      assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
+    } else {
+      // no ICU tokenization, so we split only on spaces
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
+    }
     suite.run( t.end );
   });
 };

package/integration/analyzer_peliasStreet.js CHANGED Viewed

@@ -1,5 +1,6 @@
 // validate analyzer is behaving as expected
 const Suite = require('../test/elastictest/Suite')
+const config = require('pelias-config').generate()
 module.exports.tests = {};
@@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){
     assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
     assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );
+    // complicated tokenization for some Asian languages
+    if (config.schema.icuTokenizer) {
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
+      assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
+      assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
+      assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
+        ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
+      assertAnalysis('japanese_address', '東京都渋谷区渋谷２丁目２１−１渋谷スクランブルスクエア４階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
+      assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
+      assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
+    } else {
+      // no ICU tokenization, so we split only on spaces
+      assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
+    }
     suite.run( t.end );
   });
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "pelias-schema",
-  "version": "7.2.0",
+  "version": "8.1.0",
   "author": "pelias",
   "description": "Elasticsearch schema files and tooling for Pelias",
   "homepage": "https://github.com/pelias/schema",

package/settings-icu.js ADDED Viewed

@@ -0,0 +1,49 @@
+const _ = require('lodash');
+/**
+ * This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
+ * This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
+ *
+ * It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
+ * Note: this must be set *before* you create your elasticsearch index or it will have no effect.
+ *
+ * This feature is considered beta, we encourage testing & feedback from the community in order
+ * to adopt the ICU tokenizer as our default.
+ *
+ * https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
+ * https://github.com/pelias/schema/pull/498
+ */
+module.exports = (settings) => {
+  // replace pattern tokenizer with icu_tokenizer
+  _.set(settings, 'analysis.tokenizer.peliasTokenizer', {
+    'type': 'icu_tokenizer'
+  });
+  // add ampersand_replacer filter
+  // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
+  _.set(settings, 'analysis.filter.ampersand_replacer', {
+    'type': 'pattern_replace',
+    'pattern': 'AMPERSANDPLACEHOLDER',
+    'replacement': '&'
+  });
+  // add ampersand_mapper char_filter
+  // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
+  // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
+  _.set(settings, 'analysis.char_filter.ampersand_mapper', {
+    'type': 'pattern_replace',
+    'pattern': '&',
+    'replacement': ' AMPERSANDPLACEHOLDER '
+  });
+  // prepend ampersand mapper/replacer to each analyzer
+  _.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
+    if (block?.tokenizer !== 'peliasTokenizer') { return; }
+    block.filter.unshift('ampersand_replacer');
+    block.char_filter.unshift('ampersand_mapper');
+  });
+  return settings;
+}

package/settings.js CHANGED Viewed

@@ -2,14 +2,15 @@ const _ = require('lodash');
 const peliasConfig = require('pelias-config');
 const punctuation = require('./punctuation');
 const synonyms = require('./synonyms/loader').load();
+const settingsICU = require('./settings-icu');
 require('./configValidation').validate(peliasConfig.generate());
 function generate(){
-  var config = peliasConfig.generate();
+  const config = peliasConfig.generate();
   // Default settings
-  var settings = {
+  let settings = {
     "index": {
       "similarity": {
         "peliasDefaultSimilarity": {
@@ -299,6 +300,11 @@ function generate(){
     };
   });
+  // Experimental ICU tokenizer
+  if (config.schema.icuTokenizer) {
+    settings = settingsICU(settings);
+  }
   // Merge settings from pelias/config
   settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));

package/test/compile.js CHANGED Viewed

@@ -2,7 +2,7 @@ const _ = require('lodash');
 const path = require('path');
 const schema = require('../');
 const fixture = require('./fixtures/expected.json');
-const config = require('pelias-config').generate();
+const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');
 const forEachDeep = (obj, cb) =>
   _.forEach(obj, (val, key) => {
@@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) {
   });
 };
+function overridePeliasConfig(value, cb) {
+  const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
+  process.env.PELIAS_CONFIG = value;
+  cb();
+  if (OLD_PELIAS_CONFIG) {
+    process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
+  } else {
+    delete process.env.PELIAS_CONFIG;
+  }
+}
 // current schema (compiled) - requires schema to be copied and settings to
 // be regenerated from a fixture in order to pass in CI environments.
 module.exports.tests.current_schema = function(test, common) {
@@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) {
     var schemaCopy = JSON.parse( JSON.stringify( schema ) );
     // use the pelias config fixture instead of the local config
-    process.env.PELIAS_CONFIG = path.resolve( __dirname + '/fixtures/config.json' );
-    schemaCopy.settings = require('../settings')();
-    delete process.env.PELIAS_CONFIG;
+    overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => {
+      schemaCopy.settings = require('../settings')();
+    });
     // code intentionally commented to allow quick debugging of expected.json
     // common.diff(schemaCopy, fixture);
@@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) {
     t.deepEqual(schemaCopy, fixture);
     t.end();
   });
+  test('current schema vs. fixture with ICU tokenizer', function(t) {
+    // copy schema
+    var schemaCopy = JSON.parse( JSON.stringify( schema ) );
+    // use the pelias config fixture instead of the local config
+    overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => {
+      schemaCopy.settings = require('../settings')();
+    });
+    // code intentionally commented to allow quick debugging of expected.json
+    // common.diff(schemaCopy, fixtureICUTokenizer);
+    // console.error( JSON.stringify( schemaCopy, null, 2 ) );
+    // code to write expected output to the fixture
+    // const fs = require('fs');
+    // fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2));
+    t.deepEqual(schemaCopy, fixtureICUTokenizer);
+    t.end();
+  });
 };
 module.exports.all = function (tape, common) {

package/test/fixtures/config-icu-tokenizer.json ADDED Viewed

@@ -0,0 +1,15 @@
+{
+    "elasticsearch": {
+      "settings": {
+        "index": {
+          "number_of_replicas": "999",
+          "number_of_shards": "5",
+          "refresh_interval": "1m"
+        }
+      }
+    },
+    "schema": {
+        "icuTokenizer": true
+    }
+}