pelias-schema 7.2.0 → 8.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,12 +6,13 @@ jobs:
6
6
  strategy:
7
7
  matrix:
8
8
  os:
9
- - ${{ vars.UBUNTU_VERSION }}
9
+ - ubuntu-22.04
10
10
  node-version: [18.x, 20.x, 22.x]
11
11
  es-version: [7.6.1]
12
12
  jdk-version: [oraclejdk11]
13
+ icuTokenizer: [true, false]
13
14
  steps:
14
- - uses: actions/checkout@v2
15
+ - uses: actions/checkout@v4
15
16
  - name: Install node.js ${{ matrix.node-version }}
16
17
  uses: actions/setup-node@v4
17
18
  with:
@@ -23,6 +24,10 @@ jobs:
23
24
  run: ./scripts/setup_ci.sh
24
25
  - name: Run integration tests
25
26
  run: |
27
+ if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
28
+ jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
29
+ export PELIAS_CONFIG=$(pwd)/config-icu.json
30
+ fi
26
31
  npm install
27
32
  curl http://127.0.0.1:9200/
28
33
  ./bin/create_index
@@ -6,10 +6,11 @@ jobs:
6
6
  strategy:
7
7
  matrix:
8
8
  os:
9
- - ${{ vars.UBUNTU_VERSION }}
9
+ - ubuntu-22.04
10
10
  node-version: [18.x, 20.x, 22.x]
11
+ icuTokenizer: [true, false]
11
12
  steps:
12
- - uses: actions/checkout@v2
13
+ - uses: actions/checkout@v4
13
14
  - name: Install node.js ${{ matrix.node-version }}
14
15
  uses: actions/setup-node@v4
15
16
  with:
@@ -17,4 +18,8 @@ jobs:
17
18
  - name: Run unit tests
18
19
  run: |
19
20
  npm install
20
- npm run test
21
+ if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
22
+ jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
23
+ export PELIAS_CONFIG=$(pwd)/config-icu.json
24
+ fi
25
+ npm run test
@@ -11,9 +11,9 @@ jobs:
11
11
  npm-publish:
12
12
  needs: [unit-tests, integration-tests]
13
13
  if: github.ref == 'refs/heads/master' && github.event_name == 'push'
14
- runs-on: ${{ vars.UBUNTU_VERSION }}
14
+ runs-on: ubuntu-22.04
15
15
  steps:
16
- - uses: actions/checkout@v2
16
+ - uses: actions/checkout@v4
17
17
  - name: Install Node.js
18
18
  uses: actions/setup-node@v4
19
19
  with:
@@ -29,9 +29,9 @@ jobs:
29
29
  # note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true`
30
30
  if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }}
31
31
  needs: [unit-tests, integration-tests, npm-publish]
32
- runs-on: ${{ vars.UBUNTU_VERSION }}
32
+ runs-on: ubuntu-22.04
33
33
  steps:
34
- - uses: actions/checkout@v2
34
+ - uses: actions/checkout@v4
35
35
  - name: Build Docker images
36
36
  env:
37
37
  DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
@@ -2,10 +2,12 @@ const Joi = require('@hapi/joi');
2
2
 
3
3
  // Schema Configuration
4
4
  // schema.indexName: populated by defaults if not overridden
5
+ // schema.icuTokenizer: boolean, optional, defaults to false
5
6
  // esclient: object, validation performed by elasticsearch module
6
7
  const schema = Joi.object().required().keys({
7
8
  schema: Joi.object().required().keys({
8
9
  indexName: Joi.string().required(),
10
+ icuTokenizer: Joi.boolean().optional()
9
11
  }),
10
12
  esclient: Joi.object().required()
11
13
  }).unknown(true);
@@ -1,8 +1,9 @@
1
1
  // validate analyzer is behaving as expected
2
2
 
3
- var tape = require('tape'),
3
+ const tape = require('tape'),
4
4
  Suite = require('../test/elastictest/Suite'),
5
- punctuation = require('../punctuation');
5
+ punctuation = require('../punctuation'),
6
+ config = require('pelias-config').generate();
6
7
 
7
8
  module.exports.tests = {};
8
9
 
@@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){
85
86
 
86
87
  assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
87
88
  assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
89
+ if (config.schema.icuTokenizer) {
90
+ assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
91
+ '0:ซ', '0:ซอ', '0:ซอย',
92
+ '1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
93
+ '2:f', '2:fo', '2:foo'] );
94
+ } else {
95
+ // no ICU tokenization, so we split only on spaces
96
+ assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']);
97
+ }
88
98
 
89
99
  suite.run( t.end );
90
100
  });
@@ -1,8 +1,9 @@
1
1
  // validate analyzer is behaving as expected
2
2
 
3
- var tape = require('tape'),
3
+ const tape = require('tape'),
4
4
  Suite = require('../test/elastictest/Suite'),
5
- punctuation = require('../punctuation');
5
+ punctuation = require('../punctuation'),
6
+ config = require('pelias-config').generate();
6
7
 
7
8
  module.exports.tests = {};
8
9
 
@@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){
49
50
  assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
50
51
  assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);
51
52
 
53
+ // complicated tokenization for some Asian languages
54
+ if (config.schema.icuTokenizer) {
55
+ assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
56
+ assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
57
+ assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
58
+ assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
59
+ ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
60
+ // correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
61
+ assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
62
+ ["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
63
+ // correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
64
+ assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
65
+ ["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
66
+ // correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
67
+ assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
68
+ ["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
69
+ // correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
70
+ assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
71
+ ["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);
72
+
73
+ assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
74
+ assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
75
+ assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
76
+ } else {
77
+ // no ICU tokenization, so we split only on spaces
78
+ assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
79
+ }
52
80
  suite.run( t.end );
53
81
  });
54
82
  };
@@ -1,5 +1,6 @@
1
1
  // validate analyzer is behaving as expected
2
2
  const Suite = require('../test/elastictest/Suite')
3
+ const config = require('pelias-config').generate()
3
4
 
4
5
  module.exports.tests = {};
5
6
 
@@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){
22
23
  assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
23
24
  assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );
24
25
 
26
+ // complicated tokenization for some Asian languages
27
+ if (config.schema.icuTokenizer) {
28
+ assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
29
+ assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
30
+ assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
31
+ assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
32
+ ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
33
+ assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
34
+ assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
35
+ assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
36
+ } else {
37
+ // no ICU tokenization, so we split only on spaces
38
+ assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
39
+ }
25
40
  suite.run( t.end );
26
41
  });
27
42
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pelias-schema",
3
- "version": "7.2.0",
3
+ "version": "8.1.0",
4
4
  "author": "pelias",
5
5
  "description": "Elasticsearch schema files and tooling for Pelias",
6
6
  "homepage": "https://github.com/pelias/schema",
@@ -0,0 +1,49 @@
1
+ const _ = require('lodash');
2
+
3
+ /**
4
+ * This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
5
+ * This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
6
+ *
7
+ * It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
8
+ * Note: this must be set *before* you create your elasticsearch index or it will have no effect.
9
+ *
10
+ * This feature is considered beta, we encourage testing & feedback from the community in order
11
+ * to adopt the ICU tokenizer as our default.
12
+ *
13
+ * https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
14
+ * https://github.com/pelias/schema/pull/498
15
+ */
16
+
17
+ module.exports = (settings) => {
18
+
19
+ // replace pattern tokenizer with icu_tokenizer
20
+ _.set(settings, 'analysis.tokenizer.peliasTokenizer', {
21
+ 'type': 'icu_tokenizer'
22
+ });
23
+
24
+ // add ampersand_replacer filter
25
+ // replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
26
+ _.set(settings, 'analysis.filter.ampersand_replacer', {
27
+ 'type': 'pattern_replace',
28
+ 'pattern': 'AMPERSANDPLACEHOLDER',
29
+ 'replacement': '&'
30
+ });
31
+
32
+ // add ampersand_mapper char_filter
33
+ // icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
34
+ // as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
35
+ _.set(settings, 'analysis.char_filter.ampersand_mapper', {
36
+ 'type': 'pattern_replace',
37
+ 'pattern': '&',
38
+ 'replacement': ' AMPERSANDPLACEHOLDER '
39
+ });
40
+
41
+ // prepend ampersand mapper/replacer to each analyzer
42
+ _.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
43
+ if (block?.tokenizer !== 'peliasTokenizer') { return; }
44
+ block.filter.unshift('ampersand_replacer');
45
+ block.char_filter.unshift('ampersand_mapper');
46
+ });
47
+
48
+ return settings;
49
+ }
package/settings.js CHANGED
@@ -2,14 +2,15 @@ const _ = require('lodash');
2
2
  const peliasConfig = require('pelias-config');
3
3
  const punctuation = require('./punctuation');
4
4
  const synonyms = require('./synonyms/loader').load();
5
+ const settingsICU = require('./settings-icu');
5
6
 
6
7
  require('./configValidation').validate(peliasConfig.generate());
7
8
 
8
9
  function generate(){
9
- var config = peliasConfig.generate();
10
+ const config = peliasConfig.generate();
10
11
 
11
12
  // Default settings
12
- var settings = {
13
+ let settings = {
13
14
  "index": {
14
15
  "similarity": {
15
16
  "peliasDefaultSimilarity": {
@@ -299,6 +300,11 @@ function generate(){
299
300
  };
300
301
  });
301
302
 
303
+ // Experimental ICU tokenizer
304
+ if (config.schema.icuTokenizer) {
305
+ settings = settingsICU(settings);
306
+ }
307
+
302
308
  // Merge settings from pelias/config
303
309
  settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));
304
310
 
package/test/compile.js CHANGED
@@ -2,7 +2,7 @@ const _ = require('lodash');
2
2
  const path = require('path');
3
3
  const schema = require('../');
4
4
  const fixture = require('./fixtures/expected.json');
5
- const config = require('pelias-config').generate();
5
+ const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');
6
6
 
7
7
  const forEachDeep = (obj, cb) =>
8
8
  _.forEach(obj, (val, key) => {
@@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) {
97
97
  });
98
98
  };
99
99
 
100
+ function overridePeliasConfig(value, cb) {
101
+ const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
102
+ process.env.PELIAS_CONFIG = value;
103
+
104
+ cb();
105
+
106
+ if (OLD_PELIAS_CONFIG) {
107
+ process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
108
+ } else {
109
+ delete process.env.PELIAS_CONFIG;
110
+ }
111
+ }
112
+
100
113
  // current schema (compiled) - requires schema to be copied and settings to
101
114
  // be regenerated from a fixture in order to pass in CI environments.
102
115
  module.exports.tests.current_schema = function(test, common) {
@@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) {
106
119
  var schemaCopy = JSON.parse( JSON.stringify( schema ) );
107
120
 
108
121
  // use the pelias config fixture instead of the local config
109
- process.env.PELIAS_CONFIG = path.resolve( __dirname + '/fixtures/config.json' );
110
- schemaCopy.settings = require('../settings')();
111
- delete process.env.PELIAS_CONFIG;
122
+ overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => {
123
+ schemaCopy.settings = require('../settings')();
124
+ });
112
125
 
113
126
  // code intentionally commented to allow quick debugging of expected.json
114
127
  // common.diff(schemaCopy, fixture);
@@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) {
121
134
  t.deepEqual(schemaCopy, fixture);
122
135
  t.end();
123
136
  });
137
+
138
+ test('current schema vs. fixture with ICU tokenizer', function(t) {
139
+
140
+ // copy schema
141
+ var schemaCopy = JSON.parse( JSON.stringify( schema ) );
142
+
143
+ // use the pelias config fixture instead of the local config
144
+ overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => {
145
+ schemaCopy.settings = require('../settings')();
146
+ });
147
+
148
+ // code intentionally commented to allow quick debugging of expected.json
149
+ // common.diff(schemaCopy, fixtureICUTokenizer);
150
+ // console.error( JSON.stringify( schemaCopy, null, 2 ) );
151
+
152
+ // code to write expected output to the fixture
153
+ // const fs = require('fs');
154
+ // fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2));
155
+
156
+ t.deepEqual(schemaCopy, fixtureICUTokenizer);
157
+ t.end();
158
+ });
124
159
  };
125
160
 
126
161
  module.exports.all = function (tape, common) {
@@ -0,0 +1,15 @@
1
+ {
2
+ "elasticsearch": {
3
+ "settings": {
4
+ "index": {
5
+ "number_of_replicas": "999",
6
+ "number_of_shards": "5",
7
+ "refresh_interval": "1m"
8
+ }
9
+ }
10
+ },
11
+ "schema": {
12
+ "icuTokenizer": true
13
+ }
14
+ }
15
+