pelias-schema 7.2.0 → 8.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/_integration_tests.yml +7 -2
- package/.github/workflows/_unit_tests.yml +8 -3
- package/.github/workflows/push.yml +4 -4
- package/configValidation.js +2 -0
- package/integration/analyzer_peliasIndexOneEdgeGram.js +12 -2
- package/integration/analyzer_peliasQuery.js +30 -2
- package/integration/analyzer_peliasStreet.js +15 -0
- package/package.json +1 -1
- package/settings-icu.js +49 -0
- package/settings.js +8 -2
- package/test/compile.js +39 -4
- package/test/fixtures/config-icu-tokenizer.json +15 -0
- package/test/fixtures/expected-icu-tokenizer.json +3059 -0
- package/test/settings.js +31 -12
@@ -6,12 +6,13 @@ jobs:
|
|
6
6
|
strategy:
|
7
7
|
matrix:
|
8
8
|
os:
|
9
|
-
-
|
9
|
+
- ubuntu-22.04
|
10
10
|
node-version: [18.x, 20.x, 22.x]
|
11
11
|
es-version: [7.6.1]
|
12
12
|
jdk-version: [oraclejdk11]
|
13
|
+
icuTokenizer: [true, false]
|
13
14
|
steps:
|
14
|
-
- uses: actions/checkout@
|
15
|
+
- uses: actions/checkout@v4
|
15
16
|
- name: Install node.js ${{ matrix.node-version }}
|
16
17
|
uses: actions/setup-node@v4
|
17
18
|
with:
|
@@ -23,6 +24,10 @@ jobs:
|
|
23
24
|
run: ./scripts/setup_ci.sh
|
24
25
|
- name: Run integration tests
|
25
26
|
run: |
|
27
|
+
if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
|
28
|
+
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
|
29
|
+
export PELIAS_CONFIG=$(pwd)/config-icu.json
|
30
|
+
fi
|
26
31
|
npm install
|
27
32
|
curl http://127.0.0.1:9200/
|
28
33
|
./bin/create_index
|
@@ -6,10 +6,11 @@ jobs:
|
|
6
6
|
strategy:
|
7
7
|
matrix:
|
8
8
|
os:
|
9
|
-
-
|
9
|
+
- ubuntu-22.04
|
10
10
|
node-version: [18.x, 20.x, 22.x]
|
11
|
+
icuTokenizer: [true, false]
|
11
12
|
steps:
|
12
|
-
- uses: actions/checkout@
|
13
|
+
- uses: actions/checkout@v4
|
13
14
|
- name: Install node.js ${{ matrix.node-version }}
|
14
15
|
uses: actions/setup-node@v4
|
15
16
|
with:
|
@@ -17,4 +18,8 @@ jobs:
|
|
17
18
|
- name: Run unit tests
|
18
19
|
run: |
|
19
20
|
npm install
|
20
|
-
|
21
|
+
if [ "${{ matrix.icuTokenizer }}" = "true" ]; then
|
22
|
+
jq -n '{ schema: { icuTokenizer: true } }' > $(pwd)/config-icu.json
|
23
|
+
export PELIAS_CONFIG=$(pwd)/config-icu.json
|
24
|
+
fi
|
25
|
+
npm run test
|
@@ -11,9 +11,9 @@ jobs:
|
|
11
11
|
npm-publish:
|
12
12
|
needs: [unit-tests, integration-tests]
|
13
13
|
if: github.ref == 'refs/heads/master' && github.event_name == 'push'
|
14
|
-
runs-on:
|
14
|
+
runs-on: ubuntu-22.04
|
15
15
|
steps:
|
16
|
-
- uses: actions/checkout@
|
16
|
+
- uses: actions/checkout@v4
|
17
17
|
- name: Install Node.js
|
18
18
|
uses: actions/setup-node@v4
|
19
19
|
with:
|
@@ -29,9 +29,9 @@ jobs:
|
|
29
29
|
# note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true`
|
30
30
|
if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }}
|
31
31
|
needs: [unit-tests, integration-tests, npm-publish]
|
32
|
-
runs-on:
|
32
|
+
runs-on: ubuntu-22.04
|
33
33
|
steps:
|
34
|
-
- uses: actions/checkout@
|
34
|
+
- uses: actions/checkout@v4
|
35
35
|
- name: Build Docker images
|
36
36
|
env:
|
37
37
|
DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
|
package/configValidation.js
CHANGED
@@ -2,10 +2,12 @@ const Joi = require('@hapi/joi');
|
|
2
2
|
|
3
3
|
// Schema Configuration
|
4
4
|
// schema.indexName: populated by defaults if not overridden
|
5
|
+
// schema.icuTokenizer: boolean, optional, defaults to false
|
5
6
|
// esclient: object, validation performed by elasticsearch module
|
6
7
|
const schema = Joi.object().required().keys({
|
7
8
|
schema: Joi.object().required().keys({
|
8
9
|
indexName: Joi.string().required(),
|
10
|
+
icuTokenizer: Joi.boolean().optional()
|
9
11
|
}),
|
10
12
|
esclient: Joi.object().required()
|
11
13
|
}).unknown(true);
|
@@ -1,8 +1,9 @@
|
|
1
1
|
// validate analyzer is behaving as expected
|
2
2
|
|
3
|
-
|
3
|
+
const tape = require('tape'),
|
4
4
|
Suite = require('../test/elastictest/Suite'),
|
5
|
-
punctuation = require('../punctuation')
|
5
|
+
punctuation = require('../punctuation'),
|
6
|
+
config = require('pelias-config').generate();
|
6
7
|
|
7
8
|
module.exports.tests = {};
|
8
9
|
|
@@ -85,6 +86,15 @@ module.exports.tests.analyze = function(test, common){
|
|
85
86
|
|
86
87
|
assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
|
87
88
|
assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
|
89
|
+
if (config.schema.icuTokenizer) {
|
90
|
+
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', [
|
91
|
+
'0:ซ', '0:ซอ', '0:ซอย',
|
92
|
+
'1:เพชรบุรี1', '1:เพชรบุรี', '1:เพชรบุร', '1:เพชรบุ', '1:เพชรบ', '1:เพชร', '1:เพช', '1:เพ', '1:เ',
|
93
|
+
'2:f', '2:fo', '2:foo'] );
|
94
|
+
} else {
|
95
|
+
// no ICU tokenization, so we split only on spaces
|
96
|
+
assertAnalysis('thai_address', 'ซอยเพชรบุรี๑foo', ['0:ซอยเพชรบุรี1foo']);
|
97
|
+
}
|
88
98
|
|
89
99
|
suite.run( t.end );
|
90
100
|
});
|
@@ -1,8 +1,9 @@
|
|
1
1
|
// validate analyzer is behaving as expected
|
2
2
|
|
3
|
-
|
3
|
+
const tape = require('tape'),
|
4
4
|
Suite = require('../test/elastictest/Suite'),
|
5
|
-
punctuation = require('../punctuation')
|
5
|
+
punctuation = require('../punctuation'),
|
6
|
+
config = require('pelias-config').generate();
|
6
7
|
|
7
8
|
module.exports.tests = {};
|
8
9
|
|
@@ -49,6 +50,33 @@ module.exports.tests.functional = function(test, common){
|
|
49
50
|
assertAnalysis( 'place', 'Toys "R" Us!', [ 'toys', 'r', 'us' ]);
|
50
51
|
assertAnalysis( 'address', '101 mapzen place', [ '101', 'mapzen', 'place' ]);
|
51
52
|
|
53
|
+
// complicated tokenization for some Asian languages
|
54
|
+
if (config.schema.icuTokenizer) {
|
55
|
+
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
|
56
|
+
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
|
57
|
+
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
|
58
|
+
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
|
59
|
+
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
|
60
|
+
// correct word by word split according to native speaker: 马来西亚 / 霹雳州 / 怡保 / 31400, 怡保花园 / 第5巷 / 45号
|
61
|
+
assertAnalysis('chinese_address2', '马来西亚霹雳州怡保31400怡保花园第5巷45号',
|
62
|
+
["马来", "西亚", "霹", "雳", "州", "怡", "保", "31400", "怡", "保", "花园", "第", "5", "巷", "45", "号"]);
|
63
|
+
// correct word by word split: 马来西亚 / 柔佛新山 / 81200 / , / 士古来路 / , / 百万时尚广场
|
64
|
+
assertAnalysis('chinese_address3', '马来西亚柔佛新山81200士古来路百万时尚广场',
|
65
|
+
["马来", "西亚", "柔", "佛", "新山", "81200", "士", "古来", "路", "百万", "时尚", "广场"]);
|
66
|
+
// correct word by word split: 马来西亚/ 槟城 / 亚依淡 / 11500 / , / 极乐寺 / , / 回返路
|
67
|
+
assertAnalysis('chinese_address4', '马来西亚槟城亚依淡11500极乐寺回返路',
|
68
|
+
["马来", "西亚", "槟", "城", "亚", "依", "淡", "11500", "极乐", "寺", "回", "返", "路"]);
|
69
|
+
// correct word by word split: 马来西亚 / 吉隆坡 / 50000 / , / 茨厂街 / 123号
|
70
|
+
assertAnalysis('chinese_address5', '马来西亚吉隆坡50000茨厂街123号',
|
71
|
+
["马来", "西亚", "吉隆坡", "50000", "茨", "厂", "街", "123", "号"]);
|
72
|
+
|
73
|
+
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
|
74
|
+
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
|
75
|
+
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
|
76
|
+
} else {
|
77
|
+
// no ICU tokenization, so we split only on spaces
|
78
|
+
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
|
79
|
+
}
|
52
80
|
suite.run( t.end );
|
53
81
|
});
|
54
82
|
};
|
@@ -1,5 +1,6 @@
|
|
1
1
|
// validate analyzer is behaving as expected
|
2
2
|
const Suite = require('../test/elastictest/Suite')
|
3
|
+
const config = require('pelias-config').generate()
|
3
4
|
|
4
5
|
module.exports.tests = {};
|
5
6
|
|
@@ -22,6 +23,20 @@ module.exports.tests.analyze = function(test, common){
|
|
22
23
|
assertAnalysis( 'remove_ordinals', '1st 2nd 3rd 4th 5th', ['1','2','3','4','5'] );
|
23
24
|
assertAnalysis( 'remove_ordinals', 'Ast th 101st', ['ast','th','101'] );
|
24
25
|
|
26
|
+
// complicated tokenization for some Asian languages
|
27
|
+
if (config.schema.icuTokenizer) {
|
28
|
+
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอย', 'เพชรบุรี1'] );
|
29
|
+
assertAnalysis('thai_address2', 'ซอยเพชรบุรี๑foo', ['ซอย', 'เพชรบุรี1', 'foo'] );
|
30
|
+
assertAnalysis('thai_address3', 'บ้านเลขที่๑๒๓ถนนสุขุมวิทแขวงคลองตันเหนือเขตวัฒนา กรุงเทพมหานคร๑๐๑๑๐', ["บาน", "เลข", "ที123ถนน", "สุขุมวิท", "แขวง", "คลองตัน", "เหนือ", "เขต", "วัฒนา", "กรุงเทพมหานคร10110"]);
|
31
|
+
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室',
|
32
|
+
['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
|
33
|
+
assertAnalysis('japanese_address', '東京都渋谷区渋谷2丁目21−1渋谷スクランブルスクエア4階', ["東京", "都", "渋谷", "区", "渋谷", "2", "丁目", "21", "1", "渋谷", "スクランフル", "スクエア", "4", "階"]);
|
34
|
+
assertAnalysis('khmer_address', 'ផ្ទះលេខ១២៣ផ្លូវព្រះសីហនុសង្កាត់ទន្លេបាសាក់ខណ្ឌចំការមនរាជធានីភ្នំពេញ', ["ផទះលេខ123ផលូវ", "ពរះសីហនុ", "សងកាត", "ទនលេបាសាក", "ខណឌចំការមន", "រាជធានី", "ភនំពេញ"]);
|
35
|
+
assertAnalysis('lao_address', 'ບ້ານເລກທີ່໑໕໕ຖະໜົນທ່ານຊານຂອງເຂດຈັນທະບູລີນະຄອນວຽງຈັນ', ["ບານ", "ເລກ", "ທີ155ຖະຫນົນ", "ທານ", "ຊານ", "ຂອງ", "ເຂດ", "ຈັນທະ", "ບູ", "ລີ", "ນະຄອນ", "ວຽງຈັນ"]);
|
36
|
+
} else {
|
37
|
+
// no ICU tokenization, so we split only on spaces
|
38
|
+
assertAnalysis('thai_address1', 'ซอยเพชรบุรี๑', ['ซอยเพชรบุรี1'] );
|
39
|
+
}
|
25
40
|
suite.run( t.end );
|
26
41
|
});
|
27
42
|
};
|
package/package.json
CHANGED
package/settings-icu.js
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
const _ = require('lodash');
|
2
|
+
|
3
|
+
/**
|
4
|
+
* This module contains modifications to the Pelias schema to adopt the elastic ICU tokenizer.
|
5
|
+
* This tokenizer improves word-splitting of non-latin alphabets (particularly Asian languages).
|
6
|
+
*
|
7
|
+
* It can be enabled by setting `config.schema.icuTokenizer` in your `pelias.json` config.
|
8
|
+
* Note: this must be set *before* you create your elasticsearch index or it will have no effect.
|
9
|
+
*
|
10
|
+
* This feature is considered beta, we encourage testing & feedback from the community in order
|
11
|
+
* to adopt the ICU tokenizer as our default.
|
12
|
+
*
|
13
|
+
* https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-tokenizer.html
|
14
|
+
* https://github.com/pelias/schema/pull/498
|
15
|
+
*/
|
16
|
+
|
17
|
+
module.exports = (settings) => {
|
18
|
+
|
19
|
+
// replace pattern tokenizer with icu_tokenizer
|
20
|
+
_.set(settings, 'analysis.tokenizer.peliasTokenizer', {
|
21
|
+
'type': 'icu_tokenizer'
|
22
|
+
});
|
23
|
+
|
24
|
+
// add ampersand_replacer filter
|
25
|
+
// replaces ampersand placeholders back to `&` (see `ampersand_mapper` char_filter)
|
26
|
+
_.set(settings, 'analysis.filter.ampersand_replacer', {
|
27
|
+
'type': 'pattern_replace',
|
28
|
+
'pattern': 'AMPERSANDPLACEHOLDER',
|
29
|
+
'replacement': '&'
|
30
|
+
});
|
31
|
+
|
32
|
+
// add ampersand_mapper char_filter
|
33
|
+
// icu-tokenizer treats ampersands as a word boundary, so we replace them with a placeholder to avoid it,
|
34
|
+
// as we want to handle them separately, we replace them back after tokenization (see `ampersand_replacer` filter)
|
35
|
+
_.set(settings, 'analysis.char_filter.ampersand_mapper', {
|
36
|
+
'type': 'pattern_replace',
|
37
|
+
'pattern': '&',
|
38
|
+
'replacement': ' AMPERSANDPLACEHOLDER '
|
39
|
+
});
|
40
|
+
|
41
|
+
// prepend ampersand mapper/replacer to each analyzer
|
42
|
+
_.forEach(_.get(settings, 'analysis.analyzer'), (block) => {
|
43
|
+
if (block?.tokenizer !== 'peliasTokenizer') { return; }
|
44
|
+
block.filter.unshift('ampersand_replacer');
|
45
|
+
block.char_filter.unshift('ampersand_mapper');
|
46
|
+
});
|
47
|
+
|
48
|
+
return settings;
|
49
|
+
}
|
package/settings.js
CHANGED
@@ -2,14 +2,15 @@ const _ = require('lodash');
|
|
2
2
|
const peliasConfig = require('pelias-config');
|
3
3
|
const punctuation = require('./punctuation');
|
4
4
|
const synonyms = require('./synonyms/loader').load();
|
5
|
+
const settingsICU = require('./settings-icu');
|
5
6
|
|
6
7
|
require('./configValidation').validate(peliasConfig.generate());
|
7
8
|
|
8
9
|
function generate(){
|
9
|
-
|
10
|
+
const config = peliasConfig.generate();
|
10
11
|
|
11
12
|
// Default settings
|
12
|
-
|
13
|
+
let settings = {
|
13
14
|
"index": {
|
14
15
|
"similarity": {
|
15
16
|
"peliasDefaultSimilarity": {
|
@@ -299,6 +300,11 @@ function generate(){
|
|
299
300
|
};
|
300
301
|
});
|
301
302
|
|
303
|
+
// Experimental ICU tokenizer
|
304
|
+
if (config.schema.icuTokenizer) {
|
305
|
+
settings = settingsICU(settings);
|
306
|
+
}
|
307
|
+
|
302
308
|
// Merge settings from pelias/config
|
303
309
|
settings = _.merge({}, settings, _.get(config, 'elasticsearch.settings', {}));
|
304
310
|
|
package/test/compile.js
CHANGED
@@ -2,7 +2,7 @@ const _ = require('lodash');
|
|
2
2
|
const path = require('path');
|
3
3
|
const schema = require('../');
|
4
4
|
const fixture = require('./fixtures/expected.json');
|
5
|
-
const
|
5
|
+
const fixtureICUTokenizer = require('./fixtures/expected-icu-tokenizer.json');
|
6
6
|
|
7
7
|
const forEachDeep = (obj, cb) =>
|
8
8
|
_.forEach(obj, (val, key) => {
|
@@ -97,6 +97,19 @@ module.exports.tests.analyzers = function (test, common) {
|
|
97
97
|
});
|
98
98
|
};
|
99
99
|
|
100
|
+
function overridePeliasConfig(value, cb) {
|
101
|
+
const OLD_PELIAS_CONFIG = process.env.PELIAS_CONFIG;
|
102
|
+
process.env.PELIAS_CONFIG = value;
|
103
|
+
|
104
|
+
cb();
|
105
|
+
|
106
|
+
if (OLD_PELIAS_CONFIG) {
|
107
|
+
process.env.PELIAS_CONFIG = OLD_PELIAS_CONFIG;
|
108
|
+
} else {
|
109
|
+
delete process.env.PELIAS_CONFIG;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
100
113
|
// current schema (compiled) - requires schema to be copied and settings to
|
101
114
|
// be regenerated from a fixture in order to pass in CI environments.
|
102
115
|
module.exports.tests.current_schema = function(test, common) {
|
@@ -106,9 +119,9 @@ module.exports.tests.current_schema = function(test, common) {
|
|
106
119
|
var schemaCopy = JSON.parse( JSON.stringify( schema ) );
|
107
120
|
|
108
121
|
// use the pelias config fixture instead of the local config
|
109
|
-
|
110
|
-
|
111
|
-
|
122
|
+
overridePeliasConfig(path.resolve( __dirname + '/fixtures/config.json' ), () => {
|
123
|
+
schemaCopy.settings = require('../settings')();
|
124
|
+
});
|
112
125
|
|
113
126
|
// code intentionally commented to allow quick debugging of expected.json
|
114
127
|
// common.diff(schemaCopy, fixture);
|
@@ -121,6 +134,28 @@ module.exports.tests.current_schema = function(test, common) {
|
|
121
134
|
t.deepEqual(schemaCopy, fixture);
|
122
135
|
t.end();
|
123
136
|
});
|
137
|
+
|
138
|
+
test('current schema vs. fixture with ICU tokenizer', function(t) {
|
139
|
+
|
140
|
+
// copy schema
|
141
|
+
var schemaCopy = JSON.parse( JSON.stringify( schema ) );
|
142
|
+
|
143
|
+
// use the pelias config fixture instead of the local config
|
144
|
+
overridePeliasConfig(path.resolve( __dirname + '/fixtures/config-icu-tokenizer.json' ), () => {
|
145
|
+
schemaCopy.settings = require('../settings')();
|
146
|
+
});
|
147
|
+
|
148
|
+
// code intentionally commented to allow quick debugging of expected.json
|
149
|
+
// common.diff(schemaCopy, fixtureICUTokenizer);
|
150
|
+
// console.error( JSON.stringify( schemaCopy, null, 2 ) );
|
151
|
+
|
152
|
+
// code to write expected output to the fixture
|
153
|
+
// const fs = require('fs');
|
154
|
+
// fs.writeFileSync(path.resolve( __dirname + '/fixtures/expected-icu-tokenizer.json' ), JSON.stringify(schemaCopy, null, 2));
|
155
|
+
|
156
|
+
t.deepEqual(schemaCopy, fixtureICUTokenizer);
|
157
|
+
t.end();
|
158
|
+
});
|
124
159
|
};
|
125
160
|
|
126
161
|
module.exports.all = function (tape, common) {
|