pelias-schema 8.1.0 → 8.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/_integration_tests.yml +1 -3
- package/integration/analyzer_peliasIndexOneEdgeGram.js +0 -1
- package/integration/analyzer_peliasPhrase.js +0 -4
- package/integration/analyzer_peliasQuery.js +18 -1
- package/package.json +1 -1
- package/scripts/setup_ci.sh +0 -3
- package/settings.js +0 -8
- package/test/fixtures/expected-icu-tokenizer.json +0 -8
- package/test/fixtures/expected.json +0 -8
- package/test/settings.js +0 -16
@@ -9,7 +9,6 @@ jobs:
|
|
9
9
|
- ubuntu-22.04
|
10
10
|
node-version: [18.x, 20.x, 22.x]
|
11
11
|
es-version: [7.6.1]
|
12
|
-
jdk-version: [oraclejdk11]
|
13
12
|
icuTokenizer: [true, false]
|
14
13
|
steps:
|
15
14
|
- uses: actions/checkout@v4
|
@@ -17,10 +16,9 @@ jobs:
|
|
17
16
|
uses: actions/setup-node@v4
|
18
17
|
with:
|
19
18
|
node-version: ${{ matrix.node-version }}
|
20
|
-
- name: Start elasticsearch ${{ matrix.es-version }}
|
19
|
+
- name: Start elasticsearch ${{ matrix.es-version }}
|
21
20
|
env:
|
22
21
|
ES_VERSION: ${{ matrix.es-version }}
|
23
|
-
JDK_VERSION: ${{ matrix.jdk-version }}
|
24
22
|
run: ./scripts/setup_ci.sh
|
25
23
|
- name: Run integration tests
|
26
24
|
run: |
|
@@ -52,7 +52,6 @@ module.exports.tests.analyze = function(test, common){
|
|
52
52
|
'4:a', '4:ab', '4:abc', '4:abcd', '4:abcde', '4:abcdef',
|
53
53
|
'4:abcdefg', '4:abcdefgh', '4:abcdefghi', '4:abcdefghij'
|
54
54
|
] );
|
55
|
-
assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] );
|
56
55
|
|
57
56
|
assertAnalysis( 'unique', '1 1 1', ['1','1','1'] );
|
58
57
|
assertAnalysis( 'notnull', ' / / ', [] );
|
@@ -52,10 +52,6 @@ module.exports.tests.analyze = function(test, common){
|
|
52
52
|
assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
|
53
53
|
assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
|
54
54
|
|
55
|
-
// remove leading zeros from numeric input
|
56
|
-
assertAnalysis( 'leading_zeros', '01000', ['0:1000'] );
|
57
|
-
assertAnalysis( 'leading_zeros', '09999', ['0:9999'] );
|
58
|
-
|
59
55
|
suite.run( t.end );
|
60
56
|
});
|
61
57
|
};
|
@@ -14,6 +14,24 @@ module.exports.tests.analyze = function(test, common){
|
|
14
14
|
var assertAnalysis = common.analyze.bind( null, suite, t, 'peliasQuery' );
|
15
15
|
suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up
|
16
16
|
|
17
|
+
assertAnalysis('tokenizer', 'foo-bar baz/42', ['foo','bar','baz','42']);
|
18
|
+
assertAnalysis('tokenizer', 'foo-bar baz/42', ['foo','bar','baz','42']); // tab instead of space
|
19
|
+
assertAnalysis('tokenizer', 'foo---bar baz/42', ['foo','bar','baz','42']);
|
20
|
+
assertAnalysis('tokenizer', 'foo—bar baz/42', ['foobar','baz','42']); // dash is not a hyphen
|
21
|
+
assertAnalysis('tokenizer', 'foo-bar baz//42', ['foo','bar','baz','42']);
|
22
|
+
assertAnalysis('tokenizer', 'foo bar baz 42', ['foo','bar', 'baz', '42']);
|
23
|
+
assertAnalysis('tokenizer', 'foo-bar baz\\42', ['foo', 'bar','baz', '42']);
|
24
|
+
assertAnalysis('thai_digits', '๐๑๒๓๔๕๖๗ ๘๙', ['01234567', '89']); // leading zero remains
|
25
|
+
assertAnalysis('thai_digits', '๑๒๓๔๕๖๗๐ ๘๙', ['12345670', '89']);
|
26
|
+
assertAnalysis('digit_glued_to_word', 'john doe42', ['john', 'doe42']);
|
27
|
+
if (config.schema.icuTokenizer) {
|
28
|
+
assertAnalysis('thai_tonemarks', 'ก่ก้ก๊ก๋ข่ข้ข๊ข๋ค่ค้ค๊ค๋ฆ่ฆ้ฆ๊ฆ๋', ['กก', 'กก', 'ขขขขคคคคฆฆฆฆ']);
|
29
|
+
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
|
30
|
+
} else {
|
31
|
+
assertAnalysis('thai_tonemarks', 'ก่ก้ก๊ก๋ข่ข้ข๊ข๋ค่ค้ค๊ค๋ฆ่ฆ้ฆ๊ฆ๋', ['กกกกขขขขคคคคฆฆฆฆ']);
|
32
|
+
assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市朝阳区东三环中路1号国际大厦a座1001室']);
|
33
|
+
}
|
34
|
+
|
17
35
|
assertAnalysis('asciifolding', 'é', ['e']);
|
18
36
|
assertAnalysis('asciifolding', 'ß', ['ss']);
|
19
37
|
assertAnalysis('asciifolding', 'æ', ['ae']);
|
@@ -23,7 +41,6 @@ module.exports.tests.analyze = function(test, common){
|
|
23
41
|
assertAnalysis('trim', ' f ', ['f']);
|
24
42
|
assertAnalysis('remove_ordinals', '26t', ['26']);
|
25
43
|
assertAnalysis('remove_ordinals', '26th', ['26']);
|
26
|
-
assertAnalysis('removeAllZeroNumericPrefix', '00001', ['1']);
|
27
44
|
assertAnalysis('unique', '1 1 1', ['1','1','1']);
|
28
45
|
assertAnalysis('notnull', ' / / ', []);
|
29
46
|
|
package/package.json
CHANGED
package/scripts/setup_ci.sh
CHANGED
@@ -4,9 +4,6 @@ set -e
|
|
4
4
|
# create elasticsearch directory
|
5
5
|
mkdir /tmp/elasticsearch
|
6
6
|
|
7
|
-
# allow switching the JDK version
|
8
|
-
curl -s https://raw.githubusercontent.com/michaelklishin/jdk_switcher/master/jdk_switcher.sh | bash -s use "${JDK_VERSION}"
|
9
|
-
|
10
7
|
# download and install elasticsearch with ICU plugin
|
11
8
|
FILENAME="elasticsearch-${ES_VERSION}-linux-x86_64.tar.gz"
|
12
9
|
STRIP_COMPONENTS=1
|
package/settings.js
CHANGED
@@ -57,7 +57,6 @@ function generate(){
|
|
57
57
|
"name_synonyms_multiplexer",
|
58
58
|
"icu_folding",
|
59
59
|
"remove_ordinals",
|
60
|
-
"removeAllZeroNumericPrefix",
|
61
60
|
"peliasOneEdgeGramFilter",
|
62
61
|
"unique_only_same_position",
|
63
62
|
"notnull",
|
@@ -73,7 +72,6 @@ function generate(){
|
|
73
72
|
"trim",
|
74
73
|
"icu_folding",
|
75
74
|
"remove_ordinals",
|
76
|
-
"removeAllZeroNumericPrefix",
|
77
75
|
"unique_only_same_position",
|
78
76
|
"notnull"
|
79
77
|
]
|
@@ -92,7 +90,6 @@ function generate(){
|
|
92
90
|
"name_synonyms_multiplexer",
|
93
91
|
"icu_folding",
|
94
92
|
"remove_ordinals",
|
95
|
-
"removeAllZeroNumericPrefix",
|
96
93
|
"unique_only_same_position",
|
97
94
|
"notnull",
|
98
95
|
"flatten_graph"
|
@@ -231,11 +228,6 @@ function generate(){
|
|
231
228
|
"min_gram" : 1,
|
232
229
|
"max_gram" : 24
|
233
230
|
},
|
234
|
-
"removeAllZeroNumericPrefix" :{
|
235
|
-
"type" : "pattern_replace",
|
236
|
-
"pattern" : "^(0*)",
|
237
|
-
"replacement" : ""
|
238
|
-
},
|
239
231
|
"remove_ordinals" : {
|
240
232
|
"type" : "pattern_replace",
|
241
233
|
"pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
|
@@ -58,7 +58,6 @@
|
|
58
58
|
"name_synonyms_multiplexer",
|
59
59
|
"icu_folding",
|
60
60
|
"remove_ordinals",
|
61
|
-
"removeAllZeroNumericPrefix",
|
62
61
|
"peliasOneEdgeGramFilter",
|
63
62
|
"unique_only_same_position",
|
64
63
|
"notnull",
|
@@ -79,7 +78,6 @@
|
|
79
78
|
"trim",
|
80
79
|
"icu_folding",
|
81
80
|
"remove_ordinals",
|
82
|
-
"removeAllZeroNumericPrefix",
|
83
81
|
"unique_only_same_position",
|
84
82
|
"notnull"
|
85
83
|
]
|
@@ -103,7 +101,6 @@
|
|
103
101
|
"name_synonyms_multiplexer",
|
104
102
|
"icu_folding",
|
105
103
|
"remove_ordinals",
|
106
|
-
"removeAllZeroNumericPrefix",
|
107
104
|
"unique_only_same_position",
|
108
105
|
"notnull",
|
109
106
|
"flatten_graph"
|
@@ -270,11 +267,6 @@
|
|
270
267
|
"min_gram": 1,
|
271
268
|
"max_gram": 24
|
272
269
|
},
|
273
|
-
"removeAllZeroNumericPrefix": {
|
274
|
-
"type": "pattern_replace",
|
275
|
-
"pattern": "^(0*)",
|
276
|
-
"replacement": ""
|
277
|
-
},
|
278
270
|
"remove_ordinals": {
|
279
271
|
"type": "pattern_replace",
|
280
272
|
"pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
|
@@ -55,7 +55,6 @@
|
|
55
55
|
"name_synonyms_multiplexer",
|
56
56
|
"icu_folding",
|
57
57
|
"remove_ordinals",
|
58
|
-
"removeAllZeroNumericPrefix",
|
59
58
|
"peliasOneEdgeGramFilter",
|
60
59
|
"unique_only_same_position",
|
61
60
|
"notnull",
|
@@ -74,7 +73,6 @@
|
|
74
73
|
"trim",
|
75
74
|
"icu_folding",
|
76
75
|
"remove_ordinals",
|
77
|
-
"removeAllZeroNumericPrefix",
|
78
76
|
"unique_only_same_position",
|
79
77
|
"notnull"
|
80
78
|
]
|
@@ -96,7 +94,6 @@
|
|
96
94
|
"name_synonyms_multiplexer",
|
97
95
|
"icu_folding",
|
98
96
|
"remove_ordinals",
|
99
|
-
"removeAllZeroNumericPrefix",
|
100
97
|
"unique_only_same_position",
|
101
98
|
"notnull",
|
102
99
|
"flatten_graph"
|
@@ -252,11 +249,6 @@
|
|
252
249
|
"min_gram": 1,
|
253
250
|
"max_gram": 24
|
254
251
|
},
|
255
|
-
"removeAllZeroNumericPrefix": {
|
256
|
-
"type": "pattern_replace",
|
257
|
-
"pattern": "^(0*)",
|
258
|
-
"replacement": ""
|
259
|
-
},
|
260
252
|
"remove_ordinals": {
|
261
253
|
"type": "pattern_replace",
|
262
254
|
"pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
|
package/test/settings.js
CHANGED
@@ -116,7 +116,6 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) {
|
|
116
116
|
"name_synonyms_multiplexer",
|
117
117
|
"icu_folding",
|
118
118
|
"remove_ordinals",
|
119
|
-
"removeAllZeroNumericPrefix",
|
120
119
|
"peliasOneEdgeGramFilter",
|
121
120
|
"unique_only_same_position",
|
122
121
|
"notnull",
|
@@ -145,7 +144,6 @@ module.exports.tests.peliasQueryAnalyzer = function (test, common) {
|
|
145
144
|
'trim',
|
146
145
|
'icu_folding',
|
147
146
|
'remove_ordinals',
|
148
|
-
'removeAllZeroNumericPrefix',
|
149
147
|
'unique_only_same_position',
|
150
148
|
'notnull'
|
151
149
|
]);
|
@@ -177,7 +175,6 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) {
|
|
177
175
|
"name_synonyms_multiplexer",
|
178
176
|
"icu_folding",
|
179
177
|
"remove_ordinals",
|
180
|
-
"removeAllZeroNumericPrefix",
|
181
178
|
"unique_only_same_position",
|
182
179
|
"notnull",
|
183
180
|
"flatten_graph"
|
@@ -516,19 +513,6 @@ module.exports.tests.peliasOneEdgeGramFilter = function(test, common) {
|
|
516
513
|
});
|
517
514
|
};
|
518
515
|
|
519
|
-
// this filter removed leading 0 characters. eg. 0001 -> 1
|
520
|
-
module.exports.tests.removeAllZeroNumericPrefixFilter = function(test, common) {
|
521
|
-
test('has removeAllZeroNumericPrefix filter', function(t) {
|
522
|
-
var s = settings();
|
523
|
-
t.equal(typeof s.analysis.filter.removeAllZeroNumericPrefix, 'object', 'there is a removeAllZeroNumericPrefix filter');
|
524
|
-
var filter = s.analysis.filter.removeAllZeroNumericPrefix;
|
525
|
-
t.equal(filter.type, 'pattern_replace');
|
526
|
-
t.equal(filter.pattern, '^(0*)');
|
527
|
-
t.equal(filter.replacement, '');
|
528
|
-
t.end();
|
529
|
-
});
|
530
|
-
};
|
531
|
-
|
532
516
|
// this filter provides synonyms for street suffixes
|
533
517
|
// eg. road=>rd
|
534
518
|
module.exports.tests.streetSynonymFilter = function(test, common) {
|