pelias-schema 8.1.0 → 8.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,6 @@ jobs:
9
9
  - ubuntu-22.04
10
10
  node-version: [18.x, 20.x, 22.x]
11
11
  es-version: [7.6.1]
12
- jdk-version: [oraclejdk11]
13
12
  icuTokenizer: [true, false]
14
13
  steps:
15
14
  - uses: actions/checkout@v4
@@ -17,10 +16,9 @@ jobs:
17
16
  uses: actions/setup-node@v4
18
17
  with:
19
18
  node-version: ${{ matrix.node-version }}
20
- - name: Start elasticsearch ${{ matrix.es-version }} (${{ matrix.jdk-version }})
19
+ - name: Start elasticsearch ${{ matrix.es-version }}
21
20
  env:
22
21
  ES_VERSION: ${{ matrix.es-version }}
23
- JDK_VERSION: ${{ matrix.jdk-version }}
24
22
  run: ./scripts/setup_ci.sh
25
23
  - name: Run integration tests
26
24
  run: |
@@ -52,7 +52,6 @@ module.exports.tests.analyze = function(test, common){
52
52
  '4:a', '4:ab', '4:abc', '4:abcd', '4:abcde', '4:abcdef',
53
53
  '4:abcdefg', '4:abcdefgh', '4:abcdefghi', '4:abcdefghij'
54
54
  ] );
55
- assertAnalysis( 'removeAllZeroNumericPrefix', '00001', ['1'] );
56
55
 
57
56
  assertAnalysis( 'unique', '1 1 1', ['1','1','1'] );
58
57
  assertAnalysis( 'notnull', ' / / ', [] );
@@ -52,10 +52,6 @@ module.exports.tests.analyze = function(test, common){
52
52
  assertAnalysis( 'british_american_english', 'town theatre', ['0:town', '1:theatre', '1:theater'] );
53
53
  assertAnalysis( 'british_american_english', 'town theater', ['0:town', '1:theater', '1:theatre'] );
54
54
 
55
- // remove leading zeros from numeric input
56
- assertAnalysis( 'leading_zeros', '01000', ['0:1000'] );
57
- assertAnalysis( 'leading_zeros', '09999', ['0:9999'] );
58
-
59
55
  suite.run( t.end );
60
56
  });
61
57
  };
@@ -14,6 +14,24 @@ module.exports.tests.analyze = function(test, common){
14
14
  var assertAnalysis = common.analyze.bind( null, suite, t, 'peliasQuery' );
15
15
  suite.action( function( done ){ setTimeout( done, 500 ); }); // wait for es to bring some shards up
16
16
 
17
+ assertAnalysis('tokenizer', 'foo-bar baz/42', ['foo','bar','baz','42']);
18
+ assertAnalysis('tokenizer', 'foo-bar baz/42', ['foo','bar','baz','42']); // tab instead of space
19
+ assertAnalysis('tokenizer', 'foo---bar baz/42', ['foo','bar','baz','42']);
20
+ assertAnalysis('tokenizer', 'foo—bar baz/42', ['foobar','baz','42']); // dash is not a hyphen
21
+ assertAnalysis('tokenizer', 'foo-bar baz//42', ['foo','bar','baz','42']);
22
+ assertAnalysis('tokenizer', 'foo bar baz 42', ['foo','bar', 'baz', '42']);
23
+ assertAnalysis('tokenizer', 'foo-bar baz\\42', ['foo', 'bar','baz', '42']);
24
+ assertAnalysis('thai_digits', '๐๑๒๓๔๕๖๗ ๘๙', ['01234567', '89']); // leading zero remains
25
+ assertAnalysis('thai_digits', '๑๒๓๔๕๖๗๐ ๘๙', ['12345670', '89']);
26
+ assertAnalysis('digit_glued_to_word', 'john doe42', ['john', 'doe42']);
27
+ if (config.schema.icuTokenizer) {
28
+ assertAnalysis('thai_tonemarks', 'ก่ก้ก๊ก๋ข่ข้ข๊ข๋ค่ค้ค๊ค๋ฆ่ฆ้ฆ๊ฆ๋', ['กก', 'กก', 'ขขขขคคคคฆฆฆฆ']);
29
+ assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市', '朝阳', '区', '东', '三', '环', '中路', '1', '号', '国际', '大厦', 'a', '座', '1001', '室']);
30
+ } else {
31
+ assertAnalysis('thai_tonemarks', 'ก่ก้ก๊ก๋ข่ข้ข๊ข๋ค่ค้ค๊ค๋ฆ่ฆ้ฆ๊ฆ๋', ['กกกกขขขขคคคคฆฆฆฆ']);
32
+ assertAnalysis('chinese_address', '北京市朝阳区东三环中路1号国际大厦A座1001室', ['北京市朝阳区东三环中路1号国际大厦a座1001室']);
33
+ }
34
+
17
35
  assertAnalysis('asciifolding', 'é', ['e']);
18
36
  assertAnalysis('asciifolding', 'ß', ['ss']);
19
37
  assertAnalysis('asciifolding', 'æ', ['ae']);
@@ -23,7 +41,6 @@ module.exports.tests.analyze = function(test, common){
23
41
  assertAnalysis('trim', ' f ', ['f']);
24
42
  assertAnalysis('remove_ordinals', '26t', ['26']);
25
43
  assertAnalysis('remove_ordinals', '26th', ['26']);
26
- assertAnalysis('removeAllZeroNumericPrefix', '00001', ['1']);
27
44
  assertAnalysis('unique', '1 1 1', ['1','1','1']);
28
45
  assertAnalysis('notnull', ' / / ', []);
29
46
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pelias-schema",
3
- "version": "8.1.0",
3
+ "version": "8.2.0",
4
4
  "author": "pelias",
5
5
  "description": "Elasticsearch schema files and tooling for Pelias",
6
6
  "homepage": "https://github.com/pelias/schema",
@@ -4,9 +4,6 @@ set -e
4
4
  # create elasticsearch directory
5
5
  mkdir /tmp/elasticsearch
6
6
 
7
- # allow switching the JDK version
8
- curl -s https://raw.githubusercontent.com/michaelklishin/jdk_switcher/master/jdk_switcher.sh | bash -s use "${JDK_VERSION}"
9
-
10
7
  # download and install elasticsearch with ICU plugin
11
8
  FILENAME="elasticsearch-${ES_VERSION}-linux-x86_64.tar.gz"
12
9
  STRIP_COMPONENTS=1
package/settings.js CHANGED
@@ -57,7 +57,6 @@ function generate(){
57
57
  "name_synonyms_multiplexer",
58
58
  "icu_folding",
59
59
  "remove_ordinals",
60
- "removeAllZeroNumericPrefix",
61
60
  "peliasOneEdgeGramFilter",
62
61
  "unique_only_same_position",
63
62
  "notnull",
@@ -73,7 +72,6 @@ function generate(){
73
72
  "trim",
74
73
  "icu_folding",
75
74
  "remove_ordinals",
76
- "removeAllZeroNumericPrefix",
77
75
  "unique_only_same_position",
78
76
  "notnull"
79
77
  ]
@@ -92,7 +90,6 @@ function generate(){
92
90
  "name_synonyms_multiplexer",
93
91
  "icu_folding",
94
92
  "remove_ordinals",
95
- "removeAllZeroNumericPrefix",
96
93
  "unique_only_same_position",
97
94
  "notnull",
98
95
  "flatten_graph"
@@ -231,11 +228,6 @@ function generate(){
231
228
  "min_gram" : 1,
232
229
  "max_gram" : 24
233
230
  },
234
- "removeAllZeroNumericPrefix" :{
235
- "type" : "pattern_replace",
236
- "pattern" : "^(0*)",
237
- "replacement" : ""
238
- },
239
231
  "remove_ordinals" : {
240
232
  "type" : "pattern_replace",
241
233
  "pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
@@ -58,7 +58,6 @@
58
58
  "name_synonyms_multiplexer",
59
59
  "icu_folding",
60
60
  "remove_ordinals",
61
- "removeAllZeroNumericPrefix",
62
61
  "peliasOneEdgeGramFilter",
63
62
  "unique_only_same_position",
64
63
  "notnull",
@@ -79,7 +78,6 @@
79
78
  "trim",
80
79
  "icu_folding",
81
80
  "remove_ordinals",
82
- "removeAllZeroNumericPrefix",
83
81
  "unique_only_same_position",
84
82
  "notnull"
85
83
  ]
@@ -103,7 +101,6 @@
103
101
  "name_synonyms_multiplexer",
104
102
  "icu_folding",
105
103
  "remove_ordinals",
106
- "removeAllZeroNumericPrefix",
107
104
  "unique_only_same_position",
108
105
  "notnull",
109
106
  "flatten_graph"
@@ -270,11 +267,6 @@
270
267
  "min_gram": 1,
271
268
  "max_gram": 24
272
269
  },
273
- "removeAllZeroNumericPrefix": {
274
- "type": "pattern_replace",
275
- "pattern": "^(0*)",
276
- "replacement": ""
277
- },
278
270
  "remove_ordinals": {
279
271
  "type": "pattern_replace",
280
272
  "pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
@@ -55,7 +55,6 @@
55
55
  "name_synonyms_multiplexer",
56
56
  "icu_folding",
57
57
  "remove_ordinals",
58
- "removeAllZeroNumericPrefix",
59
58
  "peliasOneEdgeGramFilter",
60
59
  "unique_only_same_position",
61
60
  "notnull",
@@ -74,7 +73,6 @@
74
73
  "trim",
75
74
  "icu_folding",
76
75
  "remove_ordinals",
77
- "removeAllZeroNumericPrefix",
78
76
  "unique_only_same_position",
79
77
  "notnull"
80
78
  ]
@@ -96,7 +94,6 @@
96
94
  "name_synonyms_multiplexer",
97
95
  "icu_folding",
98
96
  "remove_ordinals",
99
- "removeAllZeroNumericPrefix",
100
97
  "unique_only_same_position",
101
98
  "notnull",
102
99
  "flatten_graph"
@@ -252,11 +249,6 @@
252
249
  "min_gram": 1,
253
250
  "max_gram": 24
254
251
  },
255
- "removeAllZeroNumericPrefix": {
256
- "type": "pattern_replace",
257
- "pattern": "^(0*)",
258
- "replacement": ""
259
- },
260
252
  "remove_ordinals": {
261
253
  "type": "pattern_replace",
262
254
  "pattern": "(?i)((^| )((1)st?|(2)nd?|(3)rd?|([4-9])th?)|(([0-9]*)(1[0-9])th?)|(([0-9]*[02-9])((1)st?|(2)nd?|(3)rd?|([04-9])th?))($| ))",
package/test/settings.js CHANGED
@@ -116,7 +116,6 @@ module.exports.tests.peliasIndexOneEdgeGramAnalyzer = function(test, common) {
116
116
  "name_synonyms_multiplexer",
117
117
  "icu_folding",
118
118
  "remove_ordinals",
119
- "removeAllZeroNumericPrefix",
120
119
  "peliasOneEdgeGramFilter",
121
120
  "unique_only_same_position",
122
121
  "notnull",
@@ -145,7 +144,6 @@ module.exports.tests.peliasQueryAnalyzer = function (test, common) {
145
144
  'trim',
146
145
  'icu_folding',
147
146
  'remove_ordinals',
148
- 'removeAllZeroNumericPrefix',
149
147
  'unique_only_same_position',
150
148
  'notnull'
151
149
  ]);
@@ -177,7 +175,6 @@ module.exports.tests.peliasPhraseAnalyzer = function(test, common) {
177
175
  "name_synonyms_multiplexer",
178
176
  "icu_folding",
179
177
  "remove_ordinals",
180
- "removeAllZeroNumericPrefix",
181
178
  "unique_only_same_position",
182
179
  "notnull",
183
180
  "flatten_graph"
@@ -516,19 +513,6 @@ module.exports.tests.peliasOneEdgeGramFilter = function(test, common) {
516
513
  });
517
514
  };
518
515
 
519
- // this filter removed leading 0 characters. eg. 0001 -> 1
520
- module.exports.tests.removeAllZeroNumericPrefixFilter = function(test, common) {
521
- test('has removeAllZeroNumericPrefix filter', function(t) {
522
- var s = settings();
523
- t.equal(typeof s.analysis.filter.removeAllZeroNumericPrefix, 'object', 'there is a removeAllZeroNumericPrefix filter');
524
- var filter = s.analysis.filter.removeAllZeroNumericPrefix;
525
- t.equal(filter.type, 'pattern_replace');
526
- t.equal(filter.pattern, '^(0*)');
527
- t.equal(filter.replacement, '');
528
- t.end();
529
- });
530
- };
531
-
532
516
  // this filter provides synonyms for street suffixes
533
517
  // eg. road=>rd
534
518
  module.exports.tests.streetSynonymFilter = function(test, common) {