@natlibfi/marc-record-validators-melinda 12.0.0-alpha.6 → 12.0.0-alpha.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  // Const finnishTerms = ['ysa', 'yso', 'kassu', 'seko', 'valo', 'kulo', 'puho', 'oiko', 'mero', 'liito', 'fast', 'allars', 'kaunokki'];
2
2
  const finnishTerms = [
3
3
  /^(?:allars|bella|fast|juho|jupo|kassu|kauno|kaunokki|keko|koko|kulo|liiko|liito|local|mero|mts|musa|oiko|puho|seko|woto|valo|ysa|yso)$/u,
4
- /^(?:kauno|slm|yso)\//u
4
+ /^(?:kauno|slm|yso)\//u // <= yso/* etc
5
5
  ];
6
6
 
7
- const validPuncMarks = '?"-!,)]';
7
+ const validPuncMarks = '?-!.'; // NB! ')' and ']' are only valid for some fields!
8
+ const validQuoteChars = "\"'";
8
9
  // Configuration specification
9
10
  const confSpec = [
10
11
  { // 010-035 EI
@@ -500,7 +501,7 @@ const confSpec = [
500
501
  index: null,
501
502
  punc: false,
502
503
  special: {
503
- termField: '2',
504
+ termSubfieldCode: '2',
504
505
  finnishTerms,
505
506
  else: true
506
507
  }
@@ -516,7 +517,7 @@ const confSpec = [
516
517
  index: null,
517
518
  punc: false,
518
519
  special: {
519
- termField: '2',
520
+ termSubfieldCode: '2',
520
521
  finnishTerms,
521
522
  else: true
522
523
  }
@@ -666,4 +667,4 @@ const confSpec = [
666
667
  }
667
668
  ];
668
669
 
669
- export {finnishTerms, validPuncMarks, confSpec};
670
+ export {finnishTerms, validPuncMarks, validQuoteChars, confSpec};
@@ -27,7 +27,7 @@
27
27
  */
28
28
 
29
29
  // Import {validPuncMarks, finnishTerms, confSpec} from './ending-punctuation-conf.js';
30
- import {validPuncMarks, finnishTerms, confSpec} from './ending-punctuation-conf.js';
30
+ import {validPuncMarks, validQuoteChars, finnishTerms, confSpec} from './ending-punctuation-conf.js';
31
31
  import createDebugLogger from 'debug';
32
32
 
33
33
  const debug = createDebugLogger('@natlibfi/marc-record-validator-melinda/ending-punctuation');
@@ -65,40 +65,65 @@ export default function () {
65
65
 
66
66
  // Field validation with punctuation rules for normal and special cases in subfunction (to reduce complexity to please travisci)
67
67
  function validateField(field, linkedTag, fix, message) {
68
- // This is used to find last subfield that should have punctuation
69
- function findLastSubfield(field) {
70
- const subfields = field.subfields.filter(sf => isNaN(sf.code) && 'value' in sf);
71
- return subfields.slice(-1).shift();
68
+ function getDefaultPuncMarks(tag) {
69
+ if (tag.match(/^[1678](?:00|10|11|30)/u) || tag === '740') { // As defined in Loppupisteohje
70
+ return `${validPuncMarks})`;
71
+ }
72
+ // We don't want ').' here either. However, Loppupisteohje is a bit iffy here.
73
+ // BUG: Note that our generic rules will remove dot from Finnish terms such as https://finto.fi/yso-aika/fi/page/p1069910600
74
+ if (['647', '648', '650', '651', '654', '655', '656', '657', '658', '662'].includes(tag)) {
75
+ return `${validPuncMarks})`;
76
+ }
77
+ if(['260'].includes(tag)) {
78
+ return `${validPuncMarks})]`;
79
+ }
80
+ return validPuncMarks;
72
81
  }
73
82
 
74
83
  // Punctuation rule (Boolean), Check no ending dot strict (Boolean)
75
84
  function normalPuncRules(subfield, punc, tag, checkEnd, overrideValidPuncMarks) {
76
- const puncMarks = overrideValidPuncMarks || validPuncMarks;
77
- const lastPuncMark = puncMarks.includes(subfield.value.slice(-1)); // If string ends to punctuation char
78
- const lastPuncDot = '.'.includes(subfield.value.slice(-1)); // If string ends to dot
85
+ const puncMarks = overrideValidPuncMarks || getDefaultPuncMarks(tag);
86
+ const lastChar = subfield.value.slice(-1);
87
+ const lastPuncMark = puncMarks.includes(lastChar); // If string ends to punctuation char
88
+ const lastPuncDot = '.'.includes(lastChar); // If string ends to dot
89
+ const penultimateCharacter = subfield.value.length >= 2 ? subfield.value.charAt(subfield.value.length - 2) : undefined;
90
+ const antePenultimateCharacter = subfield.value.length >= 3 ? subfield.value.charAt(subfield.value.length - 3) : undefined;
91
+
79
92
 
80
- // Last char should be punc, but its not one of marks nor dot
93
+ // Last char should be punc, but it's not one of listed punctuation marks nor dot
81
94
  if (punc && !(lastPuncMark || lastPuncDot)) {
82
- // Console.log("1. Invalid punctuation - missing")
83
- message.message.push(`Field ${tag} has invalid ending punctuation`);
84
- if (fix) {
85
- subfield.value = subfield.value.concat('.');
86
- message.fix.push(`Field ${tag} - Added punctuation to $${subfield.code}`);
95
+ //console.log(puncMarks)
96
+ if (penultimateCharacter && validQuoteChars.includes(lastChar) && puncMarks.includes(penultimateCharacter)) {
97
+ // Exception: do nothing! Ending in punc+quote combo is all right, and does not imply a missing punc
98
+ }
99
+ else {
100
+ // Console.log("1. Invalid punctuation - missing")
101
+ message.message.push(`Field ${tag} requires ending punctuation, ends in '${lastChar}'`);
102
+ if (fix) {
103
+ subfield.value = subfield.value.concat('.');
104
+ message.fix.push(`Field ${tag} - Added punctuation to $${subfield.code}`);
105
+ }
87
106
  }
88
107
 
89
108
  // Last char is dot, but previous char is one of punc marks, like 'Question?.'
90
- } else if (lastPuncDot && subfield.value.length > 1 && puncMarks.includes(subfield.value.charAt(subfield.value.length - 2))) {
109
+ } else if (lastPuncDot && penultimateCharacter && puncMarks.includes(penultimateCharacter)) {
91
110
  // Console.log("2. Invalid punctuation - duplicate, like '?.'")
92
- message.message.push(`Field ${tag} has invalid ending punctuation`);
111
+ message.message.push(`Field ${tag} has an extra dot after '${penultimateCharacter}'`);
93
112
  if (fix) {
94
113
  subfield.value = subfield.value.slice(0, -1);
95
- message.fix.push(`Field ${tag} - Removed double punctuation from $${subfield.code}`);
114
+ message.fix.push(`Field ${tag} - Removed dot after punctuation from $${subfield.code}`);
115
+ }
116
+ // Last char is dot, but previous two cars are punc+quote, like 'Lorum "Ipsum.".'
117
+ } else if (lastPuncDot && antePenultimateCharacter && validQuoteChars.includes(penultimateCharacter) && puncMarks.includes(antePenultimateCharacter)) {
118
+ message.message.push(`Field ${tag} has an extra dot in '${antePenultimateCharacter}${penultimateCharacter}${lastChar}'`);
119
+ if (fix) {
120
+ subfield.value = subfield.value.slice(0, -1);
121
+ message.fix.push(`Field ${tag} - Removed '${lastChar}' after '${antePenultimateCharacter}${penultimateCharacter}'`);
96
122
  }
97
-
98
123
  // Last char shouldn't be dot !! This is behind checkEnd boolean, because of dots at end of abbreviations, so this is checked only in special cases !!//
99
124
  } else if (checkEnd && (!punc && lastPuncDot)) {
100
125
  // Console.log("3. Invalid punctuation - Shouldn't be dot, is")
101
- message.message.push(`Field ${tag} has invalid ending punctuation`);
126
+ message.message.push(`Field ${tag} has unwanted ending punctuation '${lastChar}'`);
102
127
  if (fix) {
103
128
  subfield.value = subfield.value.slice(0, -1);
104
129
  message.fix.push(`Field ${tag} - Removed punctuation from $${subfield.code}`);
@@ -136,7 +161,7 @@ function validateField(field, linkedTag, fix, message) {
136
161
  if (res.special.ifInd2 && res.special.ifInd2.includes(field.ind2)) {
137
162
  normalPuncRules(lastSubField, res.special.ifBoth, tag, true, res.special.ifLastCharNot);
138
163
 
139
- // Matches execption to special rule, noPuncIfInd2 (likely with value 4, that indicates copyright mark)
164
+ // Matches exception to special rule, noPuncIfInd2 (likely with value 4, that indicates copyright mark)
140
165
  } else if (res.special.noPuncIfInd2 && field.ind2 && res.special.noPuncIfInd2.includes(field.ind2)) {
141
166
  normalPuncRules(lastSubField, !res.special.ifBoth, tag, true, res.special.ifLastCharNot);
142
167
 
@@ -158,12 +183,17 @@ function validateField(field, linkedTag, fix, message) {
158
183
  normalPuncRules(lastSubField, res.punc, tag, false, false);
159
184
 
160
185
  // Search for Finnish terms
161
- } else if (res.special.termField) {
186
+ } else if (res.special.termSubfieldCode) {
162
187
  lastSubField = findLastSubfield(field);
163
188
 
164
189
  if (lastSubField) {
165
- const languageField = field.subfields.find(({code}) => code === res.special.termField);
166
- if (languageField && languageField.value && finnishTerms.some(p => p.test(languageField.value))) {
190
+ const lexicon = getLexicon(field, res.special.termSubfieldCode);
191
+ const proceed = !finnishException(field, res.special.termSubfieldCode, false);
192
+
193
+
194
+ //const languageField = field.subfields.find(({code}) => code === res.special.termSubfieldCode);
195
+ //if (languageField && languageField.value && finnishTerms.some(p => p.test(languageField.value))) {
196
+ if (lexicon && finnishTerms.some(p => p.test(lexicon)) && proceed) {
167
197
  // If (languageField && languageField.value && finnishTerms.indexOf(languageField.value) > -1) {
168
198
  normalPuncRules(lastSubField, res.punc, tag, true, false);
169
199
  } else {
@@ -199,6 +229,15 @@ function validateField(field, linkedTag, fix, message) {
199
229
 
200
230
  validateField(field, linkedTag, fix, message);
201
231
  }
232
+ // fallback
233
+ else {
234
+ debug(`special is definedm but no rule applies`);
235
+ const lastSubField = findLastSubfield(field);
236
+
237
+ if (lastSubField) {
238
+ normalPuncRules(lastSubField, res.punc, field.tag, false, false, fix, message);
239
+ }
240
+ }
202
241
  }
203
242
 
204
243
  let res = null;
@@ -232,8 +271,12 @@ function validateField(field, linkedTag, fix, message) {
232
271
  return;
233
272
  }
234
273
 
274
+ const forceNormal = res.special ? finnishException(field, res.special.termSubfieldCode, true) : false;
235
275
  // Normal rules
236
- if (typeof res.special === 'undefined' || res.special === null) {
276
+ if (typeof res.special === 'undefined' || res.special === null || forceNormal) {
277
+ if (forceNormal) {
278
+ console.info("EXCEPTION. SKIP FINNISH RULES");
279
+ }
237
280
  lastSubField = findLastSubfield(field);
238
281
 
239
282
  if (lastSubField) {
@@ -260,3 +303,51 @@ export function validateSingleField(field, linkedTag, fix) {
260
303
  return message;
261
304
  }
262
305
 
306
+ function getLexicon(field, subfieldCode) {
307
+ const languageSubfield = field.subfields.find(({code}) => code === subfieldCode); // res.special.termSubfieldCode);
308
+ if (!languageSubfield || !languageSubfield.value) {
309
+ return undefined;
310
+ }
311
+ if (finnishTerms.find(p => p.test(languageSubfield.value))) {
312
+ return languageSubfield.value;
313
+ }
314
+ return undefined;
315
+ }
316
+
317
+ function finnishException(field, termSubfieldCode, hasDot = true) {
318
+ const lexicon = getLexicon(field, termSubfieldCode);
319
+ if (!lexicon) {
320
+ return false;
321
+ }
322
+
323
+ const lastSubfield = findLastSubfield(field);
324
+ if (!lastSubfield || !lastSubfield.value) {
325
+ return false;
326
+ }
327
+ // Some terms can end in '.' that we want to keep
328
+ if (field.tag === '648') { // Yso-aika checks
329
+ //console.log(`Finnish Exception? '${lastSubfield.value}', '${lexicon}', '${field.tag}'`);
330
+ if (lexicon === 'yso/fin') { // 'eaa.' appears in prefLAbels and 'eKr.' in altLabels
331
+ if (hasDot) {
332
+ return lastSubfield.value.match(/ (?:eaa|[ej]Kr|jaa)\.$/u); // Finnish term from which the dot is not to be removed
333
+ }
334
+ return lastSubfield.value.match(/ (?:eaa|[ej]Kr)|jaa$/u); // Finnish word that needs a dot
335
+ }
336
+
337
+ if (lexicon === 'yso/swe') {
338
+ if (hasDot) {
339
+ return lastSubfield.value.match(/ (?:[ef]\.Kr|f\.v\.t)\.$/u);
340
+ }
341
+ return lastSubfield.value.match(/ (?:[ef]\.Kr|f\.v\.t)$/u);
342
+ }
343
+ }
344
+ // yso has 'MODEL.LA.' and 'Corel R.A.V.E.' but these are so rare I'm not listing them
345
+
346
+ return false;
347
+ }
348
+
349
+ // This is used to find last subfield that should have punctuation
350
+ function findLastSubfield(field) {
351
+ const subfields = field.subfields.filter(sf => isNaN(sf.code) && 'value' in sf);
352
+ return subfields.slice(-1).shift();
353
+ }