npm - ilib-lint - Versions diffs - 2.2.1 → 2.3.0 - Mend

ilib-lint 2.2.1 → 2.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/docs/ReleaseNotes.md +9 -0
package/docs/resource-xml.md +58 -0
package/package.json +9 -8
package/src/formatters/AnsiConsoleFormatter.js +2 -1
package/src/plugins/BuiltinPlugin.js +4 -1
package/src/rules/ResourceXML.js +244 -0

package/docs/ReleaseNotes.md CHANGED Viewed

@@ -1,6 +1,15 @@
 Release Notes
 =============
+### v2.3.0
+- implemented the XML match rule. If there are XML tags and entities in the
+  source, then the translations must match. The order of XML tags can change,
+  as the grammar of other languages might require that, but the number and
+  type of XML tags must match or an error will recorded.
+    - this rule will also record an error if the XML in the source is
+      well-formed, but the XML in the translation is not
 ### v2.2.1
 - fixed the output from the LintableFile class so that if there is only one

package/docs/resource-xml.md ADDED Viewed

@@ -0,0 +1,58 @@
+# resource-xml
+If the source string contains XML-like tags, then the translation must contain
+the same tags. The tags themselves may be reordered or nested differently than
+in the source, but:
+- they should include the same number of tags
+- the tags should have the same name as ones in the source
+- the XML must be well-formed. That is, tags are nested properly and every
+open tag has a corresponding closing tag
+- unnamed tags such as `<>` and `</>` are not allowed
+Self closing tags such as `<p/>` are allowed.
+Example of correctly matched tags in a German translation:
+- source: `You must <b>wait</b> for the <a href="url">job</a>.`
+- target: `Sie müssen auf den <a href="url">Job</a> <b>warten</b>.`
+Example of incorrectly matched tags in a German translation:
+- source: `You must <b>wait</b> for the <a href="url">job</a>.`
+- target: `Sie <b>müssen</c> auf den <a href="url">Job</a> <c>warten</c>.`
+Problems in the above translation:
+1. The `<b>` tag has a closing `</c>` tag making it is not well-formed
+2. The number of tags is different than the source
+3. The names of tags are different than the source
+## Exceptions for HTML Tags
+HTML4 tags that are commonly written without a closing tag are allowed.
+The code first checks if the tags are well-formed already. If not, then it
+treats these HTML tags as if they were a self-closing tag without having
+the trailing slash inside the angle brackets.
+Example: `<p>` (start paragraph) is treated as it is were `<p/>`
+Here is the list of HTML4 tags that are treated as if they were self-closing
+if they are not well-formed:
+- `<area>`
+- `<base>`
+- `<bdi>`
+- `<bdo>`
+- `<br>`
+- `<embed>`
+- `<hr>`
+- `<img>`
+- `<input>`
+- `<li>`
+- `<link>`
+- `<option>`
+- `<p>`
+- `<param>`
+- `<source>`
+- `<track>`

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "ilib-lint",
-    "version": "2.2.1",
+    "version": "2.3.0",
     "module": "./src/index.js",
     "type": "module",
     "bin": "./src/index.js",
@@ -61,16 +61,16 @@
     },
     "devDependencies": {
         "@tsconfig/node14": "^14.1.2",
-        "@types/node": "^20.14.10",
+        "@types/node": "^14.0.0",
         "docdash": "^2.0.2",
-        "ilib-lint-plugin-test": "file:test/ilib-lint-plugin-test",
-        "ilib-lint-plugin-obsolete": "file:test/ilib-lint-plugin-obsolete",
         "i18nlint-plugin-test-old": "file:test/i18nlint-plugin-test-old",
+        "ilib-lint-plugin-obsolete": "file:test/ilib-lint-plugin-obsolete",
+        "ilib-lint-plugin-test": "file:test/ilib-lint-plugin-test",
         "jest": "^29.7.0",
         "jsdoc": "^4.0.3",
-        "jsdoc-to-markdown": "^8.0.1",
+        "jsdoc-to-markdown": "^8.0.3",
         "npm-run-all": "^4.1.5",
-        "typescript": "^5.5.3"
+        "typescript": "^5.5.4"
     },
     "dependencies": {
         "@formatjs/intl": "^2.10.4",
@@ -78,11 +78,12 @@
         "ilib-lint-common": "^3.0.0",
         "ilib-locale": "^1.2.2",
         "ilib-localeinfo": "^1.1.0",
-        "ilib-tools-common": "^1.10.0",
+        "ilib-tools-common": "^1.11.0",
         "intl-messageformat": "^10.5",
         "json5": "^2.2.3",
         "log4js": "^6.9.1",
         "micromatch": "^4.0.7",
-        "options-parser": "^0.4.0"
+        "options-parser": "^0.4.0",
+        "xml-js": "^1.6.11"
     }
 }

package/src/formatters/AnsiConsoleFormatter.js CHANGED Viewed

@@ -61,7 +61,8 @@ class AnsiConsoleFormatter extends Formatter {
 `;
         // output ascii terminal escape sequences
-        output = output.replace(/<e\d><\/e\d>/g, "\u001B[91m \u001B[0m");
+        output = output.replace(/<e\d><\/e\d>/g, "\u001B[91m␣\u001B[0m");
+        output = output.replace(/<e\d\/>/g, "\u001B[91m␣\u001B[0m");
         output = output.replace(/<e\d>/g, "\u001B[91m");
         output = output.replace(/<\/e\d>/g, "\u001B[0m");
         if (typeof(result.rule.getLink) === 'function' && result.rule.getLink()) {

package/src/plugins/BuiltinPlugin.js CHANGED Viewed

@@ -36,6 +36,7 @@ import ResourceSourceICUPluralSyntax from '../rules/ResourceSourceICUPluralSynta
 import ResourceSourceICUPluralParams from '../rules/ResourceSourceICUPluralParams.js';
 import ResourceSourceICUPluralCategories from '../rules/ResourceSourceICUPluralCategories.js';
 import ResourceSourceICUUnexplainedParams from '../rules/ResourceSourceICUUnexplainedParams.js';
+import ResourceXML from '../rules/ResourceXML.js';
 // built-in declarative rules
 export const regexRules = [
@@ -241,6 +242,7 @@ export const builtInRulesets = {
         "resource-completeness": true,
         "resource-no-translation": true,
         "resource-icu-plurals-translated": true,
+        "resource-xml": true,
         // declarative rules from above
         "resource-url-match": true,
@@ -252,7 +254,7 @@ export const builtInRulesets = {
         "resource-no-space-between-double-and-single-byte-character": true,
         "resource-no-halfwidth-kana-characters": true,
         "resource-no-double-byte-space": true,
-        "resource-no-space-with-fullwidth-punctuation": true,
+        "resource-no-space-with-fullwidth-punctuation": true
     },
     source: {
@@ -310,6 +312,7 @@ class BuiltinPlugin extends Plugin {
             ResourceSourceICUPluralParams,
             ResourceSourceICUPluralCategories,
             ResourceSourceICUUnexplainedParams,
+            ResourceXML,
             ...regexRules
         ];
     }

package/src/rules/ResourceXML.js ADDED Viewed

@@ -0,0 +1,244 @@
+/*
+ * ResourceXML.js - rule to check that XML in the translations match
+ * XML in the source
+ *
+ * Copyright © 2024 JEDLSoft
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import { Result } from 'ilib-lint-common';
+import { xml2js } from 'xml-js';
+import { selfClosingTags } from 'ilib-tools-common';
+import ResourceRule from './ResourceRule.js';
+const htmlTags = Object.keys(selfClosingTags).concat(["p", "li"]);
+const selfClosingRe = new RegExp(`<(${htmlTags.join('|')})>`, "g");
+const endTagRe = new RegExp(`</(${htmlTags.join('|')})>`);
+const unnamedTagRe = /<\/?>/;
+/**
+ * @class Represent an ilib-lint rule.
+ */
+class ResourceXML extends ResourceRule {
+    /**
+     * Make a new rule instance.
+     * @constructor
+     */
+    constructor(options) {
+        super(options);
+        this.name = "resource-xml";
+        this.description = "Ensure that XML in translated resources match the source";
+        this.sourceLocale = (options && options.sourceLocale) || "en-US";
+        this.link = "https://gihub.com/ilib-js/ilib-lint/blob/main/docs/resource-xml.md";
+    }
+    /**
+     * @private
+     * @param {Node} node a node in the AST
+     * @param {Object} elements an object that maps each element found to the number of times it
+     * has been found
+     */
+    countElements(node, elements) {
+        if (Array.isArray(node)) {
+            for (let i in node) {
+                this.countElements(node[i], elements);
+            }
+        } else {
+            if (node.type === "element") {
+                if (!elements[node.name]) {
+                    elements[node.name] = 1;
+                } else {
+                    elements[node.name]++;
+                }
+            }
+            if (node.elements) {
+                this.countElements(node.elements, elements);
+            }
+        }
+    }
+    /**
+     * @private
+     * @param {Node} sourceAst the root of the AST of the source string
+     * @param {Node} targetAst the root of the AST of the target string
+     * @param {Resource} resource the resource instance where the source
+     * and target strings came from
+     */
+    matchElements(sourceAst, targetAst, resource) {
+        // first traverse the source tree looking for elements to count
+        let sourceElements = {}, targetElements = {};
+        let problems = [];
+        if (sourceAst?.elements?.length > 0) {
+            this.countElements(sourceAst?.elements, sourceElements);
+            if (targetAst?.elements?.length > 0) {
+                this.countElements(targetAst?.elements, targetElements);
+            }
+            for (let element in sourceElements) {
+                if (!targetElements[element] || sourceElements[element] !== targetElements[element]) {
+                    let opts = {
+                        severity: "error",
+                        rule: this,
+                        description: `The number of XML <${element}> elements in the target (${targetElements[element] ?? 0}) does not match the number in the source (${sourceElements[element]}).`,
+                        id: resource.getKey(),
+                        highlight: `Target: ${resource.getTarget()}<e0/>`,
+                        pathName: resource.getPath(),
+                        source: resource.getSource(),
+                        locale: resource.getTargetLocale()
+                    };
+                    problems.push(new Result(opts));
+                }
+            }
+            for (let element in targetElements) {
+                if (!sourceElements[element]) {
+                    const re = new RegExp(`<(?<tag>\/?${element}\/?)>`, "g");
+                    const highlight =
+                        resource.getTarget().replace(re, "<e0><$<tag>></e0>");
+                    let opts = {
+                        severity: "error",
+                        rule: this,
+                        description: `The XML element <${element}> in the target does not appear in the source.`,
+                        id: resource.getKey(),
+                        highlight: `Target: ${highlight}`,
+                        pathName: resource.getPath(),
+                        source: resource.getSource(),
+                        locale: resource.getTargetLocale()
+                    };
+                    problems.push(new Result(opts));
+                }
+            }
+        }
+        return problems;
+    }
+    /**
+     * Sometimes, the xml tags are really html, which has notorious problems
+     * with unclosed tags being considered valid, such as the <p> or
+     * <br> tags. The xml parser we are using does not recognize html,
+     * so we have to convert the unclosed html tags into valid xml before we
+     * attempt to parse them. This function does that by making those tags into
+     * self-closing tags. <p> becomes <p/>
+     *
+     * Note that if there is a <p> tag, we have to make sure there is also no
+     * </p> in the string as that is valid xml already. We should only convert
+     * the <p> tags when there are no </p> tags to go with it.
+     *
+     * @private
+     * @param {string} string the string to convert
+     * @returns {string}
+     */
+    convertUnclosedTags(string) {
+        let converted = string;
+        if (!endTagRe.test(string)) {
+            converted = string.replace(selfClosingRe, "<$1/>");
+        }
+        return converted;
+    }
+    /**
+     * @override
+     */
+    matchString({source, target, resource}) {
+        if (!target) return; // can't check "nothing" !
+        let srcObj, tgtObj;
+        let problems = [];
+        const prefix = '<?xml version="1.0" encoding="UTF-8"?><root>';
+        const suffix = '</root>';
+        // convert html tags to valid xml tags and wrap the strings with a prefix
+        // and suffix so that it forms a whole xml document before we attempt to
+        // call the parser on them
+        const wrappedSource = `${prefix}${this.convertUnclosedTags(source)}${suffix}`;
+        const wrappedTarget = `${prefix}${this.convertUnclosedTags(target)}${suffix}`;
+        // First, check the source string for problems. If there are any,
+        // don't even bother checking the target string for problems because
+        // we don't even know if they are valid problems. The translators may
+        // just have echoed the problems already in the source. There will be
+        // another rule that checks the well-formedness of the source string
+        // for the engineers to fix. It is not the job of this rule to report
+        // on the well-formedness of the source.
+        try {
+            srcObj = xml2js(wrappedSource, {
+                trim: false
+            });
+        } catch (e) {
+            // source is not well-formed, so don't even
+            // attempt to parse the target! Just bail.
+            return undefined;
+        }
+        try {
+            // Second, tags that have no name are a special type of un-well-formedness
+            // that we want to call out separately. If the target contains them, the
+            // xml2js parser below will find it, but it will show as an unclosed tag error.
+            // While that is true, it's a poor error message that doesn't help the
+            // translators fix the real problem, which is the unnamed tag.
+            if (unnamedTagRe.test(target)) {
+                const highlight =
+                    target.replace(/(<\/?>)/g, "<e0>$1</e0>");
+                let opts = {
+                    severity: "error",
+                    rule: this,
+                    description: `Empty XML elements <> and </> are not allowed in the target.`,
+                    id: resource.getKey(),
+                    highlight: `Target: ${highlight}`,
+                    pathName: resource.getPath(),
+                    source: resource.getSource(),
+                    locale: resource.getTargetLocale()
+                };
+                problems.push(new Result(opts));
+            }
+            // Third, parse the target string for well-formedness. If it does not parse properly,
+            // it throws the exception handled below
+            tgtObj = xml2js(wrappedTarget, {
+                trim: false
+            });
+            // And finally match the xml elements/tags from the source to the target
+            problems = problems.concat(this.matchElements(srcObj, tgtObj, resource));
+        } catch (e) {
+            const lines = e.message.split(/\n/g);
+            // find the column number in the 3rd line of the exception message and subtract off
+            // the length of the prefix text we added in wrappedTarget
+            const column = parseInt(lines[2].substring(8)) - prefix.length;
+            // create the highlight, but make sure to escape any less than characters so that
+            // it does not conflict with the highlight
+            const highlight = column >= target.length ?
+                target + '<e0/>' :
+                target.substring(0, column) + '<e0>' + target[column] + '</e0>' + target.substring(column+1);
+            let opts = {
+                severity: "error",
+                rule: this,
+                description: `XML in translation is not well-formed. Error: ${lines[0]}`,
+                id: resource.getKey(),
+                highlight: `Target: ${highlight}`,
+                pathName: resource.getPath(),
+                source: resource.getSource(),
+                locale: resource.getTargetLocale()
+            };
+            problems.push(new Result(opts));
+        }
+        return problems.length < 2 ? problems[0] : problems;
+    }
+}
+export default ResourceXML;