flappa-doormal 2.9.0 → 2.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -155,6 +155,154 @@ const makeDiacriticInsensitive = (text) => {
155
155
  return Array.from(norm).map((ch) => getEquivClass(ch) + diacriticsMatcher).join("");
156
156
  };
157
157
 
158
+ //#endregion
159
+ //#region src/segmentation/types.ts
160
+ /**
161
+ * Pattern type key names for split rules.
162
+ *
163
+ * Use this array to dynamically iterate over pattern types in UIs,
164
+ * or use the `PatternTypeKey` type for type-safe string unions.
165
+ *
166
+ * @example
167
+ * // Build a dropdown/select in UI
168
+ * PATTERN_TYPE_KEYS.map(key => <option value={key}>{key}</option>)
169
+ *
170
+ * @example
171
+ * // Type-safe pattern key validation
172
+ * const validateKey = (k: string): k is PatternTypeKey =>
173
+ * (PATTERN_TYPE_KEYS as readonly string[]).includes(k);
174
+ */
175
+ const PATTERN_TYPE_KEYS = [
176
+ "lineStartsWith",
177
+ "lineStartsAfter",
178
+ "lineEndsWith",
179
+ "template",
180
+ "regex"
181
+ ];
182
+
183
+ //#endregion
184
+ //#region src/segmentation/optimize-rules.ts
185
+ /**
186
+ * Rule optimization utilities for merging and sorting split rules.
187
+ *
188
+ * Provides `optimizeRules()` to:
189
+ * 1. Merge compatible rules with same pattern type and options
190
+ * 2. Deduplicate patterns within each rule
191
+ * 3. Sort rules by specificity (longer patterns first)
192
+ *
193
+ * @module optimize-rules
194
+ */
195
+ const MERGEABLE_KEYS = new Set([
196
+ "lineStartsWith",
197
+ "lineStartsAfter",
198
+ "lineEndsWith"
199
+ ]);
200
+ /**
201
+ * Get the pattern type key for a rule.
202
+ */
203
+ const getPatternKey = (rule) => {
204
+ for (const key of PATTERN_TYPE_KEYS) if (key in rule) return key;
205
+ return "regex";
206
+ };
207
+ /**
208
+ * Get the pattern array for a mergeable rule.
209
+ */
210
+ const getPatternArray = (rule, key) => {
211
+ const value = rule[key];
212
+ return Array.isArray(value) ? value : [];
213
+ };
214
+ /**
215
+ * Get a string representation of the pattern value (for specificity scoring).
216
+ */
217
+ const getPatternString = (rule, key) => {
218
+ const value = rule[key];
219
+ if (typeof value === "string") return value;
220
+ if (Array.isArray(value)) return value.join("\n");
221
+ return "";
222
+ };
223
+ /**
224
+ * Deduplicate and sort patterns by length (longest first).
225
+ */
226
+ const normalizePatterns = (patterns) => {
227
+ return [...new Set(patterns)].sort((a, b) => b.length - a.length || a.localeCompare(b));
228
+ };
229
+ /**
230
+ * Calculate specificity score for a rule (higher = more specific).
231
+ * Based on the longest pattern length.
232
+ */
233
+ const getSpecificityScore = (rule) => {
234
+ const key = getPatternKey(rule);
235
+ if (MERGEABLE_KEYS.has(key)) return getPatternArray(rule, key).reduce((max, p) => Math.max(max, p.length), 0);
236
+ return getPatternString(rule, key).length;
237
+ };
238
+ /**
239
+ * Create a merge key for a rule based on pattern type and all non-pattern properties.
240
+ * Rules with the same merge key can have their patterns combined.
241
+ */
242
+ const createMergeKey = (rule) => {
243
+ const patternKey = getPatternKey(rule);
244
+ const { [patternKey]: _pattern, ...rest } = rule;
245
+ return `${patternKey}|${JSON.stringify(rest)}`;
246
+ };
247
+ /**
248
+ * Optimize split rules by merging compatible rules and sorting by specificity.
249
+ *
250
+ * This function:
251
+ * 1. **Merges compatible rules**: Rules with the same pattern type and identical
252
+ * options (meta, fuzzy, min/max, etc.) have their pattern arrays combined
253
+ * 2. **Deduplicates patterns**: Removes duplicate patterns within each rule
254
+ * 3. **Sorts by specificity**: Rules with longer patterns come first
255
+ *
256
+ * Only array-based pattern types (`lineStartsWith`, `lineStartsAfter`, `lineEndsWith`)
257
+ * can be merged. `template` and `regex` rules are kept separate.
258
+ *
259
+ * @param rules - Array of split rules to optimize
260
+ * @returns Optimized rules and count of merged rules
261
+ *
262
+ * @example
263
+ * import { optimizeRules } from 'flappa-doormal';
264
+ *
265
+ * const { rules, mergedCount } = optimizeRules([
266
+ * { lineStartsWith: ['{{kitab}}'], fuzzy: true, meta: { type: 'header' } },
267
+ * { lineStartsWith: ['{{bab}}'], fuzzy: true, meta: { type: 'header' } },
268
+ * { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } },
269
+ * ]);
270
+ *
271
+ * // rules[0] = { lineStartsWith: ['{{kitab}}', '{{bab}}'], fuzzy: true, meta: { type: 'header' } }
272
+ * // rules[1] = { lineStartsAfter: ['{{numbered}}'], meta: { type: 'entry' } }
273
+ * // mergedCount = 1
274
+ */
275
+ const optimizeRules = (rules) => {
276
+ const output = [];
277
+ const indexByMergeKey = /* @__PURE__ */ new Map();
278
+ let mergedCount = 0;
279
+ for (const rule of rules) {
280
+ const patternKey = getPatternKey(rule);
281
+ if (!MERGEABLE_KEYS.has(patternKey)) {
282
+ output.push(rule);
283
+ continue;
284
+ }
285
+ const mergeKey = createMergeKey(rule);
286
+ const existingIndex = indexByMergeKey.get(mergeKey);
287
+ if (existingIndex === void 0) {
288
+ indexByMergeKey.set(mergeKey, output.length);
289
+ output.push({
290
+ ...rule,
291
+ [patternKey]: normalizePatterns(getPatternArray(rule, patternKey))
292
+ });
293
+ continue;
294
+ }
295
+ const existing = output[existingIndex];
296
+ existing[patternKey] = normalizePatterns([...getPatternArray(existing, patternKey), ...getPatternArray(rule, patternKey)]);
297
+ mergedCount++;
298
+ }
299
+ output.sort((a, b) => getSpecificityScore(b) - getSpecificityScore(a));
300
+ return {
301
+ mergedCount,
302
+ rules: output
303
+ };
304
+ };
305
+
158
306
  //#endregion
159
307
  //#region src/segmentation/tokens.ts
160
308
  /**
@@ -626,6 +774,51 @@ const shouldDefaultToFuzzy = (patterns) => {
626
774
  return FUZZY_TOKEN_REGEX.test(p);
627
775
  });
628
776
  };
777
+ /**
778
+ * Apply token mappings to a template string.
779
+ *
780
+ * Transforms `{{token}}` into `{{token:name}}` based on the provided mappings.
781
+ * Useful for applying user-configured capture names to a raw template.
782
+ *
783
+ * - Only affects exact matches of `{{token}}`.
784
+ * - Does NOT affect tokens that already have a capture name (e.g. `{{token:existing}}`).
785
+ * - Does NOT affect capture-only tokens (e.g. `{{:name}}`).
786
+ *
787
+ * @param template - The template string to transform
788
+ * @param mappings - Array of mappings from token name to capture name
789
+ * @returns Transformed template string with captures applied
790
+ *
791
+ * @example
792
+ * applyTokenMappings('{{raqms}} {{dash}}', [{ token: 'raqms', name: 'num' }])
793
+ * // → '{{raqms:num}} {{dash}}'
794
+ */
795
+ const applyTokenMappings = (template, mappings) => {
796
+ let result = template;
797
+ for (const { token, name } of mappings) {
798
+ if (!token || !name) continue;
799
+ const regex = new RegExp(`\\{\\{${token}\\}\\}`, "g");
800
+ result = result.replace(regex, `{{${token}:${name}}}`);
801
+ }
802
+ return result;
803
+ };
804
+ /**
805
+ * Strip token mappings from a template string.
806
+ *
807
+ * Transforms `{{token:name}}` back into `{{token}}`.
808
+ * Also transforms `{{:name}}` patterns (capture-only) into `{{}}` (which is invalid/empty).
809
+ *
810
+ * Useful for normalizing templates for storage or comparison.
811
+ *
812
+ * @param template - The template string to strip
813
+ * @returns Template string with capture names removed
814
+ *
815
+ * @example
816
+ * stripTokenMappings('{{raqms:num}} {{dash}}')
817
+ * // → '{{raqms}} {{dash}}'
818
+ */
819
+ const stripTokenMappings = (template) => {
820
+ return template.replace(/\{\{([^:}]+):[^}]+\}\}/g, "{{$1}}");
821
+ };
629
822
 
630
823
  //#endregion
631
824
  //#region src/segmentation/pattern-validator.ts
@@ -651,6 +844,7 @@ const validatePattern = (pattern, seenPatterns) => {
651
844
  };
652
845
  if (seenPatterns.has(pattern)) return {
653
846
  message: `Duplicate pattern: "${pattern}"`,
847
+ pattern,
654
848
  type: "duplicate"
655
849
  };
656
850
  seenPatterns.add(pattern);
@@ -660,6 +854,7 @@ const validatePattern = (pattern, seenPatterns) => {
660
854
  if (!KNOWN_TOKENS.has(tokenName)) return {
661
855
  message: `Unknown token: {{${tokenName}}}. Available tokens: ${[...KNOWN_TOKENS].slice(0, 5).join(", ")}...`,
662
856
  suggestion: `Check spelling or use a known token`,
857
+ token: tokenName,
663
858
  type: "unknown_token"
664
859
  };
665
860
  }
@@ -674,6 +869,7 @@ const validatePattern = (pattern, seenPatterns) => {
674
869
  if (before !== "{{" && after !== "}}") return {
675
870
  message: `Token "${tokenName}" appears to be missing {{}}. Did you mean "{{${fullMatch}}}"?`,
676
871
  suggestion: `{{${fullMatch}}}`,
872
+ token: tokenName,
677
873
  type: "missing_braces"
678
874
  };
679
875
  }
@@ -742,6 +938,39 @@ const validateRules = (rules) => {
742
938
  return hasIssues ? result : void 0;
743
939
  });
744
940
  };
941
+ /**
942
+ * Formats a validation result array into a list of human-readable error messages.
943
+ *
944
+ * Useful for displaying validation errors in UIs.
945
+ *
946
+ * @param results - The result array from `validateRules()`
947
+ * @returns Array of formatted error strings
948
+ *
949
+ * @example
950
+ * const issues = validateRules(rules);
951
+ * const errors = formatValidationReport(issues);
952
+ * // ["Rule 1, lineStartsWith: Missing {{}} around token..."]
953
+ */
954
+ const formatValidationReport = (results) => {
955
+ const errors = [];
956
+ results.forEach((result, ruleIndex) => {
957
+ if (!result) return;
958
+ const formatIssue = (issue, location) => {
959
+ if (!issue) return;
960
+ const type = issue.type;
961
+ if (type === "missing_braces" && issue.token) errors.push(`${location}: Missing {{}} around token "${issue.token}"`);
962
+ else if (type === "unknown_token" && issue.token) errors.push(`${location}: Unknown token "{{${issue.token}}}"`);
963
+ else if (type === "duplicate" && issue.pattern) errors.push(`${location}: Duplicate pattern "${issue.pattern}"`);
964
+ else if (issue.message) errors.push(`${location}: ${issue.message}`);
965
+ else errors.push(`${location}: ${type}`);
966
+ };
967
+ for (const [patternType, issues] of Object.entries(result)) {
968
+ const list = Array.isArray(issues) ? issues : [issues];
969
+ for (const issue of list) if (issue) formatIssue(issue, `Rule ${ruleIndex + 1}, ${patternType}`);
970
+ }
971
+ });
972
+ return errors;
973
+ };
745
974
 
746
975
  //#endregion
747
976
  //#region src/segmentation/replace.ts
@@ -2536,7 +2765,7 @@ const buildTokenPriority = () => {
2536
2765
  return TOKEN_PRIORITY_ORDER$1.filter((t) => allTokens.has(t));
2537
2766
  };
2538
2767
  const collapseWhitespace = (s) => s.replace(/\s+/g, " ").trim();
2539
- const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED\u0640]/gu, "");
2768
+ const stripArabicDiacritics = (s) => s.replace(/[\u064B-\u065F\u0670\u06D6-\u06ED]/gu, "");
2540
2769
  const compileTokenRegexes = (tokenNames) => {
2541
2770
  const compiled = [];
2542
2771
  for (const token of tokenNames) {
@@ -3618,5 +3847,5 @@ function recoverMistakenMarkersForRuns(runs, opts) {
3618
3847
  }
3619
3848
 
3620
3849
  //#endregion
3621
- export { TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, suggestPatternConfig, templateToRegex, validateRules };
3850
+ export { PATTERN_TYPE_KEYS, TOKEN_PATTERNS, analyzeCommonLineStarts, analyzeRepeatingSequences, analyzeTextForRule, applyReplacements, applyTokenMappings, containsTokens, detectTokenPatterns, escapeRegex, escapeTemplateBrackets, expandCompositeTokensInTemplate, expandTokens, expandTokensWithCaptures, formatValidationReport, generateTemplateFromText, getAvailableTokens, getTokenPattern, makeDiacriticInsensitive, optimizeRules, recoverMistakenLineStartsAfterMarkers, recoverMistakenMarkersForRuns, segmentPages, shouldDefaultToFuzzy, stripTokenMappings, suggestPatternConfig, templateToRegex, validateRules };
3622
3851
  //# sourceMappingURL=index.mjs.map