@datafog/fogclaw 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/.github/workflows/harness-docs.yml +30 -0
  2. package/AGENTS.md +28 -0
  3. package/LICENSE +21 -0
  4. package/README.md +208 -0
  5. package/dist/config.d.ts +4 -0
  6. package/dist/config.d.ts.map +1 -0
  7. package/dist/config.js +30 -0
  8. package/dist/config.js.map +1 -0
  9. package/dist/engines/gliner.d.ts +14 -0
  10. package/dist/engines/gliner.d.ts.map +1 -0
  11. package/dist/engines/gliner.js +75 -0
  12. package/dist/engines/gliner.js.map +1 -0
  13. package/dist/engines/regex.d.ts +5 -0
  14. package/dist/engines/regex.d.ts.map +1 -0
  15. package/dist/engines/regex.js +54 -0
  16. package/dist/engines/regex.js.map +1 -0
  17. package/dist/index.d.ts +19 -0
  18. package/dist/index.d.ts.map +1 -0
  19. package/dist/index.js +157 -0
  20. package/dist/index.js.map +1 -0
  21. package/dist/redactor.d.ts +3 -0
  22. package/dist/redactor.d.ts.map +1 -0
  23. package/dist/redactor.js +37 -0
  24. package/dist/redactor.js.map +1 -0
  25. package/dist/scanner.d.ts +11 -0
  26. package/dist/scanner.d.ts.map +1 -0
  27. package/dist/scanner.js +77 -0
  28. package/dist/scanner.js.map +1 -0
  29. package/dist/types.d.ts +31 -0
  30. package/dist/types.d.ts.map +1 -0
  31. package/dist/types.js +18 -0
  32. package/dist/types.js.map +1 -0
  33. package/docs/DATA.md +28 -0
  34. package/docs/DESIGN.md +17 -0
  35. package/docs/DOMAIN_DOCS.md +30 -0
  36. package/docs/FRONTEND.md +24 -0
  37. package/docs/OBSERVABILITY.md +25 -0
  38. package/docs/PLANS.md +171 -0
  39. package/docs/PRODUCT_SENSE.md +20 -0
  40. package/docs/RELIABILITY.md +60 -0
  41. package/docs/SECURITY.md +50 -0
  42. package/docs/design-docs/core-beliefs.md +17 -0
  43. package/docs/design-docs/index.md +8 -0
  44. package/docs/generated/README.md +36 -0
  45. package/docs/generated/memory.md +1 -0
  46. package/docs/plans/2026-02-16-fogclaw-design.md +172 -0
  47. package/docs/plans/2026-02-16-fogclaw-implementation.md +1606 -0
  48. package/docs/plans/README.md +15 -0
  49. package/docs/plans/active/2026-02-16-feat-openclaw-official-submission-plan.md +386 -0
  50. package/docs/plans/active/2026-02-17-feat-release-fogclaw-via-datafog-package-plan.md +318 -0
  51. package/docs/plans/active/2026-02-17-feat-submit-fogclaw-to-openclaw-plan.md +244 -0
  52. package/docs/plans/tech-debt-tracker.md +42 -0
  53. package/docs/plugins/fogclaw.md +95 -0
  54. package/docs/runbooks/address-review-findings.md +30 -0
  55. package/docs/runbooks/ci-failures.md +46 -0
  56. package/docs/runbooks/code-review.md +34 -0
  57. package/docs/runbooks/merge-change.md +28 -0
  58. package/docs/runbooks/pull-request.md +45 -0
  59. package/docs/runbooks/record-evidence.md +43 -0
  60. package/docs/runbooks/reproduce-bug.md +42 -0
  61. package/docs/runbooks/respond-to-feedback.md +42 -0
  62. package/docs/runbooks/review-findings.md +31 -0
  63. package/docs/runbooks/submit-openclaw-plugin.md +68 -0
  64. package/docs/runbooks/update-agents-md.md +59 -0
  65. package/docs/runbooks/update-domain-docs.md +42 -0
  66. package/docs/runbooks/validate-current-state.md +41 -0
  67. package/docs/runbooks/verify-release.md +69 -0
  68. package/docs/specs/2026-02-16-feat-openclaw-official-submission-spec.md +115 -0
  69. package/docs/specs/2026-02-17-feat-submit-fogclaw-to-openclaw.md +125 -0
  70. package/docs/specs/README.md +5 -0
  71. package/docs/specs/index.md +8 -0
  72. package/docs/spikes/README.md +8 -0
  73. package/fogclaw.config.example.json +15 -0
  74. package/openclaw.plugin.json +45 -0
  75. package/package.json +37 -0
  76. package/scripts/ci/he-docs-config.json +123 -0
  77. package/scripts/ci/he-docs-drift.sh +112 -0
  78. package/scripts/ci/he-docs-lint.sh +234 -0
  79. package/scripts/ci/he-plans-lint.sh +354 -0
  80. package/scripts/ci/he-runbooks-lint.sh +445 -0
  81. package/scripts/ci/he-specs-lint.sh +258 -0
  82. package/scripts/ci/he-spikes-lint.sh +249 -0
  83. package/scripts/runbooks/select-runbooks.sh +154 -0
  84. package/src/config.ts +46 -0
  85. package/src/engines/gliner.ts +88 -0
  86. package/src/engines/regex.ts +71 -0
  87. package/src/index.ts +223 -0
  88. package/src/redactor.ts +51 -0
  89. package/src/scanner.ts +90 -0
  90. package/src/types.ts +52 -0
  91. package/tests/config.test.ts +104 -0
  92. package/tests/gliner.test.ts +184 -0
  93. package/tests/plugin-smoke.test.ts +114 -0
  94. package/tests/redactor.test.ts +320 -0
  95. package/tests/regex.test.ts +345 -0
  96. package/tests/scanner.test.ts +199 -0
  97. package/tsconfig.json +20 -0
@@ -0,0 +1,249 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ # ---------------------------------------------------------------------------
5
+ # he-spikes-lint.sh — Lint spike documents under docs/spikes
6
+ # ---------------------------------------------------------------------------
7
+
8
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9
+ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
10
+
11
+ DEFAULT_CONFIG_PATH="scripts/ci/he-docs-config.json"
12
+
13
+ # Default required headings (one per line for easy iteration)
14
+ DEFAULT_REQUIRED_HEADINGS=(
15
+ "## Context"
16
+ "## Validation Goal"
17
+ "## Approach"
18
+ "## Findings"
19
+ "## Decisions"
20
+ "## Recommendation"
21
+ "## Impact on Upstream Docs"
22
+ "## Spike Code"
23
+ "## Remaining Unknowns"
24
+ "## Time Spent"
25
+ "## Revision Notes"
26
+ )
27
+
28
+ # Counters
29
+ errors=0
30
+ warnings=0
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Helpers
34
+ # ---------------------------------------------------------------------------
35
+
36
+ gh_annotate() {
37
+ local level="$1" file="$2" title="$3" msg="$4"
38
+ if [[ -n "$file" ]]; then
39
+ echo "::${level} file=${file},title=${title}::${msg}"
40
+ else
41
+ echo "::${level} title=${title}::${msg}"
42
+ fi
43
+ }
44
+
45
+ emit() {
46
+ local level="$1" file="$2" title="$3" msg="$4"
47
+ gh_annotate "$level" "$file" "$title" "$msg"
48
+ local upper
49
+ upper="$(echo "$level" | tr '[:lower:]' '[:upper:]')"
50
+ echo "${upper}: ${msg}" >&2
51
+ if [[ "$level" == "error" ]]; then
52
+ (( errors++ )) || true
53
+ else
54
+ (( warnings++ )) || true
55
+ fi
56
+ }
57
+
58
+ # Extract YAML frontmatter (text between first two --- lines, exclusive).
59
+ # Prints frontmatter to stdout. Returns 1 if no valid frontmatter found.
60
+ extract_frontmatter() {
61
+ local file="$1"
62
+ local first_line
63
+ first_line="$(head -n1 "$file" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
64
+ if [[ "$first_line" != "---" ]]; then
65
+ return 1
66
+ fi
67
+ # Find the closing --- (skip line 1, start from line 2)
68
+ local line_num=0
69
+ local found=0
70
+ while IFS= read -r line; do
71
+ line_num=$((line_num + 1))
72
+ if [[ $line_num -eq 1 ]]; then
73
+ continue
74
+ fi
75
+ local trimmed
76
+ trimmed="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
77
+ if [[ "$trimmed" == "---" ]]; then
78
+ found=1
79
+ break
80
+ fi
81
+ echo "$line"
82
+ done < "$file"
83
+ if [[ $found -eq 0 ]]; then
84
+ return 1
85
+ fi
86
+ return 0
87
+ }
88
+
89
+ # Extract keys from frontmatter text (stdin).
90
+ # Outputs one key per line.
91
+ frontmatter_keys() {
92
+ while IFS= read -r raw; do
93
+ local line
94
+ line="$(echo "$raw" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
95
+ # skip blank lines and comments
96
+ [[ -z "$line" ]] && continue
97
+ [[ "$line" == \#* ]] && continue
98
+ # must contain a colon
99
+ [[ "$line" != *:* ]] && continue
100
+ # extract key (everything before first colon), trimmed
101
+ local key
102
+ key="$(echo "$line" | cut -d: -f1 | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
103
+ echo "$key"
104
+ done
105
+ }
106
+
107
+ # Check if a file contains an exact full line matching the needle.
108
+ has_exact_line() {
109
+ local file="$1" needle="$2"
110
+ grep -qFx "$needle" "$file"
111
+ }
112
+
113
+ # ---------------------------------------------------------------------------
114
+ # Config loading
115
+ # ---------------------------------------------------------------------------
116
+
117
+ load_config() {
118
+ local config_rel="${HARNESS_DOCS_CONFIG:-$DEFAULT_CONFIG_PATH}"
119
+ local config_path="$REPO_ROOT/$config_rel"
120
+ if [[ ! -f "$config_path" ]]; then
121
+ echo "Error: he-spikes-lint missing/invalid config: Missing config '${config_rel}'. Fix: create it (bootstrap should do this) or set HARNESS_DOCS_CONFIG." >&2
122
+ exit 2
123
+ fi
124
+ # Validate it is a JSON object
125
+ if ! jq -e 'type == "object"' "$config_path" >/dev/null 2>&1; then
126
+ echo "Error: he-spikes-lint missing/invalid config: Config must be a JSON object." >&2
127
+ exit 2
128
+ fi
129
+ CONFIG_PATH="$config_path"
130
+ }
131
+
132
+ # Read a JSON array from config as newline-delimited strings.
133
+ config_string_array() {
134
+ local key="$1"
135
+ jq -r "(.${key} // []) | if type == \"array\" then .[] else empty end" "$CONFIG_PATH" 2>/dev/null | while IFS= read -r v; do
136
+ # only emit strings
137
+ echo "$v"
138
+ done
139
+ }
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Per-spike checks
143
+ # ---------------------------------------------------------------------------
144
+
145
+ check_placeholders() {
146
+ local rel="$1" file="$2" fail_ph="$3"
147
+ shift 3
148
+ local patterns=("$@")
149
+ for p in "${patterns[@]}"; do
150
+ [[ -z "$p" ]] && continue
151
+ if grep -qF "$p" "$file"; then
152
+ local msg="Spike '${rel}' contains placeholder token '${p}'."
153
+ if [[ "$fail_ph" == "1" ]]; then
154
+ emit "error" "$rel" "Placeholder token" "$msg"
155
+ else
156
+ emit "warning" "$rel" "Placeholder token" "${msg} (Set HARNESS_FAIL_ON_ARTIFACT_PLACEHOLDERS=1 to enforce.)"
157
+ fi
158
+ break
159
+ fi
160
+ done
161
+ }
162
+
163
+ check_spike() {
164
+ local file="$1"
165
+ local rel="${file#"$REPO_ROOT"/}"
166
+
167
+ # --- frontmatter ---
168
+ local fm
169
+ if ! fm="$(extract_frontmatter "$file")"; then
170
+ emit "error" "$rel" "Missing YAML frontmatter" \
171
+ "Spike '${rel}' must start with YAML frontmatter delimited by '---' lines."
172
+ return
173
+ fi
174
+
175
+ # Check required frontmatter keys
176
+ local fm_keys
177
+ fm_keys="$(echo "$fm" | frontmatter_keys)"
178
+
179
+ local required_keys
180
+ required_keys="$(config_string_array "required_spike_frontmatter_keys")"
181
+
182
+ if [[ -n "$required_keys" ]]; then
183
+ while IFS= read -r k; do
184
+ [[ -z "$k" ]] && continue
185
+ if ! echo "$fm_keys" | grep -qFx "$k"; then
186
+ emit "error" "$rel" "Missing frontmatter key" \
187
+ "Spike '${rel}' missing YAML frontmatter key '${k}:'."
188
+ fi
189
+ done <<< "$required_keys"
190
+ fi
191
+
192
+ # --- required headings ---
193
+ for h in "${DEFAULT_REQUIRED_HEADINGS[@]}"; do
194
+ if ! has_exact_line "$file" "$h"; then
195
+ emit "error" "$rel" "Missing heading" \
196
+ "Spike '${rel}' missing required heading line '${h}'."
197
+ fi
198
+ done
199
+
200
+ # --- placeholder tokens ---
201
+ local placeholder_patterns=()
202
+ while IFS= read -r p; do
203
+ [[ -z "$p" ]] && continue
204
+ placeholder_patterns+=("$p")
205
+ done < <(config_string_array "artifact_placeholder_patterns")
206
+
207
+ local fail_ph="${HARNESS_FAIL_ON_ARTIFACT_PLACEHOLDERS:-0}"
208
+ if [[ ${#placeholder_patterns[@]} -gt 0 ]]; then
209
+ check_placeholders "$rel" "$file" "$fail_ph" "${placeholder_patterns[@]}"
210
+ fi
211
+ }
212
+
213
+ # ---------------------------------------------------------------------------
214
+ # Main
215
+ # ---------------------------------------------------------------------------
216
+
217
+ load_config
218
+
219
+ echo "he-spikes-lint: starting"
220
+ echo "Repro: bash scripts/ci/he-spikes-lint.sh"
221
+
222
+ spikes_dir="$REPO_ROOT/docs/spikes"
223
+ if [[ ! -d "$spikes_dir" ]]; then
224
+ echo "he-spikes-lint: OK (docs/spikes not present)"
225
+ exit 0
226
+ fi
227
+
228
+ # Collect spike files sorted
229
+ spike_files=()
230
+ while IFS= read -r -d '' f; do
231
+ spike_files+=("$f")
232
+ done < <(find "$spikes_dir" -maxdepth 1 -name '*-spike.md' -print0 | sort -z)
233
+
234
+ if [[ ${#spike_files[@]} -eq 0 ]]; then
235
+ echo "he-spikes-lint: OK (no spike files)"
236
+ exit 0
237
+ fi
238
+
239
+ for f in "${spike_files[@]}"; do
240
+ check_spike "$f"
241
+ done
242
+
243
+ if [[ $errors -gt 0 ]]; then
244
+ echo "he-spikes-lint: FAIL (${errors} error(s), ${warnings} warning(s))" >&2
245
+ exit 1
246
+ fi
247
+
248
+ echo "he-spikes-lint: OK (${warnings} warning(s))"
249
+ exit 0
@@ -0,0 +1,154 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ # Select runbooks whose called_from frontmatter matches a skill or step name.
5
+ # Prints matching runbook paths (relative to repo root) to stdout.
6
+
7
+ # --- Repo root: two parents up from this script's directory ---
8
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
9
+ REPO_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
10
+
11
+ # --- CLI argument parsing ---
12
+ SKILL=""
13
+ STEP=""
14
+
15
+ while [[ $# -gt 0 ]]; do
16
+ case "$1" in
17
+ --skill)
18
+ SKILL="$2"
19
+ shift 2
20
+ ;;
21
+ --step)
22
+ STEP="$2"
23
+ shift 2
24
+ ;;
25
+ *)
26
+ echo "Usage: $0 --skill <name> [--step <name>]" >&2
27
+ exit 1
28
+ ;;
29
+ esac
30
+ done
31
+
32
+ if [[ -z "$SKILL" ]]; then
33
+ echo "Error: --skill is required" >&2
34
+ exit 1
35
+ fi
36
+
37
+ # --- Main logic ---
38
+ RUNBOOKS_DIR="$REPO_ROOT/docs/runbooks"
39
+
40
+ if [[ ! -d "$RUNBOOKS_DIR" ]]; then
41
+ exit 0
42
+ fi
43
+
44
+ # Extract the frontmatter block (between first --- and next ---).
45
+ # Parse called_from entries. Print the file path if skill or step matches.
46
+ process_file() {
47
+ local file="$1"
48
+ local in_frontmatter=0
49
+ local in_called_from=0
50
+ local first_line=1
51
+ local called_from_items=()
52
+
53
+ while IFS= read -r line || [[ -n "$line" ]]; do
54
+ local trimmed
55
+ trimmed="$(echo "$line" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
56
+
57
+ # First non-empty consideration: frontmatter must start at line 1 with ---
58
+ if [[ "$first_line" -eq 1 ]]; then
59
+ first_line=0
60
+ if [[ "$trimmed" == "---" ]]; then
61
+ in_frontmatter=1
62
+ continue
63
+ else
64
+ # No frontmatter
65
+ return
66
+ fi
67
+ fi
68
+
69
+ # Inside frontmatter
70
+ if [[ "$in_frontmatter" -eq 1 ]]; then
71
+ # Closing delimiter
72
+ if [[ "$trimmed" == "---" ]]; then
73
+ break
74
+ fi
75
+
76
+ # Skip empty lines and comments
77
+ if [[ -z "$trimmed" || "$trimmed" == \#* ]]; then
78
+ # Empty lines inside a YAML list block: keep scanning
79
+ if [[ "$in_called_from" -eq 1 && -z "$trimmed" ]]; then
80
+ continue
81
+ fi
82
+ continue
83
+ fi
84
+
85
+ # If we're collecting YAML list items for called_from
86
+ if [[ "$in_called_from" -eq 1 ]]; then
87
+ # Check if this is a list item (starts with -)
88
+ if [[ "$trimmed" == -* ]]; then
89
+ local item
90
+ item="$(echo "$trimmed" | sed "s/^-[[:space:]]*//;s/^[\"']//;s/[\"']$//")"
91
+ if [[ -n "$item" ]]; then
92
+ called_from_items+=("$item")
93
+ fi
94
+ continue
95
+ else
96
+ # Not a list item; if it contains a colon it's a new key — stop collecting
97
+ if echo "$trimmed" | grep -q ':'; then
98
+ in_called_from=0
99
+ # Fall through to process this line as a new key
100
+ else
101
+ continue
102
+ fi
103
+ fi
104
+ fi
105
+
106
+ # Check for key: value lines
107
+ if echo "$trimmed" | grep -q ':'; then
108
+ local key val
109
+ key="$(echo "$trimmed" | sed 's/^\([^:]*\):.*/\1/' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
110
+ val="$(echo "$trimmed" | sed 's/^[^:]*://' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
111
+
112
+ if [[ "$key" == "called_from" ]]; then
113
+ # Inline list: called_from: [a, b]
114
+ if [[ "$val" == \[* ]]; then
115
+ local inner
116
+ inner="$(echo "$val" | sed 's/^\[//;s/\]$//' | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
117
+ if [[ -n "$inner" ]]; then
118
+ IFS=',' read -ra parts <<< "$inner"
119
+ for part in "${parts[@]}"; do
120
+ local cleaned
121
+ cleaned="$(echo "$part" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
122
+ if [[ -n "$cleaned" ]]; then
123
+ called_from_items+=("$cleaned")
124
+ fi
125
+ done
126
+ fi
127
+ else
128
+ # YAML list form — start collecting on subsequent lines
129
+ in_called_from=1
130
+ fi
131
+ fi
132
+ fi
133
+ fi
134
+ done < "$file"
135
+
136
+ # Check for matches
137
+ for item in "${called_from_items[@]+"${called_from_items[@]}"}"; do
138
+ if [[ "$item" == "$SKILL" ]]; then
139
+ echo "${file#"$REPO_ROOT"/}"
140
+ return
141
+ fi
142
+ if [[ -n "$STEP" && "$item" == "$STEP" ]]; then
143
+ echo "${file#"$REPO_ROOT"/}"
144
+ return
145
+ fi
146
+ done
147
+ }
148
+
149
+ # Find all .md files, sorted for deterministic output
150
+ while IFS= read -r mdfile; do
151
+ process_file "$mdfile"
152
+ done < <(find "$RUNBOOKS_DIR" -name '*.md' -type f | sort)
153
+
154
+ exit 0
package/src/config.ts ADDED
@@ -0,0 +1,46 @@
1
+ import type { FogClawConfig, GuardrailAction, RedactStrategy } from "./types.js";
2
+
3
+ const VALID_GUARDRAIL_MODES: GuardrailAction[] = ["redact", "block", "warn"];
4
+ const VALID_REDACT_STRATEGIES: RedactStrategy[] = ["token", "mask", "hash"];
5
+
6
+ export const DEFAULT_CONFIG: FogClawConfig = {
7
+ enabled: true,
8
+ guardrail_mode: "redact",
9
+ redactStrategy: "token",
10
+ model: "onnx-community/gliner_large-v2.1",
11
+ confidence_threshold: 0.5,
12
+ custom_entities: [],
13
+ entityActions: {},
14
+ };
15
+
16
+ export function loadConfig(overrides: Partial<FogClawConfig>): FogClawConfig {
17
+ const config: FogClawConfig = { ...DEFAULT_CONFIG, ...overrides };
18
+
19
+ if (!VALID_GUARDRAIL_MODES.includes(config.guardrail_mode)) {
20
+ throw new Error(
21
+ `Invalid guardrail_mode "${config.guardrail_mode}". Must be one of: ${VALID_GUARDRAIL_MODES.join(", ")}`,
22
+ );
23
+ }
24
+
25
+ if (!VALID_REDACT_STRATEGIES.includes(config.redactStrategy)) {
26
+ throw new Error(
27
+ `Invalid redactStrategy "${config.redactStrategy}". Must be one of: ${VALID_REDACT_STRATEGIES.join(", ")}`,
28
+ );
29
+ }
30
+
31
+ if (config.confidence_threshold < 0 || config.confidence_threshold > 1) {
32
+ throw new Error(
33
+ `confidence_threshold must be between 0 and 1, got ${config.confidence_threshold}`,
34
+ );
35
+ }
36
+
37
+ for (const [entityType, action] of Object.entries(config.entityActions)) {
38
+ if (!VALID_GUARDRAIL_MODES.includes(action)) {
39
+ throw new Error(
40
+ `Invalid action "${action}" for entity type "${entityType}". Must be one of: ${VALID_GUARDRAIL_MODES.join(", ")}`,
41
+ );
42
+ }
43
+ }
44
+
45
+ return config;
46
+ }
@@ -0,0 +1,88 @@
1
+ import type { Entity } from "../types.js";
2
+ import { canonicalType } from "../types.js";
3
+
4
+ const DEFAULT_NER_LABELS = [
5
+ "person",
6
+ "organization",
7
+ "location",
8
+ "address",
9
+ "date of birth",
10
+ "medical record number",
11
+ "account number",
12
+ "passport number",
13
+ ];
14
+
15
+ export class GlinerEngine {
16
+ private model: any = null;
17
+ private modelPath: string;
18
+ private threshold: number;
19
+ private customLabels: string[] = [];
20
+ private initialized = false;
21
+
22
+ constructor(modelPath: string, threshold: number = 0.5) {
23
+ this.modelPath = modelPath;
24
+ this.threshold = threshold;
25
+ }
26
+
27
+ async initialize(): Promise<void> {
28
+ if (this.initialized) return;
29
+
30
+ try {
31
+ const { Gliner } = await import("gliner");
32
+ this.model = new Gliner({
33
+ tokenizerPath: this.modelPath,
34
+ onnxSettings: {
35
+ modelPath: this.modelPath,
36
+ executionProvider: "cpu",
37
+ },
38
+ maxWidth: 12,
39
+ modelType: "gliner",
40
+ });
41
+ await this.model.initialize();
42
+ this.initialized = true;
43
+ } catch (err) {
44
+ throw new Error(
45
+ `Failed to initialize GLiNER model "${this.modelPath}": ${err instanceof Error ? err.message : String(err)}`,
46
+ );
47
+ }
48
+ }
49
+
50
+ setCustomLabels(labels: string[]): void {
51
+ this.customLabels = labels;
52
+ }
53
+
54
+ async scan(text: string, extraLabels?: string[]): Promise<Entity[]> {
55
+ if (!text) return [];
56
+ if (!this.model) {
57
+ throw new Error("GLiNER engine not initialized. Call initialize() first.");
58
+ }
59
+
60
+ const labels = [
61
+ ...DEFAULT_NER_LABELS,
62
+ ...this.customLabels,
63
+ ...(extraLabels ?? []),
64
+ ];
65
+
66
+ // Deduplicate labels
67
+ const uniqueLabels = [...new Set(labels)];
68
+
69
+ const results = await this.model.inference(text, uniqueLabels, {
70
+ threshold: this.threshold,
71
+ });
72
+
73
+ return results.map(
74
+ (r: { text: string; label: string; score: number; start: number; end: number }) => ({
75
+ text: r.text,
76
+ label: canonicalType(r.label),
77
+ start: r.start,
78
+ end: r.end,
79
+ confidence: r.score,
80
+ source: "gliner" as const,
81
+ }),
82
+ );
83
+ }
84
+
85
+ get isInitialized(): boolean {
86
+ return this.initialized;
87
+ }
88
+ }
@@ -0,0 +1,71 @@
1
+ import type { Entity } from "../types.js";
2
+
3
+ interface PatternDef {
4
+ label: string;
5
+ pattern: RegExp;
6
+ }
7
+
8
+ const PATTERNS: PatternDef[] = [
9
+ {
10
+ label: "EMAIL",
11
+ pattern:
12
+ /(?<![A-Za-z0-9._%+\-@])(?![A-Za-z_]{2,20}=)[A-Za-z0-9!#$%&*+\-/=^_`{|}~][A-Za-z0-9!#$%&'*+\-/=?^_`{|}~.]*@(?:\.?[A-Za-z0-9-]+\.)+[A-Za-z]{2,}(?=$|[^A-Za-z])/gi,
13
+ },
14
+ {
15
+ label: "PHONE",
16
+ pattern:
17
+ /(?<![A-Za-z0-9])(?:(?:(?:\+?1)[-.\s]?)?(?:\(\d{3}\)|\d{3})[-.\s]?\d{3}[-.\s]?\d{4}|\+\d{1,3}[\s\-.]?\d{1,4}(?:[\s\-.]?\d{2,4}){2,3})(?![-A-Za-z0-9])/gi,
18
+ },
19
+ {
20
+ label: "SSN",
21
+ pattern:
22
+ /(?<!\d)(?:(?!000|666)\d{3}-(?!00)\d{2}-(?!0000)\d{4}|(?!000|666)\d{3}(?!00)\d{2}(?!0000)\d{4})(?!\d)/g,
23
+ },
24
+ {
25
+ label: "CREDIT_CARD",
26
+ pattern:
27
+ /\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|(?:(?:4\d{3}|5[1-5]\d{2}|3[47]\d{2})[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4})|(?:3[47]\d{2}[-\s]?\d{6}[-\s]?\d{5}))\b/g,
28
+ },
29
+ {
30
+ label: "IP_ADDRESS",
31
+ pattern:
32
+ /\b(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)\.(?:25[0-5]|2[0-4]\d|1?\d?\d)\.(?:25[0-5]|2[0-4]\d|1?\d?\d)\.(?:25[0-5]|2[0-4]\d|1?\d?\d))\b/g,
33
+ },
34
+ {
35
+ label: "DATE",
36
+ pattern:
37
+ /\b(?:(?:0?[1-9]|1[0-2])[/-](?:0?[1-9]|[12]\d|3[01])[/-](?:\d{2}|\d{4})|(?:\d{4})-(?:0?[1-9]|1[0-2])-(?:0?[1-9]|[12]\d|3[01])|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:t(?:ember)?)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(?:0?[1-9]|[12]\d|3[01]),\s+(?:19|20)\d{2})\b/gi,
38
+ },
39
+ {
40
+ label: "ZIP_CODE",
41
+ pattern: /\b\d{5}(?:-\d{4})?\b/g,
42
+ },
43
+ ];
44
+
45
+ export class RegexEngine {
46
+ scan(text: string): Entity[] {
47
+ const entities: Entity[] = [];
48
+
49
+ for (const { label, pattern } of PATTERNS) {
50
+ // Reset lastIndex to avoid stale state from previous calls
51
+ pattern.lastIndex = 0;
52
+
53
+ let match: RegExpExecArray | null;
54
+ while ((match = pattern.exec(text)) !== null) {
55
+ entities.push({
56
+ text: match[0],
57
+ label,
58
+ start: match.index,
59
+ end: match.index + match[0].length,
60
+ confidence: 1.0,
61
+ source: "regex",
62
+ });
63
+ }
64
+ }
65
+
66
+ // Sort by start position for deterministic output
67
+ entities.sort((a, b) => a.start - b.start || a.end - b.end);
68
+
69
+ return entities;
70
+ }
71
+ }