objectivist-ner 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +355 -0
  3. package/index.ts +514 -0
  4. package/package.json +24 -0
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Richard Anaya
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,355 @@
1
+ # objectivist-ner
2
+
3
+ Objectivist-inspired Named Entity Recognition with grammar-constrained LLM output.
4
+
5
+ Uses [node-llama-cpp](https://github.com/withcatai/node-llama-cpp) to run a small language model locally, enforcing structured output via JSON schema grammars. No API keys, no network calls -- everything runs on your machine.
6
+
7
+ The CLI is installed as the `ner` command.
8
+
9
+ ## Features
10
+
11
+ - Exact span extraction -- entity `text` is the substring from the input, not a paraphrase
12
+ - Schema-constrained output via llama.cpp grammar (guaranteed valid JSON)
13
+ - Restrict entity classes, attribute keys, and attribute values
14
+ - Hierarchical class taxonomies
15
+ - Relation extraction between entities
16
+ - Coreference resolution (group mentions of the same entity)
17
+ - Negation and modality detection
18
+ - Confidence scores
19
+ - Schema definition files for reusable ontologies
20
+ - Long document chunking with `--file`
21
+ - Batch processing with `--batch`
22
+ - Three built-in model tiers: `--fast`, `--balanced`, `--best`
23
+ - Reads from argument, file, or stdin
24
+ - Compact JSON output for non-TTY / piping
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ # Local development
30
+ bun install
31
+
32
+ # Install globally as the `ner` command
33
+ bun install -g objectivist-ner
34
+ ```
35
+
36
+ After global install, use the `ner` command directly.
37
+
38
+ ## Usage
39
+
40
+ ```bash
41
+ # Uses --best (4B) by default
42
+ ner "the cat is blue and is feeling sad"
43
+
44
+ # Pick a model tier
45
+ ner --fast "the cat is blue"
46
+ ner --balanced "John works at Google in NYC"
47
+ ner --best "complex medical research text"
48
+ ```
49
+
50
+ ### Entity constraints
51
+
52
+ ```bash
53
+ # Restrict entity classes
54
+ ner "John works at Google" --classes person,organization
55
+
56
+ # Restrict attribute keys
57
+ ner "Alice is sad in Paris" --attributes emotional_state,location
58
+
59
+ # Restrict attribute values with enums
60
+ ner "The sky is blue" --attr-values '{"color":["blue","red","green"]}'
61
+
62
+ # Hierarchical class taxonomy
63
+ ner "Dr. Chen lives in Boston with her cat" \
64
+ --taxonomy '{"organism":["person","animal"],"place":["city","country"]}'
65
+ ```
66
+
67
+ ### Relation extraction
68
+
69
+ ```bash
70
+ ner --relations "Dr. Chen works at MIT and collaborates with Prof. Wright"
71
+ ```
72
+
73
+ Output:
74
+
75
+ ```json
76
+ {
77
+ "entities": [
78
+ { "class": "person", "text": "Dr. Chen", "attributes": {} },
79
+ { "class": "organization", "text": "MIT", "attributes": {} },
80
+ { "class": "person", "text": "Prof. Wright", "attributes": {} }
81
+ ],
82
+ "relations": [
83
+ { "source": "Dr. Chen", "target": "MIT", "relation": "works at" },
84
+ {
85
+ "source": "Dr. Chen",
86
+ "target": "Prof. Wright",
87
+ "relation": "collaborates with"
88
+ }
89
+ ]
90
+ }
91
+ ```
92
+
93
+ ### Coreference resolution
94
+
95
+ ```bash
96
+ ner --resolve "Dr. Chen published a paper. She later won the Nobel Prize. The neurologist was celebrated."
97
+ ```
98
+
99
+ Output:
100
+
101
+ ```json
102
+ [
103
+ {
104
+ "class": "person",
105
+ "text": "Dr. Chen",
106
+ "attributes": {},
107
+ "entity_id": "e1"
108
+ },
109
+ { "class": "person", "text": "She", "attributes": {}, "entity_id": "e1" },
110
+ {
111
+ "class": "person",
112
+ "text": "The neurologist",
113
+ "attributes": {},
114
+ "entity_id": "e1"
115
+ },
116
+ {
117
+ "class": "event",
118
+ "text": "the Nobel Prize",
119
+ "attributes": {},
120
+ "entity_id": "e2"
121
+ }
122
+ ]
123
+ ```
124
+
125
+ `entity_id` is a grammar-enforced top-level field, not inside `attributes`.
126
+
127
+ ### Negation detection
128
+
129
+ ```bash
130
+ ner --detect-negation "The patient has diabetes but does not have cancer. He might develop hypertension."
131
+ ```
132
+
133
+ ### Confidence scores
134
+
135
+ ```bash
136
+ ner --include-confidence "Dr. Maria Chen works at MIT. Someone named Bob might be there too."
137
+ ```
138
+
139
+ ### Schema definition files
140
+
141
+ Define your ontology in a JSON file and reuse it:
142
+
143
+ ```json
144
+ {
145
+ "taxonomy": {
146
+ "organism": ["person", "animal"],
147
+ "place": ["city", "country", "building"],
148
+ "institution": ["company", "university", "government_agency"]
149
+ },
150
+ "attributes": ["role", "age", "location", "affiliation"],
151
+ "relations": ["works_at", "located_in", "affiliated_with"]
152
+ }
153
+ ```
154
+
155
+ ```bash
156
+ ner --schema schema.json "Dr. Chen works at MIT in Boston"
157
+ ```
158
+
159
+ Schema files support `taxonomy`, `classes`, `attributes`, `attrValues`, and `relations`. CLI flags override schema file values.
160
+
161
+ ### File and batch processing
162
+
163
+ ```bash
164
+ # Process a long document (auto-chunked)
165
+ ner --file document.txt
166
+
167
+ # Process a JSONL file (one text per line, outputs JSONL)
168
+ ner --batch inputs.jsonl
169
+
170
+ # Process a directory of .txt files
171
+ ner --batch ./documents/
172
+ ```
173
+
174
+ ### Other options
175
+
176
+ ```bash
177
+ # Append to the built-in system prompt
178
+ ner "text" --system-prompt-append "Focus only on emotions"
179
+
180
+ # Replace the system prompt entirely
181
+ ner "text" --system-prompt "You are a custom extractor."
182
+
183
+ # Read from stdin
184
+ echo "the cat is blue" | ner
185
+
186
+ # Compact JSON output
187
+ ner "the cat is blue" --compact
188
+ ```
189
+
190
+ ## Models
191
+
192
+ fastner ships with three built-in model tiers. Pick one with a flag -- the model is downloaded automatically on first use to `~/.fastner/models/`.
193
+
194
+ | Flag | Model | Size | Download | Best for |
195
+ | ------------ | ----------------- | ---- | -------- | ------------------------------- |
196
+ | `--fast` | Qwen3.5-0.8B Q8_0 | 0.8B | ~0.9 GB | Simple text, single entities |
197
+ | `--balanced` | Qwen3.5-2B Q8_0 | 2B | ~2.3 GB | Moderate complexity, most tasks |
198
+ | `--best` | Qwen3.5-4B Q8_0 | 4B | ~4.5 GB | Dense text, rare entity types |
199
+
200
+ `--best` is the default. See [Benchmarks](#benchmarks) for why.
201
+
202
+ ## Options
203
+
204
+ | Flag | Description |
205
+ | --------------------------------- | ---------------------------------------------- |
206
+ | `--fast` | Use 0.8B model -- quick, simple text only |
207
+ | `--balanced` | Use 2B model -- good accuracy/speed tradeoff |
208
+ | `--best` | Use 4B model -- best accuracy (default) |
209
+ | `-c, --classes <list>` | Comma-separated allowed entity classes |
210
+ | `-a, --attributes <list>` | Comma-separated allowed attribute keys |
211
+ | `--attr-values <json>` | JSON enum map for attribute values |
212
+ | `--taxonomy <json>` | Class hierarchy JSON |
213
+ | `--relations` | Extract relations between entities |
214
+ | `--resolve` | Resolve coreferences |
215
+ | `--include-confidence` | Include confidence scores per entity |
216
+ | `--detect-negation` | Detect negated/hypothetical entities |
217
+ | `--schema <path>` | Load schema definition from JSON file |
218
+ | `--file <path>` | Read input from file (with chunking) |
219
+ | `--batch <path>` | Process JSONL file or directory of .txt files |
220
+ | `--system-prompt <string>` | Replace the built-in system prompt entirely |
221
+ | `--system-prompt-append <string>` | Append to the built-in system prompt |
222
+ | `--compact` | Output compact JSON (auto-enabled for non-TTY) |
223
+ | `-m, --model <uri>` | Use any GGUF model (see below) |
224
+
225
+ ## Benchmarks
226
+
227
+ We tested all three tiers against a complex input containing 11 entities across 6 classes (person, organization, location, disease, drug, event):
228
+
229
+ > "Dr. Maria Chen, a 42-year-old neurologist at Massachusetts General Hospital in Boston, published a groundbreaking paper with her colleague Prof. James Wright from Oxford University about a rare genetic mutation called BRCA3-delta found in 12 patients from rural Bangladesh, while simultaneously consulting for Pfizer on their new drug Nexavion priced at 450 dollars per dose, which the WHO classified as a Category A essential medicine last Tuesday during their Geneva summit"
230
+
231
+ | Entity | `--fast` | `--balanced` | `--best` (default) |
232
+ | ------------------ | ---------- | ------------ | ------------------------ |
233
+ | Dr. Maria Chen | person | person | person |
234
+ | Prof. James Wright | person | person | person, role: colleague |
235
+ | MGH | - | org | org |
236
+ | Oxford University | - | org | org |
237
+ | BRCA3-delta | - | disease | disease |
238
+ | Bangladesh | - | - | location |
239
+ | Pfizer | - | org | org |
240
+ | Nexavion | - | drug | drug, price: 450 dollars |
241
+ | WHO | - | - | org, category: Cat A |
242
+ | Geneva summit | - | event | location |
243
+ | Boston | location | - | location |
244
+ | **Entities found** | **3 / 11** | **8 / 11** | **11 / 11** |
245
+
246
+ All three tiers produce zero hallucinations with the current prompt design.
247
+
248
+ ## Epistemological design
249
+
250
+ fastner's feature set is informed by Objectivist epistemology -- the theory that concepts are formed by abstracting essential characteristics from concretes, organized into hierarchical structures, and held in a specific relationship to reality.
251
+
252
+ ### Identity: A is A (`--resolve`)
253
+
254
+ The law of identity demands that we track _what a thing is_ across all its references. When a text says "Dr. Chen", "she", and "the neurologist", these are three linguistic expressions of one entity. Without coreference resolution, an NER system treats them as three unrelated extractions -- a failure to maintain identity. `--resolve` enforces that A remains A regardless of how it is named.
255
+
256
+ ### Hierarchical concept formation (`--taxonomy`)
257
+
258
+ Objectivist epistemology holds that concepts are organized hierarchically through a process of abstraction. "Cat" is subsumed under "animal", which is subsumed under "organism". Each level retains the essential characteristics of its parent while adding differentia. The `--taxonomy` flag mirrors this structure directly -- you define genus-species relationships between entity classes, and the model classifies at the most specific level it can justify. This isn't just organization; it's how valid concepts are formed.
259
+
260
+ ### Distinguishing existence from assertion (`--detect-negation`)
261
+
262
+ A concept must be connected to reality. "The patient has diabetes" and "the patient does not have diabetes" both contain the entity "diabetes", but their relationship to existence is opposite. Naive NER systems that extract "diabetes" from both sentences without distinguishing assertion from negation commit a fundamental error -- they detach the concept from its existential status. `--detect-negation` forces every entity to declare its relationship to reality: present, negated, or hypothetical.
263
+
264
+ ### Certainty and the hierarchy of evidence (`--include-confidence`)
265
+
266
+ Knowledge exists on a spectrum from certain to speculative. "Dr. Maria Chen" appearing with a full name and title is a high-confidence extraction. "Someone named Bob" is low-confidence. Objectivism rejects both dogmatism (asserting certainty where none exists) and skepticism (denying certainty where it does). `--include-confidence` makes the epistemic status of each extraction explicit, letting downstream systems apply appropriate thresholds.
267
+
268
+ ### Relations as conceptual integration (`--relations`)
269
+
270
+ Entities don't exist in isolation. The relationship "Dr. Chen works at MIT" is not a property of Chen or of MIT alone -- it's a fact about reality that connects two existents. Extracting entities without their relations is like forming concepts without integrating them into propositions. `--relations` extracts the connective tissue between entities, producing a knowledge graph rather than an isolated list.
271
+
272
+ ### Schema files as objective definitions (`--schema`)
273
+
274
+ Definitions, in Objectivist epistemology, identify the essential characteristics that distinguish a concept from all others. A schema file serves this function for NER: it defines your ontology once -- the class hierarchy, the valid attributes, the relation types -- and applies it consistently across all extractions. This is the difference between ad-hoc classification and principled concept formation.
275
+
276
+ ### Grammar enforcement as logical constraint
277
+
278
+ Several fields (`assertion`, `confidence`, `entity_id`, `class` enums) are enforced at the grammar level, not merely prompted. The model literally cannot produce an invalid value. This is the computational equivalent of the principle that contradictions cannot exist -- the system's structure makes certain errors impossible rather than merely unlikely.
279
+
280
+ ## Building Up Knowledge
281
+
282
+ fastner is designed as a tool for the Objectivist project of building knowledge from percepts through concepts to principles and finally to action — the exact process implemented in the companion project **[objectivist-lattice](https://github.com/richardanaya/objectivist-lattice)**.
283
+
284
+ ### The Epistemological Pipeline
285
+
286
+ Objectivism holds that all knowledge begins with **percepts** (raw sensory data), which are integrated into **concepts**, which are organized into **principles** (general truths), which are finally applied as **actions** in specific contexts.
287
+
288
+ `objectivist-lattice` enforces this hierarchy strictly on a filesystem of Markdown files with validation rules:
289
+
290
+ - **Axioms** and **percepts** are bedrock — they have no `reduces_to` links
291
+ - **Principles** must reduce to axioms or percepts
292
+ - **Applications** must reduce to principles
293
+ - Promotion from `Tentative/Hypothesis` to `Integrated/Validated` can only happen bottom-up
294
+
295
+ ### How NER Helps Build the Lattice
296
+
297
+ fastner acts as the **percept-to-concept extraction layer** for this system:
298
+
299
+ 1. **Percept Extraction** (`--detect-negation`)
300
+ - Identifies concrete entities from source material (books, articles, personal observations)
301
+ - Distinguishes what is asserted as present, negated, or hypothetical
302
+ - Feeds raw perceptual data into the `02-Percepts/` directory
303
+
304
+ 2. **Concept Formation** (`--classes`, `--taxonomy`, `--resolve`)
305
+ - Groups multiple mentions of the same entity (`entity_id`)
306
+ - Classifies entities into hierarchical taxonomies (`organism > person > neurologist`)
307
+ - Maintains identity across contexts — "Dr. Chen", "she", and "the neurologist" are recognized as the same existent
308
+
309
+ 3. **Principle Discovery** (`--relations`, `--schema`)
310
+ - Extracts relations between entities ("works at", "causes", "implies")
311
+ - Uses schema files to enforce your ontological commitments
312
+ - Surfaces potential principles by showing what consistently reduces to what
313
+
314
+ 4. **Action Guidance** (`--include-confidence`)
315
+ - Rates confidence in each extraction
316
+ - Helps distinguish high-certainty principles (suitable for action) from speculative ones (still tentative)
317
+
318
+ ### Practical Workflow
319
+
320
+ ```bash
321
+ # Extract entities from a book chapter
322
+ ner --file chapter1.txt --detect-negation --resolve --include-confidence > percepts.json
323
+
324
+ # Convert to lattice format
325
+ cat percepts.json | jq '.[] | {title: .text, level: "percept", proposition: (.text + " was observed")}' > 02-Percepts/20260315-percept-001.md
326
+
327
+ # Later, when forming principles
328
+ ner --relations --schema ontology.json "text from multiple chapters" > principles.json
329
+ ```
330
+
331
+ The combination of **objectivist-ner** (extraction) and **objectivist-lattice** (validation and organization) creates a complete pipeline:
332
+
333
+ **Percepts → Concepts → Principles → Validated Knowledge → Action**
334
+
335
+ This is not just information extraction. It is epistemological engineering — using computation to enforce the proper hierarchical structure of knowledge, preventing floating abstractions and ensuring every principle is grounded in percepts and axioms.
336
+
337
+ The grammar-enforced fields (`assertion`, `confidence`, `entity_id`) are not arbitrary features. They are computational implementations of fundamental epistemological requirements: every concept must have a relationship to reality, every claim must have an epistemic status, and identity must be maintained across contexts.
338
+
339
+ See the [objectivist-lattice](https://github.com/richardanaya/objectivist-lattice) repository for the validation and knowledge management layer that pairs with this tool.
340
+
341
+ ## Custom models
342
+
343
+ If the built-in tiers don't fit your needs, you can pass any GGUF model with `--model`. This overrides `--fast`/`--balanced`/`--best`.
344
+
345
+ ```bash
346
+ # HuggingFace URI
347
+ ner "text" --model "hf:unsloth/Qwen3-8B-GGUF:Qwen3-8B-Q4_K_M.gguf"
348
+
349
+ # Local file
350
+ ner "text" --model ./my-custom-model.gguf
351
+ ```
352
+
353
+ ## License
354
+
355
+ MIT © Richard Anaya
package/index.ts ADDED
@@ -0,0 +1,514 @@
1
+ import { program } from "commander";
2
+ import path from "path";
3
+ import os from "os";
4
+ import fs from "fs";
5
+ import { getLlama, LlamaChatSession, resolveModelFile } from "node-llama-cpp";
6
+
7
+ // === MODELS ===
8
+ const MODELS = {
9
+ fast: "hf:unsloth/Qwen3.5-0.8B-GGUF:Qwen3.5-0.8B-Q8_0.gguf",
10
+ balanced: "hf:unsloth/Qwen3.5-2B-GGUF:Qwen3.5-2B-Q8_0.gguf",
11
+ best: "hf:unsloth/Qwen3.5-4B-GGUF:Qwen3.5-4B-Q8_0.gguf",
12
+ } as const;
13
+
14
+ // === SYSTEM PROMPTS ===
15
+ const SYSTEM_PROMPT = `You are a named entity recognition (NER) system. Your task is to extract entities from text.
16
+
17
+ Rules:
18
+ - "text" must be the EXACT substring from the input that refers to the entity. Do NOT paraphrase or include extra words.
19
+ - "class" is the entity type (e.g. person, animal, location, organization).
20
+ - "attributes" are properties of the entity found in context.
21
+ - Return one object per distinct entity mention.
22
+ - If no entities are found, return an empty array [].`;
23
+
24
+ const FEW_SHOT_EXAMPLES = `Example 1:
25
+ Input: "the cat is blue and is feeling sad"
26
+ Output: [{"class":"animal","text":"cat","attributes":{"color":"blue","emotional_state":"sad"}}]
27
+
28
+ Example 2:
29
+ Input: "John Smith lives in New York City and works at Google"
30
+ Output: [{"class":"person","text":"John Smith","attributes":{"location":"New York City","employer":"Google"}},{"class":"location","text":"New York City","attributes":{}},{"class":"organization","text":"Google","attributes":{}}]
31
+
32
+ Example 3:
33
+ Input: "The quick brown fox jumps over the lazy dog near the river"
34
+ Output: [{"class":"animal","text":"fox","attributes":{"color":"brown","speed":"quick"}},{"class":"animal","text":"dog","attributes":{"temperament":"lazy"}},{"class":"location","text":"the river","attributes":{}}]
35
+
36
+ Example 4:
37
+ Input: "Researchers at MIT found that the drug Riluzole slows progression of ALS in a trial last March"
38
+ Output: [{"class":"organization","text":"MIT","attributes":{}},{"class":"drug","text":"Riluzole","attributes":{}},{"class":"disease","text":"ALS","attributes":{}},{"class":"event","text":"trial last March","attributes":{"date":"last March"}}]`;
39
+
40
+ // === SCHEMA FILE TYPES ===
41
+ interface SchemaFile {
42
+ taxonomy?: Record<string, string[]>;
43
+ classes?: string[];
44
+ attributes?: string[];
45
+ attrValues?: Record<string, string[]>;
46
+ relations?: string[];
47
+ }
48
+
49
+ // === TAXONOMY HELPERS ===
50
+ function flattenTaxonomy(taxonomy: Record<string, string[]>): string[] {
51
+ const all = new Set<string>();
52
+ for (const [parent, children] of Object.entries(taxonomy)) {
53
+ all.add(parent);
54
+ for (const child of children) all.add(child);
55
+ }
56
+ return [...all];
57
+ }
58
+
59
+ function taxonomyToPrompt(taxonomy: Record<string, string[]>): string {
60
+ const lines = Object.entries(taxonomy)
61
+ .map(([parent, children]) => ` ${parent}: ${children.join(", ")}`)
62
+ .join("\n");
63
+ return `Use the following class hierarchy. Classify at the most specific level.\n${lines}`;
64
+ }
65
+
66
+ // === CHUNKING ===
67
+ function chunkText(text: string, maxChars: number): string[] {
68
+ if (text.length <= maxChars) return [text];
69
+ const chunks: string[] = [];
70
+ const sentences = text.split(/(?<=[.!?])\s+/);
71
+ let current = "";
72
+ for (const sentence of sentences) {
73
+ if (current.length + sentence.length > maxChars && current.length > 0) {
74
+ chunks.push(current.trim());
75
+ current = "";
76
+ }
77
+ current += (current ? " " : "") + sentence;
78
+ }
79
+ if (current.trim()) chunks.push(current.trim());
80
+ return chunks;
81
+ }
82
+
83
+ // === CLI ===
84
+ program
85
+ .name("ner")
86
+ .description(
87
+ "Objectivist-inspired named entity recognition with grammar constraints",
88
+ )
89
+ .argument("[text]", "Text to extract entities from (omit to read from stdin)")
90
+ .option("-c, --classes <list>", "Comma-separated allowed entity classes")
91
+ .option("-a, --attributes <list>", "Comma-separated allowed attribute keys")
92
+ .option(
93
+ "--attr-values <json>",
94
+ 'JSON enum map for attributes e.g. {"color":["blue","red"]}',
95
+ )
96
+ .option(
97
+ "--taxonomy <json>",
98
+ 'Class hierarchy JSON e.g. {"organism":["animal","plant"]}',
99
+ )
100
+ .option("--relations", "Extract relations between entities")
101
+ .option("--resolve", "Resolve coreferences (group mentions of same entity)")
102
+ .option("--include-confidence", "Include confidence scores per entity")
103
+ .option("--detect-negation", "Detect negated/hypothetical entities")
104
+ .option("--schema <path>", "Load entity schema definition from a JSON file")
105
+ .option(
106
+ "--file <path>",
107
+ "Read input from a file (with chunking for long docs)",
108
+ )
109
+ .option(
110
+ "--batch <path>",
111
+ "Process JSONL file (one text per line) or directory of .txt files",
112
+ )
113
+ .option(
114
+ "--system-prompt <string>",
115
+ "Replace the built-in system prompt entirely",
116
+ )
117
+ .option(
118
+ "--system-prompt-append <string>",
119
+ "Append to the built-in system prompt",
120
+ )
121
+ .option("-m, --model <uri>", "Model URI or path to GGUF file")
122
+ .option("--fast", "Use smallest model (0.8B) -- quick, simple text only")
123
+ .option(
124
+ "--balanced",
125
+ "Use mid-size model (2B) -- good accuracy/speed tradeoff",
126
+ )
127
+ .option("--best", "Use largest model (4B) -- best accuracy (default)")
128
+ .option("--compact", "Output compact JSON (also auto-enabled for non-TTY)")
129
+ .addHelpText(
130
+ "after",
131
+ `
132
+ Examples:
133
+ fastner "the cat is blue"
134
+ fastner --fast "simple short text"
135
+ fastner "John works at Google" --classes person,organization
136
+ fastner "sky is blue" --attr-values '{"color":["blue","red"]}'
137
+ fastner --relations "Dr. Chen works at MIT"
138
+ fastner --resolve "Dr. Chen published a paper. She won an award."
139
+ fastner --detect-negation "The patient does not have diabetes"
140
+ fastner --schema schema.json "complex text"
141
+ fastner --file document.txt
142
+ fastner --batch inputs.jsonl
143
+ echo "the cat is blue" | fastner
144
+ `,
145
+ )
146
+ .parse();
147
+
148
+ const opts = program.opts();
149
+
150
+ // === VALIDATIONS ===
151
+ const tierFlags = [opts.fast, opts.balanced, opts.best].filter(Boolean).length;
152
+ if (tierFlags > 1) {
153
+ console.error(
154
+ "Error: --fast, --balanced, and --best are mutually exclusive.",
155
+ );
156
+ process.exit(1);
157
+ }
158
+
159
+ if (opts.systemPrompt && opts.systemPromptAppend) {
160
+ console.error(
161
+ "Error: --system-prompt and --system-prompt-append are mutually exclusive.",
162
+ );
163
+ process.exit(1);
164
+ }
165
+
166
+ // === LOAD SCHEMA FILE ===
167
+ let schemaFile: SchemaFile | undefined;
168
+ if (opts.schema) {
169
+ try {
170
+ const raw = fs.readFileSync(opts.schema, "utf-8");
171
+ schemaFile = JSON.parse(raw);
172
+ } catch (e) {
173
+ console.error(`Error: Failed to load schema file: ${(e as Error).message}`);
174
+ process.exit(1);
175
+ }
176
+ }
177
+
178
+ // === MERGE OPTIONS (CLI flags override schema file) ===
179
+ const allowedClasses: string[] | undefined = opts.classes
180
+ ? opts.classes.split(",").map((s: string) => s.trim())
181
+ : schemaFile?.classes;
182
+
183
+ const allowedAttrs: string[] | undefined = opts.attributes
184
+ ? opts.attributes.split(",").map((s: string) => s.trim())
185
+ : schemaFile?.attributes;
186
+
187
+ let attrValuesMap: Record<string, string[]> | undefined;
188
+ if (opts.attrValues) {
189
+ try {
190
+ attrValuesMap = JSON.parse(opts.attrValues);
191
+ } catch (e) {
192
+ console.error(
193
+ `Error: Invalid JSON for --attr-values: ${(e as Error).message}`,
194
+ );
195
+ process.exit(1);
196
+ }
197
+ } else if (schemaFile?.attrValues) {
198
+ attrValuesMap = schemaFile.attrValues;
199
+ }
200
+
201
+ let taxonomy: Record<string, string[]> | undefined;
202
+ if (opts.taxonomy) {
203
+ try {
204
+ taxonomy = JSON.parse(opts.taxonomy);
205
+ } catch (e) {
206
+ console.error(
207
+ `Error: Invalid JSON for --taxonomy: ${(e as Error).message}`,
208
+ );
209
+ process.exit(1);
210
+ }
211
+ } else if (schemaFile?.taxonomy) {
212
+ taxonomy = schemaFile.taxonomy;
213
+ }
214
+
215
+ const enableRelations = opts.relations || !!schemaFile?.relations;
216
+ const relationTypes: string[] | undefined = schemaFile?.relations || undefined;
217
+ const enableResolve = !!opts.resolve;
218
+ const enableConfidence = !!opts.includeConfidence;
219
+ const enableNegation = !!opts.detectNegation;
220
+
221
+ // === READ INPUT ===
222
+ let inputTexts: string[] = [];
223
+
224
+ if (opts.batch) {
225
+ const batchPath = opts.batch as string;
226
+ const stat = fs.statSync(batchPath);
227
+ if (stat.isDirectory()) {
228
+ const files = fs
229
+ .readdirSync(batchPath)
230
+ .filter((f: string) => f.endsWith(".txt"));
231
+ inputTexts = files.map((f: string) =>
232
+ fs.readFileSync(path.join(batchPath, f), "utf-8").trim(),
233
+ );
234
+ } else {
235
+ const content = fs.readFileSync(batchPath, "utf-8").trim();
236
+ inputTexts = content.split("\n").map((line: string) => {
237
+ try {
238
+ const parsed = JSON.parse(line);
239
+ return typeof parsed === "string" ? parsed : parsed.text || line;
240
+ } catch {
241
+ return line;
242
+ }
243
+ });
244
+ }
245
+ } else if (opts.file) {
246
+ const content = fs.readFileSync(opts.file, "utf-8").trim();
247
+ inputTexts = chunkText(content, 2000);
248
+ } else {
249
+ let text = program.args[0];
250
+ if (!text) {
251
+ const chunks: Buffer[] = [];
252
+ for await (const chunk of process.stdin) {
253
+ chunks.push(chunk as Buffer);
254
+ }
255
+ text = Buffer.concat(chunks).toString().trim();
256
+ }
257
+ if (!text) {
258
+ console.error(
259
+ "Error: No input text provided. Pass as argument, --file, --batch, or stdin.",
260
+ );
261
+ process.exit(1);
262
+ }
263
+ inputTexts = [text];
264
+ }
265
+
266
+ // === RESOLVE MODEL ===
267
+ const modelUri = opts.model
268
+ ? opts.model
269
+ : opts.fast
270
+ ? MODELS.fast
271
+ : opts.balanced
272
+ ? MODELS.balanced
273
+ : MODELS.best;
274
+
275
+ const modelsDir = path.join(os.homedir(), ".fastner", "models");
276
+ const modelPath = await resolveModelFile(modelUri, modelsDir);
277
+
278
+ const llama = await getLlama();
279
+ const model = await llama.loadModel({ modelPath });
280
+ const context = await model.createContext();
281
+
282
+ // === BUILD SYSTEM PROMPT ===
283
+ function buildSystemPrompt(): string {
284
+ if (opts.systemPrompt) return opts.systemPrompt;
285
+
286
+ let base = SYSTEM_PROMPT;
287
+
288
+ if (enableNegation) {
289
+ base += `\n- Every entity has a top-level "assertion" field: "present", "negated", or "hypothetical". "negated" means the text explicitly denies it (e.g. "does not have"). "hypothetical" means it is speculative (e.g. "might develop").`;
290
+ }
291
+ if (enableConfidence) {
292
+ base += `\n- Every entity has a top-level "confidence" field: "low", "medium", or "high".`;
293
+ }
294
+ if (enableResolve) {
295
+ base += `\n- Every entity has a top-level "entity_id" field. If multiple text spans refer to the same real-world entity (e.g. "Dr. Chen" and "she"), they share the same entity_id. Use short IDs like "e1", "e2".`;
296
+ }
297
+
298
+ let prompt = `${base}\n\n${FEW_SHOT_EXAMPLES}`;
299
+
300
+ if (opts.systemPromptAppend) {
301
+ prompt += `\n\n${opts.systemPromptAppend}`;
302
+ }
303
+
304
+ return prompt;
305
+ }
306
+
307
+ // === BUILD GRAMMAR SCHEMA ===
308
+ function buildGrammarSchema() {
309
+ // Determine allowed classes from taxonomy or explicit list
310
+ const classEnum = taxonomy ? flattenTaxonomy(taxonomy) : allowedClasses;
311
+
312
+ const attributesSchema: any = {
313
+ type: "object",
314
+ additionalProperties: { type: "string" },
315
+ };
316
+
317
+ const properties: any = {
318
+ class: {
319
+ type: "string",
320
+ ...(classEnum && { enum: classEnum }),
321
+ },
322
+ text: { type: "string" },
323
+ attributes: attributesSchema,
324
+ };
325
+ const required: string[] = ["class", "text"];
326
+
327
+ // Grammar-enforced fields for enabled features.
328
+ // These are top-level entity properties (not inside attributes)
329
+ // so the grammar can enforce them as required on every entity.
330
+ if (enableNegation) {
331
+ properties.assertion = {
332
+ type: "string",
333
+ enum: ["present", "negated", "hypothetical"],
334
+ };
335
+ required.push("assertion");
336
+ }
337
+ if (enableConfidence) {
338
+ properties.confidence = {
339
+ type: "string",
340
+ enum: ["low", "medium", "high"],
341
+ };
342
+ required.push("confidence");
343
+ }
344
+ if (enableResolve) {
345
+ properties.entity_id = { type: "string" };
346
+ required.push("entity_id");
347
+ }
348
+
349
+ const schema: any = {
350
+ type: "array",
351
+ items: {
352
+ type: "object",
353
+ properties,
354
+ required,
355
+ additionalProperties: false,
356
+ },
357
+ };
358
+
359
+ return schema;
360
+ }
361
+
362
+ // === BUILD RELATIONS SCHEMA ===
363
+ function buildRelationsSchema() {
364
+ const relSchema: any = {
365
+ type: "object",
366
+ properties: {
367
+ entities: buildGrammarSchema(),
368
+ relations: {
369
+ type: "array",
370
+ items: {
371
+ type: "object",
372
+ properties: {
373
+ source: { type: "string" },
374
+ target: { type: "string" },
375
+ relation: {
376
+ type: "string",
377
+ ...(relationTypes && { enum: relationTypes }),
378
+ },
379
+ },
380
+ required: ["source", "target", "relation"],
381
+ additionalProperties: false,
382
+ },
383
+ },
384
+ },
385
+ required: ["entities", "relations"],
386
+ additionalProperties: false,
387
+ };
388
+ return relSchema;
389
+ }
390
+
391
+ // === BUILD PROMPT CONSTRAINTS ===
392
+ function buildConstraints(): string {
393
+ let constraints = "";
394
+
395
+ if (taxonomy) {
396
+ constraints += `\n${taxonomyToPrompt(taxonomy)}`;
397
+ } else if (allowedClasses) {
398
+ constraints += `\nAllowed entity classes: ${allowedClasses.join(", ")}. Only use these classes.`;
399
+ }
400
+
401
+ if (attrValuesMap) {
402
+ const desc = Object.entries(attrValuesMap)
403
+ .map(([k, v]) => `${k}: ${v.join(", ")}`)
404
+ .join("; ");
405
+ constraints += `\nOnly use these attribute keys and values: ${desc}. Omit attributes that don't apply to an entity.`;
406
+ } else if (allowedAttrs) {
407
+ constraints += `\nOnly use these attribute keys: ${allowedAttrs.join(", ")}. Omit attributes that don't apply to an entity.`;
408
+ }
409
+
410
+ if (enableNegation) {
411
+ constraints += `\nEvery entity has an "assertion" field (not in attributes). Example: [{"class":"disease","text":"diabetes","assertion":"present","attributes":{}},{"class":"disease","text":"cancer","assertion":"negated","attributes":{}}]`;
412
+ }
413
+ if (enableConfidence) {
414
+ constraints += `\nEvery entity has a "confidence" field (not in attributes). Example: [{"class":"person","text":"John","confidence":"high","attributes":{}}]`;
415
+ }
416
+ if (enableResolve) {
417
+ constraints += `\nEvery entity has an "entity_id" field (not in attributes). Coreferent mentions share the same entity_id. Example: [{"class":"person","text":"Dr. Chen","entity_id":"e1","attributes":{}},{"class":"person","text":"She","entity_id":"e1","attributes":{}}]`;
418
+ }
419
+ if (enableRelations) {
420
+ constraints += `\nAlso extract relations between entities. Return {"entities": [...], "relations": [{"source": "entity text", "target": "entity text", "relation": "relation type"}]}.`;
421
+ if (relationTypes) {
422
+ constraints += ` Allowed relation types: ${relationTypes.join(", ")}.`;
423
+ }
424
+ }
425
+
426
+ return constraints;
427
+ }
428
+
429
+ // === PROCESS A SINGLE TEXT ===
430
+ async function processText(
431
+ inputText: string,
432
+ session: LlamaChatSession,
433
+ ): Promise<any> {
434
+ const constraints = buildConstraints();
435
+ const prompt = `Extract all named entities from the following text.${constraints}\n\nText: ${inputText}`;
436
+
437
+ const schema = enableRelations
438
+ ? buildRelationsSchema()
439
+ : buildGrammarSchema();
440
+ const grammar = await llama.createGrammarForJsonSchema(schema);
441
+
442
+ const res = await session.prompt(prompt, { grammar });
443
+
444
+ let parsed: any;
445
+ try {
446
+ parsed = grammar.parse(res);
447
+ } catch {
448
+ try {
449
+ parsed = JSON.parse(res.trim());
450
+ } catch {
451
+ console.error(
452
+ "Warning: Failed to parse model output. Raw response:",
453
+ res,
454
+ );
455
+ parsed = enableRelations ? { entities: [], relations: [] } : [];
456
+ }
457
+ }
458
+
459
+ return parsed;
460
+ }
461
+
462
+ // === MAIN ===
463
+ const systemPrompt = buildSystemPrompt();
464
+ const compact = opts.compact || !process.stdout.isTTY;
465
+
466
+ const contextSequence = context.getSequence();
467
+
468
+ if (inputTexts.length === 1) {
469
+ const session = new LlamaChatSession({
470
+ contextSequence,
471
+ systemPrompt,
472
+ });
473
+ const result = await processText(inputTexts[0]!, session);
474
+ console.log(JSON.stringify(result, null, compact ? 0 : 2));
475
+ } else {
476
+ // Batch / chunked: process each text, collect results
477
+ const allResults: any[] = [];
478
+ for (const inputText of inputTexts) {
479
+ // Erase context and create fresh session for each input
480
+ await contextSequence.eraseContextTokenRanges([
481
+ { start: 0, end: contextSequence.nextTokenIndex },
482
+ ]);
483
+ const session = new LlamaChatSession({
484
+ contextSequence,
485
+ systemPrompt,
486
+ });
487
+ const result = await processText(inputText, session);
488
+ allResults.push(result);
489
+ }
490
+
491
+ if (opts.file) {
492
+ // Merge chunked results into one
493
+ if (enableRelations) {
494
+ const merged = {
495
+ entities: allResults.flatMap((r) => r.entities || []),
496
+ relations: allResults.flatMap((r) => r.relations || []),
497
+ };
498
+ console.log(JSON.stringify(merged, null, compact ? 0 : 2));
499
+ } else {
500
+ const merged = allResults.flat();
501
+ console.log(JSON.stringify(merged, null, compact ? 0 : 2));
502
+ }
503
+ } else {
504
+ // Batch: output one result per line (JSONL)
505
+ for (const result of allResults) {
506
+ console.log(JSON.stringify(result));
507
+ }
508
+ }
509
+ }
510
+
511
+ // === CLEANUP ===
512
+ // Bun segfaults if process.exit() triggers synchronous native addon unloading.
513
+ // Setting exitCode lets the event loop drain naturally, avoiding the crash.
514
+ process.exitCode = 0;
package/package.json ADDED
@@ -0,0 +1,24 @@
1
+ {
2
+ "name": "objectivist-ner",
3
+ "version": "0.0.0",
4
+ "description": "Objectivist-inspired Named Entity Recognition with grammar-constrained LLM output",
5
+ "bin": {
6
+ "ner": "index.ts"
7
+ },
8
+ "module": "index.ts",
9
+ "type": "module",
10
+ "license": "MIT",
11
+ "author": "Richard Anaya",
12
+ "repository": "git@github.com:richardanaya/objectivist-ner.git",
13
+ "devDependencies": {
14
+ "@types/bun": "latest",
15
+ "prettier": "^3.8.1"
16
+ },
17
+ "peerDependencies": {
18
+ "typescript": "^5"
19
+ },
20
+ "dependencies": {
21
+ "commander": "^14.0.3",
22
+ "node-llama-cpp": "^3.17.1"
23
+ }
24
+ }