objectivist-ner 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +355 -0
- package/index.ts +514 -0
- package/package.json +24 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Richard Anaya
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
# objectivist-ner
|
|
2
|
+
|
|
3
|
+
Objectivist-inspired Named Entity Recognition with grammar-constrained LLM output.
|
|
4
|
+
|
|
5
|
+
Uses [node-llama-cpp](https://github.com/withcatai/node-llama-cpp) to run a small language model locally, enforcing structured output via JSON schema grammars. No API keys, no network calls -- everything runs on your machine.
|
|
6
|
+
|
|
7
|
+
The CLI is installed as the `ner` command.
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- Exact span extraction -- entity `text` is the substring from the input, not a paraphrase
|
|
12
|
+
- Schema-constrained output via llama.cpp grammar (guaranteed valid JSON)
|
|
13
|
+
- Restrict entity classes, attribute keys, and attribute values
|
|
14
|
+
- Hierarchical class taxonomies
|
|
15
|
+
- Relation extraction between entities
|
|
16
|
+
- Coreference resolution (group mentions of the same entity)
|
|
17
|
+
- Negation and modality detection
|
|
18
|
+
- Confidence scores
|
|
19
|
+
- Schema definition files for reusable ontologies
|
|
20
|
+
- Long document chunking with `--file`
|
|
21
|
+
- Batch processing with `--batch`
|
|
22
|
+
- Three built-in model tiers: `--fast`, `--balanced`, `--best`
|
|
23
|
+
- Reads from argument, file, or stdin
|
|
24
|
+
- Compact JSON output for non-TTY / piping
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
# Local development
|
|
30
|
+
bun install
|
|
31
|
+
|
|
32
|
+
# Install globally as the `ner` command
|
|
33
|
+
bun install -g objectivist-ner
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
After global install, use the `ner` command directly.
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
# Uses --best (4B) by default
|
|
42
|
+
ner "the cat is blue and is feeling sad"
|
|
43
|
+
|
|
44
|
+
# Pick a model tier
|
|
45
|
+
ner --fast "the cat is blue"
|
|
46
|
+
ner --balanced "John works at Google in NYC"
|
|
47
|
+
ner --best "complex medical research text"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Entity constraints
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Restrict entity classes
|
|
54
|
+
ner "John works at Google" --classes person,organization
|
|
55
|
+
|
|
56
|
+
# Restrict attribute keys
|
|
57
|
+
ner "Alice is sad in Paris" --attributes emotional_state,location
|
|
58
|
+
|
|
59
|
+
# Restrict attribute values with enums
|
|
60
|
+
ner "The sky is blue" --attr-values '{"color":["blue","red","green"]}'
|
|
61
|
+
|
|
62
|
+
# Hierarchical class taxonomy
|
|
63
|
+
ner "Dr. Chen lives in Boston with her cat" \
|
|
64
|
+
--taxonomy '{"organism":["person","animal"],"place":["city","country"]}'
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Relation extraction
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
ner --relations "Dr. Chen works at MIT and collaborates with Prof. Wright"
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Output:
|
|
74
|
+
|
|
75
|
+
```json
|
|
76
|
+
{
|
|
77
|
+
"entities": [
|
|
78
|
+
{ "class": "person", "text": "Dr. Chen", "attributes": {} },
|
|
79
|
+
{ "class": "organization", "text": "MIT", "attributes": {} },
|
|
80
|
+
{ "class": "person", "text": "Prof. Wright", "attributes": {} }
|
|
81
|
+
],
|
|
82
|
+
"relations": [
|
|
83
|
+
{ "source": "Dr. Chen", "target": "MIT", "relation": "works at" },
|
|
84
|
+
{
|
|
85
|
+
"source": "Dr. Chen",
|
|
86
|
+
"target": "Prof. Wright",
|
|
87
|
+
"relation": "collaborates with"
|
|
88
|
+
}
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Coreference resolution
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
ner --resolve "Dr. Chen published a paper. She later won the Nobel Prize. The neurologist was celebrated."
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Output:
|
|
100
|
+
|
|
101
|
+
```json
|
|
102
|
+
[
|
|
103
|
+
{
|
|
104
|
+
"class": "person",
|
|
105
|
+
"text": "Dr. Chen",
|
|
106
|
+
"attributes": {},
|
|
107
|
+
"entity_id": "e1"
|
|
108
|
+
},
|
|
109
|
+
{ "class": "person", "text": "She", "attributes": {}, "entity_id": "e1" },
|
|
110
|
+
{
|
|
111
|
+
"class": "person",
|
|
112
|
+
"text": "The neurologist",
|
|
113
|
+
"attributes": {},
|
|
114
|
+
"entity_id": "e1"
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
"class": "event",
|
|
118
|
+
"text": "the Nobel Prize",
|
|
119
|
+
"attributes": {},
|
|
120
|
+
"entity_id": "e2"
|
|
121
|
+
}
|
|
122
|
+
]
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
`entity_id` is a grammar-enforced top-level field, not inside `attributes`.
|
|
126
|
+
|
|
127
|
+
### Negation detection
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
ner --detect-negation "The patient has diabetes but does not have cancer. He might develop hypertension."
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Confidence scores
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
ner --include-confidence "Dr. Maria Chen works at MIT. Someone named Bob might be there too."
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Schema definition files
|
|
140
|
+
|
|
141
|
+
Define your ontology in a JSON file and reuse it:
|
|
142
|
+
|
|
143
|
+
```json
|
|
144
|
+
{
|
|
145
|
+
"taxonomy": {
|
|
146
|
+
"organism": ["person", "animal"],
|
|
147
|
+
"place": ["city", "country", "building"],
|
|
148
|
+
"institution": ["company", "university", "government_agency"]
|
|
149
|
+
},
|
|
150
|
+
"attributes": ["role", "age", "location", "affiliation"],
|
|
151
|
+
"relations": ["works_at", "located_in", "affiliated_with"]
|
|
152
|
+
}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
ner --schema schema.json "Dr. Chen works at MIT in Boston"
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
Schema files support `taxonomy`, `classes`, `attributes`, `attrValues`, and `relations`. CLI flags override schema file values.
|
|
160
|
+
|
|
161
|
+
### File and batch processing
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
# Process a long document (auto-chunked)
|
|
165
|
+
ner --file document.txt
|
|
166
|
+
|
|
167
|
+
# Process a JSONL file (one text per line, outputs JSONL)
|
|
168
|
+
ner --batch inputs.jsonl
|
|
169
|
+
|
|
170
|
+
# Process a directory of .txt files
|
|
171
|
+
ner --batch ./documents/
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
### Other options
|
|
175
|
+
|
|
176
|
+
```bash
|
|
177
|
+
# Append to the built-in system prompt
|
|
178
|
+
ner "text" --system-prompt-append "Focus only on emotions"
|
|
179
|
+
|
|
180
|
+
# Replace the system prompt entirely
|
|
181
|
+
ner "text" --system-prompt "You are a custom extractor."
|
|
182
|
+
|
|
183
|
+
# Read from stdin
|
|
184
|
+
echo "the cat is blue" | ner
|
|
185
|
+
|
|
186
|
+
# Compact JSON output
|
|
187
|
+
ner "the cat is blue" --compact
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Models
|
|
191
|
+
|
|
192
|
+
fastner ships with three built-in model tiers. Pick one with a flag -- the model is downloaded automatically on first use to `~/.fastner/models/`.
|
|
193
|
+
|
|
194
|
+
| Flag | Model | Size | Download | Best for |
|
|
195
|
+
| ------------ | ----------------- | ---- | -------- | ------------------------------- |
|
|
196
|
+
| `--fast` | Qwen3.5-0.8B Q8_0 | 0.8B | ~0.9 GB | Simple text, single entities |
|
|
197
|
+
| `--balanced` | Qwen3.5-2B Q8_0 | 2B | ~2.3 GB | Moderate complexity, most tasks |
|
|
198
|
+
| `--best` | Qwen3.5-4B Q8_0 | 4B | ~4.5 GB | Dense text, rare entity types |
|
|
199
|
+
|
|
200
|
+
`--best` is the default. See [Benchmarks](#benchmarks) for why.
|
|
201
|
+
|
|
202
|
+
## Options
|
|
203
|
+
|
|
204
|
+
| Flag | Description |
|
|
205
|
+
| --------------------------------- | ---------------------------------------------- |
|
|
206
|
+
| `--fast` | Use 0.8B model -- quick, simple text only |
|
|
207
|
+
| `--balanced` | Use 2B model -- good accuracy/speed tradeoff |
|
|
208
|
+
| `--best` | Use 4B model -- best accuracy (default) |
|
|
209
|
+
| `-c, --classes <list>` | Comma-separated allowed entity classes |
|
|
210
|
+
| `-a, --attributes <list>` | Comma-separated allowed attribute keys |
|
|
211
|
+
| `--attr-values <json>` | JSON enum map for attribute values |
|
|
212
|
+
| `--taxonomy <json>` | Class hierarchy JSON |
|
|
213
|
+
| `--relations` | Extract relations between entities |
|
|
214
|
+
| `--resolve` | Resolve coreferences |
|
|
215
|
+
| `--include-confidence` | Include confidence scores per entity |
|
|
216
|
+
| `--detect-negation` | Detect negated/hypothetical entities |
|
|
217
|
+
| `--schema <path>` | Load schema definition from JSON file |
|
|
218
|
+
| `--file <path>` | Read input from file (with chunking) |
|
|
219
|
+
| `--batch <path>` | Process JSONL file or directory of .txt files |
|
|
220
|
+
| `--system-prompt <string>` | Replace the built-in system prompt entirely |
|
|
221
|
+
| `--system-prompt-append <string>` | Append to the built-in system prompt |
|
|
222
|
+
| `--compact` | Output compact JSON (auto-enabled for non-TTY) |
|
|
223
|
+
| `-m, --model <uri>` | Use any GGUF model (see below) |
|
|
224
|
+
|
|
225
|
+
## Benchmarks
|
|
226
|
+
|
|
227
|
+
We tested all three tiers against a complex input containing 11 entities across 6 classes (person, organization, location, disease, drug, event):
|
|
228
|
+
|
|
229
|
+
> "Dr. Maria Chen, a 42-year-old neurologist at Massachusetts General Hospital in Boston, published a groundbreaking paper with her colleague Prof. James Wright from Oxford University about a rare genetic mutation called BRCA3-delta found in 12 patients from rural Bangladesh, while simultaneously consulting for Pfizer on their new drug Nexavion priced at 450 dollars per dose, which the WHO classified as a Category A essential medicine last Tuesday during their Geneva summit"
|
|
230
|
+
|
|
231
|
+
| Entity | `--fast` | `--balanced` | `--best` (default) |
|
|
232
|
+
| ------------------ | ---------- | ------------ | ------------------------ |
|
|
233
|
+
| Dr. Maria Chen | person | person | person |
|
|
234
|
+
| Prof. James Wright | person | person | person, role: colleague |
|
|
235
|
+
| MGH | - | org | org |
|
|
236
|
+
| Oxford University | - | org | org |
|
|
237
|
+
| BRCA3-delta | - | disease | disease |
|
|
238
|
+
| Bangladesh | - | - | location |
|
|
239
|
+
| Pfizer | - | org | org |
|
|
240
|
+
| Nexavion | - | drug | drug, price: 450 dollars |
|
|
241
|
+
| WHO | - | - | org, category: Cat A |
|
|
242
|
+
| Geneva summit | - | event | location |
|
|
243
|
+
| Boston | location | - | location |
|
|
244
|
+
| **Entities found** | **3 / 11** | **8 / 11** | **11 / 11** |
|
|
245
|
+
|
|
246
|
+
All three tiers produce zero hallucinations with the current prompt design.
|
|
247
|
+
|
|
248
|
+
## Epistemological design
|
|
249
|
+
|
|
250
|
+
fastner's feature set is informed by Objectivist epistemology -- the theory that concepts are formed by abstracting essential characteristics from concretes, organized into hierarchical structures, and held in a specific relationship to reality.
|
|
251
|
+
|
|
252
|
+
### Identity: A is A (`--resolve`)
|
|
253
|
+
|
|
254
|
+
The law of identity demands that we track _what a thing is_ across all its references. When a text says "Dr. Chen", "she", and "the neurologist", these are three linguistic expressions of one entity. Without coreference resolution, an NER system treats them as three unrelated extractions -- a failure to maintain identity. `--resolve` enforces that A remains A regardless of how it is named.
|
|
255
|
+
|
|
256
|
+
### Hierarchical concept formation (`--taxonomy`)
|
|
257
|
+
|
|
258
|
+
Objectivist epistemology holds that concepts are organized hierarchically through a process of abstraction. "Cat" is subsumed under "animal", which is subsumed under "organism". Each level retains the essential characteristics of its parent while adding differentia. The `--taxonomy` flag mirrors this structure directly -- you define genus-species relationships between entity classes, and the model classifies at the most specific level it can justify. This isn't just organization; it's how valid concepts are formed.
|
|
259
|
+
|
|
260
|
+
### Distinguishing existence from assertion (`--detect-negation`)
|
|
261
|
+
|
|
262
|
+
A concept must be connected to reality. "The patient has diabetes" and "the patient does not have diabetes" both contain the entity "diabetes", but their relationship to existence is opposite. Naive NER systems that extract "diabetes" from both sentences without distinguishing assertion from negation commit a fundamental error -- they detach the concept from its existential status. `--detect-negation` forces every entity to declare its relationship to reality: present, negated, or hypothetical.
|
|
263
|
+
|
|
264
|
+
### Certainty and the hierarchy of evidence (`--include-confidence`)
|
|
265
|
+
|
|
266
|
+
Knowledge exists on a spectrum from certain to speculative. "Dr. Maria Chen" appearing with a full name and title is a high-confidence extraction. "Someone named Bob" is low-confidence. Objectivism rejects both dogmatism (asserting certainty where none exists) and skepticism (denying certainty where it does). `--include-confidence` makes the epistemic status of each extraction explicit, letting downstream systems apply appropriate thresholds.
|
|
267
|
+
|
|
268
|
+
### Relations as conceptual integration (`--relations`)
|
|
269
|
+
|
|
270
|
+
Entities don't exist in isolation. The relationship "Dr. Chen works at MIT" is not a property of Chen or of MIT alone -- it's a fact about reality that connects two existents. Extracting entities without their relations is like forming concepts without integrating them into propositions. `--relations` extracts the connective tissue between entities, producing a knowledge graph rather than an isolated list.
|
|
271
|
+
|
|
272
|
+
### Schema files as objective definitions (`--schema`)
|
|
273
|
+
|
|
274
|
+
Definitions, in Objectivist epistemology, identify the essential characteristics that distinguish a concept from all others. A schema file serves this function for NER: it defines your ontology once -- the class hierarchy, the valid attributes, the relation types -- and applies it consistently across all extractions. This is the difference between ad-hoc classification and principled concept formation.
|
|
275
|
+
|
|
276
|
+
### Grammar enforcement as logical constraint
|
|
277
|
+
|
|
278
|
+
Several fields (`assertion`, `confidence`, `entity_id`, `class` enums) are enforced at the grammar level, not merely prompted. The model literally cannot produce an invalid value. This is the computational equivalent of the principle that contradictions cannot exist -- the system's structure makes certain errors impossible rather than merely unlikely.
|
|
279
|
+
|
|
280
|
+
## Building Up Knowledge
|
|
281
|
+
|
|
282
|
+
fastner is designed as a tool for the Objectivist project of building knowledge from percepts through concepts to principles and finally to action — the exact process implemented in the companion project **[objectivist-lattice](https://github.com/richardanaya/objectivist-lattice)**.
|
|
283
|
+
|
|
284
|
+
### The Epistemological Pipeline
|
|
285
|
+
|
|
286
|
+
Objectivism holds that all knowledge begins with **percepts** (raw sensory data), which are integrated into **concepts**, which are organized into **principles** (general truths), which are finally applied as **actions** in specific contexts.
|
|
287
|
+
|
|
288
|
+
`objectivist-lattice` enforces this hierarchy strictly on a filesystem of Markdown files with validation rules:
|
|
289
|
+
|
|
290
|
+
- **Axioms** and **percepts** are bedrock — they have no `reduces_to` links
|
|
291
|
+
- **Principles** must reduce to axioms or percepts
|
|
292
|
+
- **Applications** must reduce to principles
|
|
293
|
+
- Promotion from `Tentative/Hypothesis` to `Integrated/Validated` can only happen bottom-up
|
|
294
|
+
|
|
295
|
+
### How NER Helps Build the Lattice
|
|
296
|
+
|
|
297
|
+
fastner acts as the **percept-to-concept extraction layer** for this system:
|
|
298
|
+
|
|
299
|
+
1. **Percept Extraction** (`--detect-negation`)
|
|
300
|
+
- Identifies concrete entities from source material (books, articles, personal observations)
|
|
301
|
+
- Distinguishes what is asserted as present, negated, or hypothetical
|
|
302
|
+
- Feeds raw perceptual data into the `02-Percepts/` directory
|
|
303
|
+
|
|
304
|
+
2. **Concept Formation** (`--classes`, `--taxonomy`, `--resolve`)
|
|
305
|
+
- Groups multiple mentions of the same entity (`entity_id`)
|
|
306
|
+
- Classifies entities into hierarchical taxonomies (`organism > person > neurologist`)
|
|
307
|
+
- Maintains identity across contexts — "Dr. Chen", "she", and "the neurologist" are recognized as the same existent
|
|
308
|
+
|
|
309
|
+
3. **Principle Discovery** (`--relations`, `--schema`)
|
|
310
|
+
- Extracts relations between entities ("works at", "causes", "implies")
|
|
311
|
+
- Uses schema files to enforce your ontological commitments
|
|
312
|
+
- Surfaces potential principles by showing what consistently reduces to what
|
|
313
|
+
|
|
314
|
+
4. **Action Guidance** (`--include-confidence`)
|
|
315
|
+
- Rates confidence in each extraction
|
|
316
|
+
- Helps distinguish high-certainty principles (suitable for action) from speculative ones (still tentative)
|
|
317
|
+
|
|
318
|
+
### Practical Workflow
|
|
319
|
+
|
|
320
|
+
```bash
|
|
321
|
+
# Extract entities from a book chapter
|
|
322
|
+
ner --file chapter1.txt --detect-negation --resolve --include-confidence > percepts.json
|
|
323
|
+
|
|
324
|
+
# Convert to lattice format
|
|
325
|
+
cat percepts.json | jq '.[] | {title: .text, level: "percept", proposition: (.text + " was observed")}' > 02-Percepts/20260315-percept-001.md
|
|
326
|
+
|
|
327
|
+
# Later, when forming principles
|
|
328
|
+
ner --relations --schema ontology.json "text from multiple chapters" > principles.json
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
The combination of **objectivist-ner** (extraction) and **objectivist-lattice** (validation and organization) creates a complete pipeline:
|
|
332
|
+
|
|
333
|
+
**Percepts → Concepts → Principles → Validated Knowledge → Action**
|
|
334
|
+
|
|
335
|
+
This is not just information extraction. It is epistemological engineering — using computation to enforce the proper hierarchical structure of knowledge, preventing floating abstractions and ensuring every principle is grounded in percepts and axioms.
|
|
336
|
+
|
|
337
|
+
The grammar-enforced fields (`assertion`, `confidence`, `entity_id`) are not arbitrary features. They are computational implementations of fundamental epistemological requirements: every concept must have a relationship to reality, every claim must have an epistemic status, and identity must be maintained across contexts.
|
|
338
|
+
|
|
339
|
+
See the [objectivist-lattice](https://github.com/richardanaya/objectivist-lattice) repository for the validation and knowledge management layer that pairs with this tool.
|
|
340
|
+
|
|
341
|
+
## Custom models
|
|
342
|
+
|
|
343
|
+
If the built-in tiers don't fit your needs, you can pass any GGUF model with `--model`. This overrides `--fast`/`--balanced`/`--best`.
|
|
344
|
+
|
|
345
|
+
```bash
|
|
346
|
+
# HuggingFace URI
|
|
347
|
+
ner "text" --model "hf:unsloth/Qwen3-8B-GGUF:Qwen3-8B-Q4_K_M.gguf"
|
|
348
|
+
|
|
349
|
+
# Local file
|
|
350
|
+
ner "text" --model ./my-custom-model.gguf
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
## License
|
|
354
|
+
|
|
355
|
+
MIT © Richard Anaya
|
package/index.ts
ADDED
|
@@ -0,0 +1,514 @@
|
|
|
1
|
+
import { program } from "commander";
|
|
2
|
+
import path from "path";
|
|
3
|
+
import os from "os";
|
|
4
|
+
import fs from "fs";
|
|
5
|
+
import { getLlama, LlamaChatSession, resolveModelFile } from "node-llama-cpp";
|
|
6
|
+
|
|
7
|
+
// === MODELS ===
|
|
8
|
+
const MODELS = {
|
|
9
|
+
fast: "hf:unsloth/Qwen3.5-0.8B-GGUF:Qwen3.5-0.8B-Q8_0.gguf",
|
|
10
|
+
balanced: "hf:unsloth/Qwen3.5-2B-GGUF:Qwen3.5-2B-Q8_0.gguf",
|
|
11
|
+
best: "hf:unsloth/Qwen3.5-4B-GGUF:Qwen3.5-4B-Q8_0.gguf",
|
|
12
|
+
} as const;
|
|
13
|
+
|
|
14
|
+
// === SYSTEM PROMPTS ===
|
|
15
|
+
const SYSTEM_PROMPT = `You are a named entity recognition (NER) system. Your task is to extract entities from text.
|
|
16
|
+
|
|
17
|
+
Rules:
|
|
18
|
+
- "text" must be the EXACT substring from the input that refers to the entity. Do NOT paraphrase or include extra words.
|
|
19
|
+
- "class" is the entity type (e.g. person, animal, location, organization).
|
|
20
|
+
- "attributes" are properties of the entity found in context.
|
|
21
|
+
- Return one object per distinct entity mention.
|
|
22
|
+
- If no entities are found, return an empty array [].`;
|
|
23
|
+
|
|
24
|
+
const FEW_SHOT_EXAMPLES = `Example 1:
|
|
25
|
+
Input: "the cat is blue and is feeling sad"
|
|
26
|
+
Output: [{"class":"animal","text":"cat","attributes":{"color":"blue","emotional_state":"sad"}}]
|
|
27
|
+
|
|
28
|
+
Example 2:
|
|
29
|
+
Input: "John Smith lives in New York City and works at Google"
|
|
30
|
+
Output: [{"class":"person","text":"John Smith","attributes":{"location":"New York City","employer":"Google"}},{"class":"location","text":"New York City","attributes":{}},{"class":"organization","text":"Google","attributes":{}}]
|
|
31
|
+
|
|
32
|
+
Example 3:
|
|
33
|
+
Input: "The quick brown fox jumps over the lazy dog near the river"
|
|
34
|
+
Output: [{"class":"animal","text":"fox","attributes":{"color":"brown","speed":"quick"}},{"class":"animal","text":"dog","attributes":{"temperament":"lazy"}},{"class":"location","text":"the river","attributes":{}}]
|
|
35
|
+
|
|
36
|
+
Example 4:
|
|
37
|
+
Input: "Researchers at MIT found that the drug Riluzole slows progression of ALS in a trial last March"
|
|
38
|
+
Output: [{"class":"organization","text":"MIT","attributes":{}},{"class":"drug","text":"Riluzole","attributes":{}},{"class":"disease","text":"ALS","attributes":{}},{"class":"event","text":"trial last March","attributes":{"date":"last March"}}]`;
|
|
39
|
+
|
|
40
|
+
// === SCHEMA FILE TYPES ===
|
|
41
|
+
interface SchemaFile {
|
|
42
|
+
taxonomy?: Record<string, string[]>;
|
|
43
|
+
classes?: string[];
|
|
44
|
+
attributes?: string[];
|
|
45
|
+
attrValues?: Record<string, string[]>;
|
|
46
|
+
relations?: string[];
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// === TAXONOMY HELPERS ===
|
|
50
|
+
function flattenTaxonomy(taxonomy: Record<string, string[]>): string[] {
|
|
51
|
+
const all = new Set<string>();
|
|
52
|
+
for (const [parent, children] of Object.entries(taxonomy)) {
|
|
53
|
+
all.add(parent);
|
|
54
|
+
for (const child of children) all.add(child);
|
|
55
|
+
}
|
|
56
|
+
return [...all];
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function taxonomyToPrompt(taxonomy: Record<string, string[]>): string {
|
|
60
|
+
const lines = Object.entries(taxonomy)
|
|
61
|
+
.map(([parent, children]) => ` ${parent}: ${children.join(", ")}`)
|
|
62
|
+
.join("\n");
|
|
63
|
+
return `Use the following class hierarchy. Classify at the most specific level.\n${lines}`;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// === CHUNKING ===
|
|
67
|
+
function chunkText(text: string, maxChars: number): string[] {
|
|
68
|
+
if (text.length <= maxChars) return [text];
|
|
69
|
+
const chunks: string[] = [];
|
|
70
|
+
const sentences = text.split(/(?<=[.!?])\s+/);
|
|
71
|
+
let current = "";
|
|
72
|
+
for (const sentence of sentences) {
|
|
73
|
+
if (current.length + sentence.length > maxChars && current.length > 0) {
|
|
74
|
+
chunks.push(current.trim());
|
|
75
|
+
current = "";
|
|
76
|
+
}
|
|
77
|
+
current += (current ? " " : "") + sentence;
|
|
78
|
+
}
|
|
79
|
+
if (current.trim()) chunks.push(current.trim());
|
|
80
|
+
return chunks;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// === CLI ===
|
|
84
|
+
program
|
|
85
|
+
.name("ner")
|
|
86
|
+
.description(
|
|
87
|
+
"Objectivist-inspired named entity recognition with grammar constraints",
|
|
88
|
+
)
|
|
89
|
+
.argument("[text]", "Text to extract entities from (omit to read from stdin)")
|
|
90
|
+
.option("-c, --classes <list>", "Comma-separated allowed entity classes")
|
|
91
|
+
.option("-a, --attributes <list>", "Comma-separated allowed attribute keys")
|
|
92
|
+
.option(
|
|
93
|
+
"--attr-values <json>",
|
|
94
|
+
'JSON enum map for attributes e.g. {"color":["blue","red"]}',
|
|
95
|
+
)
|
|
96
|
+
.option(
|
|
97
|
+
"--taxonomy <json>",
|
|
98
|
+
'Class hierarchy JSON e.g. {"organism":["animal","plant"]}',
|
|
99
|
+
)
|
|
100
|
+
.option("--relations", "Extract relations between entities")
|
|
101
|
+
.option("--resolve", "Resolve coreferences (group mentions of same entity)")
|
|
102
|
+
.option("--include-confidence", "Include confidence scores per entity")
|
|
103
|
+
.option("--detect-negation", "Detect negated/hypothetical entities")
|
|
104
|
+
.option("--schema <path>", "Load entity schema definition from a JSON file")
|
|
105
|
+
.option(
|
|
106
|
+
"--file <path>",
|
|
107
|
+
"Read input from a file (with chunking for long docs)",
|
|
108
|
+
)
|
|
109
|
+
.option(
|
|
110
|
+
"--batch <path>",
|
|
111
|
+
"Process JSONL file (one text per line) or directory of .txt files",
|
|
112
|
+
)
|
|
113
|
+
.option(
|
|
114
|
+
"--system-prompt <string>",
|
|
115
|
+
"Replace the built-in system prompt entirely",
|
|
116
|
+
)
|
|
117
|
+
.option(
|
|
118
|
+
"--system-prompt-append <string>",
|
|
119
|
+
"Append to the built-in system prompt",
|
|
120
|
+
)
|
|
121
|
+
.option("-m, --model <uri>", "Model URI or path to GGUF file")
|
|
122
|
+
.option("--fast", "Use smallest model (0.8B) -- quick, simple text only")
|
|
123
|
+
.option(
|
|
124
|
+
"--balanced",
|
|
125
|
+
"Use mid-size model (2B) -- good accuracy/speed tradeoff",
|
|
126
|
+
)
|
|
127
|
+
.option("--best", "Use largest model (4B) -- best accuracy (default)")
|
|
128
|
+
.option("--compact", "Output compact JSON (also auto-enabled for non-TTY)")
|
|
129
|
+
.addHelpText(
|
|
130
|
+
"after",
|
|
131
|
+
`
|
|
132
|
+
Examples:
|
|
133
|
+
fastner "the cat is blue"
|
|
134
|
+
fastner --fast "simple short text"
|
|
135
|
+
fastner "John works at Google" --classes person,organization
|
|
136
|
+
fastner "sky is blue" --attr-values '{"color":["blue","red"]}'
|
|
137
|
+
fastner --relations "Dr. Chen works at MIT"
|
|
138
|
+
fastner --resolve "Dr. Chen published a paper. She won an award."
|
|
139
|
+
fastner --detect-negation "The patient does not have diabetes"
|
|
140
|
+
fastner --schema schema.json "complex text"
|
|
141
|
+
fastner --file document.txt
|
|
142
|
+
fastner --batch inputs.jsonl
|
|
143
|
+
echo "the cat is blue" | fastner
|
|
144
|
+
`,
|
|
145
|
+
)
|
|
146
|
+
.parse();
|
|
147
|
+
|
|
148
|
+
const opts = program.opts();
|
|
149
|
+
|
|
150
|
+
// === VALIDATIONS ===
|
|
151
|
+
const tierFlags = [opts.fast, opts.balanced, opts.best].filter(Boolean).length;
|
|
152
|
+
if (tierFlags > 1) {
|
|
153
|
+
console.error(
|
|
154
|
+
"Error: --fast, --balanced, and --best are mutually exclusive.",
|
|
155
|
+
);
|
|
156
|
+
process.exit(1);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
if (opts.systemPrompt && opts.systemPromptAppend) {
|
|
160
|
+
console.error(
|
|
161
|
+
"Error: --system-prompt and --system-prompt-append are mutually exclusive.",
|
|
162
|
+
);
|
|
163
|
+
process.exit(1);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// === LOAD SCHEMA FILE ===
|
|
167
|
+
let schemaFile: SchemaFile | undefined;
|
|
168
|
+
if (opts.schema) {
|
|
169
|
+
try {
|
|
170
|
+
const raw = fs.readFileSync(opts.schema, "utf-8");
|
|
171
|
+
schemaFile = JSON.parse(raw);
|
|
172
|
+
} catch (e) {
|
|
173
|
+
console.error(`Error: Failed to load schema file: ${(e as Error).message}`);
|
|
174
|
+
process.exit(1);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// === MERGE OPTIONS (CLI flags override schema file) ===
|
|
179
|
+
const allowedClasses: string[] | undefined = opts.classes
|
|
180
|
+
? opts.classes.split(",").map((s: string) => s.trim())
|
|
181
|
+
: schemaFile?.classes;
|
|
182
|
+
|
|
183
|
+
const allowedAttrs: string[] | undefined = opts.attributes
|
|
184
|
+
? opts.attributes.split(",").map((s: string) => s.trim())
|
|
185
|
+
: schemaFile?.attributes;
|
|
186
|
+
|
|
187
|
+
let attrValuesMap: Record<string, string[]> | undefined;
|
|
188
|
+
if (opts.attrValues) {
|
|
189
|
+
try {
|
|
190
|
+
attrValuesMap = JSON.parse(opts.attrValues);
|
|
191
|
+
} catch (e) {
|
|
192
|
+
console.error(
|
|
193
|
+
`Error: Invalid JSON for --attr-values: ${(e as Error).message}`,
|
|
194
|
+
);
|
|
195
|
+
process.exit(1);
|
|
196
|
+
}
|
|
197
|
+
} else if (schemaFile?.attrValues) {
|
|
198
|
+
attrValuesMap = schemaFile.attrValues;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
let taxonomy: Record<string, string[]> | undefined;
|
|
202
|
+
if (opts.taxonomy) {
|
|
203
|
+
try {
|
|
204
|
+
taxonomy = JSON.parse(opts.taxonomy);
|
|
205
|
+
} catch (e) {
|
|
206
|
+
console.error(
|
|
207
|
+
`Error: Invalid JSON for --taxonomy: ${(e as Error).message}`,
|
|
208
|
+
);
|
|
209
|
+
process.exit(1);
|
|
210
|
+
}
|
|
211
|
+
} else if (schemaFile?.taxonomy) {
|
|
212
|
+
taxonomy = schemaFile.taxonomy;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
const enableRelations = opts.relations || !!schemaFile?.relations;
|
|
216
|
+
const relationTypes: string[] | undefined = schemaFile?.relations || undefined;
|
|
217
|
+
const enableResolve = !!opts.resolve;
|
|
218
|
+
const enableConfidence = !!opts.includeConfidence;
|
|
219
|
+
const enableNegation = !!opts.detectNegation;
|
|
220
|
+
|
|
221
|
+
// === READ INPUT ===
|
|
222
|
+
let inputTexts: string[] = [];
|
|
223
|
+
|
|
224
|
+
if (opts.batch) {
|
|
225
|
+
const batchPath = opts.batch as string;
|
|
226
|
+
const stat = fs.statSync(batchPath);
|
|
227
|
+
if (stat.isDirectory()) {
|
|
228
|
+
const files = fs
|
|
229
|
+
.readdirSync(batchPath)
|
|
230
|
+
.filter((f: string) => f.endsWith(".txt"));
|
|
231
|
+
inputTexts = files.map((f: string) =>
|
|
232
|
+
fs.readFileSync(path.join(batchPath, f), "utf-8").trim(),
|
|
233
|
+
);
|
|
234
|
+
} else {
|
|
235
|
+
const content = fs.readFileSync(batchPath, "utf-8").trim();
|
|
236
|
+
inputTexts = content.split("\n").map((line: string) => {
|
|
237
|
+
try {
|
|
238
|
+
const parsed = JSON.parse(line);
|
|
239
|
+
return typeof parsed === "string" ? parsed : parsed.text || line;
|
|
240
|
+
} catch {
|
|
241
|
+
return line;
|
|
242
|
+
}
|
|
243
|
+
});
|
|
244
|
+
}
|
|
245
|
+
} else if (opts.file) {
|
|
246
|
+
const content = fs.readFileSync(opts.file, "utf-8").trim();
|
|
247
|
+
inputTexts = chunkText(content, 2000);
|
|
248
|
+
} else {
|
|
249
|
+
let text = program.args[0];
|
|
250
|
+
if (!text) {
|
|
251
|
+
const chunks: Buffer[] = [];
|
|
252
|
+
for await (const chunk of process.stdin) {
|
|
253
|
+
chunks.push(chunk as Buffer);
|
|
254
|
+
}
|
|
255
|
+
text = Buffer.concat(chunks).toString().trim();
|
|
256
|
+
}
|
|
257
|
+
if (!text) {
|
|
258
|
+
console.error(
|
|
259
|
+
"Error: No input text provided. Pass as argument, --file, --batch, or stdin.",
|
|
260
|
+
);
|
|
261
|
+
process.exit(1);
|
|
262
|
+
}
|
|
263
|
+
inputTexts = [text];
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// === RESOLVE MODEL ===
|
|
267
|
+
const modelUri = opts.model
|
|
268
|
+
? opts.model
|
|
269
|
+
: opts.fast
|
|
270
|
+
? MODELS.fast
|
|
271
|
+
: opts.balanced
|
|
272
|
+
? MODELS.balanced
|
|
273
|
+
: MODELS.best;
|
|
274
|
+
|
|
275
|
+
const modelsDir = path.join(os.homedir(), ".fastner", "models");
|
|
276
|
+
const modelPath = await resolveModelFile(modelUri, modelsDir);
|
|
277
|
+
|
|
278
|
+
const llama = await getLlama();
|
|
279
|
+
const model = await llama.loadModel({ modelPath });
|
|
280
|
+
const context = await model.createContext();
|
|
281
|
+
|
|
282
|
+
// === BUILD SYSTEM PROMPT ===
|
|
283
|
+
function buildSystemPrompt(): string {
|
|
284
|
+
if (opts.systemPrompt) return opts.systemPrompt;
|
|
285
|
+
|
|
286
|
+
let base = SYSTEM_PROMPT;
|
|
287
|
+
|
|
288
|
+
if (enableNegation) {
|
|
289
|
+
base += `\n- Every entity has a top-level "assertion" field: "present", "negated", or "hypothetical". "negated" means the text explicitly denies it (e.g. "does not have"). "hypothetical" means it is speculative (e.g. "might develop").`;
|
|
290
|
+
}
|
|
291
|
+
if (enableConfidence) {
|
|
292
|
+
base += `\n- Every entity has a top-level "confidence" field: "low", "medium", or "high".`;
|
|
293
|
+
}
|
|
294
|
+
if (enableResolve) {
|
|
295
|
+
base += `\n- Every entity has a top-level "entity_id" field. If multiple text spans refer to the same real-world entity (e.g. "Dr. Chen" and "she"), they share the same entity_id. Use short IDs like "e1", "e2".`;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
let prompt = `${base}\n\n${FEW_SHOT_EXAMPLES}`;
|
|
299
|
+
|
|
300
|
+
if (opts.systemPromptAppend) {
|
|
301
|
+
prompt += `\n\n${opts.systemPromptAppend}`;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
return prompt;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// === BUILD GRAMMAR SCHEMA ===
|
|
308
|
+
function buildGrammarSchema() {
|
|
309
|
+
// Determine allowed classes from taxonomy or explicit list
|
|
310
|
+
const classEnum = taxonomy ? flattenTaxonomy(taxonomy) : allowedClasses;
|
|
311
|
+
|
|
312
|
+
const attributesSchema: any = {
|
|
313
|
+
type: "object",
|
|
314
|
+
additionalProperties: { type: "string" },
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
const properties: any = {
|
|
318
|
+
class: {
|
|
319
|
+
type: "string",
|
|
320
|
+
...(classEnum && { enum: classEnum }),
|
|
321
|
+
},
|
|
322
|
+
text: { type: "string" },
|
|
323
|
+
attributes: attributesSchema,
|
|
324
|
+
};
|
|
325
|
+
const required: string[] = ["class", "text"];
|
|
326
|
+
|
|
327
|
+
// Grammar-enforced fields for enabled features.
|
|
328
|
+
// These are top-level entity properties (not inside attributes)
|
|
329
|
+
// so the grammar can enforce them as required on every entity.
|
|
330
|
+
if (enableNegation) {
|
|
331
|
+
properties.assertion = {
|
|
332
|
+
type: "string",
|
|
333
|
+
enum: ["present", "negated", "hypothetical"],
|
|
334
|
+
};
|
|
335
|
+
required.push("assertion");
|
|
336
|
+
}
|
|
337
|
+
if (enableConfidence) {
|
|
338
|
+
properties.confidence = {
|
|
339
|
+
type: "string",
|
|
340
|
+
enum: ["low", "medium", "high"],
|
|
341
|
+
};
|
|
342
|
+
required.push("confidence");
|
|
343
|
+
}
|
|
344
|
+
if (enableResolve) {
|
|
345
|
+
properties.entity_id = { type: "string" };
|
|
346
|
+
required.push("entity_id");
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
const schema: any = {
|
|
350
|
+
type: "array",
|
|
351
|
+
items: {
|
|
352
|
+
type: "object",
|
|
353
|
+
properties,
|
|
354
|
+
required,
|
|
355
|
+
additionalProperties: false,
|
|
356
|
+
},
|
|
357
|
+
};
|
|
358
|
+
|
|
359
|
+
return schema;
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// === BUILD RELATIONS SCHEMA ===
|
|
363
|
+
function buildRelationsSchema() {
|
|
364
|
+
const relSchema: any = {
|
|
365
|
+
type: "object",
|
|
366
|
+
properties: {
|
|
367
|
+
entities: buildGrammarSchema(),
|
|
368
|
+
relations: {
|
|
369
|
+
type: "array",
|
|
370
|
+
items: {
|
|
371
|
+
type: "object",
|
|
372
|
+
properties: {
|
|
373
|
+
source: { type: "string" },
|
|
374
|
+
target: { type: "string" },
|
|
375
|
+
relation: {
|
|
376
|
+
type: "string",
|
|
377
|
+
...(relationTypes && { enum: relationTypes }),
|
|
378
|
+
},
|
|
379
|
+
},
|
|
380
|
+
required: ["source", "target", "relation"],
|
|
381
|
+
additionalProperties: false,
|
|
382
|
+
},
|
|
383
|
+
},
|
|
384
|
+
},
|
|
385
|
+
required: ["entities", "relations"],
|
|
386
|
+
additionalProperties: false,
|
|
387
|
+
};
|
|
388
|
+
return relSchema;
|
|
389
|
+
}
|
|
390
|
+
|
|
391
|
+
// === BUILD PROMPT CONSTRAINTS ===
|
|
392
|
+
function buildConstraints(): string {
|
|
393
|
+
let constraints = "";
|
|
394
|
+
|
|
395
|
+
if (taxonomy) {
|
|
396
|
+
constraints += `\n${taxonomyToPrompt(taxonomy)}`;
|
|
397
|
+
} else if (allowedClasses) {
|
|
398
|
+
constraints += `\nAllowed entity classes: ${allowedClasses.join(", ")}. Only use these classes.`;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
if (attrValuesMap) {
|
|
402
|
+
const desc = Object.entries(attrValuesMap)
|
|
403
|
+
.map(([k, v]) => `${k}: ${v.join(", ")}`)
|
|
404
|
+
.join("; ");
|
|
405
|
+
constraints += `\nOnly use these attribute keys and values: ${desc}. Omit attributes that don't apply to an entity.`;
|
|
406
|
+
} else if (allowedAttrs) {
|
|
407
|
+
constraints += `\nOnly use these attribute keys: ${allowedAttrs.join(", ")}. Omit attributes that don't apply to an entity.`;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
if (enableNegation) {
|
|
411
|
+
constraints += `\nEvery entity has an "assertion" field (not in attributes). Example: [{"class":"disease","text":"diabetes","assertion":"present","attributes":{}},{"class":"disease","text":"cancer","assertion":"negated","attributes":{}}]`;
|
|
412
|
+
}
|
|
413
|
+
if (enableConfidence) {
|
|
414
|
+
constraints += `\nEvery entity has a "confidence" field (not in attributes). Example: [{"class":"person","text":"John","confidence":"high","attributes":{}}]`;
|
|
415
|
+
}
|
|
416
|
+
if (enableResolve) {
|
|
417
|
+
constraints += `\nEvery entity has an "entity_id" field (not in attributes). Coreferent mentions share the same entity_id. Example: [{"class":"person","text":"Dr. Chen","entity_id":"e1","attributes":{}},{"class":"person","text":"She","entity_id":"e1","attributes":{}}]`;
|
|
418
|
+
}
|
|
419
|
+
if (enableRelations) {
|
|
420
|
+
constraints += `\nAlso extract relations between entities. Return {"entities": [...], "relations": [{"source": "entity text", "target": "entity text", "relation": "relation type"}]}.`;
|
|
421
|
+
if (relationTypes) {
|
|
422
|
+
constraints += ` Allowed relation types: ${relationTypes.join(", ")}.`;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
return constraints;
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// === PROCESS A SINGLE TEXT ===
|
|
430
|
+
async function processText(
|
|
431
|
+
inputText: string,
|
|
432
|
+
session: LlamaChatSession,
|
|
433
|
+
): Promise<any> {
|
|
434
|
+
const constraints = buildConstraints();
|
|
435
|
+
const prompt = `Extract all named entities from the following text.${constraints}\n\nText: ${inputText}`;
|
|
436
|
+
|
|
437
|
+
const schema = enableRelations
|
|
438
|
+
? buildRelationsSchema()
|
|
439
|
+
: buildGrammarSchema();
|
|
440
|
+
const grammar = await llama.createGrammarForJsonSchema(schema);
|
|
441
|
+
|
|
442
|
+
const res = await session.prompt(prompt, { grammar });
|
|
443
|
+
|
|
444
|
+
let parsed: any;
|
|
445
|
+
try {
|
|
446
|
+
parsed = grammar.parse(res);
|
|
447
|
+
} catch {
|
|
448
|
+
try {
|
|
449
|
+
parsed = JSON.parse(res.trim());
|
|
450
|
+
} catch {
|
|
451
|
+
console.error(
|
|
452
|
+
"Warning: Failed to parse model output. Raw response:",
|
|
453
|
+
res,
|
|
454
|
+
);
|
|
455
|
+
parsed = enableRelations ? { entities: [], relations: [] } : [];
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
return parsed;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// === MAIN ===
|
|
463
|
+
const systemPrompt = buildSystemPrompt();
|
|
464
|
+
const compact = opts.compact || !process.stdout.isTTY;
|
|
465
|
+
|
|
466
|
+
const contextSequence = context.getSequence();
|
|
467
|
+
|
|
468
|
+
if (inputTexts.length === 1) {
|
|
469
|
+
const session = new LlamaChatSession({
|
|
470
|
+
contextSequence,
|
|
471
|
+
systemPrompt,
|
|
472
|
+
});
|
|
473
|
+
const result = await processText(inputTexts[0]!, session);
|
|
474
|
+
console.log(JSON.stringify(result, null, compact ? 0 : 2));
|
|
475
|
+
} else {
|
|
476
|
+
// Batch / chunked: process each text, collect results
|
|
477
|
+
const allResults: any[] = [];
|
|
478
|
+
for (const inputText of inputTexts) {
|
|
479
|
+
// Erase context and create fresh session for each input
|
|
480
|
+
await contextSequence.eraseContextTokenRanges([
|
|
481
|
+
{ start: 0, end: contextSequence.nextTokenIndex },
|
|
482
|
+
]);
|
|
483
|
+
const session = new LlamaChatSession({
|
|
484
|
+
contextSequence,
|
|
485
|
+
systemPrompt,
|
|
486
|
+
});
|
|
487
|
+
const result = await processText(inputText, session);
|
|
488
|
+
allResults.push(result);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
if (opts.file) {
|
|
492
|
+
// Merge chunked results into one
|
|
493
|
+
if (enableRelations) {
|
|
494
|
+
const merged = {
|
|
495
|
+
entities: allResults.flatMap((r) => r.entities || []),
|
|
496
|
+
relations: allResults.flatMap((r) => r.relations || []),
|
|
497
|
+
};
|
|
498
|
+
console.log(JSON.stringify(merged, null, compact ? 0 : 2));
|
|
499
|
+
} else {
|
|
500
|
+
const merged = allResults.flat();
|
|
501
|
+
console.log(JSON.stringify(merged, null, compact ? 0 : 2));
|
|
502
|
+
}
|
|
503
|
+
} else {
|
|
504
|
+
// Batch: output one result per line (JSONL)
|
|
505
|
+
for (const result of allResults) {
|
|
506
|
+
console.log(JSON.stringify(result));
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
|
|
511
|
+
// === CLEANUP ===
|
|
512
|
+
// Bun segfaults if process.exit() triggers synchronous native addon unloading.
|
|
513
|
+
// Setting exitCode lets the event loop drain naturally, avoiding the crash.
|
|
514
|
+
process.exitCode = 0;
|
package/package.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "objectivist-ner",
|
|
3
|
+
"version": "0.0.0",
|
|
4
|
+
"description": "Objectivist-inspired Named Entity Recognition with grammar-constrained LLM output",
|
|
5
|
+
"bin": {
|
|
6
|
+
"ner": "index.ts"
|
|
7
|
+
},
|
|
8
|
+
"module": "index.ts",
|
|
9
|
+
"type": "module",
|
|
10
|
+
"license": "MIT",
|
|
11
|
+
"author": "Richard Anaya",
|
|
12
|
+
"repository": "git@github.com:richardanaya/objectivist-ner.git",
|
|
13
|
+
"devDependencies": {
|
|
14
|
+
"@types/bun": "latest",
|
|
15
|
+
"prettier": "^3.8.1"
|
|
16
|
+
},
|
|
17
|
+
"peerDependencies": {
|
|
18
|
+
"typescript": "^5"
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"commander": "^14.0.3",
|
|
22
|
+
"node-llama-cpp": "^3.17.1"
|
|
23
|
+
}
|
|
24
|
+
}
|