objectivist-ner 0.0.0 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +260 -241
- package/index.ts +155 -35
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,83 +1,139 @@
|
|
|
1
1
|
# objectivist-ner
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Most Named Entity Recognition tools treat language as a bag of words to be statistically tagged.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
This tool takes a different approach.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
It is built on the Objectivist recognition that concepts are not arbitrary labels — they are integrations of observed reality, formed by identifying essential characteristics and omitting measurements. A valid concept must be grounded in percepts, organized hierarchically, and maintain identity across contexts.
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
That is why `objectivist-ner` emphasizes:
|
|
10
10
|
|
|
11
|
-
- Exact
|
|
12
|
-
-
|
|
13
|
-
-
|
|
14
|
-
-
|
|
15
|
-
-
|
|
16
|
-
- Coreference resolution (group mentions of the same entity)
|
|
17
|
-
- Negation and modality detection
|
|
18
|
-
- Confidence scores
|
|
19
|
-
- Schema definition files for reusable ontologies
|
|
20
|
-
- Long document chunking with `--file`
|
|
21
|
-
- Batch processing with `--batch`
|
|
22
|
-
- Three built-in model tiers: `--fast`, `--balanced`, `--best`
|
|
23
|
-
- Reads from argument, file, or stdin
|
|
24
|
-
- Compact JSON output for non-TTY / piping
|
|
11
|
+
- **Exact entity spans** — because a concept must refer to something specific in reality
|
|
12
|
+
- **Hierarchical classification** — because proper concept formation requires understanding genus and differentia
|
|
13
|
+
- **Negation detection** — because the relationship of a concept to existence is epistemologically essential
|
|
14
|
+
- **Coreference resolution** — because the law of identity demands we recognize the same existent across multiple descriptions
|
|
15
|
+
- **Relations** — because concepts do not exist in isolation, they integrate into propositions
|
|
25
16
|
|
|
26
|
-
|
|
17
|
+
It runs completely locally using a small language model. No API keys. No data leaves your machine.
|
|
18
|
+
|
|
19
|
+
## What Makes This Different
|
|
20
|
+
|
|
21
|
+
### 1. Assertion vs Negation vs Hypothetical
|
|
22
|
+
|
|
23
|
+
> "The patient has diabetes but does not have cancer. He might develop hypertension."
|
|
24
|
+
|
|
25
|
+
**Typical NER** sees three diseases. **objectivist-ner** sees three different relationships to reality:
|
|
27
26
|
|
|
28
27
|
```bash
|
|
29
|
-
|
|
30
|
-
|
|
28
|
+
ner --detect-negation "The patient has diabetes but does not have cancer. He might develop hypertension."
|
|
29
|
+
```
|
|
31
30
|
|
|
32
|
-
|
|
33
|
-
|
|
31
|
+
```json
|
|
32
|
+
[
|
|
33
|
+
{ "class": "disease", "text": "diabetes", "assertion": "present" },
|
|
34
|
+
{ "class": "disease", "text": "cancer", "assertion": "negated" },
|
|
35
|
+
{ "class": "disease", "text": "hypertension", "assertion": "hypothetical" }
|
|
36
|
+
]
|
|
34
37
|
```
|
|
35
38
|
|
|
36
|
-
|
|
39
|
+
The `assertion` field tells you whether the text claims something is **present**, **negated**, or **hypothetical**.
|
|
40
|
+
|
|
41
|
+
### 2. Identity Across References
|
|
42
|
+
|
|
43
|
+
> "Dr. Chen published a paper. She later won the Nobel Prize. The neurologist was celebrated."
|
|
37
44
|
|
|
38
|
-
|
|
45
|
+
**Typical NER** sees three separate people. **objectivist-ner** knows they are the same person:
|
|
39
46
|
|
|
40
47
|
```bash
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
ner --resolve "Dr. Chen published a paper. She later won the Nobel Prize. The neurologist was celebrated."
|
|
49
|
+
```
|
|
43
50
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
51
|
+
```json
|
|
52
|
+
[
|
|
53
|
+
{
|
|
54
|
+
"class": "person",
|
|
55
|
+
"text": "Dr. Chen",
|
|
56
|
+
"entity_id": "e1",
|
|
57
|
+
"is_canonical": true
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"class": "person",
|
|
61
|
+
"text": "She",
|
|
62
|
+
"entity_id": "e1",
|
|
63
|
+
"is_canonical": false
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"class": "person",
|
|
67
|
+
"text": "The neurologist",
|
|
68
|
+
"entity_id": "e1",
|
|
69
|
+
"is_canonical": false
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"class": "event",
|
|
73
|
+
"text": "the Nobel Prize",
|
|
74
|
+
"entity_id": "e2",
|
|
75
|
+
"is_canonical": true
|
|
76
|
+
}
|
|
77
|
+
]
|
|
48
78
|
```
|
|
49
79
|
|
|
50
|
-
|
|
80
|
+
`entity_id` groups coreferent mentions. `is_canonical` marks the most specific reference.
|
|
51
81
|
|
|
52
|
-
|
|
53
|
-
# Restrict entity classes
|
|
54
|
-
ner "John works at Google" --classes person,organization
|
|
82
|
+
### 3. Hierarchical Classification
|
|
55
83
|
|
|
56
|
-
|
|
57
|
-
ner "Alice is sad in Paris" --attributes emotional_state,location
|
|
84
|
+
Define your ontology as a tree with mixed arrays (leaf nodes) and objects (nested hierarchies):
|
|
58
85
|
|
|
59
|
-
|
|
60
|
-
|
|
86
|
+
```
|
|
87
|
+
organism
|
|
88
|
+
├── person
|
|
89
|
+
└── animal
|
|
90
|
+
├── dog
|
|
91
|
+
└── cat
|
|
92
|
+
|
|
93
|
+
idea
|
|
94
|
+
├── dream
|
|
95
|
+
└── principle
|
|
96
|
+
```
|
|
61
97
|
|
|
62
|
-
|
|
63
|
-
ner "
|
|
64
|
-
|
|
98
|
+
```bash
|
|
99
|
+
ner --taxonomy '{"organism":["person",{"animal":["dog","cat"]}],"idea":["dream","principle"]}' \
|
|
100
|
+
"The child recounted a vivid dream about the golden retriever."
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
```json
|
|
104
|
+
[
|
|
105
|
+
{
|
|
106
|
+
"class": "person",
|
|
107
|
+
"text": "The child",
|
|
108
|
+
"taxonomyPath": ["organism", "person"]
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"class": "dream",
|
|
112
|
+
"text": "a vivid dream",
|
|
113
|
+
"taxonomyPath": ["idea", "dream"]
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"class": "dog",
|
|
117
|
+
"text": "the golden retriever",
|
|
118
|
+
"taxonomyPath": ["organism", "animal", "dog"]
|
|
119
|
+
}
|
|
120
|
+
]
|
|
65
121
|
```
|
|
66
122
|
|
|
67
|
-
|
|
123
|
+
The model classifies at the most specific (leaf) level, and `taxonomyPath` preserves the full hierarchy.
|
|
124
|
+
|
|
125
|
+
### 4. Conceptual Integration (Relations)
|
|
68
126
|
|
|
69
127
|
```bash
|
|
70
128
|
ner --relations "Dr. Chen works at MIT and collaborates with Prof. Wright"
|
|
71
129
|
```
|
|
72
130
|
|
|
73
|
-
Output:
|
|
74
|
-
|
|
75
131
|
```json
|
|
76
132
|
{
|
|
77
133
|
"entities": [
|
|
78
|
-
{ "class": "person", "text": "Dr. Chen"
|
|
79
|
-
{ "class": "organization", "text": "MIT"
|
|
80
|
-
{ "class": "person", "text": "Prof. Wright"
|
|
134
|
+
{ "class": "person", "text": "Dr. Chen" },
|
|
135
|
+
{ "class": "organization", "text": "MIT" },
|
|
136
|
+
{ "class": "person", "text": "Prof. Wright" }
|
|
81
137
|
],
|
|
82
138
|
"relations": [
|
|
83
139
|
{ "source": "Dr. Chen", "target": "MIT", "relation": "works at" },
|
|
@@ -90,266 +146,229 @@ Output:
|
|
|
90
146
|
}
|
|
91
147
|
```
|
|
92
148
|
|
|
93
|
-
|
|
149
|
+
Relations show how entities connect — extracting the "connective tissue" between concepts.
|
|
150
|
+
|
|
151
|
+
You can also categorize relations by class:
|
|
94
152
|
|
|
95
153
|
```bash
|
|
96
|
-
ner --
|
|
154
|
+
ner --relations --relation-classes "employment,location,causal,professional" \
|
|
155
|
+
"Dr. Chen works at MIT and collaborates with Prof. Wright"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
```json
|
|
159
|
+
{
|
|
160
|
+
"entities": [
|
|
161
|
+
{ "class": "person", "text": "Dr. Chen" },
|
|
162
|
+
{ "class": "organization", "text": "MIT" },
|
|
163
|
+
{ "class": "person", "text": "Prof. Wright" }
|
|
164
|
+
],
|
|
165
|
+
"relations": [
|
|
166
|
+
{
|
|
167
|
+
"source": "Dr. Chen",
|
|
168
|
+
"target": "MIT",
|
|
169
|
+
"relation": "works at",
|
|
170
|
+
"class": "employment"
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
"source": "Dr. Chen",
|
|
174
|
+
"target": "Prof. Wright",
|
|
175
|
+
"relation": "collaborates with",
|
|
176
|
+
"class": "professional"
|
|
177
|
+
}
|
|
178
|
+
]
|
|
179
|
+
}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
The `class` field categorizes the relation type (e.g., employment, causal, spatial), allowing you to group and analyze connections by category.
|
|
183
|
+
|
|
184
|
+
## Installation
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
bun install -g objectivist-ner
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Quick Start
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Basic extraction
|
|
194
|
+
ner "the cat is blue and is feeling sad"
|
|
195
|
+
|
|
196
|
+
# Choose quality vs speed
|
|
197
|
+
ner --fast "simple text"
|
|
198
|
+
er --balanced "moderate text"
|
|
199
|
+
er --best "complex text"
|
|
97
200
|
```
|
|
98
201
|
|
|
99
|
-
|
|
202
|
+
## Usage Examples
|
|
203
|
+
|
|
204
|
+
### Constrain entity classes
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
ner "John works at Google" --classes person,organization
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
```json
|
|
211
|
+
[
|
|
212
|
+
{ "class": "person", "text": "John" },
|
|
213
|
+
{ "class": "organization", "text": "Google" }
|
|
214
|
+
]
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
### Constrain attribute keys
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
ner "Alice is sad in Paris" --attributes emotional_state,location
|
|
221
|
+
```
|
|
100
222
|
|
|
101
223
|
```json
|
|
102
224
|
[
|
|
103
225
|
{
|
|
104
226
|
"class": "person",
|
|
105
|
-
"text": "
|
|
106
|
-
"attributes": {}
|
|
107
|
-
"entity_id": "e1"
|
|
108
|
-
},
|
|
109
|
-
{ "class": "person", "text": "She", "attributes": {}, "entity_id": "e1" },
|
|
110
|
-
{
|
|
111
|
-
"class": "person",
|
|
112
|
-
"text": "The neurologist",
|
|
113
|
-
"attributes": {},
|
|
114
|
-
"entity_id": "e1"
|
|
115
|
-
},
|
|
116
|
-
{
|
|
117
|
-
"class": "event",
|
|
118
|
-
"text": "the Nobel Prize",
|
|
119
|
-
"attributes": {},
|
|
120
|
-
"entity_id": "e2"
|
|
227
|
+
"text": "Alice",
|
|
228
|
+
"attributes": { "emotional_state": "sad", "location": "Paris" }
|
|
121
229
|
}
|
|
122
230
|
]
|
|
123
231
|
```
|
|
124
232
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
### Negation detection
|
|
233
|
+
### Constrain attribute values
|
|
128
234
|
|
|
129
235
|
```bash
|
|
130
|
-
ner
|
|
236
|
+
ner "The sky is blue" --attr-values '{"color":["blue","red","green"]}'
|
|
131
237
|
```
|
|
132
238
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
239
|
+
```json
|
|
240
|
+
[
|
|
241
|
+
{
|
|
242
|
+
"class": "object",
|
|
243
|
+
"text": "sky",
|
|
244
|
+
"attributes": { "color": "blue" }
|
|
245
|
+
}
|
|
246
|
+
]
|
|
137
247
|
```
|
|
138
248
|
|
|
139
|
-
### Schema
|
|
249
|
+
### Schema files
|
|
140
250
|
|
|
141
|
-
Define your ontology
|
|
251
|
+
Define your ontology once and reuse it:
|
|
142
252
|
|
|
143
253
|
```json
|
|
144
254
|
{
|
|
145
255
|
"taxonomy": {
|
|
146
256
|
"organism": ["person", "animal"],
|
|
147
|
-
"
|
|
148
|
-
"institution": ["company", "university", "government_agency"]
|
|
257
|
+
"animal": ["dog", "cat"]
|
|
149
258
|
},
|
|
150
|
-
"attributes": ["role", "
|
|
151
|
-
"relations": ["works_at", "
|
|
259
|
+
"attributes": ["role", "location"],
|
|
260
|
+
"relations": ["works_at", "collaborates_with"]
|
|
152
261
|
}
|
|
153
262
|
```
|
|
154
263
|
|
|
155
264
|
```bash
|
|
156
|
-
ner --schema
|
|
265
|
+
ner --schema ontology.json "Dr. Chen works at MIT"
|
|
157
266
|
```
|
|
158
267
|
|
|
159
|
-
Schema files support `taxonomy`, `classes`, `attributes`, `attrValues`, and `relations`. CLI flags override schema file values.
|
|
160
|
-
|
|
161
268
|
### File and batch processing
|
|
162
269
|
|
|
163
270
|
```bash
|
|
164
271
|
# Process a long document (auto-chunked)
|
|
165
272
|
ner --file document.txt
|
|
166
273
|
|
|
167
|
-
# Process a JSONL file
|
|
274
|
+
# Process a JSONL file
|
|
168
275
|
ner --batch inputs.jsonl
|
|
169
276
|
|
|
170
277
|
# Process a directory of .txt files
|
|
171
278
|
ner --batch ./documents/
|
|
172
279
|
```
|
|
173
280
|
|
|
174
|
-
###
|
|
281
|
+
### Read from stdin
|
|
175
282
|
|
|
176
283
|
```bash
|
|
177
|
-
# Append to the built-in system prompt
|
|
178
|
-
ner "text" --system-prompt-append "Focus only on emotions"
|
|
179
|
-
|
|
180
|
-
# Replace the system prompt entirely
|
|
181
|
-
ner "text" --system-prompt "You are a custom extractor."
|
|
182
|
-
|
|
183
|
-
# Read from stdin
|
|
184
284
|
echo "the cat is blue" | ner
|
|
185
|
-
|
|
186
|
-
# Compact JSON output
|
|
187
|
-
ner "the cat is blue" --compact
|
|
285
|
+
cat article.txt | ner --detect-negation
|
|
188
286
|
```
|
|
189
287
|
|
|
190
|
-
##
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
|
195
|
-
|
|
|
196
|
-
| `--
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
|
205
|
-
|
|
|
206
|
-
| `--
|
|
207
|
-
|
|
|
208
|
-
|
|
|
209
|
-
|
|
|
210
|
-
|
|
|
211
|
-
| `--
|
|
212
|
-
| `--
|
|
213
|
-
| `--
|
|
214
|
-
| `--
|
|
215
|
-
| `--include-confidence` |
|
|
216
|
-
| `--
|
|
217
|
-
| `--
|
|
218
|
-
| `--
|
|
219
|
-
| `--
|
|
220
|
-
| `--system-prompt <string>`
|
|
221
|
-
| `--
|
|
222
|
-
|
|
|
223
|
-
| `-m, --model <uri>` | Use any GGUF model (see below) |
|
|
288
|
+
## Model Tiers
|
|
289
|
+
|
|
290
|
+
| Flag | Size | Download | Best for |
|
|
291
|
+
| ------------ | ------ | -------- | ------------------------------- |
|
|
292
|
+
| `--fast` | Small | ~0.9 GB | Simple text, single entities |
|
|
293
|
+
| `--balanced` | Medium | ~2.3 GB | Moderate complexity, most tasks |
|
|
294
|
+
| `--best` | Large | ~4.5 GB | Dense text, rare entity types |
|
|
295
|
+
|
|
296
|
+
`--best` is the default. See [Benchmarks](#benchmarks).
|
|
297
|
+
|
|
298
|
+
## Options Reference
|
|
299
|
+
|
|
300
|
+
| Flag | Description |
|
|
301
|
+
| --------------------------------- | -------------------------------------------------- |
|
|
302
|
+
| `--fast` | Use smallest model |
|
|
303
|
+
| `--balanced` | Use mid-size model |
|
|
304
|
+
| `--best` | Use largest model (default) |
|
|
305
|
+
| `-c, --classes <list>` | Allowed entity classes |
|
|
306
|
+
| `-a, --attributes <list>` | Allowed attribute keys |
|
|
307
|
+
| `--attr-values <json>` | Enum map for attribute values |
|
|
308
|
+
| `--taxonomy <json>` | Class hierarchy (parent → children) |
|
|
309
|
+
| `--relations` | Extract relations between entities |
|
|
310
|
+
| `--relation-classes <list>` | Allowed relation classes (e.g. employment,causal) |
|
|
311
|
+
| `--resolve` | Resolve coreferences (adds entity_id) |
|
|
312
|
+
| `--detect-negation` | Add assertion field (present/negated/hypothetical) |
|
|
313
|
+
| `--include-confidence` | Add confidence field (low/medium/high) |
|
|
314
|
+
| `--schema <path>` | Load schema from JSON file |
|
|
315
|
+
| `--file <path>` | Read from file (with chunking) |
|
|
316
|
+
| `--batch <path>` | Process JSONL file or directory |
|
|
317
|
+
| `--system-prompt <string>` | Replace system prompt |
|
|
318
|
+
| `--system-prompt-append <string>` | Append to system prompt |
|
|
319
|
+
| `--compact` | Compact JSON output |
|
|
320
|
+
| `-m, --model <uri>` | Use custom GGUF model |
|
|
224
321
|
|
|
225
322
|
## Benchmarks
|
|
226
323
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
> "Dr. Maria Chen, a 42-year-old neurologist at Massachusetts General Hospital in Boston, published a groundbreaking paper with her colleague Prof. James Wright from Oxford University about a rare genetic mutation called BRCA3-delta found in 12 patients from rural Bangladesh, while simultaneously consulting for Pfizer on their new drug Nexavion priced at 450 dollars per dose, which the WHO classified as a Category A essential medicine last Tuesday during their Geneva summit"
|
|
230
|
-
|
|
231
|
-
| Entity | `--fast` | `--balanced` | `--best` (default) |
|
|
232
|
-
| ------------------ | ---------- | ------------ | ------------------------ |
|
|
233
|
-
| Dr. Maria Chen | person | person | person |
|
|
234
|
-
| Prof. James Wright | person | person | person, role: colleague |
|
|
235
|
-
| MGH | - | org | org |
|
|
236
|
-
| Oxford University | - | org | org |
|
|
237
|
-
| BRCA3-delta | - | disease | disease |
|
|
238
|
-
| Bangladesh | - | - | location |
|
|
239
|
-
| Pfizer | - | org | org |
|
|
240
|
-
| Nexavion | - | drug | drug, price: 450 dollars |
|
|
241
|
-
| WHO | - | - | org, category: Cat A |
|
|
242
|
-
| Geneva summit | - | event | location |
|
|
243
|
-
| Boston | location | - | location |
|
|
244
|
-
| **Entities found** | **3 / 11** | **8 / 11** | **11 / 11** |
|
|
245
|
-
|
|
246
|
-
All three tiers produce zero hallucinations with the current prompt design.
|
|
247
|
-
|
|
248
|
-
## Epistemological design
|
|
249
|
-
|
|
250
|
-
fastner's feature set is informed by Objectivist epistemology -- the theory that concepts are formed by abstracting essential characteristics from concretes, organized into hierarchical structures, and held in a specific relationship to reality.
|
|
251
|
-
|
|
252
|
-
### Identity: A is A (`--resolve`)
|
|
253
|
-
|
|
254
|
-
The law of identity demands that we track _what a thing is_ across all its references. When a text says "Dr. Chen", "she", and "the neurologist", these are three linguistic expressions of one entity. Without coreference resolution, an NER system treats them as three unrelated extractions -- a failure to maintain identity. `--resolve` enforces that A remains A regardless of how it is named.
|
|
255
|
-
|
|
256
|
-
### Hierarchical concept formation (`--taxonomy`)
|
|
257
|
-
|
|
258
|
-
Objectivist epistemology holds that concepts are organized hierarchically through a process of abstraction. "Cat" is subsumed under "animal", which is subsumed under "organism". Each level retains the essential characteristics of its parent while adding differentia. The `--taxonomy` flag mirrors this structure directly -- you define genus-species relationships between entity classes, and the model classifies at the most specific level it can justify. This isn't just organization; it's how valid concepts are formed.
|
|
259
|
-
|
|
260
|
-
### Distinguishing existence from assertion (`--detect-negation`)
|
|
261
|
-
|
|
262
|
-
A concept must be connected to reality. "The patient has diabetes" and "the patient does not have diabetes" both contain the entity "diabetes", but their relationship to existence is opposite. Naive NER systems that extract "diabetes" from both sentences without distinguishing assertion from negation commit a fundamental error -- they detach the concept from its existential status. `--detect-negation` forces every entity to declare its relationship to reality: present, negated, or hypothetical.
|
|
263
|
-
|
|
264
|
-
### Certainty and the hierarchy of evidence (`--include-confidence`)
|
|
265
|
-
|
|
266
|
-
Knowledge exists on a spectrum from certain to speculative. "Dr. Maria Chen" appearing with a full name and title is a high-confidence extraction. "Someone named Bob" is low-confidence. Objectivism rejects both dogmatism (asserting certainty where none exists) and skepticism (denying certainty where it does). `--include-confidence` makes the epistemic status of each extraction explicit, letting downstream systems apply appropriate thresholds.
|
|
267
|
-
|
|
268
|
-
### Relations as conceptual integration (`--relations`)
|
|
324
|
+
Tested on a complex input with 11 entities across 6 classes:
|
|
269
325
|
|
|
270
|
-
|
|
326
|
+
| Entity | `--fast` | `--balanced` | `--best` |
|
|
327
|
+
| ------------------ | -------- | ------------ | --------- |
|
|
328
|
+
| Dr. Maria Chen | person | person | person |
|
|
329
|
+
| Prof. James Wright | person | person | person |
|
|
330
|
+
| MGH | — | org | org |
|
|
331
|
+
| Oxford University | — | org | org |
|
|
332
|
+
| BRCA3-delta | — | disease | disease |
|
|
333
|
+
| Bangladesh | — | — | location |
|
|
334
|
+
| Pfizer | — | org | org |
|
|
335
|
+
| Nexavion | — | drug | drug |
|
|
336
|
+
| WHO | — | — | org |
|
|
337
|
+
| Geneva summit | — | event | location |
|
|
338
|
+
| Boston | location | — | location |
|
|
339
|
+
| **Found** | **3/11** | **8/11** | **11/11** |
|
|
271
340
|
|
|
272
|
-
|
|
341
|
+
## Integration with objectivist-lattice
|
|
273
342
|
|
|
274
|
-
|
|
343
|
+
This tool is designed to work with **[objectivist-lattice](https://github.com/richardanaya/objectivist-lattice)** — a knowledge management system that enforces the Objectivist hierarchy: percepts → concepts → principles → actions.
|
|
275
344
|
|
|
276
|
-
|
|
345
|
+
**objectivist-ner** extracts the percepts and concepts. **objectivist-lattice** validates and organizes them into principles you can act on.
|
|
277
346
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
## Building Up Knowledge
|
|
281
|
-
|
|
282
|
-
fastner is designed as a tool for the Objectivist project of building knowledge from percepts through concepts to principles and finally to action — the exact process implemented in the companion project **[objectivist-lattice](https://github.com/richardanaya/objectivist-lattice)**.
|
|
283
|
-
|
|
284
|
-
### The Epistemological Pipeline
|
|
285
|
-
|
|
286
|
-
Objectivism holds that all knowledge begins with **percepts** (raw sensory data), which are integrated into **concepts**, which are organized into **principles** (general truths), which are finally applied as **actions** in specific contexts.
|
|
287
|
-
|
|
288
|
-
`objectivist-lattice` enforces this hierarchy strictly on a filesystem of Markdown files with validation rules:
|
|
289
|
-
|
|
290
|
-
- **Axioms** and **percepts** are bedrock — they have no `reduces_to` links
|
|
291
|
-
- **Principles** must reduce to axioms or percepts
|
|
292
|
-
- **Applications** must reduce to principles
|
|
293
|
-
- Promotion from `Tentative/Hypothesis` to `Integrated/Validated` can only happen bottom-up
|
|
294
|
-
|
|
295
|
-
### How NER Helps Build the Lattice
|
|
296
|
-
|
|
297
|
-
fastner acts as the **percept-to-concept extraction layer** for this system:
|
|
298
|
-
|
|
299
|
-
1. **Percept Extraction** (`--detect-negation`)
|
|
300
|
-
- Identifies concrete entities from source material (books, articles, personal observations)
|
|
301
|
-
- Distinguishes what is asserted as present, negated, or hypothetical
|
|
302
|
-
- Feeds raw perceptual data into the `02-Percepts/` directory
|
|
303
|
-
|
|
304
|
-
2. **Concept Formation** (`--classes`, `--taxonomy`, `--resolve`)
|
|
305
|
-
- Groups multiple mentions of the same entity (`entity_id`)
|
|
306
|
-
- Classifies entities into hierarchical taxonomies (`organism > person > neurologist`)
|
|
307
|
-
- Maintains identity across contexts — "Dr. Chen", "she", and "the neurologist" are recognized as the same existent
|
|
308
|
-
|
|
309
|
-
3. **Principle Discovery** (`--relations`, `--schema`)
|
|
310
|
-
- Extracts relations between entities ("works at", "causes", "implies")
|
|
311
|
-
- Uses schema files to enforce your ontological commitments
|
|
312
|
-
- Surfaces potential principles by showing what consistently reduces to what
|
|
313
|
-
|
|
314
|
-
4. **Action Guidance** (`--include-confidence`)
|
|
315
|
-
- Rates confidence in each extraction
|
|
316
|
-
- Helps distinguish high-certainty principles (suitable for action) from speculative ones (still tentative)
|
|
317
|
-
|
|
318
|
-
### Practical Workflow
|
|
347
|
+
### Workflow
|
|
319
348
|
|
|
320
349
|
```bash
|
|
321
|
-
# Extract
|
|
322
|
-
ner --file chapter1.txt --detect-negation --resolve
|
|
350
|
+
# Extract structured observations from text
|
|
351
|
+
ner --file chapter1.txt --detect-negation --resolve > percepts.json
|
|
323
352
|
|
|
324
|
-
#
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
# Later, when forming principles
|
|
328
|
-
ner --relations --schema ontology.json "text from multiple chapters" > principles.json
|
|
353
|
+
# Import into your knowledge lattice
|
|
354
|
+
# (See objectivist-lattice documentation for details)
|
|
329
355
|
```
|
|
330
356
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
**Percepts → Concepts → Principles → Validated Knowledge → Action**
|
|
357
|
+
## Epistemological Design
|
|
334
358
|
|
|
335
|
-
|
|
359
|
+
Each feature maps to an Objectivist principle:
|
|
336
360
|
|
|
337
|
-
|
|
361
|
+
- **`--resolve`** — The law of identity (A is A)
|
|
362
|
+
- **`--taxonomy`** — Hierarchical concept formation (genus and differentia)
|
|
363
|
+
- **`--detect-negation`** — Grounding concepts in reality (existence vs non-existence)
|
|
364
|
+
- **`--relations`** — Conceptual integration (concepts form connected propositions)
|
|
365
|
+
- **Grammar enforcement** — Non-contradiction (structure prevents invalid values)
|
|
338
366
|
|
|
339
|
-
|
|
367
|
+
## Custom Models
|
|
340
368
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
If the built-in tiers don't fit your needs, you can pass any GGUF model with `--model`. This overrides `--fast`/`--balanced`/`--best`.
|
|
369
|
+
Use any GGUF model:
|
|
344
370
|
|
|
345
371
|
```bash
|
|
346
|
-
# HuggingFace URI
|
|
347
372
|
ner "text" --model "hf:unsloth/Qwen3-8B-GGUF:Qwen3-8B-Q4_K_M.gguf"
|
|
348
|
-
|
|
349
|
-
# Local file
|
|
350
|
-
ner "text" --model ./my-custom-model.gguf
|
|
373
|
+
ner "text" --model ./my-model.gguf
|
|
351
374
|
```
|
|
352
|
-
|
|
353
|
-
## License
|
|
354
|
-
|
|
355
|
-
MIT © Richard Anaya
|
package/index.ts
CHANGED
|
@@ -44,23 +44,104 @@ interface SchemaFile {
|
|
|
44
44
|
attributes?: string[];
|
|
45
45
|
attrValues?: Record<string, string[]>;
|
|
46
46
|
relations?: string[];
|
|
47
|
+
relationClasses?: string[];
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
// === TAXONOMY HELPERS ===
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
51
|
+
// Taxonomy format: {"organism": ["person", {"animal": ["dog", "cat"]}], "idea": ["dream", "principle"]}
|
|
52
|
+
// Arrays contain leaf nodes, objects contain nested taxonomies
|
|
53
|
+
|
|
54
|
+
function getLeafNodes(taxonomy: Record<string, any>): string[] {
|
|
55
|
+
const leaves: string[] = [];
|
|
56
|
+
|
|
57
|
+
function traverse(node: any) {
|
|
58
|
+
if (Array.isArray(node)) {
|
|
59
|
+
for (const item of node) {
|
|
60
|
+
if (typeof item === "string") {
|
|
61
|
+
leaves.push(item);
|
|
62
|
+
} else if (typeof item === "object" && item !== null) {
|
|
63
|
+
traverse(item);
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
} else if (typeof node === "object" && node !== null) {
|
|
67
|
+
for (const [, value] of Object.entries(node)) {
|
|
68
|
+
traverse(value);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
55
71
|
}
|
|
56
|
-
|
|
72
|
+
|
|
73
|
+
traverse(taxonomy);
|
|
74
|
+
return leaves;
|
|
57
75
|
}
|
|
58
76
|
|
|
59
|
-
function
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
77
|
+
function getTaxonomyPath(
|
|
78
|
+
leaf: string,
|
|
79
|
+
taxonomy: Record<string, any>,
|
|
80
|
+
): string[] {
|
|
81
|
+
function findPath(
|
|
82
|
+
node: any,
|
|
83
|
+
target: string,
|
|
84
|
+
currentPath: string[],
|
|
85
|
+
): string[] | null {
|
|
86
|
+
if (Array.isArray(node)) {
|
|
87
|
+
for (const item of node) {
|
|
88
|
+
if (typeof item === "string" && item === target) {
|
|
89
|
+
return [...currentPath, item];
|
|
90
|
+
} else if (typeof item === "object" && item !== null) {
|
|
91
|
+
const result = findPath(item, target, currentPath);
|
|
92
|
+
if (result) return result;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
} else if (typeof node === "object" && node !== null) {
|
|
96
|
+
for (const [key, value] of Object.entries(node)) {
|
|
97
|
+
const result = findPath(value, target, [...currentPath, key]);
|
|
98
|
+
if (result) return result;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return null;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Try each root node
|
|
105
|
+
for (const [rootKey, rootValue] of Object.entries(taxonomy)) {
|
|
106
|
+
const path = findPath(rootValue, leaf, [rootKey]);
|
|
107
|
+
if (path) return path;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return [leaf];
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function taxonomyToPrompt(taxonomy: Record<string, any>): string {
|
|
114
|
+
function formatNode(node: any, indent: string): string[] {
|
|
115
|
+
const lines: string[] = [];
|
|
116
|
+
|
|
117
|
+
if (Array.isArray(node)) {
|
|
118
|
+
for (const item of node) {
|
|
119
|
+
if (typeof item === "string") {
|
|
120
|
+
lines.push(`${indent}- ${item}`);
|
|
121
|
+
} else if (typeof item === "object" && item !== null) {
|
|
122
|
+
lines.push(...formatNode(item, indent));
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
} else if (typeof node === "object" && node !== null) {
|
|
126
|
+
for (const [key, value] of Object.entries(node)) {
|
|
127
|
+
lines.push(`${indent}- ${key}`);
|
|
128
|
+
lines.push(...formatNode(value, indent + " "));
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return lines;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
const lines: string[] = [
|
|
136
|
+
"Use the following class hierarchy. Classify at the most specific (leaf) level:",
|
|
137
|
+
];
|
|
138
|
+
|
|
139
|
+
for (const [rootKey, rootValue] of Object.entries(taxonomy)) {
|
|
140
|
+
lines.push(`- ${rootKey}`);
|
|
141
|
+
lines.push(...formatNode(rootValue, " "));
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return lines.join("\n");
|
|
64
145
|
}
|
|
65
146
|
|
|
66
147
|
// === CHUNKING ===
|
|
@@ -98,6 +179,10 @@ program
|
|
|
98
179
|
'Class hierarchy JSON e.g. {"organism":["animal","plant"]}',
|
|
99
180
|
)
|
|
100
181
|
.option("--relations", "Extract relations between entities")
|
|
182
|
+
.option(
|
|
183
|
+
"--relation-classes <list>",
|
|
184
|
+
"Comma-separated allowed relation classes (e.g. employment,location,causal)",
|
|
185
|
+
)
|
|
101
186
|
.option("--resolve", "Resolve coreferences (group mentions of same entity)")
|
|
102
187
|
.option("--include-confidence", "Include confidence scores per entity")
|
|
103
188
|
.option("--detect-negation", "Detect negated/hypothetical entities")
|
|
@@ -130,17 +215,16 @@ program
|
|
|
130
215
|
"after",
|
|
131
216
|
`
|
|
132
217
|
Examples:
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
echo "the cat is blue" | fastner
|
|
218
|
+
ner "the cat is blue"
|
|
219
|
+
ner "John works at Google" --classes person,organization
|
|
220
|
+
ner "sky is blue" --attr-values '{"color":["blue","red"]}'
|
|
221
|
+
ner --relations "Dr. Chen works at MIT"
|
|
222
|
+
ner --resolve "Dr. Chen published a paper. She won an award."
|
|
223
|
+
ner --detect-negation "The patient does not have cancer"
|
|
224
|
+
ner --schema schema.json "complex text"
|
|
225
|
+
ner --file document.txt
|
|
226
|
+
ner --batch inputs.jsonl
|
|
227
|
+
echo "the cat is blue" | ner
|
|
144
228
|
`,
|
|
145
229
|
)
|
|
146
230
|
.parse();
|
|
@@ -214,6 +298,9 @@ if (opts.taxonomy) {
|
|
|
214
298
|
|
|
215
299
|
const enableRelations = opts.relations || !!schemaFile?.relations;
|
|
216
300
|
const relationTypes: string[] | undefined = schemaFile?.relations || undefined;
|
|
301
|
+
const relationClasses: string[] | undefined = opts.relationClasses
|
|
302
|
+
? opts.relationClasses.split(",").map((s: string) => s.trim())
|
|
303
|
+
: schemaFile?.relationClasses;
|
|
217
304
|
const enableResolve = !!opts.resolve;
|
|
218
305
|
const enableConfidence = !!opts.includeConfidence;
|
|
219
306
|
const enableNegation = !!opts.detectNegation;
|
|
@@ -293,6 +380,7 @@ function buildSystemPrompt(): string {
|
|
|
293
380
|
}
|
|
294
381
|
if (enableResolve) {
|
|
295
382
|
base += `\n- Every entity has a top-level "entity_id" field. If multiple text spans refer to the same real-world entity (e.g. "Dr. Chen" and "she"), they share the same entity_id. Use short IDs like "e1", "e2".`;
|
|
383
|
+
base += `\n- When multiple mentions share an entity_id, exactly ONE of them must have "is_canonical": true (the most specific reference like a proper name). The others must have "is_canonical": false.`;
|
|
296
384
|
}
|
|
297
385
|
|
|
298
386
|
let prompt = `${base}\n\n${FEW_SHOT_EXAMPLES}`;
|
|
@@ -306,8 +394,9 @@ function buildSystemPrompt(): string {
|
|
|
306
394
|
|
|
307
395
|
// === BUILD GRAMMAR SCHEMA ===
|
|
308
396
|
function buildGrammarSchema() {
|
|
309
|
-
//
|
|
310
|
-
|
|
397
|
+
// When using taxonomy, only allow leaf nodes as valid classes.
|
|
398
|
+
// This forces the model to classify at the most specific level.
|
|
399
|
+
const classEnum = taxonomy ? getLeafNodes(taxonomy) : allowedClasses;
|
|
311
400
|
|
|
312
401
|
const attributesSchema: any = {
|
|
313
402
|
type: "object",
|
|
@@ -343,7 +432,9 @@ function buildGrammarSchema() {
|
|
|
343
432
|
}
|
|
344
433
|
if (enableResolve) {
|
|
345
434
|
properties.entity_id = { type: "string" };
|
|
435
|
+
properties.is_canonical = { type: "boolean" };
|
|
346
436
|
required.push("entity_id");
|
|
437
|
+
required.push("is_canonical");
|
|
347
438
|
}
|
|
348
439
|
|
|
349
440
|
const schema: any = {
|
|
@@ -361,6 +452,23 @@ function buildGrammarSchema() {
|
|
|
361
452
|
|
|
362
453
|
// === BUILD RELATIONS SCHEMA ===
|
|
363
454
|
function buildRelationsSchema() {
|
|
455
|
+
const relationProperties: any = {
|
|
456
|
+
source: { type: "string" },
|
|
457
|
+
target: { type: "string" },
|
|
458
|
+
relation: {
|
|
459
|
+
type: "string",
|
|
460
|
+
...(relationTypes && { enum: relationTypes }),
|
|
461
|
+
},
|
|
462
|
+
};
|
|
463
|
+
|
|
464
|
+
// Add class field if relationClasses is specified
|
|
465
|
+
if (relationClasses) {
|
|
466
|
+
relationProperties.class = {
|
|
467
|
+
type: "string",
|
|
468
|
+
enum: relationClasses,
|
|
469
|
+
};
|
|
470
|
+
}
|
|
471
|
+
|
|
364
472
|
const relSchema: any = {
|
|
365
473
|
type: "object",
|
|
366
474
|
properties: {
|
|
@@ -369,15 +477,10 @@ function buildRelationsSchema() {
|
|
|
369
477
|
type: "array",
|
|
370
478
|
items: {
|
|
371
479
|
type: "object",
|
|
372
|
-
properties:
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
type: "string",
|
|
377
|
-
...(relationTypes && { enum: relationTypes }),
|
|
378
|
-
},
|
|
379
|
-
},
|
|
380
|
-
required: ["source", "target", "relation"],
|
|
480
|
+
properties: relationProperties,
|
|
481
|
+
required: relationClasses
|
|
482
|
+
? ["source", "target", "relation", "class"]
|
|
483
|
+
: ["source", "target", "relation"],
|
|
381
484
|
additionalProperties: false,
|
|
382
485
|
},
|
|
383
486
|
},
|
|
@@ -414,13 +517,21 @@ function buildConstraints(): string {
|
|
|
414
517
|
constraints += `\nEvery entity has a "confidence" field (not in attributes). Example: [{"class":"person","text":"John","confidence":"high","attributes":{}}]`;
|
|
415
518
|
}
|
|
416
519
|
if (enableResolve) {
|
|
417
|
-
constraints += `\nEvery entity has
|
|
520
|
+
constraints += `\nEvery entity has "entity_id" and "is_canonical" fields. Coreferent mentions share entity_id; exactly one per group has is_canonical:true (the most specific reference). Example: [{"class":"person","text":"Dr. Chen","entity_id":"e1","is_canonical":true,"attributes":{}},{"class":"person","text":"She","entity_id":"e1","is_canonical":false,"attributes":{}}]`;
|
|
418
521
|
}
|
|
419
522
|
if (enableRelations) {
|
|
420
|
-
|
|
523
|
+
let relDesc = `\nAlso extract relations between entities.`;
|
|
524
|
+
if (relationClasses) {
|
|
525
|
+
relDesc += ` Each relation must have a "class" field categorizing the relation type.`;
|
|
526
|
+
}
|
|
527
|
+
relDesc += ` Return {"entities": [...], "relations": [{"source": "entity text", "target": "entity text", "relation": "relation type"${relationClasses ? ', "class": "relation class"' : ""}}]}.`;
|
|
528
|
+
constraints += relDesc;
|
|
421
529
|
if (relationTypes) {
|
|
422
530
|
constraints += ` Allowed relation types: ${relationTypes.join(", ")}.`;
|
|
423
531
|
}
|
|
532
|
+
if (relationClasses) {
|
|
533
|
+
constraints += ` Allowed relation classes: ${relationClasses.join(", ")}.`;
|
|
534
|
+
}
|
|
424
535
|
}
|
|
425
536
|
|
|
426
537
|
return constraints;
|
|
@@ -456,6 +567,15 @@ async function processText(
|
|
|
456
567
|
}
|
|
457
568
|
}
|
|
458
569
|
|
|
570
|
+
// If taxonomy is used, add taxonomyPath showing full hierarchy
|
|
571
|
+
if (taxonomy && !enableRelations && Array.isArray(parsed)) {
|
|
572
|
+
for (const entity of parsed) {
|
|
573
|
+
if (entity.class && typeof entity.class === "string") {
|
|
574
|
+
entity.taxonomyPath = getTaxonomyPath(entity.class, taxonomy);
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
|
|
459
579
|
return parsed;
|
|
460
580
|
}
|
|
461
581
|
|