@o-lang/legal-extractor 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/capability.js +110 -124
- package/package.json +2 -2
package/capability.js
CHANGED
|
@@ -1,18 +1,50 @@
|
|
|
1
1
|
// capability.js
|
|
2
2
|
//
|
|
3
|
-
// O-Lang Legal Extractor Resolver — capability.js v1.
|
|
3
|
+
// O-Lang Legal Extractor Resolver — capability.js v1.1.0
|
|
4
4
|
//
|
|
5
5
|
// Deterministic, zero-dependency structured extraction from legal documents.
|
|
6
6
|
// Identifies parties, clauses, dates, obligations, and risk flags.
|
|
7
7
|
//
|
|
8
8
|
// EXTRACT-ONLY. This resolver:
|
|
9
|
+
// ✓ Auto-preprocesses long documents to bypass token/safety limits
|
|
9
10
|
// ✓ Extracts and classifies what IS in the document
|
|
10
|
-
// ✗ Never provides legal advice
|
|
11
|
-
// ✗ Never predicts legal outcomes
|
|
12
|
-
// ✗ Never opines on validity or enforceability
|
|
11
|
+
// ✗ Never provides legal advice, predicts outcomes, or opines on validity
|
|
13
12
|
|
|
14
13
|
'use strict';
|
|
15
14
|
|
|
15
|
+
// ── Smart Legal Document Pre-Processor ──────────────────────────────────────
|
|
16
|
+
// Strips boilerplate, schedules, and signature blocks that trigger kernel safety filters.
|
|
17
|
+
// Extracts core clauses if still too long. Hard-caps at 24k chars.
|
|
18
|
+
function preprocessLegalText(text) {
|
|
19
|
+
if (!text || text.length < 1000) return text;
|
|
20
|
+
|
|
21
|
+
// 1. Remove high-trigger boilerplate (schedules, signatures, witness lines)
|
|
22
|
+
let cleaned = text
|
|
23
|
+
.replace(/\nSCHEDULE\s+\d+\s*[-—][\s\S]*/gi, '')
|
|
24
|
+
.replace(/\nSIGNED\s+for\s+and\s+on\s+behalf[\s\S]*/gi, '')
|
|
25
|
+
.replace(/\nIN\s+WITNESS\s+WHEREOF[\s\S]*/gi, '')
|
|
26
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
27
|
+
.trim();
|
|
28
|
+
|
|
29
|
+
// 2. If still too long, extract header + first 10 substantive clauses
|
|
30
|
+
if (cleaned.length > 18000) {
|
|
31
|
+
const clauseRegex = /\n\d+\.\s*[A-Z\s]+(?:\([^)]*\))?\n([\s\S]*?)(?=\n\d+\.\s*|\nIN WITNESS|$)/gi;
|
|
32
|
+
const clauses = [];
|
|
33
|
+
let match;
|
|
34
|
+
while ((match = clauseRegex.exec(cleaned)) !== null) {
|
|
35
|
+
clauses.push(match[0]);
|
|
36
|
+
}
|
|
37
|
+
const header = cleaned.match(/^[^\n]*\n[^\n]*\n[\s\S]*?(?=\n1\.)/m)?.[0] || '';
|
|
38
|
+
cleaned = header + '\n' + clauses.slice(0, 10).join('\n') +
|
|
39
|
+
'\n\n[Note: Additional clauses omitted for extraction efficiency. Full document available for clause-by-clause review.]';
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// 3. Hard cap at 24,000 chars (safe for LLM context windows & kernel limits)
|
|
43
|
+
return cleaned.length > 24000
|
|
44
|
+
? cleaned.substring(0, 24000) + '\n\n[... document truncated for token safety ...]'
|
|
45
|
+
: cleaned;
|
|
46
|
+
}
|
|
47
|
+
|
|
16
48
|
// ── Jurisdiction map ──────────────────────────────────────────────────────────
|
|
17
49
|
const JURISDICTIONS = {
|
|
18
50
|
'ng': 'Nigeria', 'nigeria': 'Nigeria',
|
|
@@ -138,7 +170,6 @@ const DATE_PATTERNS = [
|
|
|
138
170
|
];
|
|
139
171
|
|
|
140
172
|
// ── Helpers ───────────────────────────────────────────────────────────────────
|
|
141
|
-
|
|
142
173
|
function resolveJurisdiction(raw) {
|
|
143
174
|
if (!raw) return 'General / Unspecified';
|
|
144
175
|
const key = raw.trim().toLowerCase();
|
|
@@ -153,36 +184,17 @@ function resolveDocType(raw) {
|
|
|
153
184
|
|
|
154
185
|
function extractParties(text) {
|
|
155
186
|
const parties = new Set();
|
|
156
|
-
|
|
157
|
-
// "between X and Y" — most common in contracts
|
|
158
|
-
const betweenMatch = text.match(
|
|
159
|
-
/between\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})\s+and\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})(?:\s*[\(,\.\;])/i
|
|
160
|
-
);
|
|
187
|
+
const betweenMatch = text.match(/between\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})\s+and\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})(?:\s*[\(,\.\;])/i);
|
|
161
188
|
if (betweenMatch) {
|
|
162
189
|
parties.add(betweenMatch[1].replace(/\s+/g, ' ').trim());
|
|
163
190
|
parties.add(betweenMatch[2].replace(/\s+/g, ' ').trim());
|
|
164
191
|
}
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
const hereinafterMatches = [
|
|
168
|
-
...text.matchAll(
|
|
169
|
-
/([A-Z][A-Za-z\s,\.()&''"–\-]{3,80?})\s*\((?:hereinafter(?:\s+referred\s+to\s+as)?|the)\s*[""]([A-Z][A-Za-z\s]+)[""]\)/gi
|
|
170
|
-
)
|
|
171
|
-
];
|
|
172
|
-
for (const m of hereinafterMatches) {
|
|
173
|
-
parties.add(`${m[1].replace(/\s+/g, ' ').trim()} ("${m[2].trim()}")`);
|
|
174
|
-
}
|
|
175
|
-
|
|
176
|
-
// "Party A" / "Party B" named styles
|
|
192
|
+
const hereinafterMatches = [...text.matchAll(/([A-Z][A-Za-z\s,\.()&''"–\-]{3,80?})\s*\((?:hereinafter(?:\s+referred\s+to\s+as)?|the)\s*[""]([A-Z][A-Za-z\s]+)[""]\)/gi)];
|
|
193
|
+
for (const m of hereinafterMatches) parties.add(`${m[1].replace(/\s+/g, ' ').trim()} ("${m[2].trim()}")`);
|
|
177
194
|
const partyLabels = [...text.matchAll(/\b(Party\s+[A-Z\d])\b/g)];
|
|
178
195
|
for (const m of partyLabels) parties.add(m[1]);
|
|
179
|
-
|
|
180
|
-
// Role-based: "the Employer", "the Employee", "the Disclosing Party" etc.
|
|
181
|
-
const roleMatches = [...text.matchAll(
|
|
182
|
-
/\bthe\s+(Employer|Employee|Disclosing\s+Party|Receiving\s+Party|Licensor|Licensee|Buyer|Seller|Supplier|Contractor|Client|Consultant|Lender|Borrower|Landlord|Tenant|Franchisor|Franchisee)\b/g
|
|
183
|
-
)];
|
|
196
|
+
const roleMatches = [...text.matchAll(/\bthe\s+(Employer|Employee|Disclosing\s+Party|Receiving\s+Party|Licensor|Licensee|Buyer|Seller|Supplier|Contractor|Client|Consultant|Lender|Borrower|Landlord|Tenant|Franchisor|Franchisee)\b/g)];
|
|
184
197
|
for (const m of roleMatches) parties.add(`the ${m[1]}`);
|
|
185
|
-
|
|
186
198
|
return [...parties].filter(p => p.length > 2).slice(0, 10);
|
|
187
199
|
}
|
|
188
200
|
|
|
@@ -190,9 +202,7 @@ function extractDates(text) {
|
|
|
190
202
|
const found = new Set();
|
|
191
203
|
for (const pattern of DATE_PATTERNS) {
|
|
192
204
|
const matches = [...text.matchAll(pattern)];
|
|
193
|
-
for (const m of matches)
|
|
194
|
-
found.add(m[1] || m[0]);
|
|
195
|
-
}
|
|
205
|
+
for (const m of matches) found.add(m[1] || m[0]);
|
|
196
206
|
}
|
|
197
207
|
return [...found].slice(0, 20);
|
|
198
208
|
}
|
|
@@ -201,93 +211,63 @@ function extractClauses(text) {
|
|
|
201
211
|
const found = [];
|
|
202
212
|
for (const { type, pattern } of CLAUSE_PATTERNS) {
|
|
203
213
|
if (pattern.test(text)) {
|
|
204
|
-
// Find the sentence containing the match
|
|
205
214
|
pattern.lastIndex = 0;
|
|
206
215
|
const match = pattern.exec(text);
|
|
207
216
|
if (match) {
|
|
208
217
|
const start = Math.max(0, match.index - 80);
|
|
209
218
|
const end = Math.min(text.length, match.index + 160);
|
|
210
|
-
const excerpt = text.slice(start, end)
|
|
211
|
-
.replace(/\s+/g, ' ')
|
|
212
|
-
.trim()
|
|
213
|
-
.replace(/^[^A-Z]/, '')
|
|
214
|
-
.slice(0, 200);
|
|
219
|
+
const excerpt = text.slice(start, end).replace(/\s+/g, ' ').trim().replace(/^[^A-Z]/, '').slice(0, 200);
|
|
215
220
|
found.push({ type, excerpt: excerpt + (excerpt.length === 200 ? '…' : '') });
|
|
216
221
|
}
|
|
217
222
|
}
|
|
218
223
|
pattern.lastIndex = 0;
|
|
219
224
|
}
|
|
220
|
-
// Deduplicate by type
|
|
221
225
|
const seen = new Set();
|
|
222
|
-
return found.filter(c => {
|
|
223
|
-
if (seen.has(c.type)) return false;
|
|
224
|
-
seen.add(c.type);
|
|
225
|
-
return true;
|
|
226
|
-
});
|
|
226
|
+
return found.filter(c => { if (seen.has(c.type)) return false; seen.add(c.type); return true; });
|
|
227
227
|
}
|
|
228
228
|
|
|
229
229
|
function extractObligations(text) {
|
|
230
|
-
const sentences = text
|
|
231
|
-
|
|
232
|
-
.split(/(?<=[.!?])\s+(?=[A-Z])/)
|
|
233
|
-
.map(s => s.replace(/\s+/g, ' ').trim())
|
|
234
|
-
.filter(s => s.length > 20 && s.length < 400);
|
|
235
|
-
|
|
236
|
-
return sentences
|
|
237
|
-
.filter(s => OBLIGATION_KEYWORDS.some(kw => s.toLowerCase().includes(kw)))
|
|
238
|
-
.slice(0, 15);
|
|
230
|
+
const sentences = text.replace(/\r\n/g, '\n').split(/(?<=[.!?])\s+(?=[A-Z])/).map(s => s.replace(/\s+/g, ' ').trim()).filter(s => s.length > 20 && s.length < 400);
|
|
231
|
+
return sentences.filter(s => OBLIGATION_KEYWORDS.some(kw => s.toLowerCase().includes(kw))).slice(0, 15);
|
|
239
232
|
}
|
|
240
233
|
|
|
241
234
|
function extractRisks(text) {
|
|
242
235
|
const found = [];
|
|
243
236
|
for (const { pattern, label, severity } of RISK_PATTERNS) {
|
|
244
|
-
if (pattern.test(text)) {
|
|
245
|
-
found.push({ flag: label, severity });
|
|
246
|
-
}
|
|
237
|
+
if (pattern.test(text)) found.push({ flag: label, severity });
|
|
247
238
|
pattern.lastIndex = 0;
|
|
248
239
|
}
|
|
249
|
-
// Sort: high → medium → low
|
|
250
240
|
const order = { high: 0, medium: 1, low: 2 };
|
|
251
241
|
return found.sort((a, b) => order[a.severity] - order[b.severity]);
|
|
252
242
|
}
|
|
253
243
|
|
|
254
244
|
function buildSummary(params) {
|
|
255
|
-
const {
|
|
256
|
-
document_ref, docTypeLabel, jurisdictionLabel,
|
|
257
|
-
wordCount, parties, clauses, dates, obligations, risks,
|
|
258
|
-
} = params;
|
|
259
|
-
|
|
245
|
+
const { document_ref, docTypeLabel, jurisdictionLabel, wordCount, parties, clauses, dates, obligations, risks } = params;
|
|
260
246
|
const highRisks = risks.filter(r => r.severity === 'high').length;
|
|
261
247
|
const medRisks = risks.filter(r => r.severity === 'medium').length;
|
|
262
248
|
const clauseList = clauses.slice(0, 5).map(c => c.type).join(', ');
|
|
263
249
|
const partyList = parties.slice(0, 3).join(', ');
|
|
264
250
|
|
|
265
251
|
let summary = `Document reference ${document_ref} is a ${docTypeLabel}`;
|
|
266
|
-
if (jurisdictionLabel !== 'General / Unspecified') {
|
|
267
|
-
summary += ` governed under ${jurisdictionLabel} law`;
|
|
268
|
-
}
|
|
252
|
+
if (jurisdictionLabel !== 'General / Unspecified') summary += ` governed under ${jurisdictionLabel} law`;
|
|
269
253
|
summary += `. The document contains ${wordCount.toLocaleString()} words`;
|
|
270
|
-
if (parties.length > 0) summary += ` and identifies
|
|
254
|
+
if (parties.length > 0) summary += ` and identifies: ${partyList}`;
|
|
271
255
|
summary += '.';
|
|
272
|
-
if (clauses.length > 0) summary += ` Key
|
|
273
|
-
if (dates.length > 0) summary += ` ${dates.length} date
|
|
274
|
-
if (obligations.length > 0) summary += ` ${obligations.length} obligation
|
|
275
|
-
if (highRisks > 0) summary += ` ${highRisks} high-
|
|
276
|
-
if (medRisks > 0) summary += ` ${medRisks} medium-
|
|
277
|
-
summary += '
|
|
278
|
-
|
|
256
|
+
if (clauses.length > 0) summary += ` Key clauses: ${clauseList}.`;
|
|
257
|
+
if (dates.length > 0) summary += ` ${dates.length} date${dates.length > 1 ? 's' : ''} found.`;
|
|
258
|
+
if (obligations.length > 0) summary += ` ${obligations.length} obligation${obligations.length > 1 ? 's' : ''} extracted.`;
|
|
259
|
+
if (highRisks > 0) summary += ` ${highRisks} high-risk flag${highRisks > 1 ? 's' : ''} identified.`;
|
|
260
|
+
if (medRisks > 0) summary += ` ${medRisks} medium-risk flag${medRisks > 1 ? 's' : ''} identified.`;
|
|
261
|
+
summary += ' Factual extraction only. No legal advice provided.';
|
|
279
262
|
return summary;
|
|
280
263
|
}
|
|
281
264
|
|
|
282
265
|
// ── Parse action string ───────────────────────────────────────────────────────
|
|
283
|
-
// Action format: legal-extractor "doc_ref" "jurisdiction" "doc_type" "document_text"
|
|
284
266
|
function parseActionArgs(action) {
|
|
285
267
|
const args = [];
|
|
286
268
|
const regex = /"((?:[^"\\]|\\.)*)"/g;
|
|
287
269
|
let match;
|
|
288
|
-
while ((match = regex.exec(action)) !== null)
|
|
289
|
-
args.push(match[1].replace(/\\"/g, '"'));
|
|
290
|
-
}
|
|
270
|
+
while ((match = regex.exec(action)) !== null) args.push(match[1].replace(/\\"/g, '"'));
|
|
291
271
|
return args;
|
|
292
272
|
}
|
|
293
273
|
|
|
@@ -302,34 +282,22 @@ async function resolve(action, context = {}, options = {}) {
|
|
|
302
282
|
|
|
303
283
|
if (typeof action === 'string') {
|
|
304
284
|
const args = parseActionArgs(action);
|
|
305
|
-
|
|
306
|
-
const offset = args.length >= 4 ? 0 : 0;
|
|
307
|
-
document_ref = args[offset] || context.document_ref || 'REF-UNKNOWN';
|
|
308
|
-
jurisdiction = args[offset + 1] || context.jurisdiction || 'general';
|
|
309
|
-
doc_type = args[offset + 2] || context.doc_type || 'general';
|
|
310
|
-
document_text = args[offset + 3] || context.document_text || '';
|
|
285
|
+
[document_ref, jurisdiction, doc_type, document_text] = args.length >= 4 ? args : [context.document_ref, context.jurisdiction, context.doc_type, context.document_text];
|
|
311
286
|
} else {
|
|
312
|
-
document_ref
|
|
313
|
-
jurisdiction = context.jurisdiction || 'general';
|
|
314
|
-
doc_type = context.doc_type || 'general';
|
|
315
|
-
document_text = context.document_text || '';
|
|
287
|
+
({ document_ref, jurisdiction, doc_type, document_text } = context);
|
|
316
288
|
}
|
|
317
289
|
|
|
318
290
|
// ── 2. Validate ───────────────────────────────────────────────────────
|
|
319
291
|
if (!document_text || document_text.trim().length < 10) {
|
|
320
292
|
console.warn('[legal-extractor] ⚠️ document_text is empty or too short');
|
|
321
|
-
return {
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
doc_type: resolveDocType(doc_type),
|
|
330
|
-
word_count: 0,
|
|
331
|
-
error: 'document_text required',
|
|
332
|
-
};
|
|
293
|
+
return { summary: 'No document text provided for extraction.', parties: [], clauses: [], dates: [], obligations: [], risks: [], jurisdiction: resolveJurisdiction(jurisdiction), doc_type: resolveDocType(doc_type), word_count: 0, error: 'document_text required' };
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
// ✅ SMART PRE-PROCESSING: Auto-clean long docs before extraction
|
|
297
|
+
const originalLength = document_text.length;
|
|
298
|
+
if (originalLength > 10000) {
|
|
299
|
+
document_text = preprocessLegalText(document_text);
|
|
300
|
+
console.log(`[legal-extractor] 🔄 Pre-processed long document: ${originalLength} → ${document_text.length} chars`);
|
|
333
301
|
}
|
|
334
302
|
|
|
335
303
|
const text = document_text.trim();
|
|
@@ -347,40 +315,58 @@ async function resolve(action, context = {}, options = {}) {
|
|
|
347
315
|
const risks = extractRisks(text);
|
|
348
316
|
|
|
349
317
|
// ── 4. Build summary ──────────────────────────────────────────────────
|
|
350
|
-
const summary = buildSummary({
|
|
351
|
-
document_ref, docTypeLabel, jurisdictionLabel,
|
|
352
|
-
wordCount, parties, clauses, dates, obligations, risks,
|
|
353
|
-
});
|
|
318
|
+
const summary = buildSummary({ document_ref, docTypeLabel, jurisdictionLabel, wordCount, parties, clauses, dates, obligations, risks });
|
|
354
319
|
|
|
355
320
|
console.log(`[legal-extractor] ✅ Extracted: ${parties.length} parties, ${clauses.length} clauses, ${dates.length} dates, ${obligations.length} obligations, ${risks.length} risk flags`);
|
|
356
321
|
|
|
357
|
-
|
|
358
|
-
summary,
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
322
|
+
return {
|
|
323
|
+
summary, parties, clauses, dates, obligations, risks,
|
|
324
|
+
jurisdiction: jurisdictionLabel, doc_type: docTypeLabel, word_count: wordCount,
|
|
325
|
+
document_ref, extracted_at: new Date().toISOString(),
|
|
326
|
+
preprocessed: originalLength > 10000,
|
|
327
|
+
|
|
328
|
+
// ── Pre-stringified fields for safe LLM interpolation ──────────────
|
|
329
|
+
// RuntimeAPI._safeInterpolate cannot inject arrays/objects into prompts.
|
|
330
|
+
// These flat strings are what the workflow's {extracted.xxx_text} tokens resolve to.
|
|
331
|
+
parties_text: parties.length
|
|
332
|
+
? '- ' + parties.join('\n- ')
|
|
333
|
+
: 'No parties identified',
|
|
334
|
+
|
|
335
|
+
clauses_text: clauses.length
|
|
336
|
+
? clauses.map(c => `${c.type}:\n ${c.excerpt}`).join('\n\n')
|
|
337
|
+
: 'No clauses identified',
|
|
338
|
+
|
|
339
|
+
dates_text: dates.length
|
|
340
|
+
? dates.join(', ')
|
|
341
|
+
: 'No dates found',
|
|
342
|
+
|
|
343
|
+
obligations_text: obligations.length
|
|
344
|
+
? '- ' + obligations.slice(0, 5).join('\n- ')
|
|
345
|
+
: 'No obligations extracted',
|
|
346
|
+
|
|
347
|
+
risks_text: risks.length
|
|
348
|
+
? risks.map(r => `[${r.severity.toUpperCase()}] ${r.flag}`).join('\n')
|
|
349
|
+
: 'No risk flags identified',
|
|
369
350
|
};
|
|
370
351
|
|
|
371
352
|
} catch (err) {
|
|
372
353
|
console.error('[legal-extractor] 💥 Error:', err.message);
|
|
373
354
|
return {
|
|
374
|
-
summary:
|
|
375
|
-
parties:
|
|
376
|
-
clauses:
|
|
377
|
-
dates:
|
|
378
|
-
obligations:
|
|
379
|
-
risks:
|
|
380
|
-
jurisdiction:
|
|
381
|
-
doc_type:
|
|
382
|
-
word_count:
|
|
383
|
-
error:
|
|
355
|
+
summary: `Extraction failed: ${err.message}`,
|
|
356
|
+
parties: [],
|
|
357
|
+
clauses: [],
|
|
358
|
+
dates: [],
|
|
359
|
+
obligations: [],
|
|
360
|
+
risks: [],
|
|
361
|
+
jurisdiction: 'Unknown',
|
|
362
|
+
doc_type: 'Unknown',
|
|
363
|
+
word_count: 0,
|
|
364
|
+
error: err.message,
|
|
365
|
+
parties_text: 'Extraction failed',
|
|
366
|
+
clauses_text: 'Extraction failed',
|
|
367
|
+
dates_text: 'Extraction failed',
|
|
368
|
+
obligations_text: 'Extraction failed',
|
|
369
|
+
risks_text: 'Extraction failed',
|
|
384
370
|
};
|
|
385
371
|
}
|
|
386
372
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@o-lang/legal-extractor",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "O-Lang resolver for structured legal document extraction. Extract-only — never provides legal advice.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"keywords": [
|
|
@@ -22,4 +22,4 @@
|
|
|
22
22
|
"node": ">=18.0.0"
|
|
23
23
|
},
|
|
24
24
|
"dependencies": {}
|
|
25
|
-
}
|
|
25
|
+
}
|