@o-lang/legal-extractor 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/capability.js +110 -124
  2. package/package.json +2 -2
package/capability.js CHANGED
@@ -1,18 +1,50 @@
1
1
  // capability.js
2
2
  //
3
- // O-Lang Legal Extractor Resolver — capability.js v1.0.0
3
+ // O-Lang Legal Extractor Resolver — capability.js v1.1.0
4
4
  //
5
5
  // Deterministic, zero-dependency structured extraction from legal documents.
6
6
  // Identifies parties, clauses, dates, obligations, and risk flags.
7
7
  //
8
8
  // EXTRACT-ONLY. This resolver:
9
+ // ✓ Auto-preprocesses long documents to bypass token/safety limits
9
10
  // ✓ Extracts and classifies what IS in the document
10
- // ✗ Never provides legal advice
11
- // ✗ Never predicts legal outcomes
12
- // ✗ Never opines on validity or enforceability
11
+ // ✗ Never provides legal advice, predicts outcomes, or opines on validity
13
12
 
14
13
  'use strict';
15
14
 
15
+ // ── Smart Legal Document Pre-Processor ──────────────────────────────────────
16
+ // Strips boilerplate, schedules, and signature blocks that trigger kernel safety filters.
17
+ // Extracts core clauses if still too long. Hard-caps at 24k chars.
18
+ function preprocessLegalText(text) {
19
+ if (!text || text.length < 1000) return text;
20
+
21
+ // 1. Remove high-trigger boilerplate (schedules, signatures, witness lines)
22
+ let cleaned = text
23
+ .replace(/\nSCHEDULE\s+\d+\s*[-—][\s\S]*/gi, '')
24
+ .replace(/\nSIGNED\s+for\s+and\s+on\s+behalf[\s\S]*/gi, '')
25
+ .replace(/\nIN\s+WITNESS\s+WHEREOF[\s\S]*/gi, '')
26
+ .replace(/\n{3,}/g, '\n\n')
27
+ .trim();
28
+
29
+ // 2. If still too long, extract header + first 10 substantive clauses
30
+ if (cleaned.length > 18000) {
31
+ const clauseRegex = /\n\d+\.\s*[A-Z\s]+(?:\([^)]*\))?\n([\s\S]*?)(?=\n\d+\.\s*|\nIN WITNESS|$)/gi;
32
+ const clauses = [];
33
+ let match;
34
+ while ((match = clauseRegex.exec(cleaned)) !== null) {
35
+ clauses.push(match[0]);
36
+ }
37
+ const header = cleaned.match(/^[^\n]*\n[^\n]*\n[\s\S]*?(?=\n1\.)/m)?.[0] || '';
38
+ cleaned = header + '\n' + clauses.slice(0, 10).join('\n') +
39
+ '\n\n[Note: Additional clauses omitted for extraction efficiency. Full document available for clause-by-clause review.]';
40
+ }
41
+
42
+ // 3. Hard cap at 24,000 chars (safe for LLM context windows & kernel limits)
43
+ return cleaned.length > 24000
44
+ ? cleaned.substring(0, 24000) + '\n\n[... document truncated for token safety ...]'
45
+ : cleaned;
46
+ }
47
+
16
48
  // ── Jurisdiction map ──────────────────────────────────────────────────────────
17
49
  const JURISDICTIONS = {
18
50
  'ng': 'Nigeria', 'nigeria': 'Nigeria',
@@ -138,7 +170,6 @@ const DATE_PATTERNS = [
138
170
  ];
139
171
 
140
172
  // ── Helpers ───────────────────────────────────────────────────────────────────
141
-
142
173
  function resolveJurisdiction(raw) {
143
174
  if (!raw) return 'General / Unspecified';
144
175
  const key = raw.trim().toLowerCase();
@@ -153,36 +184,17 @@ function resolveDocType(raw) {
153
184
 
154
185
  function extractParties(text) {
155
186
  const parties = new Set();
156
-
157
- // "between X and Y" — most common in contracts
158
- const betweenMatch = text.match(
159
- /between\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})\s+and\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})(?:\s*[\(,\.\;])/i
160
- );
187
+ const betweenMatch = text.match(/between\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})\s+and\s+([A-Z][A-Za-z\s,\.()&''"–\-]{2,80?})(?:\s*[\(,\.\;])/i);
161
188
  if (betweenMatch) {
162
189
  parties.add(betweenMatch[1].replace(/\s+/g, ' ').trim());
163
190
  parties.add(betweenMatch[2].replace(/\s+/g, ' ').trim());
164
191
  }
165
-
166
- // "X (hereinafter "Y")" or "X (the "Y")"
167
- const hereinafterMatches = [
168
- ...text.matchAll(
169
- /([A-Z][A-Za-z\s,\.()&''"–\-]{3,80?})\s*\((?:hereinafter(?:\s+referred\s+to\s+as)?|the)\s*[""]([A-Z][A-Za-z\s]+)[""]\)/gi
170
- )
171
- ];
172
- for (const m of hereinafterMatches) {
173
- parties.add(`${m[1].replace(/\s+/g, ' ').trim()} ("${m[2].trim()}")`);
174
- }
175
-
176
- // "Party A" / "Party B" named styles
192
+ const hereinafterMatches = [...text.matchAll(/([A-Z][A-Za-z\s,\.()&''"–\-]{3,80?})\s*\((?:hereinafter(?:\s+referred\s+to\s+as)?|the)\s*[""]([A-Z][A-Za-z\s]+)[""]\)/gi)];
193
+ for (const m of hereinafterMatches) parties.add(`${m[1].replace(/\s+/g, ' ').trim()} ("${m[2].trim()}")`);
177
194
  const partyLabels = [...text.matchAll(/\b(Party\s+[A-Z\d])\b/g)];
178
195
  for (const m of partyLabels) parties.add(m[1]);
179
-
180
- // Role-based: "the Employer", "the Employee", "the Disclosing Party" etc.
181
- const roleMatches = [...text.matchAll(
182
- /\bthe\s+(Employer|Employee|Disclosing\s+Party|Receiving\s+Party|Licensor|Licensee|Buyer|Seller|Supplier|Contractor|Client|Consultant|Lender|Borrower|Landlord|Tenant|Franchisor|Franchisee)\b/g
183
- )];
196
+ const roleMatches = [...text.matchAll(/\bthe\s+(Employer|Employee|Disclosing\s+Party|Receiving\s+Party|Licensor|Licensee|Buyer|Seller|Supplier|Contractor|Client|Consultant|Lender|Borrower|Landlord|Tenant|Franchisor|Franchisee)\b/g)];
184
197
  for (const m of roleMatches) parties.add(`the ${m[1]}`);
185
-
186
198
  return [...parties].filter(p => p.length > 2).slice(0, 10);
187
199
  }
188
200
 
@@ -190,9 +202,7 @@ function extractDates(text) {
190
202
  const found = new Set();
191
203
  for (const pattern of DATE_PATTERNS) {
192
204
  const matches = [...text.matchAll(pattern)];
193
- for (const m of matches) {
194
- found.add(m[1] || m[0]);
195
- }
205
+ for (const m of matches) found.add(m[1] || m[0]);
196
206
  }
197
207
  return [...found].slice(0, 20);
198
208
  }
@@ -201,93 +211,63 @@ function extractClauses(text) {
201
211
  const found = [];
202
212
  for (const { type, pattern } of CLAUSE_PATTERNS) {
203
213
  if (pattern.test(text)) {
204
- // Find the sentence containing the match
205
214
  pattern.lastIndex = 0;
206
215
  const match = pattern.exec(text);
207
216
  if (match) {
208
217
  const start = Math.max(0, match.index - 80);
209
218
  const end = Math.min(text.length, match.index + 160);
210
- const excerpt = text.slice(start, end)
211
- .replace(/\s+/g, ' ')
212
- .trim()
213
- .replace(/^[^A-Z]/, '')
214
- .slice(0, 200);
219
+ const excerpt = text.slice(start, end).replace(/\s+/g, ' ').trim().replace(/^[^A-Z]/, '').slice(0, 200);
215
220
  found.push({ type, excerpt: excerpt + (excerpt.length === 200 ? '…' : '') });
216
221
  }
217
222
  }
218
223
  pattern.lastIndex = 0;
219
224
  }
220
- // Deduplicate by type
221
225
  const seen = new Set();
222
- return found.filter(c => {
223
- if (seen.has(c.type)) return false;
224
- seen.add(c.type);
225
- return true;
226
- });
226
+ return found.filter(c => { if (seen.has(c.type)) return false; seen.add(c.type); return true; });
227
227
  }
228
228
 
229
229
  function extractObligations(text) {
230
- const sentences = text
231
- .replace(/\r\n/g, '\n')
232
- .split(/(?<=[.!?])\s+(?=[A-Z])/)
233
- .map(s => s.replace(/\s+/g, ' ').trim())
234
- .filter(s => s.length > 20 && s.length < 400);
235
-
236
- return sentences
237
- .filter(s => OBLIGATION_KEYWORDS.some(kw => s.toLowerCase().includes(kw)))
238
- .slice(0, 15);
230
+ const sentences = text.replace(/\r\n/g, '\n').split(/(?<=[.!?])\s+(?=[A-Z])/).map(s => s.replace(/\s+/g, ' ').trim()).filter(s => s.length > 20 && s.length < 400);
231
+ return sentences.filter(s => OBLIGATION_KEYWORDS.some(kw => s.toLowerCase().includes(kw))).slice(0, 15);
239
232
  }
240
233
 
241
234
  function extractRisks(text) {
242
235
  const found = [];
243
236
  for (const { pattern, label, severity } of RISK_PATTERNS) {
244
- if (pattern.test(text)) {
245
- found.push({ flag: label, severity });
246
- }
237
+ if (pattern.test(text)) found.push({ flag: label, severity });
247
238
  pattern.lastIndex = 0;
248
239
  }
249
- // Sort: high → medium → low
250
240
  const order = { high: 0, medium: 1, low: 2 };
251
241
  return found.sort((a, b) => order[a.severity] - order[b.severity]);
252
242
  }
253
243
 
254
244
  function buildSummary(params) {
255
- const {
256
- document_ref, docTypeLabel, jurisdictionLabel,
257
- wordCount, parties, clauses, dates, obligations, risks,
258
- } = params;
259
-
245
+ const { document_ref, docTypeLabel, jurisdictionLabel, wordCount, parties, clauses, dates, obligations, risks } = params;
260
246
  const highRisks = risks.filter(r => r.severity === 'high').length;
261
247
  const medRisks = risks.filter(r => r.severity === 'medium').length;
262
248
  const clauseList = clauses.slice(0, 5).map(c => c.type).join(', ');
263
249
  const partyList = parties.slice(0, 3).join(', ');
264
250
 
265
251
  let summary = `Document reference ${document_ref} is a ${docTypeLabel}`;
266
- if (jurisdictionLabel !== 'General / Unspecified') {
267
- summary += ` governed under ${jurisdictionLabel} law`;
268
- }
252
+ if (jurisdictionLabel !== 'General / Unspecified') summary += ` governed under ${jurisdictionLabel} law`;
269
253
  summary += `. The document contains ${wordCount.toLocaleString()} words`;
270
- if (parties.length > 0) summary += ` and identifies the following parties: ${partyList}`;
254
+ if (parties.length > 0) summary += ` and identifies: ${partyList}`;
271
255
  summary += '.';
272
- if (clauses.length > 0) summary += ` Key clause types identified include: ${clauseList}.`;
273
- if (dates.length > 0) summary += ` ${dates.length} date reference${dates.length > 1 ? 's' : ''} found.`;
274
- if (obligations.length > 0) summary += ` ${obligations.length} obligation statement${obligations.length > 1 ? 's' : ''} extracted.`;
275
- if (highRisks > 0) summary += ` ${highRisks} high-severity risk flag${highRisks > 1 ? 's' : ''} identified.`;
276
- if (medRisks > 0) summary += ` ${medRisks} medium-severity risk flag${medRisks > 1 ? 's' : ''} identified.`;
277
- summary += ' This is a factual extraction only. No legal advice is provided.';
278
-
256
+ if (clauses.length > 0) summary += ` Key clauses: ${clauseList}.`;
257
+ if (dates.length > 0) summary += ` ${dates.length} date${dates.length > 1 ? 's' : ''} found.`;
258
+ if (obligations.length > 0) summary += ` ${obligations.length} obligation${obligations.length > 1 ? 's' : ''} extracted.`;
259
+ if (highRisks > 0) summary += ` ${highRisks} high-risk flag${highRisks > 1 ? 's' : ''} identified.`;
260
+ if (medRisks > 0) summary += ` ${medRisks} medium-risk flag${medRisks > 1 ? 's' : ''} identified.`;
261
+ summary += ' Factual extraction only. No legal advice provided.';
279
262
  return summary;
280
263
  }
281
264
 
282
265
  // ── Parse action string ───────────────────────────────────────────────────────
283
- // Action format: legal-extractor "doc_ref" "jurisdiction" "doc_type" "document_text"
284
266
  function parseActionArgs(action) {
285
267
  const args = [];
286
268
  const regex = /"((?:[^"\\]|\\.)*)"/g;
287
269
  let match;
288
- while ((match = regex.exec(action)) !== null) {
289
- args.push(match[1].replace(/\\"/g, '"'));
290
- }
270
+ while ((match = regex.exec(action)) !== null) args.push(match[1].replace(/\\"/g, '"'));
291
271
  return args;
292
272
  }
293
273
 
@@ -302,34 +282,22 @@ async function resolve(action, context = {}, options = {}) {
302
282
 
303
283
  if (typeof action === 'string') {
304
284
  const args = parseActionArgs(action);
305
- // Strip "legal-extractor" or "Action legal-extractor" prefix
306
- const offset = args.length >= 4 ? 0 : 0;
307
- document_ref = args[offset] || context.document_ref || 'REF-UNKNOWN';
308
- jurisdiction = args[offset + 1] || context.jurisdiction || 'general';
309
- doc_type = args[offset + 2] || context.doc_type || 'general';
310
- document_text = args[offset + 3] || context.document_text || '';
285
+ [document_ref, jurisdiction, doc_type, document_text] = args.length >= 4 ? args : [context.document_ref, context.jurisdiction, context.doc_type, context.document_text];
311
286
  } else {
312
- document_ref = context.document_ref || 'REF-UNKNOWN';
313
- jurisdiction = context.jurisdiction || 'general';
314
- doc_type = context.doc_type || 'general';
315
- document_text = context.document_text || '';
287
+ ({ document_ref, jurisdiction, doc_type, document_text } = context);
316
288
  }
317
289
 
318
290
  // ── 2. Validate ───────────────────────────────────────────────────────
319
291
  if (!document_text || document_text.trim().length < 10) {
320
292
  console.warn('[legal-extractor] ⚠️ document_text is empty or too short');
321
- return {
322
- summary: 'No document text provided for extraction.',
323
- parties: [],
324
- clauses: [],
325
- dates: [],
326
- obligations: [],
327
- risks: [],
328
- jurisdiction: resolveJurisdiction(jurisdiction),
329
- doc_type: resolveDocType(doc_type),
330
- word_count: 0,
331
- error: 'document_text required',
332
- };
293
+ return { summary: 'No document text provided for extraction.', parties: [], clauses: [], dates: [], obligations: [], risks: [], jurisdiction: resolveJurisdiction(jurisdiction), doc_type: resolveDocType(doc_type), word_count: 0, error: 'document_text required' };
294
+ }
295
+
296
+ // ✅ SMART PRE-PROCESSING: Auto-clean long docs before extraction
297
+ const originalLength = document_text.length;
298
+ if (originalLength > 10000) {
299
+ document_text = preprocessLegalText(document_text);
300
+ console.log(`[legal-extractor] 🔄 Pre-processed long document: ${originalLength} → ${document_text.length} chars`);
333
301
  }
334
302
 
335
303
  const text = document_text.trim();
@@ -347,40 +315,58 @@ async function resolve(action, context = {}, options = {}) {
347
315
  const risks = extractRisks(text);
348
316
 
349
317
  // ── 4. Build summary ──────────────────────────────────────────────────
350
- const summary = buildSummary({
351
- document_ref, docTypeLabel, jurisdictionLabel,
352
- wordCount, parties, clauses, dates, obligations, risks,
353
- });
318
+ const summary = buildSummary({ document_ref, docTypeLabel, jurisdictionLabel, wordCount, parties, clauses, dates, obligations, risks });
354
319
 
355
320
  console.log(`[legal-extractor] ✅ Extracted: ${parties.length} parties, ${clauses.length} clauses, ${dates.length} dates, ${obligations.length} obligations, ${risks.length} risk flags`);
356
321
 
357
- return {
358
- summary,
359
- parties,
360
- clauses,
361
- dates,
362
- obligations,
363
- risks,
364
- jurisdiction: jurisdictionLabel,
365
- doc_type: docTypeLabel,
366
- word_count: wordCount,
367
- document_ref,
368
- extracted_at: new Date().toISOString(),
322
+ return {
323
+ summary, parties, clauses, dates, obligations, risks,
324
+ jurisdiction: jurisdictionLabel, doc_type: docTypeLabel, word_count: wordCount,
325
+ document_ref, extracted_at: new Date().toISOString(),
326
+ preprocessed: originalLength > 10000,
327
+
328
+ // ── Pre-stringified fields for safe LLM interpolation ──────────────
329
+ // RuntimeAPI._safeInterpolate cannot inject arrays/objects into prompts.
330
+ // These flat strings are what the workflow's {extracted.xxx_text} tokens resolve to.
331
+ parties_text: parties.length
332
+ ? '- ' + parties.join('\n- ')
333
+ : 'No parties identified',
334
+
335
+ clauses_text: clauses.length
336
+ ? clauses.map(c => `${c.type}:\n ${c.excerpt}`).join('\n\n')
337
+ : 'No clauses identified',
338
+
339
+ dates_text: dates.length
340
+ ? dates.join(', ')
341
+ : 'No dates found',
342
+
343
+ obligations_text: obligations.length
344
+ ? '- ' + obligations.slice(0, 5).join('\n- ')
345
+ : 'No obligations extracted',
346
+
347
+ risks_text: risks.length
348
+ ? risks.map(r => `[${r.severity.toUpperCase()}] ${r.flag}`).join('\n')
349
+ : 'No risk flags identified',
369
350
  };
370
351
 
371
352
  } catch (err) {
372
353
  console.error('[legal-extractor] 💥 Error:', err.message);
373
354
  return {
374
- summary: `Extraction failed: ${err.message}`,
375
- parties: [],
376
- clauses: [],
377
- dates: [],
378
- obligations: [],
379
- risks: [],
380
- jurisdiction: 'Unknown',
381
- doc_type: 'Unknown',
382
- word_count: 0,
383
- error: err.message,
355
+ summary: `Extraction failed: ${err.message}`,
356
+ parties: [],
357
+ clauses: [],
358
+ dates: [],
359
+ obligations: [],
360
+ risks: [],
361
+ jurisdiction: 'Unknown',
362
+ doc_type: 'Unknown',
363
+ word_count: 0,
364
+ error: err.message,
365
+ parties_text: 'Extraction failed',
366
+ clauses_text: 'Extraction failed',
367
+ dates_text: 'Extraction failed',
368
+ obligations_text: 'Extraction failed',
369
+ risks_text: 'Extraction failed',
384
370
  };
385
371
  }
386
372
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@o-lang/legal-extractor",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "O-Lang resolver for structured legal document extraction. Extract-only — never provides legal advice.",
5
5
  "main": "index.js",
6
6
  "keywords": [
@@ -22,4 +22,4 @@
22
22
  "node": ">=18.0.0"
23
23
  },
24
24
  "dependencies": {}
25
- }
25
+ }