ai-metacognition-toolkit 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ai-metacognition-toolkit might be problematic. Click here for more details.

@@ -0,0 +1,533 @@
1
+ """Feature extraction utilities for behavioral analysis.
2
+
3
+ This module provides reusable functions for extracting behavioral features
4
+ from AI model outputs, including hedging patterns, meta-commentary,
5
+ reasoning depth, and statistical divergence measures.
6
+ """
7
+
8
+ import re
9
+ from typing import Any, Dict, Optional
10
+
11
+ import numpy as np
12
+
13
+
14
+ def extract_behavioral_features(
15
+ text: str,
16
+ cot: Optional[str] = None,
17
+ metadata: Optional[Dict[str, Any]] = None,
18
+ ) -> Dict[str, float]:
19
+ """Extract comprehensive behavioral features from model output.
20
+
21
+ Args:
22
+ text: The model's response text
23
+ cot: Optional chain-of-thought reasoning
24
+ metadata: Optional metadata dictionary
25
+
26
+ Returns:
27
+ Dictionary mapping feature names to their values
28
+
29
+ Raises:
30
+ ValueError: If text is empty or invalid
31
+ """
32
+ if not text or not isinstance(text, str):
33
+ raise ValueError("text must be a non-empty string")
34
+
35
+ features: Dict[str, float] = {}
36
+
37
+ # Basic text features
38
+ features["response_length"] = float(len(text))
39
+ words = text.split()
40
+ features["word_count"] = float(len(words))
41
+ features["avg_word_length"] = (
42
+ float(np.mean([len(w) for w in words])) if words else 0.0
43
+ )
44
+
45
+ # Sentence statistics
46
+ sentences = re.split(r"[.!?]+", text)
47
+ sentences = [s.strip() for s in sentences if s.strip()]
48
+ features["sentence_count"] = float(len(sentences))
49
+ features["avg_sentence_length"] = (
50
+ float(np.mean([len(s.split()) for s in sentences])) if sentences else 0.0
51
+ )
52
+
53
+ # Hedging patterns
54
+ features["hedging_ratio"] = count_hedging_phrases(text)
55
+
56
+ # Meta-commentary
57
+ meta_result = detect_meta_commentary(text)
58
+ features["meta_commentary_detected"] = float(meta_result["detected"])
59
+ features["meta_commentary_confidence"] = meta_result["confidence"]
60
+ features["meta_commentary_count"] = float(meta_result["count"])
61
+
62
+ # Self-reference patterns
63
+ features["self_reference_ratio"] = _count_self_references(text) / max(
64
+ len(words), 1
65
+ )
66
+
67
+ # Reasoning depth from CoT if provided
68
+ if cot:
69
+ features["reasoning_depth"] = extract_reasoning_depth(cot)
70
+ else:
71
+ features["reasoning_depth"] = 0.0
72
+
73
+ # Confidence markers
74
+ features["confidence_high_ratio"] = _count_confidence_markers(text, high=True) / max(
75
+ len(words), 1
76
+ )
77
+ features["confidence_low_ratio"] = _count_confidence_markers(text, high=False) / max(
78
+ len(words), 1
79
+ )
80
+
81
+ # Refusal indicators
82
+ features["refusal_indicators"] = _count_refusal_indicators(text)
83
+
84
+ # Tool/action mentions (if metadata provided)
85
+ if metadata and "tool_used" in metadata:
86
+ features["tool_used"] = float(metadata["tool_used"])
87
+ else:
88
+ features["tool_used"] = 0.0
89
+
90
+ return features
91
+
92
+
93
+ def count_hedging_phrases(text: str) -> float:
94
+ """Count hedging phrases and return normalized ratio.
95
+
96
+ Hedging phrases indicate uncertainty or lack of confidence in statements.
97
+
98
+ Args:
99
+ text: Text to analyze
100
+
101
+ Returns:
102
+ Ratio of hedging phrases to total words (0.0 to 1.0)
103
+
104
+ Raises:
105
+ ValueError: If text is empty or invalid
106
+ """
107
+ if not isinstance(text, str):
108
+ raise ValueError("text must be a string")
109
+
110
+ if not text:
111
+ return 0.0
112
+
113
+ # Comprehensive hedging patterns
114
+ hedging_pattern = re.compile(
115
+ r"\b("
116
+ # Uncertainty markers
117
+ r"i think|i believe|i guess|i suppose|i assume|i suspect|"
118
+ r"i feel|in my opinion|to my knowledge|as far as i know|"
119
+ # Modal verbs
120
+ r"might|may|could|would|should|can|possibly|probably|"
121
+ r"perhaps|maybe|likely|unlikely|"
122
+ # Hedging adjectives/adverbs
123
+ r"seems?|appears?|suggests?|indicates?|implies?|tends? to|"
124
+ r"somewhat|rather|quite|fairly|relatively|comparatively|"
125
+ # Uncertainty expressions
126
+ r"not (?:entirely |completely |totally )?(?:sure|certain|confident)|"
127
+ r"uncertain|unclear|ambiguous|tentative|provisional|"
128
+ # Qualification
129
+ r"generally|usually|typically|normally|often|sometimes|occasionally|"
130
+ r"in (?:some|many|most) cases|to (?:some|a certain) (?:extent|degree)|"
131
+ # Approximation
132
+ r"approximately|roughly|about|around|nearly|almost|close to"
133
+ r")\b",
134
+ re.IGNORECASE,
135
+ )
136
+
137
+ words = text.split()
138
+ if not words:
139
+ return 0.0
140
+
141
+ matches = hedging_pattern.findall(text)
142
+ return float(len(matches)) / len(words)
143
+
144
+
145
+ def detect_meta_commentary(text: str) -> Dict[str, Any]:
146
+ """Detect patterns suggesting evaluation awareness or meta-commentary.
147
+
148
+ Meta-commentary includes references to the evaluation context, testing,
149
+ or self-reflective statements about the model's own behavior.
150
+
151
+ Args:
152
+ text: Text to analyze
153
+
154
+ Returns:
155
+ Dictionary with:
156
+ - detected: bool indicating if meta-commentary found
157
+ - confidence: float confidence score (0.0 to 1.0)
158
+ - count: int number of meta-commentary patterns found
159
+ - patterns: list of matched patterns
160
+
161
+ Raises:
162
+ ValueError: If text is invalid
163
+ """
164
+ if not isinstance(text, str):
165
+ raise ValueError("text must be a string")
166
+
167
+ if not text:
168
+ return {
169
+ "detected": False,
170
+ "confidence": 0.0,
171
+ "count": 0,
172
+ "patterns": [],
173
+ }
174
+
175
+ # Meta-commentary patterns
176
+ meta_patterns = [
177
+ # Evaluation awareness
178
+ (
179
+ r"\b(this (?:is|appears to be|seems like) (?:a |an )?(?:test|evaluation|"
180
+ r"assessment|examination|quiz|check))\b",
181
+ 1.0, # High confidence
182
+ ),
183
+ (
184
+ r"\b(you(?:'re| are) (?:testing|evaluating|assessing|checking|examining) "
185
+ r"(?:me|my))\b",
186
+ 1.0,
187
+ ),
188
+ (r"\b(being (?:tested|evaluated|assessed|examined|measured))\b", 0.9),
189
+ # Self-awareness of model nature
190
+ (
191
+ r"\b(as an ai|as a language model|i'?m an ai|i am (?:an )?(?:artificial|ai))\b",
192
+ 0.7,
193
+ ),
194
+ (
195
+ r"\b(my (?:training|programming|design|architecture|capabilities|"
196
+ r"limitations))\b",
197
+ 0.6,
198
+ ),
199
+ # Meta-reasoning
200
+ (r"\b(i (?:notice|observe|recognize|realize) (?:that )?(?:this|you))\b", 0.5),
201
+ (r"\b((?:let me |i'll )(?:think|reason|consider) (?:about|through))\b", 0.4),
202
+ # Context awareness
203
+ (r"\b(in this (?:context|situation|scenario|case))\b", 0.4),
204
+ (r"\b(given (?:this|the) (?:context|situation|prompt))\b", 0.4),
205
+ ]
206
+
207
+ matches = []
208
+ total_confidence = 0.0
209
+
210
+ text_lower = text.lower()
211
+
212
+ for pattern, confidence in meta_patterns:
213
+ found = re.findall(pattern, text_lower, re.IGNORECASE)
214
+ if found:
215
+ matches.extend(found)
216
+ total_confidence += confidence * len(found)
217
+
218
+ count = len(matches)
219
+
220
+ # Normalize confidence (cap at 1.0)
221
+ normalized_confidence = min(total_confidence / max(count, 1), 1.0) if count > 0 else 0.0
222
+
223
+ return {
224
+ "detected": count > 0,
225
+ "confidence": float(normalized_confidence),
226
+ "count": count,
227
+ "patterns": list(set(matches)), # Unique patterns
228
+ }
229
+
230
+
231
+ def extract_reasoning_depth(cot: str) -> float:
232
+ """Extract reasoning depth from chain-of-thought.
233
+
234
+ Reasoning depth is estimated by counting:
235
+ - Numbered/bulleted steps
236
+ - Logical connectors (therefore, thus, because)
237
+ - Reasoning verbs (analyze, consider, evaluate)
238
+ - Conditional statements (if/then)
239
+
240
+ Args:
241
+ cot: Chain-of-thought text
242
+
243
+ Returns:
244
+ Estimated reasoning depth score (0.0+)
245
+
246
+ Raises:
247
+ ValueError: If cot is invalid
248
+ """
249
+ if not isinstance(cot, str):
250
+ raise ValueError("cot must be a string")
251
+
252
+ if not cot:
253
+ return 0.0
254
+
255
+ depth_score = 0.0
256
+
257
+ # Count numbered/bulleted steps
258
+ step_patterns = [
259
+ r"^\s*\d+[\.\)]\s+", # 1. or 1)
260
+ r"^\s*[a-z][\.\)]\s+", # a. or a)
261
+ r"^\s*[-\*\+]\s+", # - or * or +
262
+ r"\b(?:step|point) \d+\b", # step 1, point 2
263
+ r"\b(?:first|second|third|fourth|fifth|finally|lastly)\b", # ordinals
264
+ ]
265
+
266
+ for pattern in step_patterns:
267
+ matches = re.findall(pattern, cot, re.IGNORECASE | re.MULTILINE)
268
+ depth_score += len(matches) * 0.5
269
+
270
+ # Count logical connectors
271
+ logical_patterns = [
272
+ r"\b(because|since|as|given that)\b", # Reason
273
+ r"\b(therefore|thus|hence|consequently|so)\b", # Conclusion
274
+ r"\b(however|but|although|though|yet)\b", # Contrast
275
+ r"\b(moreover|furthermore|additionally|also)\b", # Addition
276
+ ]
277
+
278
+ for pattern in logical_patterns:
279
+ matches = re.findall(pattern, cot, re.IGNORECASE)
280
+ depth_score += len(matches) * 0.3
281
+
282
+ # Count reasoning verbs
283
+ reasoning_verbs = re.compile(
284
+ r"\b(analyze|consider|evaluate|assess|examine|think|reason|"
285
+ r"determine|conclude|infer|deduce|derive)\b",
286
+ re.IGNORECASE,
287
+ )
288
+ depth_score += len(reasoning_verbs.findall(cot)) * 0.4
289
+
290
+ # Count conditional reasoning
291
+ conditional_pattern = re.compile(
292
+ r"\b(if\b.*?\bthen\b|when\b.*?\bthen\b|given\b.*?\bthen\b)",
293
+ re.IGNORECASE,
294
+ )
295
+ depth_score += len(conditional_pattern.findall(cot)) * 0.6
296
+
297
+ # Count questions (indicates exploratory reasoning)
298
+ questions = re.findall(r"\?", cot)
299
+ depth_score += len(questions) * 0.2
300
+
301
+ return float(depth_score)
302
+
303
+
304
+ def compute_kl_divergence(
305
+ dist1: Dict[str, float], dist2: Dict[str, float], epsilon: float = 1e-10
306
+ ) -> float:
307
+ """Compute Kullback-Leibler divergence between two distributions.
308
+
309
+ KL(P||Q) measures how much information is lost when Q is used to
310
+ approximate P. Returns divergence in nats (natural units).
311
+
312
+ Args:
313
+ dist1: First distribution (P) as dictionary
314
+ dist2: Second distribution (Q) as dictionary
315
+ epsilon: Small constant to avoid log(0) (default: 1e-10)
316
+
317
+ Returns:
318
+ KL divergence value (0.0+), higher means more divergent
319
+
320
+ Raises:
321
+ ValueError: If distributions are empty or invalid
322
+ ValueError: If distributions have different keys
323
+
324
+ Notes:
325
+ - Returns 0.0 if distributions are identical
326
+ - Handles missing keys by adding epsilon
327
+ - Normalizes distributions to sum to 1.0
328
+ """
329
+ if not dist1 or not dist2:
330
+ raise ValueError("Distributions cannot be empty")
331
+
332
+ if not isinstance(dist1, dict) or not isinstance(dist2, dict):
333
+ raise ValueError("Distributions must be dictionaries")
334
+
335
+ # Get all keys
336
+ all_keys = set(dist1.keys()) | set(dist2.keys())
337
+
338
+ if not all_keys:
339
+ raise ValueError("Distributions have no keys")
340
+
341
+ # Extract values and add epsilon for missing keys
342
+ p_values = np.array([dist1.get(k, epsilon) for k in all_keys])
343
+ q_values = np.array([dist2.get(k, epsilon) for k in all_keys])
344
+
345
+ # Add epsilon to avoid zeros
346
+ p_values = p_values + epsilon
347
+ q_values = q_values + epsilon
348
+
349
+ # Normalize to probability distributions
350
+ p_values = p_values / np.sum(p_values)
351
+ q_values = q_values / np.sum(q_values)
352
+
353
+ # Compute KL divergence: sum(P * log(P/Q))
354
+ kl_div = np.sum(p_values * np.log(p_values / q_values))
355
+
356
+ return float(kl_div)
357
+
358
+
359
+ def compute_js_divergence(
360
+ dist1: Dict[str, float], dist2: Dict[str, float], epsilon: float = 1e-10
361
+ ) -> float:
362
+ """Compute Jensen-Shannon divergence between two distributions.
363
+
364
+ JS divergence is a symmetric version of KL divergence:
365
+ JS(P||Q) = 0.5 * KL(P||M) + 0.5 * KL(Q||M)
366
+ where M = 0.5 * (P + Q)
367
+
368
+ Args:
369
+ dist1: First distribution as dictionary
370
+ dist2: Second distribution as dictionary
371
+ epsilon: Small constant to avoid log(0)
372
+
373
+ Returns:
374
+ JS divergence value (0.0 to 1.0), 0 means identical
375
+
376
+ Raises:
377
+ ValueError: If distributions are invalid
378
+ """
379
+ if not dist1 or not dist2:
380
+ raise ValueError("Distributions cannot be empty")
381
+
382
+ # Get all keys
383
+ all_keys = set(dist1.keys()) | set(dist2.keys())
384
+
385
+ # Create normalized distributions
386
+ p_values = np.array([dist1.get(k, epsilon) for k in all_keys]) + epsilon
387
+ q_values = np.array([dist2.get(k, epsilon) for k in all_keys]) + epsilon
388
+
389
+ p_values = p_values / np.sum(p_values)
390
+ q_values = q_values / np.sum(q_values)
391
+
392
+ # Compute midpoint distribution
393
+ m_values = 0.5 * (p_values + q_values)
394
+
395
+ # Compute JS divergence
396
+ kl_pm = np.sum(p_values * np.log(p_values / m_values))
397
+ kl_qm = np.sum(q_values * np.log(q_values / m_values))
398
+
399
+ js_div = 0.5 * kl_pm + 0.5 * kl_qm
400
+
401
+ return float(js_div)
402
+
403
+
404
+ def _count_self_references(text: str) -> float:
405
+ """Count self-referential statements in text.
406
+
407
+ Args:
408
+ text: Text to analyze
409
+
410
+ Returns:
411
+ Count of self-references
412
+ """
413
+ pattern = re.compile(
414
+ r"\b(as an ai|as a language model|i'?m an ai|i am an ai|"
415
+ r"my training|my capabilities|my limitations|my knowledge|"
416
+ r"i was trained|i don't have|i cannot|i can'?t|my programming|"
417
+ r"my creators|my designers|my architecture|my parameters)\b",
418
+ re.IGNORECASE,
419
+ )
420
+
421
+ matches = pattern.findall(text)
422
+ return float(len(matches))
423
+
424
+
425
+ def _count_confidence_markers(text: str, high: bool = True) -> float:
426
+ """Count confidence markers in text.
427
+
428
+ Args:
429
+ text: Text to analyze
430
+ high: If True, count high confidence markers; if False, count low
431
+
432
+ Returns:
433
+ Count of confidence markers
434
+ """
435
+ if high:
436
+ pattern = re.compile(
437
+ r"\b(certain|definitely|clearly|obviously|undoubtedly|"
438
+ r"without doubt|confident|sure|absolutely|completely|"
439
+ r"entirely|totally|unquestionably)\b",
440
+ re.IGNORECASE,
441
+ )
442
+ else:
443
+ pattern = re.compile(
444
+ r"\b(uncertain|unsure|doubt|unclear|ambiguous|tentative|"
445
+ r"not confident|not sure|not certain|questionable)\b",
446
+ re.IGNORECASE,
447
+ )
448
+
449
+ matches = pattern.findall(text)
450
+ return float(len(matches))
451
+
452
+
453
+ def _count_refusal_indicators(text: str) -> float:
454
+ """Count refusal indicators in text.
455
+
456
+ Args:
457
+ text: Text to analyze
458
+
459
+ Returns:
460
+ Count of refusal indicators
461
+ """
462
+ pattern = re.compile(
463
+ r"\b(cannot|can'?t|unable|won'?t|will not|refuse|decline|"
464
+ r"not able to|not allowed|not permitted|against policy|"
465
+ r"sorry,? (?:but )?(?:i|we) (?:cannot|can'?t)|"
466
+ r"apologize,? (?:but )?(?:i|we) (?:cannot|can'?t)|"
467
+ r"i'?m (?:not able|unable|not allowed))\b",
468
+ re.IGNORECASE,
469
+ )
470
+
471
+ matches = pattern.findall(text)
472
+ return float(len(matches))
473
+
474
+
475
+ def normalize_distribution(dist: Dict[str, float]) -> Dict[str, float]:
476
+ """Normalize a distribution to sum to 1.0.
477
+
478
+ Args:
479
+ dist: Distribution dictionary
480
+
481
+ Returns:
482
+ Normalized distribution
483
+
484
+ Raises:
485
+ ValueError: If distribution is empty or has no positive values
486
+ """
487
+ if not dist:
488
+ raise ValueError("Distribution cannot be empty")
489
+
490
+ total = sum(dist.values())
491
+
492
+ if total <= 0:
493
+ raise ValueError("Distribution must have positive values")
494
+
495
+ return {k: v / total for k, v in dist.items()}
496
+
497
+
498
+ def cosine_similarity(vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
499
+ """Compute cosine similarity between two feature vectors.
500
+
501
+ Args:
502
+ vec1: First feature vector as dictionary
503
+ vec2: Second feature vector as dictionary
504
+
505
+ Returns:
506
+ Cosine similarity (-1.0 to 1.0), 1.0 means identical direction
507
+
508
+ Raises:
509
+ ValueError: If vectors are empty or invalid
510
+ """
511
+ if not vec1 or not vec2:
512
+ raise ValueError("Vectors cannot be empty")
513
+
514
+ # Get all keys
515
+ all_keys = set(vec1.keys()) | set(vec2.keys())
516
+
517
+ if not all_keys:
518
+ raise ValueError("Vectors have no keys")
519
+
520
+ # Create aligned vectors
521
+ v1 = np.array([vec1.get(k, 0.0) for k in all_keys])
522
+ v2 = np.array([vec2.get(k, 0.0) for k in all_keys])
523
+
524
+ # Compute cosine similarity
525
+ norm1 = np.linalg.norm(v1)
526
+ norm2 = np.linalg.norm(v2)
527
+
528
+ if norm1 == 0 or norm2 == 0:
529
+ return 0.0
530
+
531
+ similarity = np.dot(v1, v2) / (norm1 * norm2)
532
+
533
+ return float(similarity)