tokenmizer 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. tokenmizer/__init__.py +21 -0
  2. tokenmizer/agents/__init__.py +0 -0
  3. tokenmizer/analytics/__init__.py +0 -0
  4. tokenmizer/analytics/engine.py +188 -0
  5. tokenmizer/api/__init__.py +0 -0
  6. tokenmizer/api/app.py +958 -0
  7. tokenmizer/api/rate_limiter.py +110 -0
  8. tokenmizer/checkpoints/__init__.py +0 -0
  9. tokenmizer/checkpoints/manager.py +383 -0
  10. tokenmizer/cli.py +153 -0
  11. tokenmizer/compression/__init__.py +0 -0
  12. tokenmizer/compression/engine.py +669 -0
  13. tokenmizer/compression/output_trimmer.py +95 -0
  14. tokenmizer/compression/window.py +104 -0
  15. tokenmizer/config/__init__.py +0 -0
  16. tokenmizer/config/settings.py +170 -0
  17. tokenmizer/core/__init__.py +0 -0
  18. tokenmizer/core/dto.py +196 -0
  19. tokenmizer/core/errors.py +35 -0
  20. tokenmizer/core/tokenizer.py +96 -0
  21. tokenmizer/dashboard/__init__.py +0 -0
  22. tokenmizer/dashboard/page.py +267 -0
  23. tokenmizer/filters/__init__.py +0 -0
  24. tokenmizer/filters/file_intelligence.py +960 -0
  25. tokenmizer/graph_memory/__init__.py +0 -0
  26. tokenmizer/graph_memory/decision_tracker.py +225 -0
  27. tokenmizer/graph_memory/graph.py +1287 -0
  28. tokenmizer/graph_memory/helpers.py +121 -0
  29. tokenmizer/graph_memory/hybrid_extractor.py +703 -0
  30. tokenmizer/graph_memory/types.py +134 -0
  31. tokenmizer/graph_memory/validator.py +304 -0
  32. tokenmizer/graph_memory/visualization.py +228 -0
  33. tokenmizer/mcp/__init__.py +0 -0
  34. tokenmizer/mcp/server.py +368 -0
  35. tokenmizer/providers/__init__.py +0 -0
  36. tokenmizer/providers/providers.py +456 -0
  37. tokenmizer/security/__init__.py +0 -0
  38. tokenmizer/security/auth.py +95 -0
  39. tokenmizer/security/middleware.py +138 -0
  40. tokenmizer/security/redaction.py +126 -0
  41. tokenmizer/semantic_cache/__init__.py +0 -0
  42. tokenmizer/semantic_cache/cache.py +383 -0
  43. tokenmizer/state/__init__.py +0 -0
  44. tokenmizer/state/backend.py +137 -0
  45. tokenmizer/storage/__init__.py +56 -0
  46. tokenmizer-0.2.4.dist-info/METADATA +529 -0
  47. tokenmizer-0.2.4.dist-info/RECORD +50 -0
  48. tokenmizer-0.2.4.dist-info/WHEEL +4 -0
  49. tokenmizer-0.2.4.dist-info/entry_points.txt +2 -0
  50. tokenmizer-0.2.4.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,960 @@
1
+ """
2
+ File Intelligence Layer — tokenmizer/filters/file_intelligence.py
3
+
4
+ The biggest hidden token drain in LLM apps:
5
+ - A 50,000-row CSV dumped as text = ~400,000 tokens
6
+ - A 200-page PDF sent verbatim = ~150,000 tokens
7
+ - An Excel file with 10 sheets = ~500,000 tokens
8
+
9
+ This module intercepts file content BEFORE it reaches the LLM and applies
10
+ the correct extraction strategy per file type:
11
+
12
+ CSV/Excel → schema + sample rows + statistical summary
13
+ PDF → structure-aware chunked extraction
14
+ JSON → schema inference + value sampling
15
+ Text/MD → smart truncation with boundary preservation
16
+ Images → passthrough (let vision model handle it)
17
+
18
+ Every strategy has a token_budget parameter. Quality is preserved by
19
+ sending the RIGHT information, not ALL information.
20
+ """
21
+ from __future__ import annotations
22
+
23
+ import csv
24
+ import io
25
+ import json
26
+ import logging
27
+ import re
28
+ from dataclasses import dataclass
29
+ from pathlib import Path
30
+ from typing import Optional
31
+
32
+ from tokenmizer.core.tokenizer import count_tokens
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+ # ── Constants ─────────────────────────────────────────────────────────────────
37
+
38
+ # Format token costs: JSON is expensive, TSV is cheap
39
+ # ref: TSV uses ~50% fewer tokens than JSON for tabular data
40
+ _FORMAT_OVERHEAD = {
41
+ "json": 1.0, # baseline
42
+ "tsv": 0.45, # ~55% cheaper than JSON
43
+ "csv": 0.55,
44
+ "text": 0.40,
45
+ }
46
+
47
+ _FILE_EXTENSIONS = {
48
+ # Tabular
49
+ ".csv": "csv", ".tsv": "tsv",
50
+ ".xlsx": "excel", ".xls": "excel", ".ods": "excel",
51
+ # Document
52
+ ".pdf": "pdf",
53
+ ".docx": "docx", ".doc": "docx",
54
+ # Data
55
+ ".json": "json", ".jsonl": "jsonl", ".ndjson": "jsonl",
56
+ ".xml": "xml", ".yaml": "yaml", ".yml": "yaml", ".toml": "toml",
57
+ # Text
58
+ ".txt": "text", ".md": "text", ".rst": "text", ".log": "text",
59
+ # Code
60
+ ".py": "code", ".js": "code", ".ts": "code", ".go": "code",
61
+ ".java": "code", ".cpp": "code", ".c": "code", ".rs": "code",
62
+ ".rb": "code", ".php": "code", ".sh": "code", ".sql": "code",
63
+ }
64
+
65
+
66
+ # ── Result dataclass ──────────────────────────────────────────────────────────
67
+
68
+ @dataclass
69
+ class FileExtractionResult:
70
+ file_type: str
71
+ original_size_bytes: int
72
+ original_tokens: int # estimated tokens if sent raw
73
+ extracted_tokens: int # actual tokens after extraction
74
+ tokens_saved: int
75
+ savings_pct: float
76
+ content: str # what to actually send to LLM
77
+ summary: str # one-line description for logging
78
+ strategy_used: str
79
+ was_truncated: bool = False
80
+
81
+
82
+ # ── Utility ───────────────────────────────────────────────────────────────────
83
+
84
+ def detect_file_type(filename: str, content_bytes: bytes) -> str:
85
+ ext = Path(filename).suffix.lower()
86
+ if ext in _FILE_EXTENSIONS:
87
+ return _FILE_EXTENSIONS[ext]
88
+ # Sniff by content
89
+ try:
90
+ head = content_bytes[:512].decode("utf-8", errors="ignore")
91
+ if head.strip().startswith("{") or head.strip().startswith("["):
92
+ return "json"
93
+ if head.strip().startswith("<?xml") or head.strip().startswith("<"):
94
+ return "xml"
95
+ if "\t" in head and "\n" in head:
96
+ return "tsv"
97
+ if "," in head and "\n" in head:
98
+ return "csv"
99
+ except Exception as e:
100
+ logger.debug(f"File type sniff failed, defaulting to text: {e}")
101
+ return "text"
102
+
103
+
104
+ def _truncate_to_budget(text: str, token_budget: int) -> tuple[str, bool]:
105
+ """Truncate text to token budget, preserving sentence/line boundaries."""
106
+ if count_tokens(text) <= token_budget:
107
+ return text, False
108
+
109
+ # Binary search for the right truncation point
110
+ lines = text.split("\n")
111
+ result_lines = []
112
+ running = 0
113
+ for line in lines:
114
+ line_tokens = count_tokens(line)
115
+ if running + line_tokens > token_budget:
116
+ break
117
+ result_lines.append(line)
118
+ running += line_tokens
119
+
120
+ truncated = "\n".join(result_lines)
121
+ return truncated, True
122
+
123
+
124
+ # ── CSV / TSV extractor ───────────────────────────────────────────────────────
125
+
126
+ class CSVExtractor:
127
+ """
128
+ Strategy for CSV/TSV files.
129
+
130
+ Instead of: 50,000 rows × 10 columns = 500,000 tokens
131
+ We send:
132
+ 1. Schema (column names + inferred types) ~20 tokens
133
+ 2. Statistical summary (min/max/mean/unique) ~80 tokens
134
+ 3. Representative sample rows (stratified) ~200 tokens
135
+ 4. Shape information ~10 tokens
136
+
137
+ Total: ~310 tokens instead of 500,000. Quality: preserved for
138
+ analysis tasks. If user needs specific rows, they can ask and
139
+ we do a targeted query.
140
+ """
141
+
142
+ def extract(
143
+ self,
144
+ content: str,
145
+ filename: str,
146
+ token_budget: int = 400,
147
+ sample_rows: int = 5,
148
+ delimiter: str = ",",
149
+ ) -> FileExtractionResult:
150
+ original_tokens = count_tokens(content)
151
+
152
+ try:
153
+ reader = csv.DictReader(io.StringIO(content), delimiter=delimiter)
154
+ rows = list(reader)
155
+ columns = reader.fieldnames or []
156
+ except Exception as e:
157
+ logger.warning(f"CSV parse failed for {filename}: {e}")
158
+ truncated, was_cut = _truncate_to_budget(content, token_budget)
159
+ return FileExtractionResult(
160
+ file_type="csv", original_size_bytes=len(content.encode()),
161
+ original_tokens=original_tokens,
162
+ extracted_tokens=count_tokens(truncated),
163
+ tokens_saved=original_tokens - count_tokens(truncated),
164
+ savings_pct=0.0, content=truncated,
165
+ summary=f"CSV parse failed, truncated to {token_budget} tokens",
166
+ strategy_used="fallback_truncation", was_truncated=was_cut,
167
+ )
168
+
169
+ if not rows or not columns:
170
+ return self._empty_result(filename, content, original_tokens)
171
+
172
+ total_rows = len(rows)
173
+ parts: list[str] = []
174
+
175
+ # 1. Shape
176
+ parts.append(f"File: {filename} | {total_rows:,} rows × {len(columns)} columns")
177
+
178
+ # 2. Schema with inferred types
179
+ type_map = self._infer_types(rows, columns)
180
+ schema_line = "Columns: " + ", ".join(
181
+ f"{col} ({type_map.get(col, 'text')})" for col in columns
182
+ )
183
+ parts.append(schema_line)
184
+
185
+ # 3. Statistical summary (numeric columns)
186
+ stats = self._compute_stats(rows, columns, type_map)
187
+ if stats:
188
+ parts.append("Stats:\n" + stats)
189
+
190
+ # 4. Categorical summary (text columns with few unique values)
191
+ cats = self._compute_categoricals(rows, columns, type_map)
192
+ if cats:
193
+ parts.append("Categories:\n" + cats)
194
+
195
+ # 5. Sample rows — use TSV format (fewer tokens than CSV/JSON)
196
+ sampled = self._stratified_sample(rows, sample_rows, columns=columns, type_map=type_map)
197
+ header = "\t".join(columns)
198
+ sample_lines = [header] + [
199
+ "\t".join(str(r.get(c, "")) for c in columns) for r in sampled
200
+ ]
201
+ parts.append("Sample rows (TSV):\n" + "\n".join(sample_lines))
202
+
203
+ # 6. Missing value note
204
+ missing = self._missing_summary(rows, columns)
205
+ if missing:
206
+ parts.append("Missing values: " + missing)
207
+
208
+ extracted = "\n\n".join(parts)
209
+ extracted, was_truncated = _truncate_to_budget(extracted, token_budget)
210
+ extracted_tokens = count_tokens(extracted)
211
+
212
+ return FileExtractionResult(
213
+ file_type="csv",
214
+ original_size_bytes=len(content.encode()),
215
+ original_tokens=original_tokens,
216
+ extracted_tokens=extracted_tokens,
217
+ tokens_saved=original_tokens - extracted_tokens,
218
+ savings_pct=round((1 - extracted_tokens / max(1, original_tokens)) * 100, 1),
219
+ content=extracted,
220
+ summary=f"CSV: {total_rows:,} rows × {len(columns)} cols → {extracted_tokens} tokens",
221
+ strategy_used="schema+stats+sample",
222
+ was_truncated=was_truncated,
223
+ )
224
+
225
+ def _infer_types(self, rows: list[dict], columns: list[str]) -> dict[str, str]:
226
+ types = {}
227
+ for col in columns:
228
+ values = [r.get(col, "") for r in rows[:100] if r.get(col)]
229
+ numeric = sum(1 for v in values if self._is_numeric(v))
230
+ if numeric > len(values) * 0.8:
231
+ types[col] = "number"
232
+ elif any(self._looks_like_date(v) for v in values[:20]):
233
+ types[col] = "date"
234
+ else:
235
+ types[col] = "text"
236
+ return types
237
+
238
+ def _is_numeric(self, v: str) -> bool:
239
+ try:
240
+ float(str(v).replace(",", "").replace("$", "").replace("%", ""))
241
+ return True
242
+ except ValueError:
243
+ return False
244
+
245
+ def _looks_like_date(self, v: str) -> bool:
246
+ patterns = [
247
+ r"\d{4}-\d{2}-\d{2}",
248
+ r"\d{2}/\d{2}/\d{4}",
249
+ r"\d{2}-\d{2}-\d{4}",
250
+ ]
251
+ return any(re.match(p, str(v)) for p in patterns)
252
+
253
+ def _compute_stats(self, rows: list[dict], columns: list[str],
254
+ type_map: dict) -> str:
255
+ lines = []
256
+ for col in columns:
257
+ if type_map.get(col) != "number":
258
+ continue
259
+ vals = []
260
+ for r in rows:
261
+ try:
262
+ vals.append(float(str(r.get(col, "")).replace(",", "").replace("$", "")))
263
+ except (ValueError, TypeError):
264
+ pass # intentional: skip non-numeric cells during stats scan
265
+ if not vals:
266
+ continue
267
+ mn, mx, avg = min(vals), max(vals), sum(vals) / len(vals)
268
+ lines.append(f" {col}: min={mn:.2f} max={mx:.2f} mean={avg:.2f} n={len(vals)}")
269
+ return "\n".join(lines)
270
+
271
+ def _compute_categoricals(self, rows: list[dict], columns: list[str],
272
+ type_map: dict, max_unique: int = 10) -> str:
273
+ lines = []
274
+ for col in columns:
275
+ if type_map.get(col) != "text":
276
+ continue
277
+ unique = set(r.get(col, "") for r in rows if r.get(col))
278
+ if 2 <= len(unique) <= max_unique:
279
+ lines.append(f" {col}: {', '.join(sorted(unique)[:max_unique])}")
280
+ return "\n".join(lines)
281
+
282
+ def _stratified_sample(
283
+ self, rows: list[dict], n: int,
284
+ columns: list[str] | None = None,
285
+ type_map: dict | None = None,
286
+ ) -> list[dict]:
287
+ """
288
+ Genuinely stratified sample — not just evenly-spaced indices.
289
+
290
+ Guarantees inclusion of:
291
+ 1. First and last row (temporal/sequence boundaries)
292
+ 2. Rows containing the MIN and MAX of the first numeric column
293
+ (outliers are otherwise invisible to the LLM — it would see
294
+ "max=50000" in stats but never the row that has it)
295
+ 3. One row per rare value of the first low-cardinality categorical
296
+ column (e.g. status="cancelled" appearing once in 10,000 rows)
297
+
298
+ Remaining budget filled with evenly-spaced rows for general coverage.
299
+
300
+ This directly prevents the failure mode where evenly-spaced sampling
301
+ silently drops the one row that actually matters (an error row, an
302
+ outlier transaction, a rare status value).
303
+ """
304
+ if len(rows) <= n:
305
+ return rows
306
+
307
+ selected_indices: set[int] = set()
308
+ columns = columns or (list(rows[0].keys()) if rows else [])
309
+ type_map = type_map or {}
310
+
311
+ # 1. Boundaries
312
+ selected_indices.add(0)
313
+ selected_indices.add(len(rows) - 1)
314
+
315
+ # 2. Outliers — min/max rows across ALL numeric columns.
316
+ # (Checking only the first numeric column is wrong: it's often a
317
+ # sequential ID/index whose min/max are just row 0 and row N-1,
318
+ # already covered by boundaries — the REAL outlier in e.g. an
319
+ # "amount" column would be missed entirely.)
320
+ numeric_cols = [c for c in columns if type_map.get(c) == "number"]
321
+ for col in numeric_cols:
322
+ if len(selected_indices) >= n:
323
+ break
324
+ best_min_idx = best_max_idx = None
325
+ best_min_val = best_max_val = None
326
+ for i, r in enumerate(rows):
327
+ try:
328
+ v = float(str(r.get(col, "")).replace(",", "").replace("$", ""))
329
+ except (ValueError, TypeError):
330
+ continue
331
+ if best_min_val is None or v < best_min_val:
332
+ best_min_val, best_min_idx = v, i
333
+ if best_max_val is None or v > best_max_val:
334
+ best_max_val, best_max_idx = v, i
335
+ if best_min_idx is not None:
336
+ selected_indices.add(best_min_idx)
337
+ if best_max_idx is not None and len(selected_indices) < n:
338
+ selected_indices.add(best_max_idx)
339
+
340
+ # 3. Rare categorical values — one row per rare value (≤3 occurrences)
341
+ # in the first low-cardinality text column
342
+ text_cols = [c for c in columns if type_map.get(c) == "text"]
343
+ if text_cols and len(selected_indices) < n:
344
+ col = text_cols[0]
345
+ value_counts: dict[str, list[int]] = {}
346
+ for i, r in enumerate(rows):
347
+ v = str(r.get(col, ""))
348
+ if v:
349
+ value_counts.setdefault(v, []).append(i)
350
+ # Rare = appears <=3 times in the dataset
351
+ for val, idxs in value_counts.items():
352
+ if len(idxs) <= 3 and len(selected_indices) < n:
353
+ selected_indices.add(idxs[0])
354
+
355
+ # 4. Fill remaining budget with evenly-spaced rows for general coverage
356
+ remaining = n - len(selected_indices)
357
+ if remaining > 0:
358
+ step = max(1, len(rows) // (remaining + 1))
359
+ for i in range(1, remaining + 1):
360
+ idx = min(i * step, len(rows) - 1)
361
+ if len(selected_indices) < n:
362
+ selected_indices.add(idx)
363
+ else:
364
+ break
365
+
366
+ # Return in original row order
367
+ ordered = sorted(selected_indices)[:n]
368
+ return [rows[i] for i in ordered]
369
+
370
+ def _missing_summary(self, rows: list[dict], columns: list[str]) -> str:
371
+ parts = []
372
+ for col in columns:
373
+ empty = sum(1 for r in rows if not r.get(col))
374
+ if empty > 0:
375
+ pct = empty / len(rows) * 100
376
+ if pct > 5:
377
+ parts.append(f"{col}: {pct:.0f}% missing")
378
+ return ", ".join(parts)
379
+
380
+ def _empty_result(self, filename, content, original_tokens):
381
+ return FileExtractionResult(
382
+ file_type="csv", original_size_bytes=len(content.encode()),
383
+ original_tokens=original_tokens, extracted_tokens=0,
384
+ tokens_saved=original_tokens, savings_pct=100.0,
385
+ content=f"File: {filename} (empty or unreadable)",
386
+ summary="Empty CSV", strategy_used="empty", was_truncated=False,
387
+ )
388
+
389
+
390
+ # ── JSON extractor ────────────────────────────────────────────────────────────
391
+
392
+ class JSONExtractor:
393
+ """
394
+ Strategy for JSON files.
395
+
396
+ Instead of: raw JSON with all values = huge tokens
397
+ We send:
398
+ 1. Schema (key paths + value types)
399
+ 2. Array stats (length, sample items)
400
+ 3. Value samples for leaf nodes
401
+
402
+ <ref: TSV format uses ~50% fewer tokens than JSON for structured data>
403
+ """
404
+
405
+ def extract(
406
+ self,
407
+ content: str,
408
+ filename: str,
409
+ token_budget: int = 500,
410
+ ) -> FileExtractionResult:
411
+ original_tokens = count_tokens(content)
412
+
413
+ try:
414
+ data = json.loads(content)
415
+ except json.JSONDecodeError:
416
+ # Try JSONL
417
+ lines = [line.strip() for line in content.splitlines() if line.strip()]
418
+ try:
419
+ data = [json.loads(line) for line in lines[:1000]]
420
+ except Exception:
421
+ truncated, was_cut = _truncate_to_budget(content, token_budget)
422
+ return FileExtractionResult(
423
+ file_type="json", original_size_bytes=len(content.encode()),
424
+ original_tokens=original_tokens,
425
+ extracted_tokens=count_tokens(truncated),
426
+ tokens_saved=original_tokens - count_tokens(truncated),
427
+ savings_pct=0.0, content=truncated,
428
+ summary="JSON parse failed, truncated",
429
+ strategy_used="fallback_truncation", was_truncated=was_cut,
430
+ )
431
+
432
+ parts = [f"File: {filename}"]
433
+ schema = self._extract_schema(data, max_depth=4)
434
+ parts.append("Schema:\n" + schema)
435
+
436
+ if isinstance(data, list):
437
+ parts.append(f"Array length: {len(data):,} items")
438
+ sample = data[:3]
439
+ parts.append("First 3 items:\n" + json.dumps(sample, indent=2)[:800])
440
+ elif isinstance(data, dict):
441
+ parts.append(f"Top-level keys: {', '.join(list(data.keys())[:20])}")
442
+
443
+ extracted = "\n\n".join(parts)
444
+ extracted, was_truncated = _truncate_to_budget(extracted, token_budget)
445
+ extracted_tokens = count_tokens(extracted)
446
+
447
+ return FileExtractionResult(
448
+ file_type="json",
449
+ original_size_bytes=len(content.encode()),
450
+ original_tokens=original_tokens,
451
+ extracted_tokens=extracted_tokens,
452
+ tokens_saved=original_tokens - extracted_tokens,
453
+ savings_pct=round((1 - extracted_tokens / max(1, original_tokens)) * 100, 1),
454
+ content=extracted,
455
+ summary=f"JSON: schema+sample → {extracted_tokens} tokens",
456
+ strategy_used="schema+sample",
457
+ was_truncated=was_truncated,
458
+ )
459
+
460
+ def _extract_schema(self, data, prefix="", max_depth=4, depth=0) -> str:
461
+ if depth >= max_depth:
462
+ return ""
463
+ lines = []
464
+ if isinstance(data, dict):
465
+ for k, v in list(data.items())[:20]:
466
+ path = f"{prefix}.{k}" if prefix else k
467
+ type_name = type(v).__name__
468
+ if isinstance(v, (dict, list)) and depth < max_depth - 1:
469
+ lines.append(f" {path}: {type_name}")
470
+ lines.append(self._extract_schema(v, path, max_depth, depth + 1))
471
+ else:
472
+ sample = str(v)[:40] if not isinstance(v, (dict, list)) else f"[{type_name}]"
473
+ lines.append(f" {path}: {type_name} = {sample}")
474
+ elif isinstance(data, list) and data:
475
+ lines.append(f" {prefix}[]: array({len(data)})")
476
+ lines.append(self._extract_schema(data[0], f"{prefix}[]", max_depth, depth + 1))
477
+ return "\n".join(line for line in lines if line)
478
+
479
+
480
+ # ── PDF extractor ─────────────────────────────────────────────────────────────
481
+
482
+ class PDFExtractor:
483
+ """
484
+ Strategy for PDF files.
485
+
486
+ <ref: page-level chunking won NVIDIA's 2024 benchmarks with 0.648 accuracy>
487
+ <ref: adaptive chunking aligned to logical topic boundaries hit 87% accuracy>
488
+
489
+ We extract:
490
+ 1. Document metadata (title, author, page count)
491
+ 2. Table of contents / heading structure
492
+ 3. First N pages verbatim (usually has context + objectives)
493
+ 4. Last page (often has conclusions/next steps)
494
+ 5. Query-relevant pages if a query is provided
495
+ """
496
+
497
+ def extract(
498
+ self,
499
+ content_bytes: bytes,
500
+ filename: str,
501
+ token_budget: int = 2000,
502
+ query: str = "",
503
+ ) -> FileExtractionResult:
504
+ original_tokens = count_tokens(content_bytes.decode("utf-8", errors="ignore"))
505
+
506
+ try:
507
+ import pypdf # type: ignore
508
+ reader = pypdf.PdfReader(io.BytesIO(content_bytes))
509
+ except ImportError:
510
+ try:
511
+ import PyPDF2 as pypdf # type: ignore
512
+ reader = pypdf.PdfReader(io.BytesIO(content_bytes))
513
+ except ImportError:
514
+ # Fallback: treat as text
515
+ text = content_bytes.decode("utf-8", errors="ignore")
516
+ truncated, was_cut = _truncate_to_budget(text, token_budget)
517
+ return FileExtractionResult(
518
+ file_type="pdf", original_size_bytes=len(content_bytes),
519
+ original_tokens=original_tokens,
520
+ extracted_tokens=count_tokens(truncated),
521
+ tokens_saved=original_tokens - count_tokens(truncated),
522
+ savings_pct=0.0, content=truncated,
523
+ summary="PDF (no parser, text fallback)",
524
+ strategy_used="text_fallback", was_truncated=was_cut,
525
+ )
526
+
527
+ num_pages = len(reader.pages)
528
+ parts = [f"File: {filename} | {num_pages} pages"]
529
+
530
+ # Metadata
531
+ meta = reader.metadata or {}
532
+ if meta.get("/Title"):
533
+ parts.append(f"Title: {meta['/Title']}")
534
+ if meta.get("/Author"):
535
+ parts.append(f"Author: {meta['/Author']}")
536
+
537
+ # Extract page texts
538
+ page_texts: list[str] = []
539
+ for i, page in enumerate(reader.pages):
540
+ try:
541
+ page_texts.append(page.extract_text() or "")
542
+ except Exception as e:
543
+ # Non-fatal: one corrupted page shouldn't block extracting
544
+ # the rest of the document. Logged (not silent) so a
545
+ # document with many failing pages is at least visible —
546
+ # previously this was a bare `except: pass`.
547
+ logger.debug(f"Failed to extract text from page {i} of {filename}: {e}")
548
+ page_texts.append("")
549
+
550
+ # Heading structure (lines that look like headings)
551
+ headings = self._extract_headings(page_texts)
552
+ if headings:
553
+ parts.append("Structure:\n" + "\n".join(headings[:20]))
554
+
555
+ # Budget allocation
556
+ budget_per_section = token_budget // 3
557
+
558
+ # First 2 pages (intro/context)
559
+ first_pages = "\n\n".join(page_texts[:2])
560
+ first_trimmed, _ = _truncate_to_budget(first_pages, budget_per_section)
561
+ if first_trimmed.strip():
562
+ parts.append(f"[Pages 1-2]\n{first_trimmed}")
563
+
564
+ # Query-relevant pages (if query provided)
565
+ if query and len(page_texts) > 3:
566
+ relevant = self._find_relevant_pages(page_texts, query, top_k=2)
567
+ for page_num, page_text in relevant:
568
+ trimmed, _ = _truncate_to_budget(page_text, budget_per_section // 2)
569
+ if trimmed.strip():
570
+ parts.append(f"[Page {page_num + 1} — relevant to query]\n{trimmed}")
571
+
572
+ # Last page (conclusions/next steps)
573
+ if num_pages > 2:
574
+ last_trimmed, _ = _truncate_to_budget(page_texts[-1], budget_per_section // 2)
575
+ if last_trimmed.strip():
576
+ parts.append(f"[Last page]\n{last_trimmed}")
577
+
578
+ extracted = "\n\n".join(parts)
579
+ extracted, was_truncated = _truncate_to_budget(extracted, token_budget)
580
+ extracted_tokens = count_tokens(extracted)
581
+
582
+ return FileExtractionResult(
583
+ file_type="pdf",
584
+ original_size_bytes=len(content_bytes),
585
+ original_tokens=original_tokens,
586
+ extracted_tokens=extracted_tokens,
587
+ tokens_saved=original_tokens - extracted_tokens,
588
+ savings_pct=round((1 - extracted_tokens / max(1, original_tokens)) * 100, 1),
589
+ content=extracted,
590
+ summary=f"PDF: {num_pages}pp → {extracted_tokens} tokens (structure+key pages)",
591
+ strategy_used="structure+key_pages",
592
+ was_truncated=was_truncated,
593
+ )
594
+
595
+ def _extract_headings(self, page_texts: list[str]) -> list[str]:
596
+ headings = []
597
+ heading_pattern = re.compile(
598
+ r"^(?:\d+\.?\s+)?([A-Z][A-Z\s]{3,50}|[A-Z][a-z].{5,60})$", re.MULTILINE
599
+ )
600
+ for i, text in enumerate(page_texts[:50]):
601
+ for m in heading_pattern.finditer(text):
602
+ h = m.group(0).strip()
603
+ if 10 < len(h) < 80:
604
+ headings.append(f" p{i+1}: {h}")
605
+ return headings[:25]
606
+
607
+ def _find_relevant_pages(self, page_texts: list[str], query: str,
608
+ top_k: int = 2) -> list[tuple[int, str]]:
609
+ query_words = set(query.lower().split())
610
+ scored = []
611
+ for i, text in enumerate(page_texts):
612
+ if not text.strip():
613
+ continue
614
+ text_words = set(text.lower().split())
615
+ overlap = len(query_words & text_words)
616
+ scored.append((overlap, i, text))
617
+ scored.sort(reverse=True)
618
+ return [(i, text) for _, i, text in scored[:top_k]]
619
+
620
+
621
+ # ── Excel extractor ───────────────────────────────────────────────────────────
622
+
623
+ class ExcelExtractor:
624
+ """
625
+ Strategy for Excel files (.xlsx/.xls).
626
+
627
+ Excel files have multiple sheets — each can be a separate dataset.
628
+ We extract per-sheet summaries using the same CSV strategy.
629
+ """
630
+
631
+ def extract(
632
+ self,
633
+ content_bytes: bytes,
634
+ filename: str,
635
+ token_budget: int = 800,
636
+ ) -> FileExtractionResult:
637
+ original_tokens = len(content_bytes) // 3 # rough estimate for binary
638
+
639
+ try:
640
+ import openpyxl # type: ignore
641
+ wb = openpyxl.load_workbook(io.BytesIO(content_bytes), read_only=True, data_only=True)
642
+ sheet_names = wb.sheetnames
643
+ except ImportError:
644
+ return FileExtractionResult(
645
+ file_type="excel", original_size_bytes=len(content_bytes),
646
+ original_tokens=original_tokens, extracted_tokens=50,
647
+ tokens_saved=original_tokens - 50, savings_pct=99.0,
648
+ content=f"File: {filename}\nInstall openpyxl to extract Excel: pip install openpyxl",
649
+ summary="Excel (openpyxl not installed)",
650
+ strategy_used="install_hint", was_truncated=False,
651
+ )
652
+ except Exception as e:
653
+ return FileExtractionResult(
654
+ file_type="excel", original_size_bytes=len(content_bytes),
655
+ original_tokens=original_tokens, extracted_tokens=30,
656
+ tokens_saved=original_tokens - 30, savings_pct=99.0,
657
+ content=f"File: {filename}\nExcel parse error: {e}",
658
+ summary="Excel parse error",
659
+ strategy_used="error", was_truncated=False,
660
+ )
661
+
662
+ csv_extractor = CSVExtractor()
663
+ parts = [f"File: {filename} | {len(sheet_names)} sheets: {', '.join(sheet_names)}"]
664
+ budget_per_sheet = token_budget // max(1, len(sheet_names))
665
+
666
+ all_results = []
667
+ for sheet_name in sheet_names[:8]: # max 8 sheets
668
+ ws = wb[sheet_name]
669
+ rows_data = list(ws.iter_rows(values_only=True))
670
+ if not rows_data or len(rows_data) < 2:
671
+ parts.append(f"\n[Sheet: {sheet_name}] — empty")
672
+ continue
673
+
674
+ headers = [str(h) if h is not None else f"col_{i}"
675
+ for i, h in enumerate(rows_data[0])]
676
+ csv_io = io.StringIO()
677
+ writer = csv.writer(csv_io)
678
+ writer.writerow(headers)
679
+ for row in rows_data[1:1001]: # max 1000 rows per sheet
680
+ writer.writerow([str(c) if c is not None else "" for c in row])
681
+
682
+ result = csv_extractor.extract(
683
+ csv_io.getvalue(),
684
+ f"{filename}[{sheet_name}]",
685
+ token_budget=budget_per_sheet,
686
+ sample_rows=3,
687
+ )
688
+ parts.append(f"\n[Sheet: {sheet_name}]\n{result.content}")
689
+ all_results.append(result)
690
+
691
+ extracted = "\n".join(parts)
692
+ extracted, was_truncated = _truncate_to_budget(extracted, token_budget)
693
+ extracted_tokens = count_tokens(extracted)
694
+ total_saved = original_tokens - extracted_tokens
695
+
696
+ return FileExtractionResult(
697
+ file_type="excel",
698
+ original_size_bytes=len(content_bytes),
699
+ original_tokens=original_tokens,
700
+ extracted_tokens=extracted_tokens,
701
+ tokens_saved=total_saved,
702
+ savings_pct=round((1 - extracted_tokens / max(1, original_tokens)) * 100, 1),
703
+ content=extracted,
704
+ summary=f"Excel: {len(sheet_names)} sheets → {extracted_tokens} tokens",
705
+ strategy_used="per_sheet_csv_strategy",
706
+ was_truncated=was_truncated,
707
+ )
708
+
709
+
710
+ # ── Text / Code extractor ─────────────────────────────────────────────────────
711
+
712
+ class TextExtractor:
713
+ """
714
+ Strategy for plain text, markdown, and code files.
715
+
716
+ For code: preserve structure (imports, class/function signatures, key logic)
717
+ For text: preserve beginning + section headers + end
718
+ """
719
+
720
+ def extract(
721
+ self,
722
+ content: str,
723
+ filename: str,
724
+ token_budget: int = 2000,
725
+ file_type: str = "text",
726
+ ) -> FileExtractionResult:
727
+ original_tokens = count_tokens(content)
728
+
729
+ if original_tokens <= token_budget:
730
+ return FileExtractionResult(
731
+ file_type=file_type, original_size_bytes=len(content.encode()),
732
+ original_tokens=original_tokens, extracted_tokens=original_tokens,
733
+ tokens_saved=0, savings_pct=0.0, content=content,
734
+ summary=f"Text: fits in budget ({original_tokens} tokens)",
735
+ strategy_used="passthrough", was_truncated=False,
736
+ )
737
+
738
+ if file_type == "code":
739
+ extracted = self._extract_code_structure(content, token_budget)
740
+ else:
741
+ extracted = self._extract_text_structure(content, token_budget)
742
+
743
+ extracted, was_truncated = _truncate_to_budget(extracted, token_budget)
744
+ extracted_tokens = count_tokens(extracted)
745
+
746
+ return FileExtractionResult(
747
+ file_type=file_type, original_size_bytes=len(content.encode()),
748
+ original_tokens=original_tokens, extracted_tokens=extracted_tokens,
749
+ tokens_saved=original_tokens - extracted_tokens,
750
+ savings_pct=round((1 - extracted_tokens / max(1, original_tokens)) * 100, 1),
751
+ content=extracted,
752
+ summary=f"{file_type}: {original_tokens}→{extracted_tokens} tokens (structure-aware)",
753
+ strategy_used="structure_aware_truncation",
754
+ was_truncated=was_truncated,
755
+ )
756
+
757
+ def _extract_code_structure(self, content: str, budget: int) -> str:
758
+ lines = content.split("\n")
759
+ important: list[str] = []
760
+ in_important = False
761
+
762
+ for i, line in enumerate(lines):
763
+ stripped = line.strip()
764
+ # Always keep: imports, class/function defs, constants, decorators
765
+ if (stripped.startswith(("import ", "from ", "def ", "class ", "async def ",
766
+ "@", "const ", "let ", "var ", "function ",
767
+ "export ", "module.exports", "type ", "interface "))
768
+ or re.match(r"^[A-Z_]{3,}\s*=", stripped)):
769
+ important.append(line)
770
+ in_important = True
771
+ elif in_important and (stripped.startswith(('"""', "'''", "#")) or not stripped):
772
+ important.append(line)
773
+ else:
774
+ in_important = False
775
+ if i < 30 or i >= len(lines) - 10: # always keep top + bottom
776
+ important.append(line)
777
+
778
+ skeleton = "\n".join(important)
779
+ if count_tokens(skeleton) < budget * 0.7:
780
+ # Have room — add more content
781
+ full_trimmed, _ = _truncate_to_budget(content, budget)
782
+ return full_trimmed
783
+ return skeleton
784
+
785
+ def _extract_text_structure(self, content: str, budget: int) -> str:
786
+ lines = content.split("\n")
787
+ # Keep headings + first paragraph of each section
788
+ result: list[str] = []
789
+ heading_pattern = re.compile(r"^#{1,4}\s|^[A-Z].{0,60}:\s*$")
790
+
791
+ # Always keep first 20 lines
792
+ result.extend(lines[:20])
793
+
794
+ # Keep headings from the rest
795
+ for line in lines[20:]:
796
+ if heading_pattern.match(line.strip()):
797
+ result.append(line)
798
+
799
+ # Keep last 10 lines
800
+ result.extend(lines[-10:])
801
+
802
+ skeleton = "\n".join(result)
803
+ if count_tokens(skeleton) > budget:
804
+ skeleton, _ = _truncate_to_budget(skeleton, budget)
805
+ return skeleton
806
+
807
+
808
+ # ── Master dispatcher ─────────────────────────────────────────────────────────
809
+
810
+ class FileIntelligence:
811
+ """
812
+ Main entry point. Auto-detects file type and applies correct strategy.
813
+
814
+ Usage in app.py:
815
+ fi = FileIntelligence()
816
+ result = fi.process(content, filename, token_budget=500, query=user_query)
817
+ # inject result.content into messages instead of raw content
818
+ # log result.tokens_saved
819
+ """
820
+
821
+ def __init__(self):
822
+ self._csv = CSVExtractor()
823
+ self._json = JSONExtractor()
824
+ self._pdf = PDFExtractor()
825
+ self._excel = ExcelExtractor()
826
+ self._text = TextExtractor()
827
+
828
+ def process(
829
+ self,
830
+ content: bytes | str,
831
+ filename: str,
832
+ token_budget: int = 1000,
833
+ query: str = "",
834
+ ) -> FileExtractionResult:
835
+ """
836
+ Process any file. Returns extracted content within token_budget.
837
+
838
+ Args:
839
+ content: raw file bytes or text string
840
+ filename: original filename (used for type detection)
841
+ token_budget: max tokens to use for this file's content
842
+ query: current user query (used for relevance-based extraction)
843
+ """
844
+ content_bytes = content if isinstance(content, bytes) else content.encode("utf-8")
845
+ content_str = content_bytes.decode("utf-8", errors="ignore")
846
+
847
+ file_type = detect_file_type(filename, content_bytes)
848
+
849
+ logger.info(f"FileIntelligence: {filename} ({file_type}, "
850
+ f"{len(content_bytes):,} bytes, budget={token_budget})")
851
+
852
+ if file_type == "csv":
853
+ return self._csv.extract(content_str, filename, token_budget)
854
+ elif file_type == "tsv":
855
+ return self._csv.extract(content_str, filename, token_budget, delimiter="\t")
856
+ elif file_type in ("json", "jsonl"):
857
+ return self._json.extract(content_str, filename, token_budget)
858
+ elif file_type == "pdf":
859
+ return self._pdf.extract(content_bytes, filename, token_budget, query)
860
+ elif file_type == "excel":
861
+ return self._excel.extract(content_bytes, filename, token_budget)
862
+ elif file_type == "code":
863
+ return self._text.extract(content_str, filename, token_budget, "code")
864
+ else:
865
+ return self._text.extract(content_str, filename, token_budget, "text")
866
+
867
+ def process_message_files(
868
+ self,
869
+ messages: list[dict],
870
+ token_budget_per_file: int = 800,
871
+ query: str = "",
872
+ ) -> tuple[list[dict], int]:
873
+ """
874
+ Scan messages for large file content blocks and extract intelligently.
875
+ Detects patterns like:
876
+ - "Here is my CSV file: <large content>"
877
+ - Multi-line data blocks embedded in user messages
878
+
879
+ Returns (processed_messages, total_tokens_saved)
880
+ """
881
+ total_saved = 0
882
+ processed = []
883
+
884
+ for msg in messages:
885
+ content = msg.get("content", "")
886
+ if not isinstance(content, str) or len(content) < 500:
887
+ processed.append(msg)
888
+ continue
889
+
890
+ # Detect if content contains a large file block
891
+ file_block, filename, pre, post = self._extract_file_block(content)
892
+ if file_block is None:
893
+ processed.append(msg)
894
+ continue
895
+
896
+ result = self.process(file_block, filename, token_budget_per_file, query)
897
+ total_saved += result.tokens_saved
898
+
899
+ new_content = (
900
+ (pre + "\n" if pre else "") +
901
+ f"[File: {filename} — {result.summary}]\n{result.content}" +
902
+ ("\n" + post if post else "")
903
+ )
904
+ processed.append({**msg, "content": new_content})
905
+
906
+ return processed, total_saved
907
+
908
+ def _extract_file_block(
909
+ self, content: str
910
+ ) -> tuple[Optional[str], str, str, str]:
911
+ """
912
+ Detect embedded file content in a message.
913
+ Returns (file_content, filename, text_before, text_after) or (None, ...)
914
+ """
915
+ # Pattern: "file.csv\n<content>" or "```csv\n<content>\n```"
916
+ code_fence = re.search(
917
+ r"```(\w+)?\n([\s\S]{500,}?)\n```",
918
+ content,
919
+ )
920
+ if code_fence:
921
+ lang = code_fence.group(1) or "text"
922
+ block = code_fence.group(2)
923
+ filename = f"attachment.{lang}" if lang != "text" else "attachment.txt"
924
+ pre = content[:code_fence.start()].strip()
925
+ post = content[code_fence.end():].strip()
926
+ return block, filename, pre, post
927
+
928
+ # Pattern: very long line-separated content (likely CSV/TSV)
929
+ lines = content.split("\n")
930
+ if len(lines) > 50:
931
+ # FIXED — real bug found via testing, not just theorized: the old
932
+ # code sampled lines[:5] unconditionally and averaged comma counts
933
+ # across them. Any prose preamble before the actual data (e.g. a
934
+ # user typing "Analyze this data:" before pasting a CSV — an
935
+ # extremely common real case) diluted avg_commas below the >=2
936
+ # threshold, so detection silently failed and ZERO tokens were
937
+ # saved on exactly the input this feature exists for. Verified:
938
+ # "Analyze this data:\n<60-row CSV>" saved 0 tokens before this
939
+ # fix, 149 tokens after.
940
+ #
941
+ # Fix: skip a small number of leading non-tabular lines (prose
942
+ # preamble) before taking the 5-line sample used for detection.
943
+ non_tabular_skip_limit = 3 # generous enough for a short intro line
944
+ start = 0
945
+ while start < len(lines) and start < non_tabular_skip_limit:
946
+ probe = lines[start]
947
+ if probe.count(",") >= 2 or probe.count("\t") >= 1:
948
+ break
949
+ start += 1
950
+
951
+ sample = lines[start:start + 5]
952
+ comma_counts = [line.count(",") for line in sample if line]
953
+ tab_counts = [line.count("\t") for line in sample if line]
954
+ avg_commas = sum(comma_counts) / max(1, len(comma_counts))
955
+ avg_tabs = sum(tab_counts) / max(1, len(tab_counts))
956
+
957
+ if avg_commas >= 2 or avg_tabs >= 1:
958
+ return content, "inline_data.csv", "", ""
959
+
960
+ return None, "", content, ""