agent-runtime-core 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,508 @@
1
+ """
2
+ Built-in file processors for common file types.
3
+
4
+ Each processor handles specific file types and extracts text/metadata.
5
+ Optional dependencies are checked at runtime.
6
+ """
7
+
8
+ import csv
9
+ import io
10
+ import json
11
+ import time
12
+ from pathlib import Path
13
+ from typing import Optional
14
+
15
+ from .base import (
16
+ FileProcessor,
17
+ FileType,
18
+ ProcessedFile,
19
+ ProcessingOptions,
20
+ )
21
+
22
+
23
+ class TextFileProcessor(FileProcessor):
24
+ """Processor for plain text files."""
25
+
26
+ @property
27
+ def name(self) -> str:
28
+ return "text"
29
+
30
+ @property
31
+ def supported_types(self) -> list[FileType]:
32
+ return [FileType.TEXT, FileType.MARKDOWN, FileType.JSON, FileType.HTML]
33
+
34
+ @property
35
+ def supported_extensions(self) -> list[str]:
36
+ return [".txt", ".text", ".log", ".md", ".markdown", ".json", ".html", ".htm", ".xml", ".yaml", ".yml"]
37
+
38
+ @property
39
+ def supported_mime_types(self) -> list[str]:
40
+ return [
41
+ "text/plain",
42
+ "text/markdown",
43
+ "text/html",
44
+ "application/json",
45
+ "text/xml",
46
+ "application/xml",
47
+ "text/yaml",
48
+ ]
49
+
50
+ async def process(
51
+ self,
52
+ content: bytes,
53
+ filename: str,
54
+ options: ProcessingOptions,
55
+ ) -> ProcessedFile:
56
+ start_time = time.time()
57
+
58
+ # Detect encoding
59
+ text = self._decode_text(content)
60
+
61
+ # Determine specific type
62
+ ext = Path(filename).suffix.lower()
63
+ if ext in [".md", ".markdown"]:
64
+ file_type = FileType.MARKDOWN
65
+ elif ext == ".json":
66
+ file_type = FileType.JSON
67
+ elif ext in [".html", ".htm"]:
68
+ file_type = FileType.HTML
69
+ else:
70
+ file_type = FileType.TEXT
71
+
72
+ metadata = {
73
+ "line_count": text.count("\n") + 1,
74
+ "char_count": len(text),
75
+ "word_count": len(text.split()),
76
+ }
77
+
78
+ # For JSON, try to parse and add structure info
79
+ if file_type == FileType.JSON:
80
+ try:
81
+ parsed = json.loads(text)
82
+ metadata["json_type"] = type(parsed).__name__
83
+ if isinstance(parsed, dict):
84
+ metadata["json_keys"] = list(parsed.keys())[:20]
85
+ elif isinstance(parsed, list):
86
+ metadata["json_length"] = len(parsed)
87
+ except json.JSONDecodeError:
88
+ pass
89
+
90
+ return ProcessedFile(
91
+ filename=filename,
92
+ file_type=file_type,
93
+ mime_type=self._get_mime_type(filename),
94
+ size_bytes=len(content),
95
+ text=text,
96
+ metadata=metadata,
97
+ processor_used=self.name,
98
+ processing_time_ms=(time.time() - start_time) * 1000,
99
+ )
100
+
101
+ def _decode_text(self, content: bytes) -> str:
102
+ """Decode bytes to string, trying multiple encodings."""
103
+ encodings = ["utf-8", "utf-16", "latin-1", "cp1252"]
104
+ for encoding in encodings:
105
+ try:
106
+ return content.decode(encoding)
107
+ except UnicodeDecodeError:
108
+ continue
109
+ # Fallback with replacement
110
+ return content.decode("utf-8", errors="replace")
111
+
112
+ def _get_mime_type(self, filename: str) -> str:
113
+ ext = Path(filename).suffix.lower()
114
+ mime_map = {
115
+ ".txt": "text/plain",
116
+ ".md": "text/markdown",
117
+ ".json": "application/json",
118
+ ".html": "text/html",
119
+ ".xml": "application/xml",
120
+ ".yaml": "text/yaml",
121
+ }
122
+ return mime_map.get(ext, "text/plain")
123
+
124
+
125
+ class CsvProcessor(FileProcessor):
126
+ """Processor for CSV files."""
127
+
128
+ @property
129
+ def name(self) -> str:
130
+ return "csv"
131
+
132
+ @property
133
+ def supported_types(self) -> list[FileType]:
134
+ return [FileType.CSV]
135
+
136
+ @property
137
+ def supported_extensions(self) -> list[str]:
138
+ return [".csv", ".tsv"]
139
+
140
+ @property
141
+ def supported_mime_types(self) -> list[str]:
142
+ return ["text/csv", "text/tab-separated-values"]
143
+
144
+ async def process(
145
+ self,
146
+ content: bytes,
147
+ filename: str,
148
+ options: ProcessingOptions,
149
+ ) -> ProcessedFile:
150
+ start_time = time.time()
151
+
152
+ # Decode content
153
+ text = content.decode("utf-8", errors="replace")
154
+
155
+ # Parse CSV
156
+ delimiter = "\t" if filename.endswith(".tsv") else ","
157
+ reader = csv.reader(io.StringIO(text), delimiter=delimiter)
158
+ rows = list(reader)
159
+
160
+ # Extract metadata
161
+ headers = rows[0] if rows else []
162
+ row_count = len(rows) - 1 if rows else 0 # Exclude header
163
+
164
+ metadata = {
165
+ "headers": headers,
166
+ "row_count": row_count,
167
+ "column_count": len(headers),
168
+ "delimiter": delimiter,
169
+ }
170
+
171
+ return ProcessedFile(
172
+ filename=filename,
173
+ file_type=FileType.CSV,
174
+ mime_type="text/csv",
175
+ size_bytes=len(content),
176
+ text=text,
177
+ metadata=metadata,
178
+ processor_used=self.name,
179
+ processing_time_ms=(time.time() - start_time) * 1000,
180
+ )
181
+
182
+
183
+ class PDFProcessor(FileProcessor):
184
+ """
185
+ Processor for PDF files.
186
+
187
+ Requires: pypdf (pip install pypdf)
188
+ """
189
+
190
+ @property
191
+ def name(self) -> str:
192
+ return "pdf"
193
+
194
+ @property
195
+ def supported_types(self) -> list[FileType]:
196
+ return [FileType.PDF]
197
+
198
+ @property
199
+ def supported_extensions(self) -> list[str]:
200
+ return [".pdf"]
201
+
202
+ @property
203
+ def supported_mime_types(self) -> list[str]:
204
+ return ["application/pdf"]
205
+
206
+ async def process(
207
+ self,
208
+ content: bytes,
209
+ filename: str,
210
+ options: ProcessingOptions,
211
+ ) -> ProcessedFile:
212
+ try:
213
+ from pypdf import PdfReader
214
+ except ImportError:
215
+ raise ImportError("pypdf is required for PDF processing. Install with: pip install pypdf")
216
+
217
+ start_time = time.time()
218
+ warnings = []
219
+
220
+ # Read PDF
221
+ reader = PdfReader(io.BytesIO(content))
222
+
223
+ # Extract text from pages
224
+ text_parts = []
225
+ page_limit = options.pdf_page_limit or len(reader.pages)
226
+
227
+ for i, page in enumerate(reader.pages[:page_limit]):
228
+ try:
229
+ text_parts.append(page.extract_text() or "")
230
+ except Exception as e:
231
+ warnings.append(f"Failed to extract text from page {i+1}: {e}")
232
+
233
+ text = "\n\n".join(text_parts)
234
+
235
+ # Extract metadata
236
+ info = reader.metadata or {}
237
+ metadata = {
238
+ "page_count": len(reader.pages),
239
+ "pages_processed": min(page_limit, len(reader.pages)),
240
+ "title": info.get("/Title", ""),
241
+ "author": info.get("/Author", ""),
242
+ "subject": info.get("/Subject", ""),
243
+ "creator": info.get("/Creator", ""),
244
+ "producer": info.get("/Producer", ""),
245
+ }
246
+
247
+ return ProcessedFile(
248
+ filename=filename,
249
+ file_type=FileType.PDF,
250
+ mime_type="application/pdf",
251
+ size_bytes=len(content),
252
+ text=text,
253
+ metadata=metadata,
254
+ processor_used=self.name,
255
+ processing_time_ms=(time.time() - start_time) * 1000,
256
+ warnings=warnings,
257
+ )
258
+
259
+
260
+ class ImageProcessor(FileProcessor):
261
+ """
262
+ Processor for image files.
263
+
264
+ Requires: Pillow (pip install Pillow)
265
+ """
266
+
267
+ @property
268
+ def name(self) -> str:
269
+ return "image"
270
+
271
+ @property
272
+ def supported_types(self) -> list[FileType]:
273
+ return [FileType.IMAGE]
274
+
275
+ @property
276
+ def supported_extensions(self) -> list[str]:
277
+ return [".png", ".jpg", ".jpeg", ".gif", ".webp", ".bmp", ".tiff", ".tif"]
278
+
279
+ @property
280
+ def supported_mime_types(self) -> list[str]:
281
+ return [
282
+ "image/png",
283
+ "image/jpeg",
284
+ "image/gif",
285
+ "image/webp",
286
+ "image/bmp",
287
+ "image/tiff",
288
+ ]
289
+
290
+ async def process(
291
+ self,
292
+ content: bytes,
293
+ filename: str,
294
+ options: ProcessingOptions,
295
+ ) -> ProcessedFile:
296
+ try:
297
+ from PIL import Image
298
+ except ImportError:
299
+ raise ImportError("Pillow is required for image processing. Install with: pip install Pillow")
300
+
301
+ import base64
302
+ start_time = time.time()
303
+
304
+ # Open image
305
+ img = Image.open(io.BytesIO(content))
306
+
307
+ # Extract metadata
308
+ metadata = {
309
+ "width": img.width,
310
+ "height": img.height,
311
+ "format": img.format,
312
+ "mode": img.mode,
313
+ }
314
+
315
+ # Add EXIF data if available
316
+ if hasattr(img, "_getexif") and img._getexif():
317
+ exif = img._getexif()
318
+ metadata["has_exif"] = True
319
+
320
+ # Generate thumbnail
321
+ thumbnail_base64 = None
322
+ if options.generate_thumbnail:
323
+ thumb = img.copy()
324
+ thumb.thumbnail(options.thumbnail_size)
325
+ thumb_buffer = io.BytesIO()
326
+ thumb.save(thumb_buffer, format="PNG")
327
+ thumbnail_base64 = base64.b64encode(thumb_buffer.getvalue()).decode("utf-8")
328
+
329
+ # Images don't have text by default - OCR or vision needed
330
+ text = ""
331
+
332
+ return ProcessedFile(
333
+ filename=filename,
334
+ file_type=FileType.IMAGE,
335
+ mime_type=f"image/{(img.format or 'png').lower()}",
336
+ size_bytes=len(content),
337
+ text=text,
338
+ metadata=metadata,
339
+ thumbnail_base64=thumbnail_base64,
340
+ processor_used=self.name,
341
+ processing_time_ms=(time.time() - start_time) * 1000,
342
+ )
343
+
344
+
345
+ class DocxProcessor(FileProcessor):
346
+ """
347
+ Processor for Microsoft Word documents.
348
+
349
+ Requires: python-docx (pip install python-docx)
350
+ """
351
+
352
+ @property
353
+ def name(self) -> str:
354
+ return "docx"
355
+
356
+ @property
357
+ def supported_types(self) -> list[FileType]:
358
+ return [FileType.DOCX]
359
+
360
+ @property
361
+ def supported_extensions(self) -> list[str]:
362
+ return [".docx"]
363
+
364
+ @property
365
+ def supported_mime_types(self) -> list[str]:
366
+ return [
367
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
368
+ ]
369
+
370
+ async def process(
371
+ self,
372
+ content: bytes,
373
+ filename: str,
374
+ options: ProcessingOptions,
375
+ ) -> ProcessedFile:
376
+ try:
377
+ import docx
378
+ except ImportError:
379
+ raise ImportError("python-docx is required for DOCX processing. Install with: pip install python-docx")
380
+
381
+ start_time = time.time()
382
+
383
+ # Read document
384
+ doc = docx.Document(io.BytesIO(content))
385
+
386
+ # Extract text from paragraphs
387
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
388
+ text = "\n\n".join(paragraphs)
389
+
390
+ # Extract text from tables
391
+ table_texts = []
392
+ for table in doc.tables:
393
+ for row in table.rows:
394
+ row_text = " | ".join(cell.text for cell in row.cells)
395
+ if row_text.strip():
396
+ table_texts.append(row_text)
397
+
398
+ if table_texts:
399
+ text += "\n\n--- Tables ---\n" + "\n".join(table_texts)
400
+
401
+ # Extract metadata
402
+ core_props = doc.core_properties
403
+ metadata = {
404
+ "paragraph_count": len(doc.paragraphs),
405
+ "table_count": len(doc.tables),
406
+ "title": core_props.title or "",
407
+ "author": core_props.author or "",
408
+ "subject": core_props.subject or "",
409
+ "created": str(core_props.created) if core_props.created else "",
410
+ "modified": str(core_props.modified) if core_props.modified else "",
411
+ }
412
+
413
+ return ProcessedFile(
414
+ filename=filename,
415
+ file_type=FileType.DOCX,
416
+ mime_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
417
+ size_bytes=len(content),
418
+ text=text,
419
+ metadata=metadata,
420
+ processor_used=self.name,
421
+ processing_time_ms=(time.time() - start_time) * 1000,
422
+ )
423
+
424
+
425
+ class XlsxProcessor(FileProcessor):
426
+ """
427
+ Processor for Microsoft Excel spreadsheets.
428
+
429
+ Requires: openpyxl (pip install openpyxl)
430
+ """
431
+
432
+ @property
433
+ def name(self) -> str:
434
+ return "xlsx"
435
+
436
+ @property
437
+ def supported_types(self) -> list[FileType]:
438
+ return [FileType.XLSX]
439
+
440
+ @property
441
+ def supported_extensions(self) -> list[str]:
442
+ return [".xlsx"]
443
+
444
+ @property
445
+ def supported_mime_types(self) -> list[str]:
446
+ return [
447
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
448
+ ]
449
+
450
+ async def process(
451
+ self,
452
+ content: bytes,
453
+ filename: str,
454
+ options: ProcessingOptions,
455
+ ) -> ProcessedFile:
456
+ try:
457
+ import openpyxl
458
+ except ImportError:
459
+ raise ImportError("openpyxl is required for XLSX processing. Install with: pip install openpyxl")
460
+
461
+ start_time = time.time()
462
+
463
+ # Read workbook
464
+ wb = openpyxl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
465
+
466
+ # Extract text from all sheets
467
+ text_parts = []
468
+ sheet_info = []
469
+
470
+ for sheet_name in wb.sheetnames:
471
+ sheet = wb[sheet_name]
472
+ text_parts.append(f"=== Sheet: {sheet_name} ===")
473
+
474
+ rows = []
475
+ row_count = 0
476
+ for row in sheet.iter_rows(values_only=True):
477
+ row_text = " | ".join(str(cell) if cell is not None else "" for cell in row)
478
+ if row_text.strip(" |"):
479
+ rows.append(row_text)
480
+ row_count += 1
481
+
482
+ text_parts.extend(rows)
483
+ sheet_info.append({
484
+ "name": sheet_name,
485
+ "row_count": row_count,
486
+ })
487
+
488
+ text = "\n".join(text_parts)
489
+
490
+ metadata = {
491
+ "sheet_count": len(wb.sheetnames),
492
+ "sheet_names": wb.sheetnames,
493
+ "sheets": sheet_info,
494
+ }
495
+
496
+ wb.close()
497
+
498
+ return ProcessedFile(
499
+ filename=filename,
500
+ file_type=FileType.XLSX,
501
+ mime_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
502
+ size_bytes=len(content),
503
+ text=text,
504
+ metadata=metadata,
505
+ processor_used=self.name,
506
+ processing_time_ms=(time.time() - start_time) * 1000,
507
+ )
508
+