abstractcore 2.4.3__py3-none-any.whl → 2.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,557 @@
1
+ """
2
+ Text processor for various text-based file formats.
3
+
4
+ This module provides processing capabilities for text files, CSV/TSV data,
5
+ Markdown documents, and other text-based formats.
6
+ """
7
+
8
+ import csv
9
+ import json
10
+ from pathlib import Path
11
+ from typing import Optional, Dict, Any, List, Union
12
+
13
+ try:
14
+ import pandas as pd
15
+ PANDAS_AVAILABLE = True
16
+ except ImportError:
17
+ PANDAS_AVAILABLE = False
18
+ pd = None
19
+
20
+ from ..base import BaseMediaHandler, MediaProcessingError
21
+ from ..types import MediaContent, MediaType, ContentFormat
22
+
23
+
24
+ class TextProcessor(BaseMediaHandler):
25
+ """
26
+ Text processor for various text-based file formats.
27
+
28
+ Handles plain text, CSV/TSV files, Markdown, JSON, and other text formats
29
+ with intelligent parsing and structure extraction.
30
+ """
31
+
32
+ def __init__(self, **kwargs):
33
+ """
34
+ Initialize the text processor.
35
+
36
+ Args:
37
+ **kwargs: Configuration parameters including:
38
+ - encoding: Default text encoding ('utf-8')
39
+ - csv_delimiter: Default CSV delimiter (',')
40
+ - max_rows: Maximum rows to process for tabular data
41
+ - preserve_structure: Whether to preserve document structure
42
+ """
43
+ super().__init__(**kwargs)
44
+
45
+ # Text processing configuration
46
+ self.default_encoding = kwargs.get('encoding', 'utf-8')
47
+ self.csv_delimiter = kwargs.get('csv_delimiter', ',')
48
+ self.max_rows = kwargs.get('max_rows', 10000)
49
+ self.preserve_structure = kwargs.get('preserve_structure', True)
50
+
51
+ # Set capabilities for text processing
52
+ from ..types import MediaCapabilities
53
+ self.capabilities = MediaCapabilities(
54
+ vision_support=False,
55
+ audio_support=False,
56
+ video_support=False,
57
+ document_support=True,
58
+ supported_document_formats=['txt', 'md', 'csv', 'tsv', 'json', 'xml', 'html', 'htm'],
59
+ max_file_size=self.max_file_size
60
+ )
61
+
62
+ self.logger.debug(
63
+ f"Initialized TextProcessor with encoding={self.default_encoding}, "
64
+ f"max_rows={self.max_rows}, preserve_structure={self.preserve_structure}"
65
+ )
66
+
67
+ def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
68
+ """
69
+ Process a text-based file and return structured content.
70
+
71
+ Args:
72
+ file_path: Path to the text file
73
+ media_type: Detected media type (should be TEXT or DOCUMENT)
74
+ **kwargs: Additional processing parameters:
75
+ - encoding: Text encoding to use
76
+ - format_output: Whether to format output ('raw', 'structured', 'summary')
77
+ - extract_metadata: Whether to extract document metadata
78
+
79
+ Returns:
80
+ MediaContent with processed text content
81
+
82
+ Raises:
83
+ MediaProcessingError: If text processing fails
84
+ """
85
+ if media_type not in [MediaType.TEXT, MediaType.DOCUMENT]:
86
+ raise MediaProcessingError(f"TextProcessor only handles text/document types, got {media_type}")
87
+
88
+ try:
89
+ # Override defaults with kwargs
90
+ encoding = kwargs.get('encoding', self.default_encoding)
91
+ format_output = kwargs.get('format_output', 'structured')
92
+ extract_metadata = kwargs.get('extract_metadata', True)
93
+
94
+ # Determine processing method based on file extension
95
+ extension = file_path.suffix.lower().lstrip('.')
96
+
97
+ if extension in ['csv', 'tsv']:
98
+ content, metadata = self._process_tabular_file(file_path, extension, encoding, **kwargs)
99
+ elif extension == 'json':
100
+ content, metadata = self._process_json_file(file_path, encoding, **kwargs)
101
+ elif extension in ['xml', 'html', 'htm']:
102
+ content, metadata = self._process_markup_file(file_path, extension, encoding, **kwargs)
103
+ elif extension == 'md':
104
+ content, metadata = self._process_markdown_file(file_path, encoding, **kwargs)
105
+ else:
106
+ # Plain text processing
107
+ content, metadata = self._process_plain_text(file_path, encoding, **kwargs)
108
+
109
+ # Apply output formatting
110
+ if format_output == 'structured':
111
+ content = self._apply_structured_formatting(content, extension, metadata)
112
+ elif format_output == 'summary':
113
+ content = self._generate_content_summary(content, extension, metadata)
114
+ # 'raw' format uses content as-is
115
+
116
+ # Determine appropriate MIME type
117
+ mime_type = self._get_mime_type_for_extension(extension)
118
+
119
+ return self._create_media_content(
120
+ content=content,
121
+ file_path=file_path,
122
+ media_type=media_type,
123
+ content_format=ContentFormat.TEXT,
124
+ mime_type=mime_type,
125
+ format=extension,
126
+ **metadata
127
+ )
128
+
129
+ except Exception as e:
130
+ raise MediaProcessingError(f"Failed to process text file {file_path}: {str(e)}") from e
131
+
132
+ def _process_tabular_file(self, file_path: Path, extension: str, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
133
+ """
134
+ Process CSV/TSV files with intelligent structure detection.
135
+
136
+ Args:
137
+ file_path: Path to the tabular file
138
+ extension: File extension ('csv' or 'tsv')
139
+ encoding: Text encoding
140
+ **kwargs: Additional parameters
141
+
142
+ Returns:
143
+ Tuple of (processed_content, metadata)
144
+ """
145
+ delimiter = '\t' if extension == 'tsv' else ','
146
+ delimiter = kwargs.get('delimiter', delimiter)
147
+
148
+ try:
149
+ if PANDAS_AVAILABLE:
150
+ # Use pandas for robust CSV processing
151
+ df = pd.read_csv(
152
+ file_path,
153
+ encoding=encoding,
154
+ delimiter=delimiter,
155
+ nrows=self.max_rows,
156
+ on_bad_lines='skip'
157
+ )
158
+
159
+ # Generate structured content
160
+ content_parts = []
161
+ content_parts.append(f"# {file_path.name}")
162
+ content_parts.append(f"Tabular data with {len(df)} rows and {len(df.columns)} columns\n")
163
+
164
+ # Column information
165
+ content_parts.append("## Columns:")
166
+ for col in df.columns:
167
+ dtype = str(df[col].dtype)
168
+ null_count = df[col].isnull().sum()
169
+ content_parts.append(f"- {col} ({dtype}, {null_count} null values)")
170
+
171
+ content_parts.append("\n## Sample Data:")
172
+ content_parts.append(df.head(10).to_string(index=False))
173
+
174
+ if len(df) > 10:
175
+ content_parts.append(f"\n... and {len(df) - 10} more rows")
176
+
177
+ content = "\n".join(content_parts)
178
+
179
+ metadata = {
180
+ 'row_count': len(df),
181
+ 'column_count': len(df.columns),
182
+ 'columns': df.columns.tolist(),
183
+ 'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()},
184
+ 'delimiter': delimiter,
185
+ 'has_header': True,
186
+ 'null_values': df.isnull().sum().to_dict()
187
+ }
188
+
189
+ else:
190
+ # Fallback to basic CSV processing
191
+ with open(file_path, 'r', encoding=encoding) as f:
192
+ reader = csv.reader(f, delimiter=delimiter)
193
+ rows = list(reader)
194
+
195
+ if not rows:
196
+ content = f"Empty {extension.upper()} file"
197
+ metadata = {'row_count': 0, 'column_count': 0}
198
+ else:
199
+ # Assume first row is header
200
+ header = rows[0]
201
+ data_rows = rows[1:self.max_rows + 1]
202
+
203
+ content_parts = []
204
+ content_parts.append(f"# {file_path.name}")
205
+ content_parts.append(f"Tabular data with {len(data_rows)} rows and {len(header)} columns\n")
206
+
207
+ content_parts.append("## Columns:")
208
+ for col in header:
209
+ content_parts.append(f"- {col}")
210
+
211
+ content_parts.append("\n## Sample Data:")
212
+ for i, row in enumerate(data_rows[:10]):
213
+ content_parts.append(f"Row {i+1}: {', '.join(row)}")
214
+
215
+ if len(data_rows) > 10:
216
+ content_parts.append(f"... and {len(data_rows) - 10} more rows")
217
+
218
+ content = "\n".join(content_parts)
219
+
220
+ metadata = {
221
+ 'row_count': len(data_rows),
222
+ 'column_count': len(header),
223
+ 'columns': header,
224
+ 'delimiter': delimiter,
225
+ 'has_header': True
226
+ }
227
+
228
+ return content, metadata
229
+
230
+ except Exception as e:
231
+ # Fallback to plain text if CSV parsing fails
232
+ with open(file_path, 'r', encoding=encoding) as f:
233
+ content = f.read()
234
+
235
+ metadata = {
236
+ 'processing_error': str(e),
237
+ 'fallback_to_plain_text': True,
238
+ 'delimiter': delimiter
239
+ }
240
+
241
+ return content, metadata
242
+
243
+ def _process_json_file(self, file_path: Path, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
244
+ """
245
+ Process JSON files with structure analysis.
246
+
247
+ Args:
248
+ file_path: Path to the JSON file
249
+ encoding: Text encoding
250
+ **kwargs: Additional parameters
251
+
252
+ Returns:
253
+ Tuple of (processed_content, metadata)
254
+ """
255
+ try:
256
+ with open(file_path, 'r', encoding=encoding) as f:
257
+ data = json.load(f)
258
+
259
+ # Generate structured content
260
+ content_parts = []
261
+ content_parts.append(f"# {file_path.name}")
262
+
263
+ if isinstance(data, dict):
264
+ content_parts.append(f"JSON object with {len(data)} keys\n")
265
+ content_parts.append("## Structure:")
266
+ content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
267
+ elif isinstance(data, list):
268
+ content_parts.append(f"JSON array with {len(data)} items\n")
269
+ content_parts.append("## Sample items:")
270
+ for i, item in enumerate(data[:5]):
271
+ content_parts.append(f"Item {i+1}: {json.dumps(item, ensure_ascii=False)}")
272
+ if len(data) > 5:
273
+ content_parts.append(f"... and {len(data) - 5} more items")
274
+ else:
275
+ content_parts.append("JSON primitive value:")
276
+ content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
277
+
278
+ content = "\n".join(content_parts)
279
+
280
+ metadata = {
281
+ 'json_type': type(data).__name__,
282
+ 'size': len(data) if isinstance(data, (list, dict)) else 1,
283
+ 'keys': list(data.keys()) if isinstance(data, dict) else None
284
+ }
285
+
286
+ return content, metadata
287
+
288
+ except json.JSONDecodeError as e:
289
+ # If JSON is invalid, treat as plain text
290
+ with open(file_path, 'r', encoding=encoding) as f:
291
+ content = f.read()
292
+
293
+ metadata = {
294
+ 'json_error': str(e),
295
+ 'fallback_to_plain_text': True
296
+ }
297
+
298
+ return content, metadata
299
+
300
+ def _process_markup_file(self, file_path: Path, extension: str, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
301
+ """
302
+ Process markup files (XML, HTML) with basic structure extraction.
303
+
304
+ Args:
305
+ file_path: Path to the markup file
306
+ extension: File extension
307
+ encoding: Text encoding
308
+ **kwargs: Additional parameters
309
+
310
+ Returns:
311
+ Tuple of (processed_content, metadata)
312
+ """
313
+ try:
314
+ with open(file_path, 'r', encoding=encoding) as f:
315
+ content = f.read()
316
+
317
+ # Basic structure analysis
318
+ lines = content.split('\n')
319
+ non_empty_lines = [line.strip() for line in lines if line.strip()]
320
+
321
+ # Count basic markup elements
322
+ tag_count = content.count('<')
323
+
324
+ metadata = {
325
+ 'markup_type': extension,
326
+ 'line_count': len(lines),
327
+ 'non_empty_lines': len(non_empty_lines),
328
+ 'tag_count': tag_count,
329
+ 'character_count': len(content)
330
+ }
331
+
332
+ # For HTML, try to extract title
333
+ if extension in ['html', 'htm']:
334
+ import re
335
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', content, re.IGNORECASE | re.DOTALL)
336
+ if title_match:
337
+ metadata['title'] = title_match.group(1).strip()
338
+
339
+ return content, metadata
340
+
341
+ except Exception as e:
342
+ metadata = {
343
+ 'processing_error': str(e),
344
+ 'markup_type': extension
345
+ }
346
+ return "", metadata
347
+
348
+ def _process_markdown_file(self, file_path: Path, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
349
+ """
350
+ Process Markdown files with structure analysis.
351
+
352
+ Args:
353
+ file_path: Path to the Markdown file
354
+ encoding: Text encoding
355
+ **kwargs: Additional parameters
356
+
357
+ Returns:
358
+ Tuple of (processed_content, metadata)
359
+ """
360
+ try:
361
+ with open(file_path, 'r', encoding=encoding) as f:
362
+ content = f.read()
363
+
364
+ # Basic Markdown structure analysis
365
+ lines = content.split('\n')
366
+
367
+ # Count different elements
368
+ headers = [line for line in lines if line.strip().startswith('#')]
369
+ code_blocks = content.count('```')
370
+ links = content.count('[')
371
+ images = content.count('![')
372
+
373
+ metadata = {
374
+ 'line_count': len(lines),
375
+ 'header_count': len(headers),
376
+ 'code_block_count': code_blocks // 2, # Pairs of ```
377
+ 'link_count': links,
378
+ 'image_count': images,
379
+ 'character_count': len(content),
380
+ 'word_count': len(content.split())
381
+ }
382
+
383
+ # Extract headers for structure
384
+ if headers:
385
+ metadata['headers'] = headers[:10] # First 10 headers
386
+
387
+ return content, metadata
388
+
389
+ except Exception as e:
390
+ metadata = {
391
+ 'processing_error': str(e)
392
+ }
393
+ return "", metadata
394
+
395
+ def _process_plain_text(self, file_path: Path, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
396
+ """
397
+ Process plain text files with basic analysis.
398
+
399
+ Args:
400
+ file_path: Path to the text file
401
+ encoding: Text encoding
402
+ **kwargs: Additional parameters
403
+
404
+ Returns:
405
+ Tuple of (processed_content, metadata)
406
+ """
407
+ try:
408
+ with open(file_path, 'r', encoding=encoding) as f:
409
+ content = f.read()
410
+
411
+ # Basic text analysis
412
+ lines = content.split('\n')
413
+ words = content.split()
414
+
415
+ metadata = {
416
+ 'line_count': len(lines),
417
+ 'word_count': len(words),
418
+ 'character_count': len(content),
419
+ 'non_empty_lines': len([line for line in lines if line.strip()]),
420
+ 'encoding_used': encoding
421
+ }
422
+
423
+ return content, metadata
424
+
425
+ except UnicodeDecodeError:
426
+ # Try different encodings
427
+ for alt_encoding in ['latin-1', 'cp1252', 'utf-16']:
428
+ try:
429
+ with open(file_path, 'r', encoding=alt_encoding) as f:
430
+ content = f.read()
431
+
432
+ metadata = {
433
+ 'encoding_used': alt_encoding,
434
+ 'original_encoding_failed': encoding,
435
+ 'character_count': len(content)
436
+ }
437
+
438
+ return content, metadata
439
+ except:
440
+ continue
441
+
442
+ # If all encodings fail, read as binary and decode with errors='replace'
443
+ with open(file_path, 'rb') as f:
444
+ raw_content = f.read()
445
+
446
+ content = raw_content.decode('utf-8', errors='replace')
447
+ metadata = {
448
+ 'encoding_used': 'utf-8-with-errors',
449
+ 'binary_fallback': True,
450
+ 'character_count': len(content)
451
+ }
452
+
453
+ return content, metadata
454
+
455
+ def _apply_structured_formatting(self, content: str, extension: str, metadata: Dict[str, Any]) -> str:
456
+ """Apply structured formatting to content based on file type."""
457
+ if extension in ['csv', 'tsv']:
458
+ # Content is already structured for tabular data
459
+ return content
460
+ elif extension == 'json':
461
+ # Content is already structured for JSON
462
+ return content
463
+ elif extension == 'md':
464
+ # Markdown is already structured
465
+ return content
466
+ else:
467
+ # Add basic structure to plain text
468
+ structured_parts = [f"# {metadata.get('file_name', 'Text Content')}"]
469
+
470
+ if 'word_count' in metadata:
471
+ structured_parts.append(f"Document Statistics: {metadata['word_count']} words, {metadata['line_count']} lines\n")
472
+
473
+ structured_parts.append("## Content:")
474
+ structured_parts.append(content)
475
+
476
+ return "\n".join(structured_parts)
477
+
478
+ def _generate_content_summary(self, content: str, extension: str, metadata: Dict[str, Any]) -> str:
479
+ """Generate a summary of the content."""
480
+ summary_parts = [f"# Summary of {metadata.get('file_name', 'file')}"]
481
+
482
+ if extension in ['csv', 'tsv']:
483
+ summary_parts.append(f"Tabular data with {metadata.get('row_count', 0)} rows and {metadata.get('column_count', 0)} columns")
484
+ if 'columns' in metadata:
485
+ summary_parts.append(f"Columns: {', '.join(metadata['columns'][:5])}")
486
+ elif extension == 'json':
487
+ summary_parts.append(f"JSON {metadata.get('json_type', 'data')} with {metadata.get('size', 0)} items")
488
+ elif extension == 'md':
489
+ summary_parts.append(f"Markdown document with {metadata.get('header_count', 0)} headers and {metadata.get('word_count', 0)} words")
490
+ else:
491
+ summary_parts.append(f"Text document with {metadata.get('word_count', 0)} words and {metadata.get('line_count', 0)} lines")
492
+
493
+ # Add content preview
494
+ preview = content[:500] + "..." if len(content) > 500 else content
495
+ summary_parts.append(f"\nContent preview:\n{preview}")
496
+
497
+ return "\n".join(summary_parts)
498
+
499
+ def _get_mime_type_for_extension(self, extension: str) -> str:
500
+ """Get MIME type for file extension."""
501
+ mime_map = {
502
+ 'txt': 'text/plain',
503
+ 'md': 'text/markdown',
504
+ 'csv': 'text/csv',
505
+ 'tsv': 'text/tab-separated-values',
506
+ 'json': 'application/json',
507
+ 'xml': 'application/xml',
508
+ 'html': 'text/html',
509
+ 'htm': 'text/html'
510
+ }
511
+ return mime_map.get(extension, 'text/plain')
512
+
513
+ def get_text_preview(self, file_path: Union[str, Path], max_chars: int = 1000) -> str:
514
+ """
515
+ Get a preview of text content without full processing.
516
+
517
+ Args:
518
+ file_path: Path to the text file
519
+ max_chars: Maximum characters to preview
520
+
521
+ Returns:
522
+ Text preview
523
+ """
524
+ file_path = Path(file_path)
525
+
526
+ try:
527
+ with open(file_path, 'r', encoding=self.default_encoding) as f:
528
+ content = f.read(max_chars)
529
+ if len(content) == max_chars:
530
+ content += "..."
531
+ return content
532
+ except Exception as e:
533
+ return f"Error reading file: {str(e)}"
534
+
535
+ def get_processing_info(self) -> Dict[str, Any]:
536
+ """
537
+ Get information about the text processor capabilities.
538
+
539
+ Returns:
540
+ Dictionary with processor information
541
+ """
542
+ return {
543
+ 'processor_type': 'TextProcessor',
544
+ 'supported_formats': ['txt', 'md', 'csv', 'tsv', 'json', 'xml', 'html', 'htm'],
545
+ 'capabilities': {
546
+ 'default_encoding': self.default_encoding,
547
+ 'csv_delimiter': self.csv_delimiter,
548
+ 'max_rows': self.max_rows,
549
+ 'preserve_structure': self.preserve_structure,
550
+ 'pandas_integration': PANDAS_AVAILABLE,
551
+ 'structured_formatting': True,
552
+ 'metadata_extraction': True
553
+ },
554
+ 'dependencies': {
555
+ 'pandas': PANDAS_AVAILABLE
556
+ }
557
+ }
@@ -0,0 +1,22 @@
1
+ """
2
+ Utility modules for AbstractCore media handling.
3
+
4
+ Provides reusable utilities for media processing, including image scaling
5
+ optimized for different vision models.
6
+ """
7
+
8
+ from .image_scaler import (
9
+ ModelOptimizedScaler,
10
+ ScalingMode,
11
+ get_scaler,
12
+ scale_image_for_model,
13
+ get_optimal_size_for_model
14
+ )
15
+
16
+ __all__ = [
17
+ 'ModelOptimizedScaler',
18
+ 'ScalingMode',
19
+ 'get_scaler',
20
+ 'scale_image_for_model',
21
+ 'get_optimal_size_for_model'
22
+ ]