abstractcore 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/cli/__init__.py +9 -0
- abstractcore/cli/main.py +759 -0
- abstractcore/cli/vision_config.py +491 -0
- abstractcore/core/interface.py +7 -0
- abstractcore/core/session.py +27 -2
- abstractcore/media/handlers/__init__.py +16 -0
- abstractcore/media/handlers/anthropic_handler.py +326 -0
- abstractcore/media/handlers/local_handler.py +541 -0
- abstractcore/media/handlers/openai_handler.py +281 -0
- abstractcore/media/processors/__init__.py +13 -0
- abstractcore/media/processors/image_processor.py +610 -0
- abstractcore/media/processors/office_processor.py +490 -0
- abstractcore/media/processors/pdf_processor.py +485 -0
- abstractcore/media/processors/text_processor.py +557 -0
- abstractcore/media/utils/__init__.py +22 -0
- abstractcore/media/utils/image_scaler.py +306 -0
- abstractcore/providers/anthropic_provider.py +14 -2
- abstractcore/providers/base.py +24 -0
- abstractcore/providers/huggingface_provider.py +23 -9
- abstractcore/providers/lmstudio_provider.py +6 -1
- abstractcore/providers/mlx_provider.py +20 -7
- abstractcore/providers/ollama_provider.py +6 -1
- abstractcore/providers/openai_provider.py +6 -2
- abstractcore/tools/common_tools.py +651 -1
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/METADATA +59 -9
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/RECORD +31 -17
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/entry_points.txt +2 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.4.dist-info → abstractcore-2.4.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,557 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text processor for various text-based file formats.
|
|
3
|
+
|
|
4
|
+
This module provides processing capabilities for text files, CSV/TSV data,
|
|
5
|
+
Markdown documents, and other text-based formats.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import csv
|
|
9
|
+
import json
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Optional, Dict, Any, List, Union
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import pandas as pd
|
|
15
|
+
PANDAS_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
PANDAS_AVAILABLE = False
|
|
18
|
+
pd = None
|
|
19
|
+
|
|
20
|
+
from ..base import BaseMediaHandler, MediaProcessingError
|
|
21
|
+
from ..types import MediaContent, MediaType, ContentFormat
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class TextProcessor(BaseMediaHandler):
|
|
25
|
+
"""
|
|
26
|
+
Text processor for various text-based file formats.
|
|
27
|
+
|
|
28
|
+
Handles plain text, CSV/TSV files, Markdown, JSON, and other text formats
|
|
29
|
+
with intelligent parsing and structure extraction.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, **kwargs):
|
|
33
|
+
"""
|
|
34
|
+
Initialize the text processor.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
**kwargs: Configuration parameters including:
|
|
38
|
+
- encoding: Default text encoding ('utf-8')
|
|
39
|
+
- csv_delimiter: Default CSV delimiter (',')
|
|
40
|
+
- max_rows: Maximum rows to process for tabular data
|
|
41
|
+
- preserve_structure: Whether to preserve document structure
|
|
42
|
+
"""
|
|
43
|
+
super().__init__(**kwargs)
|
|
44
|
+
|
|
45
|
+
# Text processing configuration
|
|
46
|
+
self.default_encoding = kwargs.get('encoding', 'utf-8')
|
|
47
|
+
self.csv_delimiter = kwargs.get('csv_delimiter', ',')
|
|
48
|
+
self.max_rows = kwargs.get('max_rows', 10000)
|
|
49
|
+
self.preserve_structure = kwargs.get('preserve_structure', True)
|
|
50
|
+
|
|
51
|
+
# Set capabilities for text processing
|
|
52
|
+
from ..types import MediaCapabilities
|
|
53
|
+
self.capabilities = MediaCapabilities(
|
|
54
|
+
vision_support=False,
|
|
55
|
+
audio_support=False,
|
|
56
|
+
video_support=False,
|
|
57
|
+
document_support=True,
|
|
58
|
+
supported_document_formats=['txt', 'md', 'csv', 'tsv', 'json', 'xml', 'html', 'htm'],
|
|
59
|
+
max_file_size=self.max_file_size
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
self.logger.debug(
|
|
63
|
+
f"Initialized TextProcessor with encoding={self.default_encoding}, "
|
|
64
|
+
f"max_rows={self.max_rows}, preserve_structure={self.preserve_structure}"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
def _process_internal(self, file_path: Path, media_type: MediaType, **kwargs) -> MediaContent:
|
|
68
|
+
"""
|
|
69
|
+
Process a text-based file and return structured content.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
file_path: Path to the text file
|
|
73
|
+
media_type: Detected media type (should be TEXT or DOCUMENT)
|
|
74
|
+
**kwargs: Additional processing parameters:
|
|
75
|
+
- encoding: Text encoding to use
|
|
76
|
+
- format_output: Whether to format output ('raw', 'structured', 'summary')
|
|
77
|
+
- extract_metadata: Whether to extract document metadata
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
MediaContent with processed text content
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
MediaProcessingError: If text processing fails
|
|
84
|
+
"""
|
|
85
|
+
if media_type not in [MediaType.TEXT, MediaType.DOCUMENT]:
|
|
86
|
+
raise MediaProcessingError(f"TextProcessor only handles text/document types, got {media_type}")
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
# Override defaults with kwargs
|
|
90
|
+
encoding = kwargs.get('encoding', self.default_encoding)
|
|
91
|
+
format_output = kwargs.get('format_output', 'structured')
|
|
92
|
+
extract_metadata = kwargs.get('extract_metadata', True)
|
|
93
|
+
|
|
94
|
+
# Determine processing method based on file extension
|
|
95
|
+
extension = file_path.suffix.lower().lstrip('.')
|
|
96
|
+
|
|
97
|
+
if extension in ['csv', 'tsv']:
|
|
98
|
+
content, metadata = self._process_tabular_file(file_path, extension, encoding, **kwargs)
|
|
99
|
+
elif extension == 'json':
|
|
100
|
+
content, metadata = self._process_json_file(file_path, encoding, **kwargs)
|
|
101
|
+
elif extension in ['xml', 'html', 'htm']:
|
|
102
|
+
content, metadata = self._process_markup_file(file_path, extension, encoding, **kwargs)
|
|
103
|
+
elif extension == 'md':
|
|
104
|
+
content, metadata = self._process_markdown_file(file_path, encoding, **kwargs)
|
|
105
|
+
else:
|
|
106
|
+
# Plain text processing
|
|
107
|
+
content, metadata = self._process_plain_text(file_path, encoding, **kwargs)
|
|
108
|
+
|
|
109
|
+
# Apply output formatting
|
|
110
|
+
if format_output == 'structured':
|
|
111
|
+
content = self._apply_structured_formatting(content, extension, metadata)
|
|
112
|
+
elif format_output == 'summary':
|
|
113
|
+
content = self._generate_content_summary(content, extension, metadata)
|
|
114
|
+
# 'raw' format uses content as-is
|
|
115
|
+
|
|
116
|
+
# Determine appropriate MIME type
|
|
117
|
+
mime_type = self._get_mime_type_for_extension(extension)
|
|
118
|
+
|
|
119
|
+
return self._create_media_content(
|
|
120
|
+
content=content,
|
|
121
|
+
file_path=file_path,
|
|
122
|
+
media_type=media_type,
|
|
123
|
+
content_format=ContentFormat.TEXT,
|
|
124
|
+
mime_type=mime_type,
|
|
125
|
+
format=extension,
|
|
126
|
+
**metadata
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
except Exception as e:
|
|
130
|
+
raise MediaProcessingError(f"Failed to process text file {file_path}: {str(e)}") from e
|
|
131
|
+
|
|
132
|
+
def _process_tabular_file(self, file_path: Path, extension: str, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
|
|
133
|
+
"""
|
|
134
|
+
Process CSV/TSV files with intelligent structure detection.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
file_path: Path to the tabular file
|
|
138
|
+
extension: File extension ('csv' or 'tsv')
|
|
139
|
+
encoding: Text encoding
|
|
140
|
+
**kwargs: Additional parameters
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Tuple of (processed_content, metadata)
|
|
144
|
+
"""
|
|
145
|
+
delimiter = '\t' if extension == 'tsv' else ','
|
|
146
|
+
delimiter = kwargs.get('delimiter', delimiter)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
if PANDAS_AVAILABLE:
|
|
150
|
+
# Use pandas for robust CSV processing
|
|
151
|
+
df = pd.read_csv(
|
|
152
|
+
file_path,
|
|
153
|
+
encoding=encoding,
|
|
154
|
+
delimiter=delimiter,
|
|
155
|
+
nrows=self.max_rows,
|
|
156
|
+
on_bad_lines='skip'
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Generate structured content
|
|
160
|
+
content_parts = []
|
|
161
|
+
content_parts.append(f"# {file_path.name}")
|
|
162
|
+
content_parts.append(f"Tabular data with {len(df)} rows and {len(df.columns)} columns\n")
|
|
163
|
+
|
|
164
|
+
# Column information
|
|
165
|
+
content_parts.append("## Columns:")
|
|
166
|
+
for col in df.columns:
|
|
167
|
+
dtype = str(df[col].dtype)
|
|
168
|
+
null_count = df[col].isnull().sum()
|
|
169
|
+
content_parts.append(f"- {col} ({dtype}, {null_count} null values)")
|
|
170
|
+
|
|
171
|
+
content_parts.append("\n## Sample Data:")
|
|
172
|
+
content_parts.append(df.head(10).to_string(index=False))
|
|
173
|
+
|
|
174
|
+
if len(df) > 10:
|
|
175
|
+
content_parts.append(f"\n... and {len(df) - 10} more rows")
|
|
176
|
+
|
|
177
|
+
content = "\n".join(content_parts)
|
|
178
|
+
|
|
179
|
+
metadata = {
|
|
180
|
+
'row_count': len(df),
|
|
181
|
+
'column_count': len(df.columns),
|
|
182
|
+
'columns': df.columns.tolist(),
|
|
183
|
+
'data_types': {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
184
|
+
'delimiter': delimiter,
|
|
185
|
+
'has_header': True,
|
|
186
|
+
'null_values': df.isnull().sum().to_dict()
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
else:
|
|
190
|
+
# Fallback to basic CSV processing
|
|
191
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
192
|
+
reader = csv.reader(f, delimiter=delimiter)
|
|
193
|
+
rows = list(reader)
|
|
194
|
+
|
|
195
|
+
if not rows:
|
|
196
|
+
content = f"Empty {extension.upper()} file"
|
|
197
|
+
metadata = {'row_count': 0, 'column_count': 0}
|
|
198
|
+
else:
|
|
199
|
+
# Assume first row is header
|
|
200
|
+
header = rows[0]
|
|
201
|
+
data_rows = rows[1:self.max_rows + 1]
|
|
202
|
+
|
|
203
|
+
content_parts = []
|
|
204
|
+
content_parts.append(f"# {file_path.name}")
|
|
205
|
+
content_parts.append(f"Tabular data with {len(data_rows)} rows and {len(header)} columns\n")
|
|
206
|
+
|
|
207
|
+
content_parts.append("## Columns:")
|
|
208
|
+
for col in header:
|
|
209
|
+
content_parts.append(f"- {col}")
|
|
210
|
+
|
|
211
|
+
content_parts.append("\n## Sample Data:")
|
|
212
|
+
for i, row in enumerate(data_rows[:10]):
|
|
213
|
+
content_parts.append(f"Row {i+1}: {', '.join(row)}")
|
|
214
|
+
|
|
215
|
+
if len(data_rows) > 10:
|
|
216
|
+
content_parts.append(f"... and {len(data_rows) - 10} more rows")
|
|
217
|
+
|
|
218
|
+
content = "\n".join(content_parts)
|
|
219
|
+
|
|
220
|
+
metadata = {
|
|
221
|
+
'row_count': len(data_rows),
|
|
222
|
+
'column_count': len(header),
|
|
223
|
+
'columns': header,
|
|
224
|
+
'delimiter': delimiter,
|
|
225
|
+
'has_header': True
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return content, metadata
|
|
229
|
+
|
|
230
|
+
except Exception as e:
|
|
231
|
+
# Fallback to plain text if CSV parsing fails
|
|
232
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
233
|
+
content = f.read()
|
|
234
|
+
|
|
235
|
+
metadata = {
|
|
236
|
+
'processing_error': str(e),
|
|
237
|
+
'fallback_to_plain_text': True,
|
|
238
|
+
'delimiter': delimiter
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
return content, metadata
|
|
242
|
+
|
|
243
|
+
def _process_json_file(self, file_path: Path, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
|
|
244
|
+
"""
|
|
245
|
+
Process JSON files with structure analysis.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
file_path: Path to the JSON file
|
|
249
|
+
encoding: Text encoding
|
|
250
|
+
**kwargs: Additional parameters
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Tuple of (processed_content, metadata)
|
|
254
|
+
"""
|
|
255
|
+
try:
|
|
256
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
257
|
+
data = json.load(f)
|
|
258
|
+
|
|
259
|
+
# Generate structured content
|
|
260
|
+
content_parts = []
|
|
261
|
+
content_parts.append(f"# {file_path.name}")
|
|
262
|
+
|
|
263
|
+
if isinstance(data, dict):
|
|
264
|
+
content_parts.append(f"JSON object with {len(data)} keys\n")
|
|
265
|
+
content_parts.append("## Structure:")
|
|
266
|
+
content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
|
|
267
|
+
elif isinstance(data, list):
|
|
268
|
+
content_parts.append(f"JSON array with {len(data)} items\n")
|
|
269
|
+
content_parts.append("## Sample items:")
|
|
270
|
+
for i, item in enumerate(data[:5]):
|
|
271
|
+
content_parts.append(f"Item {i+1}: {json.dumps(item, ensure_ascii=False)}")
|
|
272
|
+
if len(data) > 5:
|
|
273
|
+
content_parts.append(f"... and {len(data) - 5} more items")
|
|
274
|
+
else:
|
|
275
|
+
content_parts.append("JSON primitive value:")
|
|
276
|
+
content_parts.append(json.dumps(data, indent=2, ensure_ascii=False))
|
|
277
|
+
|
|
278
|
+
content = "\n".join(content_parts)
|
|
279
|
+
|
|
280
|
+
metadata = {
|
|
281
|
+
'json_type': type(data).__name__,
|
|
282
|
+
'size': len(data) if isinstance(data, (list, dict)) else 1,
|
|
283
|
+
'keys': list(data.keys()) if isinstance(data, dict) else None
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return content, metadata
|
|
287
|
+
|
|
288
|
+
except json.JSONDecodeError as e:
|
|
289
|
+
# If JSON is invalid, treat as plain text
|
|
290
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
291
|
+
content = f.read()
|
|
292
|
+
|
|
293
|
+
metadata = {
|
|
294
|
+
'json_error': str(e),
|
|
295
|
+
'fallback_to_plain_text': True
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return content, metadata
|
|
299
|
+
|
|
300
|
+
def _process_markup_file(self, file_path: Path, extension: str, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
|
|
301
|
+
"""
|
|
302
|
+
Process markup files (XML, HTML) with basic structure extraction.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
file_path: Path to the markup file
|
|
306
|
+
extension: File extension
|
|
307
|
+
encoding: Text encoding
|
|
308
|
+
**kwargs: Additional parameters
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Tuple of (processed_content, metadata)
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
315
|
+
content = f.read()
|
|
316
|
+
|
|
317
|
+
# Basic structure analysis
|
|
318
|
+
lines = content.split('\n')
|
|
319
|
+
non_empty_lines = [line.strip() for line in lines if line.strip()]
|
|
320
|
+
|
|
321
|
+
# Count basic markup elements
|
|
322
|
+
tag_count = content.count('<')
|
|
323
|
+
|
|
324
|
+
metadata = {
|
|
325
|
+
'markup_type': extension,
|
|
326
|
+
'line_count': len(lines),
|
|
327
|
+
'non_empty_lines': len(non_empty_lines),
|
|
328
|
+
'tag_count': tag_count,
|
|
329
|
+
'character_count': len(content)
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
# For HTML, try to extract title
|
|
333
|
+
if extension in ['html', 'htm']:
|
|
334
|
+
import re
|
|
335
|
+
title_match = re.search(r'<title[^>]*>(.*?)</title>', content, re.IGNORECASE | re.DOTALL)
|
|
336
|
+
if title_match:
|
|
337
|
+
metadata['title'] = title_match.group(1).strip()
|
|
338
|
+
|
|
339
|
+
return content, metadata
|
|
340
|
+
|
|
341
|
+
except Exception as e:
|
|
342
|
+
metadata = {
|
|
343
|
+
'processing_error': str(e),
|
|
344
|
+
'markup_type': extension
|
|
345
|
+
}
|
|
346
|
+
return "", metadata
|
|
347
|
+
|
|
348
|
+
def _process_markdown_file(self, file_path: Path, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
|
|
349
|
+
"""
|
|
350
|
+
Process Markdown files with structure analysis.
|
|
351
|
+
|
|
352
|
+
Args:
|
|
353
|
+
file_path: Path to the Markdown file
|
|
354
|
+
encoding: Text encoding
|
|
355
|
+
**kwargs: Additional parameters
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
Tuple of (processed_content, metadata)
|
|
359
|
+
"""
|
|
360
|
+
try:
|
|
361
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
362
|
+
content = f.read()
|
|
363
|
+
|
|
364
|
+
# Basic Markdown structure analysis
|
|
365
|
+
lines = content.split('\n')
|
|
366
|
+
|
|
367
|
+
# Count different elements
|
|
368
|
+
headers = [line for line in lines if line.strip().startswith('#')]
|
|
369
|
+
code_blocks = content.count('```')
|
|
370
|
+
links = content.count('[')
|
|
371
|
+
images = content.count('![')
|
|
372
|
+
|
|
373
|
+
metadata = {
|
|
374
|
+
'line_count': len(lines),
|
|
375
|
+
'header_count': len(headers),
|
|
376
|
+
'code_block_count': code_blocks // 2, # Pairs of ```
|
|
377
|
+
'link_count': links,
|
|
378
|
+
'image_count': images,
|
|
379
|
+
'character_count': len(content),
|
|
380
|
+
'word_count': len(content.split())
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
# Extract headers for structure
|
|
384
|
+
if headers:
|
|
385
|
+
metadata['headers'] = headers[:10] # First 10 headers
|
|
386
|
+
|
|
387
|
+
return content, metadata
|
|
388
|
+
|
|
389
|
+
except Exception as e:
|
|
390
|
+
metadata = {
|
|
391
|
+
'processing_error': str(e)
|
|
392
|
+
}
|
|
393
|
+
return "", metadata
|
|
394
|
+
|
|
395
|
+
def _process_plain_text(self, file_path: Path, encoding: str, **kwargs) -> tuple[str, Dict[str, Any]]:
|
|
396
|
+
"""
|
|
397
|
+
Process plain text files with basic analysis.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
file_path: Path to the text file
|
|
401
|
+
encoding: Text encoding
|
|
402
|
+
**kwargs: Additional parameters
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Tuple of (processed_content, metadata)
|
|
406
|
+
"""
|
|
407
|
+
try:
|
|
408
|
+
with open(file_path, 'r', encoding=encoding) as f:
|
|
409
|
+
content = f.read()
|
|
410
|
+
|
|
411
|
+
# Basic text analysis
|
|
412
|
+
lines = content.split('\n')
|
|
413
|
+
words = content.split()
|
|
414
|
+
|
|
415
|
+
metadata = {
|
|
416
|
+
'line_count': len(lines),
|
|
417
|
+
'word_count': len(words),
|
|
418
|
+
'character_count': len(content),
|
|
419
|
+
'non_empty_lines': len([line for line in lines if line.strip()]),
|
|
420
|
+
'encoding_used': encoding
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return content, metadata
|
|
424
|
+
|
|
425
|
+
except UnicodeDecodeError:
|
|
426
|
+
# Try different encodings
|
|
427
|
+
for alt_encoding in ['latin-1', 'cp1252', 'utf-16']:
|
|
428
|
+
try:
|
|
429
|
+
with open(file_path, 'r', encoding=alt_encoding) as f:
|
|
430
|
+
content = f.read()
|
|
431
|
+
|
|
432
|
+
metadata = {
|
|
433
|
+
'encoding_used': alt_encoding,
|
|
434
|
+
'original_encoding_failed': encoding,
|
|
435
|
+
'character_count': len(content)
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
return content, metadata
|
|
439
|
+
except:
|
|
440
|
+
continue
|
|
441
|
+
|
|
442
|
+
# If all encodings fail, read as binary and decode with errors='replace'
|
|
443
|
+
with open(file_path, 'rb') as f:
|
|
444
|
+
raw_content = f.read()
|
|
445
|
+
|
|
446
|
+
content = raw_content.decode('utf-8', errors='replace')
|
|
447
|
+
metadata = {
|
|
448
|
+
'encoding_used': 'utf-8-with-errors',
|
|
449
|
+
'binary_fallback': True,
|
|
450
|
+
'character_count': len(content)
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
return content, metadata
|
|
454
|
+
|
|
455
|
+
def _apply_structured_formatting(self, content: str, extension: str, metadata: Dict[str, Any]) -> str:
|
|
456
|
+
"""Apply structured formatting to content based on file type."""
|
|
457
|
+
if extension in ['csv', 'tsv']:
|
|
458
|
+
# Content is already structured for tabular data
|
|
459
|
+
return content
|
|
460
|
+
elif extension == 'json':
|
|
461
|
+
# Content is already structured for JSON
|
|
462
|
+
return content
|
|
463
|
+
elif extension == 'md':
|
|
464
|
+
# Markdown is already structured
|
|
465
|
+
return content
|
|
466
|
+
else:
|
|
467
|
+
# Add basic structure to plain text
|
|
468
|
+
structured_parts = [f"# {metadata.get('file_name', 'Text Content')}"]
|
|
469
|
+
|
|
470
|
+
if 'word_count' in metadata:
|
|
471
|
+
structured_parts.append(f"Document Statistics: {metadata['word_count']} words, {metadata['line_count']} lines\n")
|
|
472
|
+
|
|
473
|
+
structured_parts.append("## Content:")
|
|
474
|
+
structured_parts.append(content)
|
|
475
|
+
|
|
476
|
+
return "\n".join(structured_parts)
|
|
477
|
+
|
|
478
|
+
def _generate_content_summary(self, content: str, extension: str, metadata: Dict[str, Any]) -> str:
|
|
479
|
+
"""Generate a summary of the content."""
|
|
480
|
+
summary_parts = [f"# Summary of {metadata.get('file_name', 'file')}"]
|
|
481
|
+
|
|
482
|
+
if extension in ['csv', 'tsv']:
|
|
483
|
+
summary_parts.append(f"Tabular data with {metadata.get('row_count', 0)} rows and {metadata.get('column_count', 0)} columns")
|
|
484
|
+
if 'columns' in metadata:
|
|
485
|
+
summary_parts.append(f"Columns: {', '.join(metadata['columns'][:5])}")
|
|
486
|
+
elif extension == 'json':
|
|
487
|
+
summary_parts.append(f"JSON {metadata.get('json_type', 'data')} with {metadata.get('size', 0)} items")
|
|
488
|
+
elif extension == 'md':
|
|
489
|
+
summary_parts.append(f"Markdown document with {metadata.get('header_count', 0)} headers and {metadata.get('word_count', 0)} words")
|
|
490
|
+
else:
|
|
491
|
+
summary_parts.append(f"Text document with {metadata.get('word_count', 0)} words and {metadata.get('line_count', 0)} lines")
|
|
492
|
+
|
|
493
|
+
# Add content preview
|
|
494
|
+
preview = content[:500] + "..." if len(content) > 500 else content
|
|
495
|
+
summary_parts.append(f"\nContent preview:\n{preview}")
|
|
496
|
+
|
|
497
|
+
return "\n".join(summary_parts)
|
|
498
|
+
|
|
499
|
+
def _get_mime_type_for_extension(self, extension: str) -> str:
|
|
500
|
+
"""Get MIME type for file extension."""
|
|
501
|
+
mime_map = {
|
|
502
|
+
'txt': 'text/plain',
|
|
503
|
+
'md': 'text/markdown',
|
|
504
|
+
'csv': 'text/csv',
|
|
505
|
+
'tsv': 'text/tab-separated-values',
|
|
506
|
+
'json': 'application/json',
|
|
507
|
+
'xml': 'application/xml',
|
|
508
|
+
'html': 'text/html',
|
|
509
|
+
'htm': 'text/html'
|
|
510
|
+
}
|
|
511
|
+
return mime_map.get(extension, 'text/plain')
|
|
512
|
+
|
|
513
|
+
def get_text_preview(self, file_path: Union[str, Path], max_chars: int = 1000) -> str:
|
|
514
|
+
"""
|
|
515
|
+
Get a preview of text content without full processing.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
file_path: Path to the text file
|
|
519
|
+
max_chars: Maximum characters to preview
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
Text preview
|
|
523
|
+
"""
|
|
524
|
+
file_path = Path(file_path)
|
|
525
|
+
|
|
526
|
+
try:
|
|
527
|
+
with open(file_path, 'r', encoding=self.default_encoding) as f:
|
|
528
|
+
content = f.read(max_chars)
|
|
529
|
+
if len(content) == max_chars:
|
|
530
|
+
content += "..."
|
|
531
|
+
return content
|
|
532
|
+
except Exception as e:
|
|
533
|
+
return f"Error reading file: {str(e)}"
|
|
534
|
+
|
|
535
|
+
def get_processing_info(self) -> Dict[str, Any]:
|
|
536
|
+
"""
|
|
537
|
+
Get information about the text processor capabilities.
|
|
538
|
+
|
|
539
|
+
Returns:
|
|
540
|
+
Dictionary with processor information
|
|
541
|
+
"""
|
|
542
|
+
return {
|
|
543
|
+
'processor_type': 'TextProcessor',
|
|
544
|
+
'supported_formats': ['txt', 'md', 'csv', 'tsv', 'json', 'xml', 'html', 'htm'],
|
|
545
|
+
'capabilities': {
|
|
546
|
+
'default_encoding': self.default_encoding,
|
|
547
|
+
'csv_delimiter': self.csv_delimiter,
|
|
548
|
+
'max_rows': self.max_rows,
|
|
549
|
+
'preserve_structure': self.preserve_structure,
|
|
550
|
+
'pandas_integration': PANDAS_AVAILABLE,
|
|
551
|
+
'structured_formatting': True,
|
|
552
|
+
'metadata_extraction': True
|
|
553
|
+
},
|
|
554
|
+
'dependencies': {
|
|
555
|
+
'pandas': PANDAS_AVAILABLE
|
|
556
|
+
}
|
|
557
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utility modules for AbstractCore media handling.
|
|
3
|
+
|
|
4
|
+
Provides reusable utilities for media processing, including image scaling
|
|
5
|
+
optimized for different vision models.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .image_scaler import (
|
|
9
|
+
ModelOptimizedScaler,
|
|
10
|
+
ScalingMode,
|
|
11
|
+
get_scaler,
|
|
12
|
+
scale_image_for_model,
|
|
13
|
+
get_optimal_size_for_model
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
'ModelOptimizedScaler',
|
|
18
|
+
'ScalingMode',
|
|
19
|
+
'get_scaler',
|
|
20
|
+
'scale_image_for_model',
|
|
21
|
+
'get_optimal_size_for_model'
|
|
22
|
+
]
|