content-core 1.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- content_core/__init__.py +216 -0
- content_core/cc_config.yaml +86 -0
- content_core/common/__init__.py +38 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/retry.py +325 -0
- content_core/common/state.py +64 -0
- content_core/common/types.py +15 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +575 -0
- content_core/content/__init__.py +6 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +252 -0
- content_core/content/identification/__init__.py +9 -0
- content_core/content/identification/file_detector.py +505 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/logging.py +15 -0
- content_core/mcp/__init__.py +5 -0
- content_core/mcp/server.py +214 -0
- content_core/models.py +60 -0
- content_core/models_config.yaml +31 -0
- content_core/notebooks/run.ipynb +359 -0
- content_core/notebooks/urls.ipynb +154 -0
- content_core/processors/audio.py +272 -0
- content_core/processors/docling.py +79 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +292 -0
- content_core/processors/text.py +36 -0
- content_core/processors/url.py +324 -0
- content_core/processors/video.py +166 -0
- content_core/processors/youtube.py +262 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +70 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-1.10.0.dist-info/METADATA +742 -0
- content_core-1.10.0.dist-info/RECORD +44 -0
- content_core-1.10.0.dist-info/WHEEL +4 -0
- content_core-1.10.0.dist-info/entry_points.txt +5 -0
- content_core-1.10.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,505 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pure Python file type detection using magic bytes and content analysis.
|
|
3
|
+
Replaces libmagic dependency with a lightweight implementation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import zipfile
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Optional
|
|
9
|
+
|
|
10
|
+
from content_core.common.exceptions import UnsupportedTypeException
|
|
11
|
+
from content_core.logging import logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class FileDetector:
|
|
15
|
+
"""Pure Python file type detection using magic bytes and content analysis."""
|
|
16
|
+
|
|
17
|
+
# Configuration constants for binary/text detection
|
|
18
|
+
SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
|
|
19
|
+
TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
|
|
20
|
+
|
|
21
|
+
# Configuration constants for CSV detection
|
|
22
|
+
CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
|
|
23
|
+
CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
|
|
24
|
+
CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
|
|
25
|
+
CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
|
|
26
|
+
CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
"""Initialize the FileDetector with signature mappings."""
|
|
30
|
+
self.binary_signatures = self._load_binary_signatures()
|
|
31
|
+
self.text_patterns = self._load_text_patterns()
|
|
32
|
+
self.extension_mapping = self._load_extension_mapping()
|
|
33
|
+
self.zip_content_patterns = self._load_zip_content_patterns()
|
|
34
|
+
|
|
35
|
+
def _load_binary_signatures(self) -> Dict[bytes, str]:
|
|
36
|
+
"""Load binary file signatures (magic bytes) to MIME type mappings."""
|
|
37
|
+
# Ordered by specificity - longer/more specific signatures first
|
|
38
|
+
return {
|
|
39
|
+
# PDF
|
|
40
|
+
b'%PDF': 'application/pdf', # PDF document signature (hex: 25 50 44 46)
|
|
41
|
+
|
|
42
|
+
# Images
|
|
43
|
+
b'\xff\xd8\xff\xe0': 'image/jpeg', # JPEG with JFIF header (JPEG File Interchange Format)
|
|
44
|
+
b'\xff\xd8\xff\xe1': 'image/jpeg', # JPEG with EXIF header (Exchangeable Image File Format)
|
|
45
|
+
b'\xff\xd8\xff\xe2': 'image/jpeg', # JPEG with Adobe header (Adobe JPEG)
|
|
46
|
+
b'\xff\xd8\xff\xdb': 'image/jpeg', # JPEG with DQT (Define Quantization Table) marker
|
|
47
|
+
b'\xff\xd8': 'image/jpeg', # Generic JPEG signature (Start of Image marker, must be last)
|
|
48
|
+
b'\x89PNG\r\n\x1a\n': 'image/png', # PNG signature (hex: 89 50 4E 47 0D 0A 1A 0A)
|
|
49
|
+
b'GIF87a': 'image/gif', # GIF version 87a
|
|
50
|
+
b'GIF89a': 'image/gif', # GIF version 89a (supports animation and transparency)
|
|
51
|
+
b'II*\x00': 'image/tiff', # TIFF little-endian (Intel byte order)
|
|
52
|
+
b'MM\x00*': 'image/tiff', # TIFF big-endian (Motorola byte order)
|
|
53
|
+
b'BM': 'image/bmp', # Windows Bitmap signature
|
|
54
|
+
|
|
55
|
+
# Audio
|
|
56
|
+
b'ID3': 'audio/mpeg', # MP3 with ID3v2 metadata tag
|
|
57
|
+
b'\xff\xfb': 'audio/mpeg', # MP3 frame sync with MPEG-1 Layer 3
|
|
58
|
+
b'\xff\xf3': 'audio/mpeg', # MP3 frame sync with MPEG-2 Layer 3
|
|
59
|
+
b'\xff\xf2': 'audio/mpeg', # MP3 frame sync with MPEG-2.5 Layer 3
|
|
60
|
+
b'RIFF': None, # Resource Interchange File Format - requires further inspection (could be WAV, AVI, WebP)
|
|
61
|
+
b'fLaC': 'audio/flac', # Free Lossless Audio Codec signature
|
|
62
|
+
|
|
63
|
+
# Video/Audio containers - these will be handled by ftyp detection
|
|
64
|
+
# MP4/M4A/MOV use ftyp box at offset 4 for identification
|
|
65
|
+
|
|
66
|
+
# Archive formats
|
|
67
|
+
b'PK\x03\x04': 'application/zip', # ZIP archive (also used by DOCX, XLSX, PPTX, JAR, etc.)
|
|
68
|
+
b'PK\x05\x06': 'application/zip', # Empty ZIP archive (End of Central Directory)
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
def _load_text_patterns(self) -> Dict[str, str]:
|
|
72
|
+
"""Load text-based format detection patterns."""
|
|
73
|
+
return {
|
|
74
|
+
'<!DOCTYPE html': 'text/html',
|
|
75
|
+
'<!doctype html': 'text/html',
|
|
76
|
+
'<html': 'text/html',
|
|
77
|
+
'<?xml': 'text/xml',
|
|
78
|
+
'{': 'application/json', # Will need more validation
|
|
79
|
+
'[': 'application/json', # Will need more validation
|
|
80
|
+
'---\n': 'text/yaml',
|
|
81
|
+
'---\r\n': 'text/yaml',
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
def _load_extension_mapping(self) -> Dict[str, str]:
|
|
85
|
+
"""Load file extension to MIME type mappings as fallback."""
|
|
86
|
+
return {
|
|
87
|
+
# Documents
|
|
88
|
+
'.pdf': 'application/pdf',
|
|
89
|
+
'.txt': 'text/plain',
|
|
90
|
+
'.md': 'text/plain', # Markdown treated as plain text (current behavior)
|
|
91
|
+
'.markdown': 'text/plain',
|
|
92
|
+
'.rst': 'text/plain', # reStructuredText
|
|
93
|
+
'.log': 'text/plain',
|
|
94
|
+
|
|
95
|
+
# Web formats
|
|
96
|
+
'.html': 'text/html',
|
|
97
|
+
'.htm': 'text/html',
|
|
98
|
+
'.xhtml': 'text/html',
|
|
99
|
+
'.xml': 'text/xml',
|
|
100
|
+
|
|
101
|
+
# Data formats
|
|
102
|
+
'.json': 'application/json',
|
|
103
|
+
'.yaml': 'text/yaml',
|
|
104
|
+
'.yml': 'text/yaml',
|
|
105
|
+
'.csv': 'text/csv',
|
|
106
|
+
'.tsv': 'text/csv', # Tab-separated values
|
|
107
|
+
|
|
108
|
+
# Images
|
|
109
|
+
'.jpg': 'image/jpeg',
|
|
110
|
+
'.jpeg': 'image/jpeg',
|
|
111
|
+
'.jpe': 'image/jpeg',
|
|
112
|
+
'.png': 'image/png',
|
|
113
|
+
'.gif': 'image/gif',
|
|
114
|
+
'.tiff': 'image/tiff',
|
|
115
|
+
'.tif': 'image/tiff',
|
|
116
|
+
'.bmp': 'image/bmp',
|
|
117
|
+
'.webp': 'image/webp',
|
|
118
|
+
'.ico': 'image/x-icon',
|
|
119
|
+
'.svg': 'image/svg+xml',
|
|
120
|
+
|
|
121
|
+
# Audio
|
|
122
|
+
'.mp3': 'audio/mpeg',
|
|
123
|
+
'.wav': 'audio/wav',
|
|
124
|
+
'.wave': 'audio/wav',
|
|
125
|
+
'.m4a': 'audio/mp4',
|
|
126
|
+
'.aac': 'audio/aac',
|
|
127
|
+
'.ogg': 'audio/ogg',
|
|
128
|
+
'.oga': 'audio/ogg',
|
|
129
|
+
'.flac': 'audio/flac',
|
|
130
|
+
'.wma': 'audio/x-ms-wma',
|
|
131
|
+
|
|
132
|
+
# Video
|
|
133
|
+
'.mp4': 'video/mp4',
|
|
134
|
+
'.m4v': 'video/mp4',
|
|
135
|
+
'.avi': 'video/x-msvideo',
|
|
136
|
+
'.mov': 'video/quicktime',
|
|
137
|
+
'.qt': 'video/quicktime',
|
|
138
|
+
'.wmv': 'video/x-ms-wmv',
|
|
139
|
+
'.flv': 'video/x-flv',
|
|
140
|
+
'.mkv': 'video/x-matroska',
|
|
141
|
+
'.webm': 'video/webm',
|
|
142
|
+
'.mpg': 'video/mpeg',
|
|
143
|
+
'.mpeg': 'video/mpeg',
|
|
144
|
+
'.3gp': 'video/3gpp',
|
|
145
|
+
|
|
146
|
+
# Office formats
|
|
147
|
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
148
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
149
|
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
150
|
+
|
|
151
|
+
# E-books
|
|
152
|
+
'.epub': 'application/epub+zip',
|
|
153
|
+
|
|
154
|
+
# Archives (basic detection - not expanded)
|
|
155
|
+
'.zip': 'application/zip',
|
|
156
|
+
'.tar': 'application/x-tar',
|
|
157
|
+
'.gz': 'application/gzip',
|
|
158
|
+
'.bz2': 'application/x-bzip2',
|
|
159
|
+
'.7z': 'application/x-7z-compressed',
|
|
160
|
+
'.rar': 'application/x-rar-compressed',
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
def _load_zip_content_patterns(self) -> Dict[str, str]:
|
|
164
|
+
"""Load patterns for identifying ZIP-based formats by their content."""
|
|
165
|
+
return {
|
|
166
|
+
'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
167
|
+
'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
168
|
+
'ppt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
169
|
+
'META-INF/container.xml': 'application/epub+zip',
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
async def detect(self, file_path: str) -> str:
|
|
173
|
+
"""
|
|
174
|
+
Detect file type using magic bytes and content analysis.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
file_path: Path to the file to analyze
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
MIME type string
|
|
181
|
+
|
|
182
|
+
Raises:
|
|
183
|
+
UnsupportedTypeException: If file type cannot be determined
|
|
184
|
+
"""
|
|
185
|
+
file_path = Path(file_path)
|
|
186
|
+
|
|
187
|
+
if not file_path.exists():
|
|
188
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
189
|
+
|
|
190
|
+
if not file_path.is_file():
|
|
191
|
+
raise ValueError(f"Not a file: {file_path}")
|
|
192
|
+
|
|
193
|
+
# Try binary signature detection first
|
|
194
|
+
mime_type = await self._detect_by_signature(file_path)
|
|
195
|
+
if mime_type:
|
|
196
|
+
logger.debug(f"Detected {file_path} as {mime_type} by signature")
|
|
197
|
+
return mime_type
|
|
198
|
+
|
|
199
|
+
# Try text-based detection
|
|
200
|
+
mime_type = await self._detect_text_format(file_path)
|
|
201
|
+
if mime_type:
|
|
202
|
+
logger.debug(f"Detected {file_path} as {mime_type} by text analysis")
|
|
203
|
+
return mime_type
|
|
204
|
+
|
|
205
|
+
# Fallback to extension
|
|
206
|
+
mime_type = self._detect_by_extension(file_path)
|
|
207
|
+
if mime_type:
|
|
208
|
+
logger.debug(f"Detected {file_path} as {mime_type} by extension")
|
|
209
|
+
return mime_type
|
|
210
|
+
|
|
211
|
+
# If all detection methods fail
|
|
212
|
+
raise UnsupportedTypeException(f"Unable to determine file type for: {file_path}")
|
|
213
|
+
|
|
214
|
+
async def _detect_by_signature(self, file_path: Path) -> Optional[str]:
|
|
215
|
+
"""Detect file type by binary signature (magic bytes)."""
|
|
216
|
+
try:
|
|
217
|
+
with open(file_path, 'rb') as f:
|
|
218
|
+
# Read bytes for signature detection
|
|
219
|
+
header = f.read(self.SIGNATURE_READ_SIZE)
|
|
220
|
+
|
|
221
|
+
if not header:
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
# Check for exact signature matches
|
|
225
|
+
for signature, mime_type in self.binary_signatures.items():
|
|
226
|
+
if header.startswith(signature):
|
|
227
|
+
# Special handling for RIFF (could be WAV or AVI)
|
|
228
|
+
if signature == b'RIFF' and len(header) >= 12:
|
|
229
|
+
if header[8:12] == b'WAVE':
|
|
230
|
+
return 'audio/wav'
|
|
231
|
+
elif header[8:12] == b'AVI ':
|
|
232
|
+
return 'video/x-msvideo'
|
|
233
|
+
|
|
234
|
+
# Special handling for ZIP-based formats
|
|
235
|
+
if mime_type == 'application/zip':
|
|
236
|
+
zip_mime = await self._detect_zip_format(file_path)
|
|
237
|
+
if zip_mime:
|
|
238
|
+
return zip_mime
|
|
239
|
+
|
|
240
|
+
if mime_type:
|
|
241
|
+
return mime_type
|
|
242
|
+
|
|
243
|
+
# Special check for MP4/MOV files with ftyp box
|
|
244
|
+
if len(header) >= 12 and header[4:8] == b'ftyp':
|
|
245
|
+
ftyp_brand = header[8:12]
|
|
246
|
+
# Don't strip - check exact 4-byte brand
|
|
247
|
+
if ftyp_brand == b'M4A ' or ftyp_brand.startswith(b'M4A'):
|
|
248
|
+
return 'audio/mp4'
|
|
249
|
+
elif ftyp_brand in [b'mp41', b'mp42', b'isom', b'iso2', b'iso5', b'M4V ', b'M4VP']:
|
|
250
|
+
return 'video/mp4'
|
|
251
|
+
elif ftyp_brand.startswith(b'qt'):
|
|
252
|
+
return 'video/quicktime'
|
|
253
|
+
else:
|
|
254
|
+
# Generic MP4 for other ftyp brands
|
|
255
|
+
return 'video/mp4'
|
|
256
|
+
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
except Exception as e:
|
|
260
|
+
logger.debug(f"Error reading file signature: {e}")
|
|
261
|
+
return None
|
|
262
|
+
|
|
263
|
+
async def _detect_zip_format(self, file_path: Path) -> Optional[str]:
|
|
264
|
+
"""Detect specific ZIP-based format (DOCX, XLSX, PPTX, EPUB)."""
|
|
265
|
+
try:
|
|
266
|
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
267
|
+
namelist = zf.namelist()
|
|
268
|
+
|
|
269
|
+
# Check for specific content patterns
|
|
270
|
+
for pattern, mime_type in self.zip_content_patterns.items():
|
|
271
|
+
if any(name.startswith(pattern) for name in namelist):
|
|
272
|
+
return mime_type
|
|
273
|
+
|
|
274
|
+
# If it's a valid ZIP but no specific pattern matched
|
|
275
|
+
return 'application/zip'
|
|
276
|
+
|
|
277
|
+
except zipfile.BadZipFile:
|
|
278
|
+
logger.debug(f"Invalid ZIP file: {file_path}")
|
|
279
|
+
return None
|
|
280
|
+
except Exception as e:
|
|
281
|
+
logger.debug(f"Error inspecting ZIP content: {e}")
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
async def _detect_text_format(self, file_path: Path) -> Optional[str]:
|
|
285
|
+
"""Detect text-based formats by content analysis."""
|
|
286
|
+
try:
|
|
287
|
+
# Read bytes for text content analysis
|
|
288
|
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
289
|
+
content = f.read(self.TEXT_READ_SIZE)
|
|
290
|
+
|
|
291
|
+
if not content or len(content) < 10:
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
# Strip whitespace for analysis
|
|
295
|
+
content_stripped = content.strip()
|
|
296
|
+
|
|
297
|
+
# Check for text patterns
|
|
298
|
+
for pattern, mime_type in self.text_patterns.items():
|
|
299
|
+
if content_stripped.lower().startswith(pattern.lower()):
|
|
300
|
+
# Special validation for JSON
|
|
301
|
+
if mime_type == 'application/json':
|
|
302
|
+
if self._is_valid_json_start(content_stripped):
|
|
303
|
+
return mime_type
|
|
304
|
+
# HTML needs to be detected for routing
|
|
305
|
+
elif mime_type == 'text/html':
|
|
306
|
+
return mime_type
|
|
307
|
+
# For other text patterns (YAML, etc), just return text/plain
|
|
308
|
+
else:
|
|
309
|
+
return 'text/plain'
|
|
310
|
+
|
|
311
|
+
# Check for CSV pattern (multiple comma-separated values)
|
|
312
|
+
if self._looks_like_csv(content):
|
|
313
|
+
return 'text/csv'
|
|
314
|
+
|
|
315
|
+
# If it's readable text but no specific format detected
|
|
316
|
+
if self._is_text_file(content):
|
|
317
|
+
return 'text/plain'
|
|
318
|
+
|
|
319
|
+
return None
|
|
320
|
+
|
|
321
|
+
except UnicodeDecodeError:
|
|
322
|
+
# Not a text file
|
|
323
|
+
return None
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.debug(f"Error analyzing text content: {e}")
|
|
326
|
+
return None
|
|
327
|
+
|
|
328
|
+
def _detect_by_extension(self, file_path: Path) -> Optional[str]:
|
|
329
|
+
"""Detect file type by extension as fallback."""
|
|
330
|
+
extension = file_path.suffix.lower()
|
|
331
|
+
return self.extension_mapping.get(extension)
|
|
332
|
+
|
|
333
|
+
def _is_valid_json_start(self, content: str) -> bool:
|
|
334
|
+
"""Check if content starts like valid JSON."""
|
|
335
|
+
# More robust JSON detection
|
|
336
|
+
content = content.strip()
|
|
337
|
+
if not (content.startswith('{') or content.startswith('[')):
|
|
338
|
+
return False
|
|
339
|
+
|
|
340
|
+
# Strong JSON indicators that are less likely in other formats
|
|
341
|
+
strong_indicators = [
|
|
342
|
+
'{\n "', # Pretty-printed JSON object
|
|
343
|
+
'{\n\t"', # Tab-indented JSON
|
|
344
|
+
'{"', # Compact JSON object
|
|
345
|
+
'[\n {', # Pretty-printed JSON array
|
|
346
|
+
'[{', # Compact JSON array
|
|
347
|
+
'": {', # Nested object
|
|
348
|
+
'": [' # Nested array
|
|
349
|
+
]
|
|
350
|
+
|
|
351
|
+
# Check for strong indicators
|
|
352
|
+
for indicator in strong_indicators:
|
|
353
|
+
if indicator in content[:200]:
|
|
354
|
+
return True
|
|
355
|
+
|
|
356
|
+
# Weaker indicators - require multiple matches
|
|
357
|
+
json_patterns = ['":', '": ', '",', ', "', '"]', '"}']
|
|
358
|
+
pattern_count = sum(1 for pattern in json_patterns if pattern in content[:200])
|
|
359
|
+
|
|
360
|
+
# Check for JSON keywords but not in URLs or natural text
|
|
361
|
+
json_keywords = ['true', 'false', 'null']
|
|
362
|
+
keyword_count = 0
|
|
363
|
+
content_lower = content[:200].lower()
|
|
364
|
+
for kw in json_keywords:
|
|
365
|
+
# Check if keyword appears as a value (not in URL or sentence)
|
|
366
|
+
if f': {kw}' in content_lower or f':{kw}' in content_lower or f', {kw}' in content_lower:
|
|
367
|
+
keyword_count += 1
|
|
368
|
+
|
|
369
|
+
# Require stronger evidence to avoid false positives
|
|
370
|
+
return pattern_count >= 3 or keyword_count >= 1
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def _looks_like_csv(self, content: str) -> bool:
|
|
374
|
+
"""
|
|
375
|
+
Check if content looks like CSV format with improved heuristics.
|
|
376
|
+
|
|
377
|
+
Uses a multi-stage approach with performance optimization:
|
|
378
|
+
1. Basic structural checks (cheap)
|
|
379
|
+
2. Field length analysis (cheap, early exit)
|
|
380
|
+
3. Pattern matching (moderate cost)
|
|
381
|
+
4. Variance analysis (expensive, only if needed)
|
|
382
|
+
"""
|
|
383
|
+
lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
|
|
384
|
+
non_empty_lines = [line for line in lines if line.strip()]
|
|
385
|
+
|
|
386
|
+
# Stage 1: Basic structural checks
|
|
387
|
+
if len(non_empty_lines) < 2:
|
|
388
|
+
return False
|
|
389
|
+
|
|
390
|
+
# Count commas in each line
|
|
391
|
+
comma_counts = [line.count(',') for line in non_empty_lines]
|
|
392
|
+
|
|
393
|
+
# Must have at least one comma per line
|
|
394
|
+
if not all(count > 0 for count in comma_counts):
|
|
395
|
+
return False
|
|
396
|
+
|
|
397
|
+
# CSV should have consistent comma counts across lines
|
|
398
|
+
if len(set(comma_counts)) != 1:
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
|
|
402
|
+
|
|
403
|
+
# Must have minimum number of fields to be CSV
|
|
404
|
+
if num_fields < self.CSV_MIN_FIELDS:
|
|
405
|
+
return False
|
|
406
|
+
|
|
407
|
+
# Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
|
|
408
|
+
first_line = non_empty_lines[0]
|
|
409
|
+
fields = first_line.split(',')
|
|
410
|
+
|
|
411
|
+
# CSV fields should be relatively short (not long sentences)
|
|
412
|
+
# Average field length should be reasonable (not paragraphs)
|
|
413
|
+
# Early exit avoids expensive variance calculations for obvious prose
|
|
414
|
+
avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
|
|
415
|
+
if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
|
|
416
|
+
return False # Too long to be typical CSV fields - exit early
|
|
417
|
+
|
|
418
|
+
# Stage 3: Pattern matching
|
|
419
|
+
# Check for CSV-like patterns:
|
|
420
|
+
# 1. Fields that look like headers (short, alphanumeric)
|
|
421
|
+
# 2. Quoted fields (common in CSV)
|
|
422
|
+
# 3. Numeric fields
|
|
423
|
+
has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
|
|
424
|
+
|
|
425
|
+
first_line_fields = [f.strip() for f in fields]
|
|
426
|
+
# Check if first line looks like a header (short, no sentence-ending punctuation)
|
|
427
|
+
looks_like_header = all(
|
|
428
|
+
len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
|
|
429
|
+
for f in first_line_fields
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
|
|
433
|
+
# Check if subsequent lines have similar field structure
|
|
434
|
+
# Real CSV tends to have consistent field lengths
|
|
435
|
+
if len(non_empty_lines) >= 3:
|
|
436
|
+
field_lengths_per_line = []
|
|
437
|
+
for line in non_empty_lines[:5]:
|
|
438
|
+
line_fields = line.split(',')
|
|
439
|
+
field_lengths = [len(f.strip()) for f in line_fields]
|
|
440
|
+
field_lengths_per_line.append(field_lengths)
|
|
441
|
+
|
|
442
|
+
# Calculate variance in field positions
|
|
443
|
+
# CSV data should have relatively consistent field lengths at each position
|
|
444
|
+
# Natural text with commas will have much more variance
|
|
445
|
+
position_variances = []
|
|
446
|
+
for i in range(num_fields):
|
|
447
|
+
lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
|
|
448
|
+
if lengths_at_position:
|
|
449
|
+
avg = sum(lengths_at_position) / len(lengths_at_position)
|
|
450
|
+
variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
|
|
451
|
+
position_variances.append(variance)
|
|
452
|
+
|
|
453
|
+
# High variance suggests natural text, not structured CSV
|
|
454
|
+
if position_variances:
|
|
455
|
+
avg_variance = sum(position_variances) / len(position_variances)
|
|
456
|
+
if avg_variance > self.CSV_MAX_VARIANCE:
|
|
457
|
+
return False # Very high variance = likely prose
|
|
458
|
+
|
|
459
|
+
# Scoring: Require at least some CSV-like characteristics
|
|
460
|
+
csv_score = 0
|
|
461
|
+
if looks_like_header:
|
|
462
|
+
csv_score += 1
|
|
463
|
+
if has_quoted_fields:
|
|
464
|
+
csv_score += 1
|
|
465
|
+
if num_fields >= 3: # Multiple fields is more CSV-like
|
|
466
|
+
csv_score += 1
|
|
467
|
+
|
|
468
|
+
# Need minimum score to confidently classify as CSV
|
|
469
|
+
return csv_score >= self.CSV_MIN_SCORE
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def _is_text_file(self, content: str) -> bool:
|
|
473
|
+
"""Check if content appears to be plain text."""
|
|
474
|
+
if not content or len(content) < 10: # Need reasonable content
|
|
475
|
+
return False
|
|
476
|
+
|
|
477
|
+
# Check for high ratio of printable characters
|
|
478
|
+
printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
|
|
479
|
+
|
|
480
|
+
# Also check that it has reasonable line lengths (not binary data)
|
|
481
|
+
lines = content.split('\n')
|
|
482
|
+
max_line_length = max(len(line) for line in lines) if lines else 0
|
|
483
|
+
|
|
484
|
+
# Text files typically have lines under 1000 chars and high printable ratio
|
|
485
|
+
return (printable_chars / len(content) > 0.95 and
|
|
486
|
+
max_line_length < 1000 and
|
|
487
|
+
len(content) > 20) # Minimum reasonable text file size
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
# Backward compatibility function
|
|
491
|
+
async def get_file_type(file_path: str) -> str:
|
|
492
|
+
"""
|
|
493
|
+
Legacy function for compatibility with existing code.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
file_path: Path to the file to analyze
|
|
497
|
+
|
|
498
|
+
Returns:
|
|
499
|
+
MIME type string
|
|
500
|
+
|
|
501
|
+
Raises:
|
|
502
|
+
UnsupportedTypeException: If file type cannot be determined
|
|
503
|
+
"""
|
|
504
|
+
detector = FileDetector()
|
|
505
|
+
return await detector.detect(file_path)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from functools import partial
|
|
2
|
+
|
|
3
|
+
from content_core.models import ModelFactory
|
|
4
|
+
from content_core.templated_message import TemplatedMessageInput, templated_message
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
async def summarize(content: str, context: str) -> str:
|
|
8
|
+
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
|
+
response = await templated_message_fn(
|
|
10
|
+
TemplatedMessageInput(
|
|
11
|
+
user_prompt_template="content/summarize",
|
|
12
|
+
data={"content": content, "context": context},
|
|
13
|
+
)
|
|
14
|
+
)
|
|
15
|
+
return response
|
content_core/logging.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
def configure_logging(debug=False):
|
|
5
|
+
"""
|
|
6
|
+
Configure the global logger for the application.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
debug (bool): If True, set logging level to DEBUG; otherwise, set to INFO.
|
|
10
|
+
"""
|
|
11
|
+
logger.remove() # Remove any existing handlers
|
|
12
|
+
logger.add(sys.stderr, level="DEBUG" if debug else "INFO")
|
|
13
|
+
|
|
14
|
+
# Initial configuration with default level (INFO)
|
|
15
|
+
configure_logging(debug=False)
|