content-core 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/content/extraction/graph.py +4 -3
- content_core/content/identification/__init__.py +4 -3
- content_core/content/identification/file_detector.py +415 -0
- {content_core-1.3.1.dist-info → content_core-1.4.1.dist-info}/METADATA +17 -4
- {content_core-1.3.1.dist-info → content_core-1.4.1.dist-info}/RECORD +8 -7
- {content_core-1.3.1.dist-info → content_core-1.4.1.dist-info}/WHEEL +0 -0
- {content_core-1.3.1.dist-info → content_core-1.4.1.dist-info}/entry_points.txt +0 -0
- {content_core-1.3.1.dist-info → content_core-1.4.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -4,7 +4,6 @@ from typing import Any, Dict, Optional
|
|
|
4
4
|
from urllib.parse import urlparse
|
|
5
5
|
|
|
6
6
|
import aiohttp
|
|
7
|
-
import magic
|
|
8
7
|
from langgraph.graph import END, START, StateGraph
|
|
9
8
|
|
|
10
9
|
from content_core.common import (
|
|
@@ -54,12 +53,14 @@ async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
|
|
|
54
53
|
|
|
55
54
|
async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
|
|
56
55
|
"""
|
|
57
|
-
Identify the file using
|
|
56
|
+
Identify the file using pure Python file detection
|
|
58
57
|
"""
|
|
58
|
+
from content_core.content.identification import get_file_type
|
|
59
|
+
|
|
59
60
|
return_dict = {}
|
|
60
61
|
file_path = state.file_path
|
|
61
62
|
if file_path is not None:
|
|
62
|
-
return_dict["identified_type"] =
|
|
63
|
+
return_dict["identified_type"] = await get_file_type(file_path)
|
|
63
64
|
return_dict["title"] = os.path.basename(file_path)
|
|
64
65
|
return return_dict
|
|
65
66
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import
|
|
1
|
+
from .file_detector import FileDetector
|
|
2
2
|
|
|
3
3
|
|
|
4
4
|
async def get_file_type(file_path: str) -> str:
|
|
5
5
|
"""
|
|
6
|
-
Identify the file using
|
|
6
|
+
Identify the file using pure Python file detection
|
|
7
7
|
"""
|
|
8
|
-
|
|
8
|
+
detector = FileDetector()
|
|
9
|
+
return await detector.detect(file_path)
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pure Python file type detection using magic bytes and content analysis.
|
|
3
|
+
Replaces libmagic dependency with a lightweight implementation.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import zipfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
from content_core.common.exceptions import UnsupportedTypeException
|
|
12
|
+
from content_core.logging import logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class FileDetector:
|
|
16
|
+
"""Pure Python file type detection using magic bytes and content analysis."""
|
|
17
|
+
|
|
18
|
+
# Configuration constants
|
|
19
|
+
SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
|
|
20
|
+
TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
"""Initialize the FileDetector with signature mappings."""
|
|
24
|
+
self.binary_signatures = self._load_binary_signatures()
|
|
25
|
+
self.text_patterns = self._load_text_patterns()
|
|
26
|
+
self.extension_mapping = self._load_extension_mapping()
|
|
27
|
+
self.zip_content_patterns = self._load_zip_content_patterns()
|
|
28
|
+
|
|
29
|
+
def _load_binary_signatures(self) -> Dict[bytes, str]:
|
|
30
|
+
"""Load binary file signatures (magic bytes) to MIME type mappings."""
|
|
31
|
+
# Ordered by specificity - longer/more specific signatures first
|
|
32
|
+
return {
|
|
33
|
+
# PDF
|
|
34
|
+
b'%PDF': 'application/pdf', # PDF document signature (hex: 25 50 44 46)
|
|
35
|
+
|
|
36
|
+
# Images
|
|
37
|
+
b'\xff\xd8\xff\xe0': 'image/jpeg', # JPEG with JFIF header (JPEG File Interchange Format)
|
|
38
|
+
b'\xff\xd8\xff\xe1': 'image/jpeg', # JPEG with EXIF header (Exchangeable Image File Format)
|
|
39
|
+
b'\xff\xd8\xff\xe2': 'image/jpeg', # JPEG with Adobe header (Adobe JPEG)
|
|
40
|
+
b'\xff\xd8\xff\xdb': 'image/jpeg', # JPEG with DQT (Define Quantization Table) marker
|
|
41
|
+
b'\xff\xd8': 'image/jpeg', # Generic JPEG signature (Start of Image marker, must be last)
|
|
42
|
+
b'\x89PNG\r\n\x1a\n': 'image/png', # PNG signature (hex: 89 50 4E 47 0D 0A 1A 0A)
|
|
43
|
+
b'GIF87a': 'image/gif', # GIF version 87a
|
|
44
|
+
b'GIF89a': 'image/gif', # GIF version 89a (supports animation and transparency)
|
|
45
|
+
b'II*\x00': 'image/tiff', # TIFF little-endian (Intel byte order)
|
|
46
|
+
b'MM\x00*': 'image/tiff', # TIFF big-endian (Motorola byte order)
|
|
47
|
+
b'BM': 'image/bmp', # Windows Bitmap signature
|
|
48
|
+
|
|
49
|
+
# Audio
|
|
50
|
+
b'ID3': 'audio/mpeg', # MP3 with ID3v2 metadata tag
|
|
51
|
+
b'\xff\xfb': 'audio/mpeg', # MP3 frame sync with MPEG-1 Layer 3
|
|
52
|
+
b'\xff\xf3': 'audio/mpeg', # MP3 frame sync with MPEG-2 Layer 3
|
|
53
|
+
b'\xff\xf2': 'audio/mpeg', # MP3 frame sync with MPEG-2.5 Layer 3
|
|
54
|
+
b'RIFF': None, # Resource Interchange File Format - requires further inspection (could be WAV, AVI, WebP)
|
|
55
|
+
b'fLaC': 'audio/flac', # Free Lossless Audio Codec signature
|
|
56
|
+
|
|
57
|
+
# Video/Audio containers - these will be handled by ftyp detection
|
|
58
|
+
# MP4/M4A/MOV use ftyp box at offset 4 for identification
|
|
59
|
+
|
|
60
|
+
# Archive formats
|
|
61
|
+
b'PK\x03\x04': 'application/zip', # ZIP archive (also used by DOCX, XLSX, PPTX, JAR, etc.)
|
|
62
|
+
b'PK\x05\x06': 'application/zip', # Empty ZIP archive (End of Central Directory)
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
def _load_text_patterns(self) -> Dict[str, str]:
|
|
66
|
+
"""Load text-based format detection patterns."""
|
|
67
|
+
return {
|
|
68
|
+
'<!DOCTYPE html': 'text/html',
|
|
69
|
+
'<!doctype html': 'text/html',
|
|
70
|
+
'<html': 'text/html',
|
|
71
|
+
'<?xml': 'text/xml',
|
|
72
|
+
'{': 'application/json', # Will need more validation
|
|
73
|
+
'[': 'application/json', # Will need more validation
|
|
74
|
+
'---\n': 'text/yaml',
|
|
75
|
+
'---\r\n': 'text/yaml',
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
def _load_extension_mapping(self) -> Dict[str, str]:
|
|
79
|
+
"""Load file extension to MIME type mappings as fallback."""
|
|
80
|
+
return {
|
|
81
|
+
# Documents
|
|
82
|
+
'.pdf': 'application/pdf',
|
|
83
|
+
'.txt': 'text/plain',
|
|
84
|
+
'.md': 'text/plain', # Markdown treated as plain text (current behavior)
|
|
85
|
+
'.markdown': 'text/plain',
|
|
86
|
+
'.rst': 'text/plain', # reStructuredText
|
|
87
|
+
'.log': 'text/plain',
|
|
88
|
+
|
|
89
|
+
# Web formats
|
|
90
|
+
'.html': 'text/html',
|
|
91
|
+
'.htm': 'text/html',
|
|
92
|
+
'.xhtml': 'text/html',
|
|
93
|
+
'.xml': 'text/xml',
|
|
94
|
+
|
|
95
|
+
# Data formats
|
|
96
|
+
'.json': 'application/json',
|
|
97
|
+
'.yaml': 'text/yaml',
|
|
98
|
+
'.yml': 'text/yaml',
|
|
99
|
+
'.csv': 'text/csv',
|
|
100
|
+
'.tsv': 'text/csv', # Tab-separated values
|
|
101
|
+
|
|
102
|
+
# Images
|
|
103
|
+
'.jpg': 'image/jpeg',
|
|
104
|
+
'.jpeg': 'image/jpeg',
|
|
105
|
+
'.jpe': 'image/jpeg',
|
|
106
|
+
'.png': 'image/png',
|
|
107
|
+
'.gif': 'image/gif',
|
|
108
|
+
'.tiff': 'image/tiff',
|
|
109
|
+
'.tif': 'image/tiff',
|
|
110
|
+
'.bmp': 'image/bmp',
|
|
111
|
+
'.webp': 'image/webp',
|
|
112
|
+
'.ico': 'image/x-icon',
|
|
113
|
+
'.svg': 'image/svg+xml',
|
|
114
|
+
|
|
115
|
+
# Audio
|
|
116
|
+
'.mp3': 'audio/mpeg',
|
|
117
|
+
'.wav': 'audio/wav',
|
|
118
|
+
'.wave': 'audio/wav',
|
|
119
|
+
'.m4a': 'audio/mp4',
|
|
120
|
+
'.aac': 'audio/aac',
|
|
121
|
+
'.ogg': 'audio/ogg',
|
|
122
|
+
'.oga': 'audio/ogg',
|
|
123
|
+
'.flac': 'audio/flac',
|
|
124
|
+
'.wma': 'audio/x-ms-wma',
|
|
125
|
+
|
|
126
|
+
# Video
|
|
127
|
+
'.mp4': 'video/mp4',
|
|
128
|
+
'.m4v': 'video/mp4',
|
|
129
|
+
'.avi': 'video/x-msvideo',
|
|
130
|
+
'.mov': 'video/quicktime',
|
|
131
|
+
'.qt': 'video/quicktime',
|
|
132
|
+
'.wmv': 'video/x-ms-wmv',
|
|
133
|
+
'.flv': 'video/x-flv',
|
|
134
|
+
'.mkv': 'video/x-matroska',
|
|
135
|
+
'.webm': 'video/webm',
|
|
136
|
+
'.mpg': 'video/mpeg',
|
|
137
|
+
'.mpeg': 'video/mpeg',
|
|
138
|
+
'.3gp': 'video/3gpp',
|
|
139
|
+
|
|
140
|
+
# Office formats
|
|
141
|
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
142
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
143
|
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
144
|
+
|
|
145
|
+
# E-books
|
|
146
|
+
'.epub': 'application/epub+zip',
|
|
147
|
+
|
|
148
|
+
# Archives (basic detection - not expanded)
|
|
149
|
+
'.zip': 'application/zip',
|
|
150
|
+
'.tar': 'application/x-tar',
|
|
151
|
+
'.gz': 'application/gzip',
|
|
152
|
+
'.bz2': 'application/x-bzip2',
|
|
153
|
+
'.7z': 'application/x-7z-compressed',
|
|
154
|
+
'.rar': 'application/x-rar-compressed',
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
def _load_zip_content_patterns(self) -> Dict[str, str]:
|
|
158
|
+
"""Load patterns for identifying ZIP-based formats by their content."""
|
|
159
|
+
return {
|
|
160
|
+
'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
161
|
+
'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
162
|
+
'ppt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
163
|
+
'META-INF/container.xml': 'application/epub+zip',
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async def detect(self, file_path: str) -> str:
|
|
167
|
+
"""
|
|
168
|
+
Detect file type using magic bytes and content analysis.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
file_path: Path to the file to analyze
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
MIME type string
|
|
175
|
+
|
|
176
|
+
Raises:
|
|
177
|
+
UnsupportedTypeException: If file type cannot be determined
|
|
178
|
+
"""
|
|
179
|
+
file_path = Path(file_path)
|
|
180
|
+
|
|
181
|
+
if not file_path.exists():
|
|
182
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
183
|
+
|
|
184
|
+
if not file_path.is_file():
|
|
185
|
+
raise ValueError(f"Not a file: {file_path}")
|
|
186
|
+
|
|
187
|
+
# Try binary signature detection first
|
|
188
|
+
mime_type = await self._detect_by_signature(file_path)
|
|
189
|
+
if mime_type:
|
|
190
|
+
logger.debug(f"Detected {file_path} as {mime_type} by signature")
|
|
191
|
+
return mime_type
|
|
192
|
+
|
|
193
|
+
# Try text-based detection
|
|
194
|
+
mime_type = await self._detect_text_format(file_path)
|
|
195
|
+
if mime_type:
|
|
196
|
+
logger.debug(f"Detected {file_path} as {mime_type} by text analysis")
|
|
197
|
+
return mime_type
|
|
198
|
+
|
|
199
|
+
# Fallback to extension
|
|
200
|
+
mime_type = self._detect_by_extension(file_path)
|
|
201
|
+
if mime_type:
|
|
202
|
+
logger.debug(f"Detected {file_path} as {mime_type} by extension")
|
|
203
|
+
return mime_type
|
|
204
|
+
|
|
205
|
+
# If all detection methods fail
|
|
206
|
+
raise UnsupportedTypeException(f"Unable to determine file type for: {file_path}")
|
|
207
|
+
|
|
208
|
+
async def _detect_by_signature(self, file_path: Path) -> Optional[str]:
|
|
209
|
+
"""Detect file type by binary signature (magic bytes)."""
|
|
210
|
+
try:
|
|
211
|
+
with open(file_path, 'rb') as f:
|
|
212
|
+
# Read bytes for signature detection
|
|
213
|
+
header = f.read(self.SIGNATURE_READ_SIZE)
|
|
214
|
+
|
|
215
|
+
if not header:
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
# Check for exact signature matches
|
|
219
|
+
for signature, mime_type in self.binary_signatures.items():
|
|
220
|
+
if header.startswith(signature):
|
|
221
|
+
# Special handling for RIFF (could be WAV or AVI)
|
|
222
|
+
if signature == b'RIFF' and len(header) >= 12:
|
|
223
|
+
if header[8:12] == b'WAVE':
|
|
224
|
+
return 'audio/wav'
|
|
225
|
+
elif header[8:12] == b'AVI ':
|
|
226
|
+
return 'video/x-msvideo'
|
|
227
|
+
|
|
228
|
+
# Special handling for ZIP-based formats
|
|
229
|
+
if mime_type == 'application/zip':
|
|
230
|
+
zip_mime = await self._detect_zip_format(file_path)
|
|
231
|
+
if zip_mime:
|
|
232
|
+
return zip_mime
|
|
233
|
+
|
|
234
|
+
if mime_type:
|
|
235
|
+
return mime_type
|
|
236
|
+
|
|
237
|
+
# Special check for MP4/MOV files with ftyp box
|
|
238
|
+
if len(header) >= 12 and header[4:8] == b'ftyp':
|
|
239
|
+
ftyp_brand = header[8:12]
|
|
240
|
+
# Don't strip - check exact 4-byte brand
|
|
241
|
+
if ftyp_brand == b'M4A ' or ftyp_brand.startswith(b'M4A'):
|
|
242
|
+
return 'audio/mp4'
|
|
243
|
+
elif ftyp_brand in [b'mp41', b'mp42', b'isom', b'iso2', b'iso5', b'M4V ', b'M4VP']:
|
|
244
|
+
return 'video/mp4'
|
|
245
|
+
elif ftyp_brand.startswith(b'qt'):
|
|
246
|
+
return 'video/quicktime'
|
|
247
|
+
else:
|
|
248
|
+
# Generic MP4 for other ftyp brands
|
|
249
|
+
return 'video/mp4'
|
|
250
|
+
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.debug(f"Error reading file signature: {e}")
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
async def _detect_zip_format(self, file_path: Path) -> Optional[str]:
|
|
258
|
+
"""Detect specific ZIP-based format (DOCX, XLSX, PPTX, EPUB)."""
|
|
259
|
+
try:
|
|
260
|
+
with zipfile.ZipFile(file_path, 'r') as zf:
|
|
261
|
+
namelist = zf.namelist()
|
|
262
|
+
|
|
263
|
+
# Check for specific content patterns
|
|
264
|
+
for pattern, mime_type in self.zip_content_patterns.items():
|
|
265
|
+
if any(name.startswith(pattern) for name in namelist):
|
|
266
|
+
return mime_type
|
|
267
|
+
|
|
268
|
+
# If it's a valid ZIP but no specific pattern matched
|
|
269
|
+
return 'application/zip'
|
|
270
|
+
|
|
271
|
+
except zipfile.BadZipFile:
|
|
272
|
+
logger.debug(f"Invalid ZIP file: {file_path}")
|
|
273
|
+
return None
|
|
274
|
+
except Exception as e:
|
|
275
|
+
logger.debug(f"Error inspecting ZIP content: {e}")
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
async def _detect_text_format(self, file_path: Path) -> Optional[str]:
|
|
279
|
+
"""Detect text-based formats by content analysis."""
|
|
280
|
+
try:
|
|
281
|
+
# Read bytes for text content analysis
|
|
282
|
+
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
|
283
|
+
content = f.read(self.TEXT_READ_SIZE)
|
|
284
|
+
|
|
285
|
+
if not content or len(content) < 10:
|
|
286
|
+
return None
|
|
287
|
+
|
|
288
|
+
# Strip whitespace for analysis
|
|
289
|
+
content_stripped = content.strip()
|
|
290
|
+
|
|
291
|
+
# Check for text patterns
|
|
292
|
+
for pattern, mime_type in self.text_patterns.items():
|
|
293
|
+
if content_stripped.lower().startswith(pattern.lower()):
|
|
294
|
+
# Special validation for JSON
|
|
295
|
+
if mime_type == 'application/json':
|
|
296
|
+
if self._is_valid_json_start(content_stripped):
|
|
297
|
+
return mime_type
|
|
298
|
+
# HTML needs to be detected for routing
|
|
299
|
+
elif mime_type == 'text/html':
|
|
300
|
+
return mime_type
|
|
301
|
+
# For other text patterns (YAML, etc), just return text/plain
|
|
302
|
+
else:
|
|
303
|
+
return 'text/plain'
|
|
304
|
+
|
|
305
|
+
# Check for CSV pattern (multiple comma-separated values)
|
|
306
|
+
if self._looks_like_csv(content):
|
|
307
|
+
return 'text/csv'
|
|
308
|
+
|
|
309
|
+
# If it's readable text but no specific format detected
|
|
310
|
+
if self._is_text_file(content):
|
|
311
|
+
return 'text/plain'
|
|
312
|
+
|
|
313
|
+
return None
|
|
314
|
+
|
|
315
|
+
except UnicodeDecodeError:
|
|
316
|
+
# Not a text file
|
|
317
|
+
return None
|
|
318
|
+
except Exception as e:
|
|
319
|
+
logger.debug(f"Error analyzing text content: {e}")
|
|
320
|
+
return None
|
|
321
|
+
|
|
322
|
+
def _detect_by_extension(self, file_path: Path) -> Optional[str]:
|
|
323
|
+
"""Detect file type by extension as fallback."""
|
|
324
|
+
extension = file_path.suffix.lower()
|
|
325
|
+
return self.extension_mapping.get(extension)
|
|
326
|
+
|
|
327
|
+
def _is_valid_json_start(self, content: str) -> bool:
|
|
328
|
+
"""Check if content starts like valid JSON."""
|
|
329
|
+
# More robust JSON detection
|
|
330
|
+
content = content.strip()
|
|
331
|
+
if not (content.startswith('{') or content.startswith('[')):
|
|
332
|
+
return False
|
|
333
|
+
|
|
334
|
+
# Strong JSON indicators that are less likely in other formats
|
|
335
|
+
strong_indicators = [
|
|
336
|
+
'{\n "', # Pretty-printed JSON object
|
|
337
|
+
'{\n\t"', # Tab-indented JSON
|
|
338
|
+
'{"', # Compact JSON object
|
|
339
|
+
'[\n {', # Pretty-printed JSON array
|
|
340
|
+
'[{', # Compact JSON array
|
|
341
|
+
'": {', # Nested object
|
|
342
|
+
'": [' # Nested array
|
|
343
|
+
]
|
|
344
|
+
|
|
345
|
+
# Check for strong indicators
|
|
346
|
+
for indicator in strong_indicators:
|
|
347
|
+
if indicator in content[:200]:
|
|
348
|
+
return True
|
|
349
|
+
|
|
350
|
+
# Weaker indicators - require multiple matches
|
|
351
|
+
json_patterns = ['":', '": ', '",', ', "', '"]', '"}']
|
|
352
|
+
pattern_count = sum(1 for pattern in json_patterns if pattern in content[:200])
|
|
353
|
+
|
|
354
|
+
# Check for JSON keywords but not in URLs or natural text
|
|
355
|
+
json_keywords = ['true', 'false', 'null']
|
|
356
|
+
keyword_count = 0
|
|
357
|
+
content_lower = content[:200].lower()
|
|
358
|
+
for kw in json_keywords:
|
|
359
|
+
# Check if keyword appears as a value (not in URL or sentence)
|
|
360
|
+
if f': {kw}' in content_lower or f':{kw}' in content_lower or f', {kw}' in content_lower:
|
|
361
|
+
keyword_count += 1
|
|
362
|
+
|
|
363
|
+
# Require stronger evidence to avoid false positives
|
|
364
|
+
return pattern_count >= 3 or keyword_count >= 1
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _looks_like_csv(self, content: str) -> bool:
|
|
368
|
+
"""Check if content looks like CSV format."""
|
|
369
|
+
lines = content.split('\n', 5)[:5] # Check first 5 lines
|
|
370
|
+
if len(lines) < 2:
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
# Count commas in each line
|
|
374
|
+
comma_counts = [line.count(',') for line in lines if line.strip()]
|
|
375
|
+
if not comma_counts:
|
|
376
|
+
return False
|
|
377
|
+
|
|
378
|
+
# CSV should have consistent comma counts
|
|
379
|
+
return len(set(comma_counts)) == 1 and comma_counts[0] > 0
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _is_text_file(self, content: str) -> bool:
|
|
383
|
+
"""Check if content appears to be plain text."""
|
|
384
|
+
if not content or len(content) < 10: # Need reasonable content
|
|
385
|
+
return False
|
|
386
|
+
|
|
387
|
+
# Check for high ratio of printable characters
|
|
388
|
+
printable_chars = sum(1 for c in content if c.isprintable() or c.isspace())
|
|
389
|
+
|
|
390
|
+
# Also check that it has reasonable line lengths (not binary data)
|
|
391
|
+
lines = content.split('\n')
|
|
392
|
+
max_line_length = max(len(line) for line in lines) if lines else 0
|
|
393
|
+
|
|
394
|
+
# Text files typically have lines under 1000 chars and high printable ratio
|
|
395
|
+
return (printable_chars / len(content) > 0.95 and
|
|
396
|
+
max_line_length < 1000 and
|
|
397
|
+
len(content) > 20) # Minimum reasonable text file size
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
# Backward compatibility function
|
|
401
|
+
async def get_file_type(file_path: str) -> str:
|
|
402
|
+
"""
|
|
403
|
+
Legacy function for compatibility with existing code.
|
|
404
|
+
|
|
405
|
+
Args:
|
|
406
|
+
file_path: Path to the file to analyze
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
MIME type string
|
|
410
|
+
|
|
411
|
+
Raises:
|
|
412
|
+
UnsupportedTypeException: If file type cannot be determined
|
|
413
|
+
"""
|
|
414
|
+
detector = FileDetector()
|
|
415
|
+
return await detector.detect(file_path)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.1
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -24,8 +24,6 @@ Requires-Dist: pillow>=10.4.0
|
|
|
24
24
|
Requires-Dist: pymupdf>=1.25.5
|
|
25
25
|
Requires-Dist: python-docx>=1.1.2
|
|
26
26
|
Requires-Dist: python-dotenv>=1.1.0
|
|
27
|
-
Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
|
|
28
|
-
Requires-Dist: python-magic>=0.4.27
|
|
29
27
|
Requires-Dist: python-pptx>=1.0.2
|
|
30
28
|
Requires-Dist: pytubefix>=9.1.1
|
|
31
29
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
@@ -38,6 +36,14 @@ Description-Content-Type: text/markdown
|
|
|
38
36
|
# Content Core
|
|
39
37
|
|
|
40
38
|
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://badge.fury.io/py/content-core)
|
|
40
|
+
[](https://pepy.tech/project/content-core)
|
|
41
|
+
[](https://pepy.tech/project/content-core)
|
|
42
|
+
[](https://github.com/lfnovo/content-core)
|
|
43
|
+
[](https://github.com/lfnovo/content-core)
|
|
44
|
+
[](https://github.com/lfnovo/content-core/issues)
|
|
45
|
+
[](https://github.com/psf/black)
|
|
46
|
+
[](https://github.com/astral-sh/ruff)
|
|
41
47
|
|
|
42
48
|
**Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
|
|
43
49
|
|
|
@@ -103,12 +109,13 @@ summary = await cc.summarize_content(result, context="explain to a child")
|
|
|
103
109
|
* **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
|
|
104
110
|
* **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
|
|
105
111
|
* **🔄 Asynchronous:** Built with `asyncio` for efficient processing
|
|
112
|
+
* **🐍 Pure Python Implementation:** No system dependencies required - simplified installation across all platforms
|
|
106
113
|
|
|
107
114
|
## Getting Started
|
|
108
115
|
|
|
109
116
|
### Installation
|
|
110
117
|
|
|
111
|
-
Install Content Core using `pip
|
|
118
|
+
Install Content Core using `pip` - **no system dependencies required!**
|
|
112
119
|
|
|
113
120
|
```bash
|
|
114
121
|
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
@@ -124,6 +131,8 @@ pip install content-core
|
|
|
124
131
|
pip install content-core[docling]
|
|
125
132
|
```
|
|
126
133
|
|
|
134
|
+
> **Note:** Unlike many content extraction tools, Content Core uses pure Python implementations and doesn't require system libraries like libmagic. This ensures consistent, hassle-free installation across Windows, macOS, and Linux.
|
|
135
|
+
|
|
127
136
|
Alternatively, if you’re developing locally:
|
|
128
137
|
|
|
129
138
|
```bash
|
|
@@ -264,6 +273,10 @@ For more information on how to use the Content Core library, including details o
|
|
|
264
273
|
|
|
265
274
|
Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
|
|
266
275
|
|
|
276
|
+
<a href="https://glama.ai/mcp/servers/@lfnovo/content-core">
|
|
277
|
+
<img width="380" height="200" src="https://glama.ai/mcp/servers/@lfnovo/content-core/badge" />
|
|
278
|
+
</a>
|
|
279
|
+
|
|
267
280
|
### Quick Setup with Claude Desktop
|
|
268
281
|
|
|
269
282
|
```bash
|
|
@@ -15,8 +15,9 @@ content_core/content/__init__.py,sha256=7IxfLTUHKyHjoT4MfWM2PX2J3QBeYcuERzE9vFeF
|
|
|
15
15
|
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
16
16
|
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
|
-
content_core/content/extraction/graph.py,sha256=
|
|
19
|
-
content_core/content/identification/__init__.py,sha256=
|
|
18
|
+
content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
|
|
19
|
+
content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
|
|
20
|
+
content_core/content/identification/file_detector.py,sha256=s_10Osxv8gfVfs3UPXFzCOosvWCrf4ZCFXcW2yimUIM,17170
|
|
20
21
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
21
22
|
content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
|
|
22
23
|
content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
|
|
@@ -35,8 +36,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
|
|
|
35
36
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
36
37
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
37
38
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
38
|
-
content_core-1.
|
|
39
|
-
content_core-1.
|
|
40
|
-
content_core-1.
|
|
41
|
-
content_core-1.
|
|
42
|
-
content_core-1.
|
|
39
|
+
content_core-1.4.1.dist-info/METADATA,sha256=MQIVrPCeN9dE1JZ8UbcE4NSch7tfGEI-mauLTtbYLoE,21093
|
|
40
|
+
content_core-1.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
content_core-1.4.1.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
|
|
42
|
+
content_core-1.4.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
43
|
+
content_core-1.4.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|