content-core 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core-1.4.0/.claude/sessions/OSS-216/architecture.md +195 -0
- content_core-1.4.0/.claude/sessions/OSS-216/context.md +54 -0
- content_core-1.4.0/.claude/sessions/OSS-216/plan.md +195 -0
- {content_core-1.3.1 → content_core-1.4.0}/.gitignore +1 -1
- content_core-1.4.0/CHANGELOG.md +31 -0
- {content_core-1.3.1 → content_core-1.4.0}/PKG-INFO +17 -4
- {content_core-1.3.1 → content_core-1.4.0}/README.md +16 -1
- {content_core-1.3.1 → content_core-1.4.0}/docs/usage.md +34 -0
- {content_core-1.3.1 → content_core-1.4.0}/pyproject.toml +1 -3
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/extraction/graph.py +4 -3
- content_core-1.4.0/src/content_core/content/identification/__init__.py +9 -0
- content_core-1.4.0/src/content_core/content/identification/file_detector.py +415 -0
- content_core-1.4.0/tests/unit/test_file_detector.py +60 -0
- content_core-1.4.0/tests/unit/test_file_detector_critical.py +186 -0
- content_core-1.4.0/tests/unit/test_file_detector_performance.py +126 -0
- {content_core-1.3.1 → content_core-1.4.0}/uv.lock +3086 -3017
- content_core-1.3.1/src/content_core/content/identification/__init__.py +0 -8
- {content_core-1.3.1 → content_core-1.4.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/.github/workflows/claude-code-review.yml +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/.github/workflows/claude.yml +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/.github/workflows/publish.yml +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/.python-version +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/CONTRIBUTING.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/LICENSE +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/Makefile +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/docs/macos.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/docs/mcp.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/docs/processors.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/docs/raycast.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/examples/main.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/prompts/content/summarize.jinja +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/.eslintrc.json +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/CHANGELOG.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/README.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/assets/command-icon.png +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/package-lock.json +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/package.json +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/raycast-env.d.ts +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/src/extract-content.tsx +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/src/quick-extract.tsx +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/src/summarize-content.tsx +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/src/utils/content-core.ts +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/src/utils/types.ts +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/raycast-content-core/tsconfig.json +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/common/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/common/state.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/common/types.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/common/utils.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/config.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/logging.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/mcp/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/mcp/server.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/models.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/models_config.yaml +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/notebooks/urls.ipynb +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/audio.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/docling.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/office.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/text.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/url.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/video.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/py.typed +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/templated_message.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/tools/extract.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.docx +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.epub +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.md +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.mp3 +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.mp4 +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.pdf +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.pptx +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.txt +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file.xlsx +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/input_content/new_pdf.pdf +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/integration/test_cli.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/integration/test_extraction.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/unit/test_config.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/unit/test_docling.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/unit/test_mcp_server.py +0 -0
- {content_core-1.3.1 → content_core-1.4.0}/tests/unit/test_pymupdf_ocr.py +0 -0
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# OSS-216: Remove libmagic Dependency - Architecture
|
|
2
|
+
|
|
3
|
+
## High-Level System Overview
|
|
4
|
+
|
|
5
|
+
### Current State (Before)
|
|
6
|
+
```
|
|
7
|
+
File Input → magic.from_file() → MIME Type → Content Router → Processor
|
|
8
|
+
↑
|
|
9
|
+
libmagic (C library)
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
### Target State (After)
|
|
13
|
+
```
|
|
14
|
+
File Input → file_detector.detect() → MIME Type → Content Router → Processor
|
|
15
|
+
↑
|
|
16
|
+
Pure Python Detection
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Affected Components
|
|
20
|
+
|
|
21
|
+
### 1. `/src/content_core/content/identification/__init__.py`
|
|
22
|
+
- **Current**: Single function `get_file_type()` using `magic.from_file()`
|
|
23
|
+
- **Change**: Replace with new detection module
|
|
24
|
+
- **Dependencies**: None (isolated module)
|
|
25
|
+
|
|
26
|
+
### 2. `/src/content_core/content/extraction/graph.py`
|
|
27
|
+
- **Current**: `file_type()` function using `magic.from_file()` at line 62
|
|
28
|
+
- **Change**: Replace with new detection function call
|
|
29
|
+
- **Dependencies**: Routes to various processors based on MIME type
|
|
30
|
+
|
|
31
|
+
### 3. `/pyproject.toml`
|
|
32
|
+
- **Current**: Dependencies on `python-magic>=0.4.27` and `python-magic-bin==0.4.14`
|
|
33
|
+
- **Change**: Remove both dependencies
|
|
34
|
+
|
|
35
|
+
## New Component Design
|
|
36
|
+
|
|
37
|
+
### File Detection Module (`/src/content_core/content/identification/file_detector.py`)
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
class FileDetector:
|
|
41
|
+
"""Pure Python file type detection using magic bytes and content analysis."""
|
|
42
|
+
|
|
43
|
+
def __init__(self):
|
|
44
|
+
self.signatures = self._load_signatures()
|
|
45
|
+
self.text_patterns = self._load_text_patterns()
|
|
46
|
+
|
|
47
|
+
async def detect(self, file_path: str) -> str:
|
|
48
|
+
"""Main detection method returning MIME type."""
|
|
49
|
+
# 1. Read first 512 bytes
|
|
50
|
+
# 2. Check binary signatures
|
|
51
|
+
# 3. If no match, analyze as text
|
|
52
|
+
# 4. Fallback to extension mapping
|
|
53
|
+
# 5. Raise UnsupportedTypeException if all fail
|
|
54
|
+
|
|
55
|
+
# Backward compatibility function
|
|
56
|
+
async def get_file_type(file_path: str) -> str:
|
|
57
|
+
"""Legacy function for compatibility."""
|
|
58
|
+
detector = FileDetector()
|
|
59
|
+
return await detector.detect(file_path)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Signature Mappings
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
BINARY_SIGNATURES = {
|
|
66
|
+
# PDFs
|
|
67
|
+
b'%PDF': 'application/pdf',
|
|
68
|
+
|
|
69
|
+
# Office formats (ZIP-based)
|
|
70
|
+
b'PK\x03\x04': 'application/zip', # Will need content analysis
|
|
71
|
+
|
|
72
|
+
# Images
|
|
73
|
+
b'\xff\xd8\xff': 'image/jpeg',
|
|
74
|
+
b'\x89PNG\r\n\x1a\n': 'image/png',
|
|
75
|
+
b'GIF87a': 'image/gif',
|
|
76
|
+
b'GIF89a': 'image/gif',
|
|
77
|
+
b'II*\x00': 'image/tiff',
|
|
78
|
+
b'MM\x00*': 'image/tiff',
|
|
79
|
+
|
|
80
|
+
# Audio/Video
|
|
81
|
+
b'ID3': 'audio/mpeg',
|
|
82
|
+
b'\xff\xfb': 'audio/mpeg',
|
|
83
|
+
b'RIFF': 'audio/wav', # Also video/avi
|
|
84
|
+
b'\x00\x00\x00\x14ftypM4A': 'audio/mp4',
|
|
85
|
+
b'\x00\x00\x00\x18ftypmp4': 'video/mp4',
|
|
86
|
+
b'\x00\x00\x00\x14ftypisom': 'video/mp4',
|
|
87
|
+
|
|
88
|
+
# EPUB
|
|
89
|
+
b'PK\x03\x04': 'application/epub+zip', # Will need content analysis
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
# For ZIP-based formats, check internal structure
|
|
93
|
+
ZIP_CONTENT_PATTERNS = {
|
|
94
|
+
'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
95
|
+
'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
96
|
+
'ppt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
97
|
+
'META-INF/container.xml': 'application/epub+zip',
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
# Text-based format detection
|
|
101
|
+
TEXT_PATTERNS = {
|
|
102
|
+
'<!DOCTYPE html': 'text/html',
|
|
103
|
+
'<html': 'text/html',
|
|
104
|
+
'<?xml': 'text/xml',
|
|
105
|
+
'{"': 'application/json',
|
|
106
|
+
'[{': 'application/json',
|
|
107
|
+
'---\n': 'text/yaml',
|
|
108
|
+
'#': 'text/markdown', # Weak, needs more context
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# Extension fallback mapping
|
|
112
|
+
EXTENSION_MAPPING = {
|
|
113
|
+
'.pdf': 'application/pdf',
|
|
114
|
+
'.txt': 'text/plain',
|
|
115
|
+
'.md': 'text/plain', # Current behavior
|
|
116
|
+
'.html': 'text/html',
|
|
117
|
+
'.json': 'application/json',
|
|
118
|
+
'.csv': 'text/csv',
|
|
119
|
+
'.mp4': 'video/mp4',
|
|
120
|
+
'.mp3': 'audio/mpeg',
|
|
121
|
+
'.wav': 'audio/wav',
|
|
122
|
+
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
123
|
+
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
124
|
+
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
125
|
+
'.epub': 'application/epub+zip',
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Implementation Strategy
|
|
130
|
+
|
|
131
|
+
### Phase 1: Create Detection Module
|
|
132
|
+
1. Implement `FileDetector` class with all detection logic
|
|
133
|
+
2. Handle ZIP-based formats by checking internal structure
|
|
134
|
+
3. Implement robust text format detection
|
|
135
|
+
4. Add comprehensive logging for debugging
|
|
136
|
+
|
|
137
|
+
### Phase 2: Integration
|
|
138
|
+
1. Update `get_file_type()` to use new detector
|
|
139
|
+
2. Update `file_type()` in graph.py
|
|
140
|
+
3. Ensure all MIME type strings match expected values
|
|
141
|
+
|
|
142
|
+
### Phase 3: Cleanup
|
|
143
|
+
1. Remove magic imports
|
|
144
|
+
2. Update pyproject.toml dependencies
|
|
145
|
+
3. Run tests to ensure compatibility
|
|
146
|
+
|
|
147
|
+
## Patterns & Best Practices
|
|
148
|
+
|
|
149
|
+
### Error Handling
|
|
150
|
+
- Maintain existing `UnsupportedTypeException` behavior
|
|
151
|
+
- Add specific error messages for debugging
|
|
152
|
+
- Log detection attempts for troubleshooting
|
|
153
|
+
|
|
154
|
+
### Async Pattern
|
|
155
|
+
- Keep async interface for consistency
|
|
156
|
+
- Use `aiofiles` if needed for async file reading
|
|
157
|
+
|
|
158
|
+
### Extensibility
|
|
159
|
+
- Design for easy addition of new signatures
|
|
160
|
+
- Consider configuration file for custom mappings
|
|
161
|
+
|
|
162
|
+
## External Dependencies
|
|
163
|
+
- **None** - Pure Python implementation
|
|
164
|
+
- Uses only standard library: `os`, `pathlib`, `zipfile`
|
|
165
|
+
|
|
166
|
+
## Trade-offs & Alternatives
|
|
167
|
+
|
|
168
|
+
### Trade-offs
|
|
169
|
+
1. **Performance**: Slightly slower than libmagic C library, but acceptable per requirements
|
|
170
|
+
2. **Accuracy**: May have edge cases libmagic handles better, but covers all current use cases
|
|
171
|
+
3. **Maintenance**: More code to maintain, but removes deployment complexity
|
|
172
|
+
|
|
173
|
+
### Alternatives Considered
|
|
174
|
+
1. **python-magic-bin fork**: Still has binary dependencies
|
|
175
|
+
2. **filetype library**: Pure Python but limited format support
|
|
176
|
+
3. **Custom C extension**: Defeats purpose of removing binary dependencies
|
|
177
|
+
|
|
178
|
+
## Negative Consequences
|
|
179
|
+
1. **Potential edge cases**: Some obscure file formats might not be detected correctly
|
|
180
|
+
2. **Maintenance burden**: Need to update signatures for new formats
|
|
181
|
+
3. **Slightly larger codebase**: Adding ~200 lines of detection code
|
|
182
|
+
|
|
183
|
+
## Files to Edit/Create
|
|
184
|
+
|
|
185
|
+
### Create:
|
|
186
|
+
1. `/src/content_core/content/identification/file_detector.py` - Main detection logic
|
|
187
|
+
|
|
188
|
+
### Edit:
|
|
189
|
+
1. `/src/content_core/content/identification/__init__.py` - Update to use new detector
|
|
190
|
+
2. `/src/content_core/content/extraction/graph.py` - Replace magic.from_file() call
|
|
191
|
+
3. `/pyproject.toml` - Remove python-magic dependencies
|
|
192
|
+
|
|
193
|
+
### No Changes Needed:
|
|
194
|
+
- All processor files (they only check MIME types, don't detect them)
|
|
195
|
+
- Test files (will continue to work with same MIME types)
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# OSS-216: Remove libmagic Dependency - Context
|
|
2
|
+
|
|
3
|
+
## Why This Is Being Built
|
|
4
|
+
|
|
5
|
+
- **Deployment Friction**: libmagic requires OS-level installation which creates deployment problems
|
|
6
|
+
- **Cross-Platform Issues**: Binary dependency causes installation problems across Windows, macOS, and Linux
|
|
7
|
+
- **Simplification**: Removing system dependencies makes the package easier to install and use
|
|
8
|
+
- **Maintain Functionality**: Need to keep intelligent file detection without external dependencies
|
|
9
|
+
|
|
10
|
+
## Expected Outcome
|
|
11
|
+
|
|
12
|
+
Replace libmagic with a pure Python implementation that:
|
|
13
|
+
- Detects file types using magic bytes/file signatures (first 512 bytes)
|
|
14
|
+
- Maintains the current routing behavior to appropriate content processors
|
|
15
|
+
- Works across all platforms without OS-level dependencies
|
|
16
|
+
- Keeps the same error handling (UnsupportedTypeException for unsupported types)
|
|
17
|
+
|
|
18
|
+
## Implementation Approach
|
|
19
|
+
|
|
20
|
+
1. **File Signature Detection System**:
|
|
21
|
+
- Build comprehensive mapping of file signatures to MIME types
|
|
22
|
+
- Read first 512 bytes to identify format by magic bytes
|
|
23
|
+
- Special handling for Office formats (DOCX, XLSX, PPTX) which are ZIP-based
|
|
24
|
+
- Content structure analysis for text formats (HTML, JSON, XML)
|
|
25
|
+
|
|
26
|
+
2. **Detection Priority** (as discussed):
|
|
27
|
+
- Primary: File signature/magic bytes detection
|
|
28
|
+
- Secondary: Content analysis for text formats
|
|
29
|
+
- Tertiary: File extension as final fallback
|
|
30
|
+
- If file extension and content disagree, prioritize content analysis
|
|
31
|
+
|
|
32
|
+
3. **Replace Current Usage**:
|
|
33
|
+
- Remove imports of `magic` library
|
|
34
|
+
- Replace `magic.from_file()` calls in:
|
|
35
|
+
- `/src/content_core/content/identification/__init__.py`
|
|
36
|
+
- `/src/content_core/content/extraction/graph.py`
|
|
37
|
+
- Remove dependencies from `pyproject.toml`
|
|
38
|
+
|
|
39
|
+
## Testing Approach
|
|
40
|
+
|
|
41
|
+
- Comprehensive testing will be handled later
|
|
42
|
+
- Focus on maintaining existing functionality
|
|
43
|
+
- Ensure all currently supported file types continue to work
|
|
44
|
+
|
|
45
|
+
## Dependencies
|
|
46
|
+
|
|
47
|
+
No new dependencies - implementation should be pure Python using only standard library.
|
|
48
|
+
|
|
49
|
+
## Constraints
|
|
50
|
+
|
|
51
|
+
- 512 bytes buffer is sufficient (no need for deep ZIP inspection)
|
|
52
|
+
- Performance is not a concern (load is small)
|
|
53
|
+
- Maintain current error behavior (raise UnsupportedTypeException)
|
|
54
|
+
- MIME type strings can be adjusted as long as routing works correctly
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# OSS-216: Remove libmagic Dependency
|
|
2
|
+
|
|
3
|
+
If you are working on this feature, make sure to update this plan.md file as you go.
|
|
4
|
+
|
|
5
|
+
## PHASE 1: Create Pure Python File Detection Module [Completed ✅]
|
|
6
|
+
|
|
7
|
+
Build the core file detection system to replace libmagic with pure Python implementation.
|
|
8
|
+
|
|
9
|
+
### Create file_detector.py with basic structure [Completed ✅]
|
|
10
|
+
|
|
11
|
+
Create `/src/content_core/content/identification/file_detector.py` with:
|
|
12
|
+
- FileDetector class skeleton
|
|
13
|
+
- Basic signature mappings for binary formats (PDF, images)
|
|
14
|
+
- Simple detect() method that reads first 512 bytes
|
|
15
|
+
- Raise UnsupportedTypeException for unknown types
|
|
16
|
+
|
|
17
|
+
### Implement binary format detection [Completed ✅]
|
|
18
|
+
|
|
19
|
+
Add detection for binary formats:
|
|
20
|
+
- PDF files (magic bytes: `%PDF`)
|
|
21
|
+
- Common image formats (JPEG, PNG, GIF, TIFF, BMP)
|
|
22
|
+
- Audio formats (MP3, WAV, M4A)
|
|
23
|
+
- Video formats (MP4, AVI, MOV)
|
|
24
|
+
- Test each format with sample files
|
|
25
|
+
|
|
26
|
+
### Implement ZIP-based format detection [Completed ✅]
|
|
27
|
+
|
|
28
|
+
Handle Office and EPUB formats that use ZIP containers:
|
|
29
|
+
- Detect ZIP magic bytes (`PK\x03\x04`)
|
|
30
|
+
- Use zipfile module to inspect internal structure
|
|
31
|
+
- Differentiate DOCX (word/), XLSX (xl/), PPTX (ppt/), EPUB (META-INF/container.xml)
|
|
32
|
+
- Handle corrupted or password-protected ZIP files gracefully
|
|
33
|
+
|
|
34
|
+
### Comments:
|
|
35
|
+
- Focus on accurate detection over performance
|
|
36
|
+
- Ensure all MIME types match exactly what libmagic returns
|
|
37
|
+
- **Implementation notes from Phase 1:**
|
|
38
|
+
- Added comprehensive binary signatures with ordered checking (longer signatures first)
|
|
39
|
+
- Implemented generic ftyp box detection for MP4/MOV files for better compatibility
|
|
40
|
+
- Added FLAC audio format support
|
|
41
|
+
- Special RIFF handling differentiates between WAV and AVI
|
|
42
|
+
- Text detection requires minimum content length to avoid false positives
|
|
43
|
+
- All core file types tested and working correctly
|
|
44
|
+
|
|
45
|
+
## PHASE 2: Text Format Detection and Fallbacks [Completed ✅]
|
|
46
|
+
|
|
47
|
+
Implement text-based format detection and extension fallback mechanism.
|
|
48
|
+
|
|
49
|
+
### Add text format detection [Completed ✅]
|
|
50
|
+
|
|
51
|
+
Implement content analysis for text formats:
|
|
52
|
+
- HTML detection (DOCTYPE, <html tags)
|
|
53
|
+
- XML detection (<?xml declaration)
|
|
54
|
+
- JSON detection (starts with { or [)
|
|
55
|
+
- YAML detection (--- header)
|
|
56
|
+
- Markdown detection (combine multiple indicators)
|
|
57
|
+
- CSV detection (analyze structure)
|
|
58
|
+
- Plain text as default for unrecognized text
|
|
59
|
+
|
|
60
|
+
### Implement extension fallback system [Completed ✅]
|
|
61
|
+
|
|
62
|
+
Create comprehensive extension mapping:
|
|
63
|
+
- Map common file extensions to MIME types
|
|
64
|
+
- Use as last resort when content detection fails
|
|
65
|
+
- Log when falling back to extension
|
|
66
|
+
- Maintain compatibility with current behavior
|
|
67
|
+
|
|
68
|
+
### Add detection method priority logic [Completed ✅]
|
|
69
|
+
|
|
70
|
+
Implement the agreed priority order:
|
|
71
|
+
1. Binary signature detection (most reliable)
|
|
72
|
+
2. Content analysis for text formats
|
|
73
|
+
3. File extension as final fallback
|
|
74
|
+
- Add logging at each detection stage
|
|
75
|
+
- Return appropriate MIME type or raise exception
|
|
76
|
+
|
|
77
|
+
### Comments:
|
|
78
|
+
- Text detection needs to be careful to avoid false positives
|
|
79
|
+
- Extension fallback ensures graceful degradation
|
|
80
|
+
- **Implementation notes from Phase 2:**
|
|
81
|
+
- Enhanced JSON detection with pattern matching and keyword checking
|
|
82
|
+
- Improved YAML detection to avoid conflicts with Markdown
|
|
83
|
+
- Added sophisticated Markdown scoring system (headers, lists, links, etc.)
|
|
84
|
+
- Extended extension mapping to cover more file types (70+ extensions)
|
|
85
|
+
- Fixed YAML/Markdown detection priority to avoid false positives
|
|
86
|
+
- Added minimum content requirements for text detection
|
|
87
|
+
- All text formats tested with edge cases
|
|
88
|
+
|
|
89
|
+
## PHASE 3: Integration with Existing Code [Completed ✅]
|
|
90
|
+
|
|
91
|
+
Replace libmagic usage throughout the codebase.
|
|
92
|
+
|
|
93
|
+
### Update identification module [Completed ✅]
|
|
94
|
+
|
|
95
|
+
Modify `/src/content_core/content/identification/__init__.py`:
|
|
96
|
+
- Import FileDetector
|
|
97
|
+
- Replace `magic.from_file()` call in `get_file_type()`
|
|
98
|
+
- Maintain async interface
|
|
99
|
+
- Remove magic import
|
|
100
|
+
|
|
101
|
+
### Update graph.py file type detection [Completed ✅]
|
|
102
|
+
|
|
103
|
+
Modify `/src/content_core/content/extraction/graph.py`:
|
|
104
|
+
- Replace `magic.from_file()` at line 62
|
|
105
|
+
- Import get_file_type from identification module
|
|
106
|
+
- Remove direct magic import
|
|
107
|
+
- Ensure error handling remains consistent
|
|
108
|
+
|
|
109
|
+
### Test integration thoroughly [Completed ✅]
|
|
110
|
+
|
|
111
|
+
Verify all extraction paths work:
|
|
112
|
+
- Test each supported file type through full pipeline
|
|
113
|
+
- Verify correct processor routing
|
|
114
|
+
- Check error messages for unsupported types
|
|
115
|
+
- Ensure no regression in functionality
|
|
116
|
+
|
|
117
|
+
### Comments:
|
|
118
|
+
- Must maintain exact same external behavior
|
|
119
|
+
- All existing code depending on MIME types should work unchanged
|
|
120
|
+
- **Implementation notes from Phase 3:**
|
|
121
|
+
- Successfully replaced all libmagic usage with FileDetector
|
|
122
|
+
- Integration was seamless - no changes needed to downstream processors
|
|
123
|
+
- All file types correctly detected and routed to appropriate processors
|
|
124
|
+
- Tested with PDF, DOCX, MP4, MP3, JSON, HTML, CSV, text files
|
|
125
|
+
- Only test failure was unrelated (OpenAI API issue for MP3 transcription)
|
|
126
|
+
- MIME types match exactly what libmagic returned
|
|
127
|
+
|
|
128
|
+
## PHASE 4: Cleanup and Final Validation [In Progress 🔄]
|
|
129
|
+
|
|
130
|
+
Remove dependencies and ensure production readiness.
|
|
131
|
+
|
|
132
|
+
### Remove libmagic from dependencies [Completed ✅]
|
|
133
|
+
|
|
134
|
+
Update `/pyproject.toml`:
|
|
135
|
+
- Remove `python-magic>=0.4.27`
|
|
136
|
+
- Remove `python-magic-bin==0.4.14` for Windows
|
|
137
|
+
- Update lock file with `uv sync`
|
|
138
|
+
- Verify clean installation works
|
|
139
|
+
|
|
140
|
+
**Implementation notes:**
|
|
141
|
+
- Successfully removed both python-magic dependencies from pyproject.toml
|
|
142
|
+
- Lock file updated with `uv sync`
|
|
143
|
+
- 2 packages uninstalled: python-magic and python-magic-bin
|
|
144
|
+
|
|
145
|
+
### Add comprehensive test suite [Not Started ⏳]
|
|
146
|
+
|
|
147
|
+
Create thorough tests:
|
|
148
|
+
- Unit tests for FileDetector methods
|
|
149
|
+
- Integration tests for full extraction pipeline
|
|
150
|
+
- Edge cases (empty files, malformed files)
|
|
151
|
+
- Cross-platform compatibility tests
|
|
152
|
+
- Performance benchmarks
|
|
153
|
+
|
|
154
|
+
### Documentation and release preparation [Not Started ⏳]
|
|
155
|
+
|
|
156
|
+
Final preparations:
|
|
157
|
+
- Update README if it mentions libmagic
|
|
158
|
+
- Add docstrings to all new code
|
|
159
|
+
- Update CHANGELOG
|
|
160
|
+
- Test installation on fresh environment
|
|
161
|
+
- Run full test suite: `make test`
|
|
162
|
+
- Build package: `uv build`
|
|
163
|
+
|
|
164
|
+
### Comments:
|
|
165
|
+
- This is a breaking change for anyone depending on libmagic behavior
|
|
166
|
+
- Consider adding migration guide if needed
|
|
167
|
+
|
|
168
|
+
## Key Technical Details
|
|
169
|
+
|
|
170
|
+
**Critical MIME Types** (must match exactly):
|
|
171
|
+
- `application/pdf` - PDF files
|
|
172
|
+
- `application/epub+zip` - EPUB files
|
|
173
|
+
- `application/vnd.openxmlformats-officedocument.wordprocessingml.document` - DOCX
|
|
174
|
+
- `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` - XLSX
|
|
175
|
+
- `application/vnd.openxmlformats-officedocument.presentationml.presentation` - PPTX
|
|
176
|
+
- `text/plain` - Text and Markdown files
|
|
177
|
+
- `text/html` - HTML files
|
|
178
|
+
- `text/csv` - CSV files
|
|
179
|
+
- `application/json` - JSON files
|
|
180
|
+
- `image/*` - Various image formats
|
|
181
|
+
- `video/*` - Video files (prefix matching)
|
|
182
|
+
- `audio/*` - Audio files (prefix matching)
|
|
183
|
+
|
|
184
|
+
**Implementation Constraints**:
|
|
185
|
+
- 512-byte buffer is sufficient (no deep file inspection needed)
|
|
186
|
+
- Performance is not critical (small load expected)
|
|
187
|
+
- Must raise `UnsupportedTypeException` for unknown types
|
|
188
|
+
- Maintain async interface for consistency
|
|
189
|
+
- Pure Python only (no C extensions)
|
|
190
|
+
|
|
191
|
+
**Risk Mitigation**:
|
|
192
|
+
- Extensive testing before removing libmagic
|
|
193
|
+
- Keep detection logic modular for easy updates
|
|
194
|
+
- Log detection decisions for debugging
|
|
195
|
+
- Consider feature flag for rollback if needed
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to Content Core will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Pure Python file type detection via the new `FileDetector` class
|
|
12
|
+
- Comprehensive file signature detection for 25+ file formats
|
|
13
|
+
- Smart detection for ZIP-based formats (DOCX, XLSX, PPTX, EPUB)
|
|
14
|
+
|
|
15
|
+
### Changed
|
|
16
|
+
- File type detection now uses pure Python implementation instead of libmagic
|
|
17
|
+
- Improved cross-platform compatibility - no system dependencies required
|
|
18
|
+
|
|
19
|
+
### Removed
|
|
20
|
+
- Dependency on `python-magic` and `python-magic-bin`
|
|
21
|
+
- System requirement for libmagic library
|
|
22
|
+
|
|
23
|
+
### Technical Details
|
|
24
|
+
- Replaced libmagic dependency with custom `FileDetector` implementation
|
|
25
|
+
- File detection based on binary signatures and content analysis
|
|
26
|
+
- Maintains same API surface - no breaking changes for users
|
|
27
|
+
- Significantly simplified installation process across all platforms
|
|
28
|
+
|
|
29
|
+
## Previous Releases
|
|
30
|
+
|
|
31
|
+
For releases prior to this changelog, please see the [GitHub releases page](https://github.com/lfnovo/content-core/releases).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.4.0
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -24,8 +24,6 @@ Requires-Dist: pillow>=10.4.0
|
|
|
24
24
|
Requires-Dist: pymupdf>=1.25.5
|
|
25
25
|
Requires-Dist: python-docx>=1.1.2
|
|
26
26
|
Requires-Dist: python-dotenv>=1.1.0
|
|
27
|
-
Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
|
|
28
|
-
Requires-Dist: python-magic>=0.4.27
|
|
29
27
|
Requires-Dist: python-pptx>=1.0.2
|
|
30
28
|
Requires-Dist: pytubefix>=9.1.1
|
|
31
29
|
Requires-Dist: readability-lxml>=0.8.4.1
|
|
@@ -38,6 +36,14 @@ Description-Content-Type: text/markdown
|
|
|
38
36
|
# Content Core
|
|
39
37
|
|
|
40
38
|
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://badge.fury.io/py/content-core)
|
|
40
|
+
[](https://pepy.tech/project/content-core)
|
|
41
|
+
[](https://pepy.tech/project/content-core)
|
|
42
|
+
[](https://github.com/lfnovo/content-core)
|
|
43
|
+
[](https://github.com/lfnovo/content-core)
|
|
44
|
+
[](https://github.com/lfnovo/content-core/issues)
|
|
45
|
+
[](https://github.com/psf/black)
|
|
46
|
+
[](https://github.com/astral-sh/ruff)
|
|
41
47
|
|
|
42
48
|
**Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
|
|
43
49
|
|
|
@@ -103,12 +109,13 @@ summary = await cc.summarize_content(result, context="explain to a child")
|
|
|
103
109
|
* **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
|
|
104
110
|
* **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
|
|
105
111
|
* **🔄 Asynchronous:** Built with `asyncio` for efficient processing
|
|
112
|
+
* **🐍 Pure Python Implementation:** No system dependencies required - simplified installation across all platforms
|
|
106
113
|
|
|
107
114
|
## Getting Started
|
|
108
115
|
|
|
109
116
|
### Installation
|
|
110
117
|
|
|
111
|
-
Install Content Core using `pip
|
|
118
|
+
Install Content Core using `pip` - **no system dependencies required!**
|
|
112
119
|
|
|
113
120
|
```bash
|
|
114
121
|
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
@@ -124,6 +131,8 @@ pip install content-core
|
|
|
124
131
|
pip install content-core[docling]
|
|
125
132
|
```
|
|
126
133
|
|
|
134
|
+
> **Note:** Unlike many content extraction tools, Content Core uses pure Python implementations and doesn't require system libraries like libmagic. This ensures consistent, hassle-free installation across Windows, macOS, and Linux.
|
|
135
|
+
|
|
127
136
|
Alternatively, if you’re developing locally:
|
|
128
137
|
|
|
129
138
|
```bash
|
|
@@ -264,6 +273,10 @@ For more information on how to use the Content Core library, including details o
|
|
|
264
273
|
|
|
265
274
|
Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
|
|
266
275
|
|
|
276
|
+
<a href="https://glama.ai/mcp/servers/@lfnovo/content-core">
|
|
277
|
+
<img width="380" height="200" src="https://glama.ai/mcp/servers/@lfnovo/content-core/badge" />
|
|
278
|
+
</a>
|
|
279
|
+
|
|
267
280
|
### Quick Setup with Claude Desktop
|
|
268
281
|
|
|
269
282
|
```bash
|
|
@@ -1,6 +1,14 @@
|
|
|
1
1
|
# Content Core
|
|
2
2
|
|
|
3
3
|
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
[](https://badge.fury.io/py/content-core)
|
|
5
|
+
[](https://pepy.tech/project/content-core)
|
|
6
|
+
[](https://pepy.tech/project/content-core)
|
|
7
|
+
[](https://github.com/lfnovo/content-core)
|
|
8
|
+
[](https://github.com/lfnovo/content-core)
|
|
9
|
+
[](https://github.com/lfnovo/content-core/issues)
|
|
10
|
+
[](https://github.com/psf/black)
|
|
11
|
+
[](https://github.com/astral-sh/ruff)
|
|
4
12
|
|
|
5
13
|
**Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
|
|
6
14
|
|
|
@@ -66,12 +74,13 @@ summary = await cc.summarize_content(result, context="explain to a child")
|
|
|
66
74
|
* **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
|
|
67
75
|
* **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
|
|
68
76
|
* **🔄 Asynchronous:** Built with `asyncio` for efficient processing
|
|
77
|
+
* **🐍 Pure Python Implementation:** No system dependencies required - simplified installation across all platforms
|
|
69
78
|
|
|
70
79
|
## Getting Started
|
|
71
80
|
|
|
72
81
|
### Installation
|
|
73
82
|
|
|
74
|
-
Install Content Core using `pip
|
|
83
|
+
Install Content Core using `pip` - **no system dependencies required!**
|
|
75
84
|
|
|
76
85
|
```bash
|
|
77
86
|
# Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
|
|
@@ -87,6 +96,8 @@ pip install content-core
|
|
|
87
96
|
pip install content-core[docling]
|
|
88
97
|
```
|
|
89
98
|
|
|
99
|
+
> **Note:** Unlike many content extraction tools, Content Core uses pure Python implementations and doesn't require system libraries like libmagic. This ensures consistent, hassle-free installation across Windows, macOS, and Linux.
|
|
100
|
+
|
|
90
101
|
Alternatively, if you’re developing locally:
|
|
91
102
|
|
|
92
103
|
```bash
|
|
@@ -227,6 +238,10 @@ For more information on how to use the Content Core library, including details o
|
|
|
227
238
|
|
|
228
239
|
Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
|
|
229
240
|
|
|
241
|
+
<a href="https://glama.ai/mcp/servers/@lfnovo/content-core">
|
|
242
|
+
<img width="380" height="200" src="https://glama.ai/mcp/servers/@lfnovo/content-core/badge" />
|
|
243
|
+
</a>
|
|
244
|
+
|
|
230
245
|
### Quick Setup with Claude Desktop
|
|
231
246
|
|
|
232
247
|
```bash
|
|
@@ -247,6 +247,40 @@ Enable OCR enhancement for:
|
|
|
247
247
|
|
|
248
248
|
**Note**: The quality improvements (better character rendering, table detection) work automatically without requiring OCR or additional setup.
|
|
249
249
|
|
|
250
|
+
## File Type Detection
|
|
251
|
+
|
|
252
|
+
Content Core uses a pure Python implementation for file type detection, eliminating the need for system dependencies like libmagic. This ensures consistent behavior across all platforms (Windows, macOS, Linux).
|
|
253
|
+
|
|
254
|
+
### How It Works
|
|
255
|
+
|
|
256
|
+
The `FileDetector` class uses:
|
|
257
|
+
- **Binary signature matching** for formats like PDF, images, audio, and video files
|
|
258
|
+
- **Content analysis** for text-based formats (HTML, XML, JSON, YAML, CSV, Markdown)
|
|
259
|
+
- **ZIP structure detection** for modern document formats (DOCX, XLSX, PPTX, EPUB)
|
|
260
|
+
|
|
261
|
+
### Supported Formats
|
|
262
|
+
|
|
263
|
+
Content Core automatically detects and returns appropriate MIME types for:
|
|
264
|
+
- **Documents**: PDF, DOCX, XLSX, PPTX, ODT, ODS, ODP, RTF, EPUB
|
|
265
|
+
- **Images**: JPEG, PNG, GIF, BMP, WEBP, SVG, TIFF, ICO
|
|
266
|
+
- **Media**: MP4, AVI, MKV, MOV, MP3, WAV, OGG, FLAC, M4A
|
|
267
|
+
- **Text**: HTML, XML, JSON, YAML, CSV, Markdown, Plain text
|
|
268
|
+
- **Archives**: ZIP, TAR, GZ, BZ2, XZ
|
|
269
|
+
|
|
270
|
+
### Implementation Details
|
|
271
|
+
|
|
272
|
+
File detection is performed automatically when you call `extract_content()`. The detection:
|
|
273
|
+
- Reads only the necessary bytes (typically first 8KB) for performance
|
|
274
|
+
- Works regardless of file extension - detection is based on content
|
|
275
|
+
- Falls back to `text/plain` for unrecognized text files
|
|
276
|
+
- Returns `application/octet-stream` for binary files that don't match known signatures
|
|
277
|
+
|
|
278
|
+
This pure Python approach means:
|
|
279
|
+
- No installation headaches on different platforms
|
|
280
|
+
- Consistent behavior in all environments (local, Docker, serverless)
|
|
281
|
+
- Easy debugging and customization if needed
|
|
282
|
+
- No binary dependencies or system library conflicts
|
|
283
|
+
|
|
250
284
|
## Support
|
|
251
285
|
|
|
252
286
|
If you have questions or encounter issues while using the library, open an issue in the repository or contact the support team.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.4.0"
|
|
4
4
|
description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -20,7 +20,6 @@ dependencies = [
|
|
|
20
20
|
"pymupdf>=1.25.5",
|
|
21
21
|
"python-docx>=1.1.2",
|
|
22
22
|
"python-dotenv>=1.1.0",
|
|
23
|
-
"python-magic>=0.4.27",
|
|
24
23
|
"python-pptx>=1.0.2",
|
|
25
24
|
"youtube-transcript-api>=1.0.3",
|
|
26
25
|
"langgraph>=0.3.29",
|
|
@@ -32,7 +31,6 @@ dependencies = [
|
|
|
32
31
|
"firecrawl-py>=2.7.0",
|
|
33
32
|
"pillow>=10.4.0",
|
|
34
33
|
"asciidoc>=10.2.1",
|
|
35
|
-
"python-magic-bin==0.4.14; sys_platform == 'win32'",
|
|
36
34
|
"pytubefix>=9.1.1",
|
|
37
35
|
"fastmcp>=2.10.0",
|
|
38
36
|
]
|