content-core 1.3.1__tar.gz → 1.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (97) hide show
  1. content_core-1.4.1/.claude/sessions/OSS-216/architecture.md +195 -0
  2. content_core-1.4.1/.claude/sessions/OSS-216/context.md +54 -0
  3. content_core-1.4.1/.claude/sessions/OSS-216/plan.md +195 -0
  4. {content_core-1.3.1 → content_core-1.4.1}/.gitignore +1 -1
  5. content_core-1.4.1/CHANGELOG.md +31 -0
  6. {content_core-1.3.1 → content_core-1.4.1}/PKG-INFO +17 -4
  7. {content_core-1.3.1 → content_core-1.4.1}/README.md +16 -1
  8. {content_core-1.3.1 → content_core-1.4.1}/docs/usage.md +34 -0
  9. {content_core-1.3.1 → content_core-1.4.1}/pyproject.toml +1 -3
  10. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/extraction/graph.py +4 -3
  11. content_core-1.4.1/src/content_core/content/identification/__init__.py +9 -0
  12. content_core-1.4.1/src/content_core/content/identification/file_detector.py +415 -0
  13. content_core-1.4.1/tests/unit/test_file_detector.py +60 -0
  14. content_core-1.4.1/tests/unit/test_file_detector_critical.py +186 -0
  15. content_core-1.4.1/tests/unit/test_file_detector_performance.py +126 -0
  16. {content_core-1.3.1 → content_core-1.4.1}/uv.lock +9 -30
  17. content_core-1.3.1/src/content_core/content/identification/__init__.py +0 -8
  18. {content_core-1.3.1 → content_core-1.4.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  19. {content_core-1.3.1 → content_core-1.4.1}/.github/workflows/claude-code-review.yml +0 -0
  20. {content_core-1.3.1 → content_core-1.4.1}/.github/workflows/claude.yml +0 -0
  21. {content_core-1.3.1 → content_core-1.4.1}/.github/workflows/publish.yml +0 -0
  22. {content_core-1.3.1 → content_core-1.4.1}/.python-version +0 -0
  23. {content_core-1.3.1 → content_core-1.4.1}/CONTRIBUTING.md +0 -0
  24. {content_core-1.3.1 → content_core-1.4.1}/LICENSE +0 -0
  25. {content_core-1.3.1 → content_core-1.4.1}/Makefile +0 -0
  26. {content_core-1.3.1 → content_core-1.4.1}/docs/macos.md +0 -0
  27. {content_core-1.3.1 → content_core-1.4.1}/docs/mcp.md +0 -0
  28. {content_core-1.3.1 → content_core-1.4.1}/docs/processors.md +0 -0
  29. {content_core-1.3.1 → content_core-1.4.1}/docs/raycast.md +0 -0
  30. {content_core-1.3.1 → content_core-1.4.1}/examples/main.py +0 -0
  31. {content_core-1.3.1 → content_core-1.4.1}/prompts/content/cleanup.jinja +0 -0
  32. {content_core-1.3.1 → content_core-1.4.1}/prompts/content/summarize.jinja +0 -0
  33. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/.eslintrc.json +0 -0
  34. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/CHANGELOG.md +0 -0
  35. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/README.md +0 -0
  36. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/assets/command-icon.png +0 -0
  37. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/package-lock.json +0 -0
  38. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/package.json +0 -0
  39. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/raycast-env.d.ts +0 -0
  40. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/src/extract-content.tsx +0 -0
  41. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/src/quick-extract.tsx +0 -0
  42. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/src/summarize-content.tsx +0 -0
  43. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/src/utils/content-core.ts +0 -0
  44. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/src/utils/types.ts +0 -0
  45. {content_core-1.3.1 → content_core-1.4.1}/raycast-content-core/tsconfig.json +0 -0
  46. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/__init__.py +0 -0
  47. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/cc_config.yaml +0 -0
  48. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/common/__init__.py +0 -0
  49. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/common/exceptions.py +0 -0
  50. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/common/state.py +0 -0
  51. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/common/types.py +0 -0
  52. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/common/utils.py +0 -0
  53. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/config.py +0 -0
  54. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/__init__.py +0 -0
  55. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/cleanup/__init__.py +0 -0
  56. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/cleanup/core.py +0 -0
  57. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/extraction/__init__.py +0 -0
  58. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/summary/__init__.py +0 -0
  59. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/content/summary/core.py +0 -0
  60. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/logging.py +0 -0
  61. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/mcp/__init__.py +0 -0
  62. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/mcp/server.py +0 -0
  63. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/models.py +0 -0
  64. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/models_config.yaml +0 -0
  65. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/notebooks/run.ipynb +0 -0
  66. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/notebooks/urls.ipynb +0 -0
  67. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/audio.py +0 -0
  68. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/docling.py +0 -0
  69. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/office.py +0 -0
  70. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/pdf.py +0 -0
  71. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/text.py +0 -0
  72. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/url.py +0 -0
  73. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/video.py +0 -0
  74. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/processors/youtube.py +0 -0
  75. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/py.typed +0 -0
  76. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/templated_message.py +0 -0
  77. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/tools/__init__.py +0 -0
  78. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/tools/cleanup.py +0 -0
  79. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/tools/extract.py +0 -0
  80. {content_core-1.3.1 → content_core-1.4.1}/src/content_core/tools/summarize.py +0 -0
  81. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.docx +0 -0
  82. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.epub +0 -0
  83. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.md +0 -0
  84. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.mp3 +0 -0
  85. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.mp4 +0 -0
  86. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.pdf +0 -0
  87. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.pptx +0 -0
  88. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.txt +0 -0
  89. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file.xlsx +0 -0
  90. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/file_audio.mp3 +0 -0
  91. {content_core-1.3.1 → content_core-1.4.1}/tests/input_content/new_pdf.pdf +0 -0
  92. {content_core-1.3.1 → content_core-1.4.1}/tests/integration/test_cli.py +0 -0
  93. {content_core-1.3.1 → content_core-1.4.1}/tests/integration/test_extraction.py +0 -0
  94. {content_core-1.3.1 → content_core-1.4.1}/tests/unit/test_config.py +0 -0
  95. {content_core-1.3.1 → content_core-1.4.1}/tests/unit/test_docling.py +0 -0
  96. {content_core-1.3.1 → content_core-1.4.1}/tests/unit/test_mcp_server.py +0 -0
  97. {content_core-1.3.1 → content_core-1.4.1}/tests/unit/test_pymupdf_ocr.py +0 -0
@@ -0,0 +1,195 @@
1
+ # OSS-216: Remove libmagic Dependency - Architecture
2
+
3
+ ## High-Level System Overview
4
+
5
+ ### Current State (Before)
6
+ ```
7
+ File Input → magic.from_file() → MIME Type → Content Router → Processor
8
+
9
+ libmagic (C library)
10
+ ```
11
+
12
+ ### Target State (After)
13
+ ```
14
+ File Input → file_detector.detect() → MIME Type → Content Router → Processor
15
+
16
+ Pure Python Detection
17
+ ```
18
+
19
+ ## Affected Components
20
+
21
+ ### 1. `/src/content_core/content/identification/__init__.py`
22
+ - **Current**: Single function `get_file_type()` using `magic.from_file()`
23
+ - **Change**: Replace with new detection module
24
+ - **Dependencies**: None (isolated module)
25
+
26
+ ### 2. `/src/content_core/content/extraction/graph.py`
27
+ - **Current**: `file_type()` function using `magic.from_file()` at line 62
28
+ - **Change**: Replace with new detection function call
29
+ - **Dependencies**: Routes to various processors based on MIME type
30
+
31
+ ### 3. `/pyproject.toml`
32
+ - **Current**: Dependencies on `python-magic>=0.4.27` and `python-magic-bin==0.4.14`
33
+ - **Change**: Remove both dependencies
34
+
35
+ ## New Component Design
36
+
37
+ ### File Detection Module (`/src/content_core/content/identification/file_detector.py`)
38
+
39
+ ```python
40
+ class FileDetector:
41
+ """Pure Python file type detection using magic bytes and content analysis."""
42
+
43
+ def __init__(self):
44
+ self.signatures = self._load_signatures()
45
+ self.text_patterns = self._load_text_patterns()
46
+
47
+ async def detect(self, file_path: str) -> str:
48
+ """Main detection method returning MIME type."""
49
+ # 1. Read first 512 bytes
50
+ # 2. Check binary signatures
51
+ # 3. If no match, analyze as text
52
+ # 4. Fallback to extension mapping
53
+ # 5. Raise UnsupportedTypeException if all fail
54
+
55
+ # Backward compatibility function
56
+ async def get_file_type(file_path: str) -> str:
57
+ """Legacy function for compatibility."""
58
+ detector = FileDetector()
59
+ return await detector.detect(file_path)
60
+ ```
61
+
62
+ ### Signature Mappings
63
+
64
+ ```python
65
+ BINARY_SIGNATURES = {
66
+ # PDFs
67
+ b'%PDF': 'application/pdf',
68
+
69
+ # Office formats (ZIP-based)
70
+ b'PK\x03\x04': 'application/zip', # Will need content analysis
71
+
72
+ # Images
73
+ b'\xff\xd8\xff': 'image/jpeg',
74
+ b'\x89PNG\r\n\x1a\n': 'image/png',
75
+ b'GIF87a': 'image/gif',
76
+ b'GIF89a': 'image/gif',
77
+ b'II*\x00': 'image/tiff',
78
+ b'MM\x00*': 'image/tiff',
79
+
80
+ # Audio/Video
81
+ b'ID3': 'audio/mpeg',
82
+ b'\xff\xfb': 'audio/mpeg',
83
+ b'RIFF': 'audio/wav', # Also video/avi
84
+ b'\x00\x00\x00\x14ftypM4A': 'audio/mp4',
85
+ b'\x00\x00\x00\x18ftypmp4': 'video/mp4',
86
+ b'\x00\x00\x00\x14ftypisom': 'video/mp4',
87
+
88
+ # EPUB
89
+ b'PK\x03\x04': 'application/epub+zip', # Will need content analysis
90
+ }
91
+
92
+ # For ZIP-based formats, check internal structure
93
+ ZIP_CONTENT_PATTERNS = {
94
+ 'word/': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
95
+ 'xl/': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
96
+ 'ppt/': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
97
+ 'META-INF/container.xml': 'application/epub+zip',
98
+ }
99
+
100
+ # Text-based format detection
101
+ TEXT_PATTERNS = {
102
+ '<!DOCTYPE html': 'text/html',
103
+ '<html': 'text/html',
104
+ '<?xml': 'text/xml',
105
+ '{"': 'application/json',
106
+ '[{': 'application/json',
107
+ '---\n': 'text/yaml',
108
+ '#': 'text/markdown', # Weak, needs more context
109
+ }
110
+
111
+ # Extension fallback mapping
112
+ EXTENSION_MAPPING = {
113
+ '.pdf': 'application/pdf',
114
+ '.txt': 'text/plain',
115
+ '.md': 'text/plain', # Current behavior
116
+ '.html': 'text/html',
117
+ '.json': 'application/json',
118
+ '.csv': 'text/csv',
119
+ '.mp4': 'video/mp4',
120
+ '.mp3': 'audio/mpeg',
121
+ '.wav': 'audio/wav',
122
+ '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
123
+ '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
124
+ '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
125
+ '.epub': 'application/epub+zip',
126
+ }
127
+ ```
128
+
129
+ ## Implementation Strategy
130
+
131
+ ### Phase 1: Create Detection Module
132
+ 1. Implement `FileDetector` class with all detection logic
133
+ 2. Handle ZIP-based formats by checking internal structure
134
+ 3. Implement robust text format detection
135
+ 4. Add comprehensive logging for debugging
136
+
137
+ ### Phase 2: Integration
138
+ 1. Update `get_file_type()` to use new detector
139
+ 2. Update `file_type()` in graph.py
140
+ 3. Ensure all MIME type strings match expected values
141
+
142
+ ### Phase 3: Cleanup
143
+ 1. Remove magic imports
144
+ 2. Update pyproject.toml dependencies
145
+ 3. Run tests to ensure compatibility
146
+
147
+ ## Patterns & Best Practices
148
+
149
+ ### Error Handling
150
+ - Maintain existing `UnsupportedTypeException` behavior
151
+ - Add specific error messages for debugging
152
+ - Log detection attempts for troubleshooting
153
+
154
+ ### Async Pattern
155
+ - Keep async interface for consistency
156
+ - Use `aiofiles` if needed for async file reading
157
+
158
+ ### Extensibility
159
+ - Design for easy addition of new signatures
160
+ - Consider configuration file for custom mappings
161
+
162
+ ## External Dependencies
163
+ - **None** - Pure Python implementation
164
+ - Uses only standard library: `os`, `pathlib`, `zipfile`
165
+
166
+ ## Trade-offs & Alternatives
167
+
168
+ ### Trade-offs
169
+ 1. **Performance**: Slightly slower than libmagic C library, but acceptable per requirements
170
+ 2. **Accuracy**: May have edge cases libmagic handles better, but covers all current use cases
171
+ 3. **Maintenance**: More code to maintain, but removes deployment complexity
172
+
173
+ ### Alternatives Considered
174
+ 1. **python-magic-bin fork**: Still has binary dependencies
175
+ 2. **filetype library**: Pure Python but limited format support
176
+ 3. **Custom C extension**: Defeats purpose of removing binary dependencies
177
+
178
+ ## Negative Consequences
179
+ 1. **Potential edge cases**: Some obscure file formats might not be detected correctly
180
+ 2. **Maintenance burden**: Need to update signatures for new formats
181
+ 3. **Slightly larger codebase**: Adding ~200 lines of detection code
182
+
183
+ ## Files to Edit/Create
184
+
185
+ ### Create:
186
+ 1. `/src/content_core/content/identification/file_detector.py` - Main detection logic
187
+
188
+ ### Edit:
189
+ 1. `/src/content_core/content/identification/__init__.py` - Update to use new detector
190
+ 2. `/src/content_core/content/extraction/graph.py` - Replace magic.from_file() call
191
+ 3. `/pyproject.toml` - Remove python-magic dependencies
192
+
193
+ ### No Changes Needed:
194
+ - All processor files (they only check MIME types, don't detect them)
195
+ - Test files (will continue to work with same MIME types)
@@ -0,0 +1,54 @@
1
+ # OSS-216: Remove libmagic Dependency - Context
2
+
3
+ ## Why This Is Being Built
4
+
5
+ - **Deployment Friction**: libmagic requires OS-level installation which creates deployment problems
6
+ - **Cross-Platform Issues**: Binary dependency causes installation problems across Windows, macOS, and Linux
7
+ - **Simplification**: Removing system dependencies makes the package easier to install and use
8
+ - **Maintain Functionality**: Need to keep intelligent file detection without external dependencies
9
+
10
+ ## Expected Outcome
11
+
12
+ Replace libmagic with a pure Python implementation that:
13
+ - Detects file types using magic bytes/file signatures (first 512 bytes)
14
+ - Maintains the current routing behavior to appropriate content processors
15
+ - Works across all platforms without OS-level dependencies
16
+ - Keeps the same error handling (UnsupportedTypeException for unsupported types)
17
+
18
+ ## Implementation Approach
19
+
20
+ 1. **File Signature Detection System**:
21
+ - Build comprehensive mapping of file signatures to MIME types
22
+ - Read first 512 bytes to identify format by magic bytes
23
+ - Special handling for Office formats (DOCX, XLSX, PPTX) which are ZIP-based
24
+ - Content structure analysis for text formats (HTML, JSON, XML)
25
+
26
+ 2. **Detection Priority** (as discussed):
27
+ - Primary: File signature/magic bytes detection
28
+ - Secondary: Content analysis for text formats
29
+ - Tertiary: File extension as final fallback
30
+ - If file extension and content disagree, prioritize content analysis
31
+
32
+ 3. **Replace Current Usage**:
33
+ - Remove imports of `magic` library
34
+ - Replace `magic.from_file()` calls in:
35
+ - `/src/content_core/content/identification/__init__.py`
36
+ - `/src/content_core/content/extraction/graph.py`
37
+ - Remove dependencies from `pyproject.toml`
38
+
39
+ ## Testing Approach
40
+
41
+ - Comprehensive testing will be handled later
42
+ - Focus on maintaining existing functionality
43
+ - Ensure all currently supported file types continue to work
44
+
45
+ ## Dependencies
46
+
47
+ No new dependencies - implementation should be pure Python using only standard library.
48
+
49
+ ## Constraints
50
+
51
+ - 512 bytes buffer is sufficient (no need for deep ZIP inspection)
52
+ - Performance is not a concern (load is small)
53
+ - Maintain current error behavior (raise UnsupportedTypeException)
54
+ - MIME type strings can be adjusted as long as routing works correctly
@@ -0,0 +1,195 @@
1
+ # OSS-216: Remove libmagic Dependency
2
+
3
+ If you are working on this feature, make sure to update this plan.md file as you go.
4
+
5
+ ## PHASE 1: Create Pure Python File Detection Module [Completed ✅]
6
+
7
+ Build the core file detection system to replace libmagic with pure Python implementation.
8
+
9
+ ### Create file_detector.py with basic structure [Completed ✅]
10
+
11
+ Create `/src/content_core/content/identification/file_detector.py` with:
12
+ - FileDetector class skeleton
13
+ - Basic signature mappings for binary formats (PDF, images)
14
+ - Simple detect() method that reads first 512 bytes
15
+ - Raise UnsupportedTypeException for unknown types
16
+
17
+ ### Implement binary format detection [Completed ✅]
18
+
19
+ Add detection for binary formats:
20
+ - PDF files (magic bytes: `%PDF`)
21
+ - Common image formats (JPEG, PNG, GIF, TIFF, BMP)
22
+ - Audio formats (MP3, WAV, M4A)
23
+ - Video formats (MP4, AVI, MOV)
24
+ - Test each format with sample files
25
+
26
+ ### Implement ZIP-based format detection [Completed ✅]
27
+
28
+ Handle Office and EPUB formats that use ZIP containers:
29
+ - Detect ZIP magic bytes (`PK\x03\x04`)
30
+ - Use zipfile module to inspect internal structure
31
+ - Differentiate DOCX (word/), XLSX (xl/), PPTX (ppt/), EPUB (META-INF/container.xml)
32
+ - Handle corrupted or password-protected ZIP files gracefully
33
+
34
+ ### Comments:
35
+ - Focus on accurate detection over performance
36
+ - Ensure all MIME types match exactly what libmagic returns
37
+ - **Implementation notes from Phase 1:**
38
+ - Added comprehensive binary signatures with ordered checking (longer signatures first)
39
+ - Implemented generic ftyp box detection for MP4/MOV files for better compatibility
40
+ - Added FLAC audio format support
41
+ - Special RIFF handling differentiates between WAV and AVI
42
+ - Text detection requires minimum content length to avoid false positives
43
+ - All core file types tested and working correctly
44
+
45
+ ## PHASE 2: Text Format Detection and Fallbacks [Completed ✅]
46
+
47
+ Implement text-based format detection and extension fallback mechanism.
48
+
49
+ ### Add text format detection [Completed ✅]
50
+
51
+ Implement content analysis for text formats:
52
+ - HTML detection (DOCTYPE, <html tags)
53
+ - XML detection (<?xml declaration)
54
+ - JSON detection (starts with { or [)
55
+ - YAML detection (--- header)
56
+ - Markdown detection (combine multiple indicators)
57
+ - CSV detection (analyze structure)
58
+ - Plain text as default for unrecognized text
59
+
60
+ ### Implement extension fallback system [Completed ✅]
61
+
62
+ Create comprehensive extension mapping:
63
+ - Map common file extensions to MIME types
64
+ - Use as last resort when content detection fails
65
+ - Log when falling back to extension
66
+ - Maintain compatibility with current behavior
67
+
68
+ ### Add detection method priority logic [Completed ✅]
69
+
70
+ Implement the agreed priority order:
71
+ 1. Binary signature detection (most reliable)
72
+ 2. Content analysis for text formats
73
+ 3. File extension as final fallback
74
+ - Add logging at each detection stage
75
+ - Return appropriate MIME type or raise exception
76
+
77
+ ### Comments:
78
+ - Text detection needs to be careful to avoid false positives
79
+ - Extension fallback ensures graceful degradation
80
+ - **Implementation notes from Phase 2:**
81
+ - Enhanced JSON detection with pattern matching and keyword checking
82
+ - Improved YAML detection to avoid conflicts with Markdown
83
+ - Added sophisticated Markdown scoring system (headers, lists, links, etc.)
84
+ - Extended extension mapping to cover more file types (70+ extensions)
85
+ - Fixed YAML/Markdown detection priority to avoid false positives
86
+ - Added minimum content requirements for text detection
87
+ - All text formats tested with edge cases
88
+
89
+ ## PHASE 3: Integration with Existing Code [Completed ✅]
90
+
91
+ Replace libmagic usage throughout the codebase.
92
+
93
+ ### Update identification module [Completed ✅]
94
+
95
+ Modify `/src/content_core/content/identification/__init__.py`:
96
+ - Import FileDetector
97
+ - Replace `magic.from_file()` call in `get_file_type()`
98
+ - Maintain async interface
99
+ - Remove magic import
100
+
101
+ ### Update graph.py file type detection [Completed ✅]
102
+
103
+ Modify `/src/content_core/content/extraction/graph.py`:
104
+ - Replace `magic.from_file()` at line 62
105
+ - Import get_file_type from identification module
106
+ - Remove direct magic import
107
+ - Ensure error handling remains consistent
108
+
109
+ ### Test integration thoroughly [Completed ✅]
110
+
111
+ Verify all extraction paths work:
112
+ - Test each supported file type through full pipeline
113
+ - Verify correct processor routing
114
+ - Check error messages for unsupported types
115
+ - Ensure no regression in functionality
116
+
117
+ ### Comments:
118
+ - Must maintain exact same external behavior
119
+ - All existing code depending on MIME types should work unchanged
120
+ - **Implementation notes from Phase 3:**
121
+ - Successfully replaced all libmagic usage with FileDetector
122
+ - Integration was seamless - no changes needed to downstream processors
123
+ - All file types correctly detected and routed to appropriate processors
124
+ - Tested with PDF, DOCX, MP4, MP3, JSON, HTML, CSV, text files
125
+ - Only test failure was unrelated (OpenAI API issue for MP3 transcription)
126
+ - MIME types match exactly what libmagic returned
127
+
128
+ ## PHASE 4: Cleanup and Final Validation [In Progress 🔄]
129
+
130
+ Remove dependencies and ensure production readiness.
131
+
132
+ ### Remove libmagic from dependencies [Completed ✅]
133
+
134
+ Update `/pyproject.toml`:
135
+ - Remove `python-magic>=0.4.27`
136
+ - Remove `python-magic-bin==0.4.14` for Windows
137
+ - Update lock file with `uv sync`
138
+ - Verify clean installation works
139
+
140
+ **Implementation notes:**
141
+ - Successfully removed both python-magic dependencies from pyproject.toml
142
+ - Lock file updated with `uv sync`
143
+ - 2 packages uninstalled: python-magic and python-magic-bin
144
+
145
+ ### Add comprehensive test suite [Not Started ⏳]
146
+
147
+ Create thorough tests:
148
+ - Unit tests for FileDetector methods
149
+ - Integration tests for full extraction pipeline
150
+ - Edge cases (empty files, malformed files)
151
+ - Cross-platform compatibility tests
152
+ - Performance benchmarks
153
+
154
+ ### Documentation and release preparation [Not Started ⏳]
155
+
156
+ Final preparations:
157
+ - Update README if it mentions libmagic
158
+ - Add docstrings to all new code
159
+ - Update CHANGELOG
160
+ - Test installation on fresh environment
161
+ - Run full test suite: `make test`
162
+ - Build package: `uv build`
163
+
164
+ ### Comments:
165
+ - This is a breaking change for anyone depending on libmagic behavior
166
+ - Consider adding migration guide if needed
167
+
168
+ ## Key Technical Details
169
+
170
+ **Critical MIME Types** (must match exactly):
171
+ - `application/pdf` - PDF files
172
+ - `application/epub+zip` - EPUB files
173
+ - `application/vnd.openxmlformats-officedocument.wordprocessingml.document` - DOCX
174
+ - `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` - XLSX
175
+ - `application/vnd.openxmlformats-officedocument.presentationml.presentation` - PPTX
176
+ - `text/plain` - Text and Markdown files
177
+ - `text/html` - HTML files
178
+ - `text/csv` - CSV files
179
+ - `application/json` - JSON files
180
+ - `image/*` - Various image formats
181
+ - `video/*` - Video files (prefix matching)
182
+ - `audio/*` - Audio files (prefix matching)
183
+
184
+ **Implementation Constraints**:
185
+ - 512-byte buffer is sufficient (no deep file inspection needed)
186
+ - Performance is not critical (small load expected)
187
+ - Must raise `UnsupportedTypeException` for unknown types
188
+ - Maintain async interface for consistency
189
+ - Pure Python only (no C extensions)
190
+
191
+ **Risk Mitigation**:
192
+ - Extensive testing before removing libmagic
193
+ - Keep detection logic modular for easy updates
194
+ - Log detection decisions for debugging
195
+ - Consider feature flag for rollback if needed
@@ -27,4 +27,4 @@ CLAUDE.md
27
27
  node_modules/
28
28
  **/notebooks/private
29
29
 
30
- .claude/
30
+ claude-logs
@@ -0,0 +1,31 @@
1
+ # Changelog
2
+
3
+ All notable changes to Content Core will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+ - Pure Python file type detection via the new `FileDetector` class
12
+ - Comprehensive file signature detection for 25+ file formats
13
+ - Smart detection for ZIP-based formats (DOCX, XLSX, PPTX, EPUB)
14
+
15
+ ### Changed
16
+ - File type detection now uses pure Python implementation instead of libmagic
17
+ - Improved cross-platform compatibility - no system dependencies required
18
+
19
+ ### Removed
20
+ - Dependency on `python-magic` and `python-magic-bin`
21
+ - System requirement for libmagic library
22
+
23
+ ### Technical Details
24
+ - Replaced libmagic dependency with custom `FileDetector` implementation
25
+ - File detection based on binary signatures and content analysis
26
+ - Maintains same API surface - no breaking changes for users
27
+ - Significantly simplified installation process across all platforms
28
+
29
+ ## Previous Releases
30
+
31
+ For releases prior to this changelog, please see the [GitHub releases page](https://github.com/lfnovo/content-core/releases).
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.3.1
3
+ Version: 1.4.1
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -24,8 +24,6 @@ Requires-Dist: pillow>=10.4.0
24
24
  Requires-Dist: pymupdf>=1.25.5
25
25
  Requires-Dist: python-docx>=1.1.2
26
26
  Requires-Dist: python-dotenv>=1.1.0
27
- Requires-Dist: python-magic-bin==0.4.14; sys_platform == 'win32'
28
- Requires-Dist: python-magic>=0.4.27
29
27
  Requires-Dist: python-pptx>=1.0.2
30
28
  Requires-Dist: pytubefix>=9.1.1
31
29
  Requires-Dist: readability-lxml>=0.8.4.1
@@ -38,6 +36,14 @@ Description-Content-Type: text/markdown
38
36
  # Content Core
39
37
 
40
38
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
39
+ [![PyPI version](https://badge.fury.io/py/content-core.svg)](https://badge.fury.io/py/content-core)
40
+ [![Downloads](https://pepy.tech/badge/content-core)](https://pepy.tech/project/content-core)
41
+ [![Downloads](https://pepy.tech/badge/content-core/month)](https://pepy.tech/project/content-core)
42
+ [![GitHub stars](https://img.shields.io/github/stars/lfnovo/content-core?style=social)](https://github.com/lfnovo/content-core)
43
+ [![GitHub forks](https://img.shields.io/github/forks/lfnovo/content-core?style=social)](https://github.com/lfnovo/content-core)
44
+ [![GitHub issues](https://img.shields.io/github/issues/lfnovo/content-core)](https://github.com/lfnovo/content-core/issues)
45
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
46
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
41
47
 
42
48
  **Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
43
49
 
@@ -103,12 +109,13 @@ summary = await cc.summarize_content(result, context="explain to a child")
103
109
  * **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
104
110
  * **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
105
111
  * **🔄 Asynchronous:** Built with `asyncio` for efficient processing
112
+ * **🐍 Pure Python Implementation:** No system dependencies required - simplified installation across all platforms
106
113
 
107
114
  ## Getting Started
108
115
 
109
116
  ### Installation
110
117
 
111
- Install Content Core using `pip`:
118
+ Install Content Core using `pip` - **no system dependencies required!**
112
119
 
113
120
  ```bash
114
121
  # Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
@@ -124,6 +131,8 @@ pip install content-core
124
131
  pip install content-core[docling]
125
132
  ```
126
133
 
134
+ > **Note:** Unlike many content extraction tools, Content Core uses pure Python implementations and doesn't require system libraries like libmagic. This ensures consistent, hassle-free installation across Windows, macOS, and Linux.
135
+
127
136
  Alternatively, if you’re developing locally:
128
137
 
129
138
  ```bash
@@ -264,6 +273,10 @@ For more information on how to use the Content Core library, including details o
264
273
 
265
274
  Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
266
275
 
276
+ <a href="https://glama.ai/mcp/servers/@lfnovo/content-core">
277
+ <img width="380" height="200" src="https://glama.ai/mcp/servers/@lfnovo/content-core/badge" />
278
+ </a>
279
+
267
280
  ### Quick Setup with Claude Desktop
268
281
 
269
282
  ```bash
@@ -1,6 +1,14 @@
1
1
  # Content Core
2
2
 
3
3
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
4
+ [![PyPI version](https://badge.fury.io/py/content-core.svg)](https://badge.fury.io/py/content-core)
5
+ [![Downloads](https://pepy.tech/badge/content-core)](https://pepy.tech/project/content-core)
6
+ [![Downloads](https://pepy.tech/badge/content-core/month)](https://pepy.tech/project/content-core)
7
+ [![GitHub stars](https://img.shields.io/github/stars/lfnovo/content-core?style=social)](https://github.com/lfnovo/content-core)
8
+ [![GitHub forks](https://img.shields.io/github/forks/lfnovo/content-core?style=social)](https://github.com/lfnovo/content-core)
9
+ [![GitHub issues](https://img.shields.io/github/issues/lfnovo/content-core)](https://github.com/lfnovo/content-core/issues)
10
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
11
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
4
12
 
5
13
  **Content Core** is a powerful, AI-powered content extraction and processing platform that transforms any source into clean, structured content. Extract text from websites, transcribe videos, process documents, and generate AI summaries—all through a unified interface with multiple integration options.
6
14
 
@@ -66,12 +74,13 @@ summary = await cc.summarize_content(result, context="explain to a child")
66
74
  * **⚡ Zero-Install Options:** Use `uvx` for instant access without installation
67
75
  * **🧠 AI-Powered Processing:** LLM integration for content cleaning and summarization
68
76
  * **🔄 Asynchronous:** Built with `asyncio` for efficient processing
77
+ * **🐍 Pure Python Implementation:** No system dependencies required - simplified installation across all platforms
69
78
 
70
79
  ## Getting Started
71
80
 
72
81
  ### Installation
73
82
 
74
- Install Content Core using `pip`:
83
+ Install Content Core using `pip` - **no system dependencies required!**
75
84
 
76
85
  ```bash
77
86
  # Basic installation (PyMuPDF + BeautifulSoup/Jina extraction)
@@ -87,6 +96,8 @@ pip install content-core
87
96
  pip install content-core[docling]
88
97
  ```
89
98
 
99
+ > **Note:** Unlike many content extraction tools, Content Core uses pure Python implementations and doesn't require system libraries like libmagic. This ensures consistent, hassle-free installation across Windows, macOS, and Linux.
100
+
90
101
  Alternatively, if you’re developing locally:
91
102
 
92
103
  ```bash
@@ -227,6 +238,10 @@ For more information on how to use the Content Core library, including details o
227
238
 
228
239
  Content Core includes a Model Context Protocol (MCP) server that enables seamless integration with Claude Desktop and other MCP-compatible applications. The MCP server exposes Content Core's powerful extraction capabilities through a standardized protocol.
229
240
 
241
+ <a href="https://glama.ai/mcp/servers/@lfnovo/content-core">
242
+ <img width="380" height="200" src="https://glama.ai/mcp/servers/@lfnovo/content-core/badge" />
243
+ </a>
244
+
230
245
  ### Quick Setup with Claude Desktop
231
246
 
232
247
  ```bash
@@ -247,6 +247,40 @@ Enable OCR enhancement for:
247
247
 
248
248
  **Note**: The quality improvements (better character rendering, table detection) work automatically without requiring OCR or additional setup.
249
249
 
250
+ ## File Type Detection
251
+
252
+ Content Core uses a pure Python implementation for file type detection, eliminating the need for system dependencies like libmagic. This ensures consistent behavior across all platforms (Windows, macOS, Linux).
253
+
254
+ ### How It Works
255
+
256
+ The `FileDetector` class uses:
257
+ - **Binary signature matching** for formats like PDF, images, audio, and video files
258
+ - **Content analysis** for text-based formats (HTML, XML, JSON, YAML, CSV, Markdown)
259
+ - **ZIP structure detection** for modern document formats (DOCX, XLSX, PPTX, EPUB)
260
+
261
+ ### Supported Formats
262
+
263
+ Content Core automatically detects and returns appropriate MIME types for:
264
+ - **Documents**: PDF, DOCX, XLSX, PPTX, ODT, ODS, ODP, RTF, EPUB
265
+ - **Images**: JPEG, PNG, GIF, BMP, WEBP, SVG, TIFF, ICO
266
+ - **Media**: MP4, AVI, MKV, MOV, MP3, WAV, OGG, FLAC, M4A
267
+ - **Text**: HTML, XML, JSON, YAML, CSV, Markdown, Plain text
268
+ - **Archives**: ZIP, TAR, GZ, BZ2, XZ
269
+
270
+ ### Implementation Details
271
+
272
+ File detection is performed automatically when you call `extract_content()`. The detection:
273
+ - Reads only the necessary bytes (typically first 8KB) for performance
274
+ - Works regardless of file extension - detection is based on content
275
+ - Falls back to `text/plain` for unrecognized text files
276
+ - Returns `application/octet-stream` for binary files that don't match known signatures
277
+
278
+ This pure Python approach means:
279
+ - No installation headaches on different platforms
280
+ - Consistent behavior in all environments (local, Docker, serverless)
281
+ - Easy debugging and customization if needed
282
+ - No binary dependencies or system library conflicts
283
+
250
284
  ## Support
251
285
 
252
286
  If you have questions or encounter issues while using the library, open an issue in the repository or contact the support team.
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.3.1"
3
+ version = "1.4.1"
4
4
  description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -20,7 +20,6 @@ dependencies = [
20
20
  "pymupdf>=1.25.5",
21
21
  "python-docx>=1.1.2",
22
22
  "python-dotenv>=1.1.0",
23
- "python-magic>=0.4.27",
24
23
  "python-pptx>=1.0.2",
25
24
  "youtube-transcript-api>=1.0.3",
26
25
  "langgraph>=0.3.29",
@@ -32,7 +31,6 @@ dependencies = [
32
31
  "firecrawl-py>=2.7.0",
33
32
  "pillow>=10.4.0",
34
33
  "asciidoc>=10.2.1",
35
- "python-magic-bin==0.4.14; sys_platform == 'win32'",
36
34
  "pytubefix>=9.1.1",
37
35
  "fastmcp>=2.10.0",
38
36
  ]