content-core 1.5.0__tar.gz → 1.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core-1.7.0/.github/workflows/create-tag.yml +56 -0
- {content_core-1.5.0 → content_core-1.7.0}/CHANGELOG.md +5 -0
- {content_core-1.5.0 → content_core-1.7.0}/PKG-INFO +9 -1
- {content_core-1.5.0 → content_core-1.7.0}/README.md +8 -0
- {content_core-1.5.0 → content_core-1.7.0}/docs/usage.md +197 -0
- {content_core-1.5.0 → content_core-1.7.0}/pyproject.toml +1 -1
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/__init__.py +0 -2
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/state.py +10 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/identification/file_detector.py +102 -11
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/audio.py +40 -5
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/templated_message.py +0 -1
- {content_core-1.5.0 → content_core-1.7.0}/tests/integration/conftest.py +0 -1
- content_core-1.7.0/tests/unit/test_audio_model_override.py +451 -0
- content_core-1.7.0/uv.lock +5708 -0
- content_core-1.5.0/notebooks/extraction.ipynb +0 -194
- content_core-1.5.0/uv.lock +0 -5180
- {content_core-1.5.0 → content_core-1.7.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/.github/workflows/claude-code-review.yml +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/.github/workflows/claude.yml +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/.github/workflows/publish.yml +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/.gitignore +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/.python-version +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/CONTRIBUTING.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/LICENSE +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/Makefile +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/docs/macos.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/docs/mcp.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/docs/processors.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/docs/raycast.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/examples/main.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/prompts/content/summarize.jinja +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/.eslintrc.json +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/CHANGELOG.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/README.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/assets/command-icon.png +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/package-lock.json +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/package.json +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/raycast-env.d.ts +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/extract-content.tsx +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/quick-extract.tsx +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/summarize-content.tsx +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/utils/content-core.ts +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/utils/types.ts +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/tsconfig.json +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/cc_config.yaml +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/types.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/utils.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/config.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/summary/core.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/logging.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/mcp/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/mcp/server.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/models.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/models_config.yaml +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/notebooks/run.ipynb +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/notebooks/urls.ipynb +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/docling.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/office.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/text.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/url.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/video.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/py.typed +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/extract.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/test_coverage_branch_report.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.docx +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.epub +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.md +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.mp3 +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.mp4 +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.pdf +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.pptx +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.txt +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.xlsx +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/new_pdf.pdf +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/integration/test_cli.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/integration/test_extraction.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_audio_concurrency.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_config.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_docling.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_file_detector.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_file_detector_critical.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_file_detector_performance.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_mcp_server.py +0 -0
- {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_pymupdf_ocr.py +0 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
name: Create Tag
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
inputs:
|
|
6
|
+
confirm:
|
|
7
|
+
description: 'Type "yes" to confirm tag creation'
|
|
8
|
+
required: true
|
|
9
|
+
default: 'no'
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
create-tag:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
if: github.event.inputs.confirm == 'yes'
|
|
15
|
+
permissions:
|
|
16
|
+
contents: write
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
with:
|
|
21
|
+
fetch-depth: 0
|
|
22
|
+
|
|
23
|
+
- name: Extract version from pyproject.toml
|
|
24
|
+
id: version
|
|
25
|
+
run: |
|
|
26
|
+
VERSION=$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
|
|
27
|
+
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
28
|
+
echo "tag=v$VERSION" >> $GITHUB_OUTPUT
|
|
29
|
+
echo "📦 Version found: $VERSION"
|
|
30
|
+
|
|
31
|
+
- name: Check if tag already exists
|
|
32
|
+
id: check
|
|
33
|
+
run: |
|
|
34
|
+
if git rev-parse "v${{ steps.version.outputs.version }}" >/dev/null 2>&1; then
|
|
35
|
+
echo "exists=true" >> $GITHUB_OUTPUT
|
|
36
|
+
echo "❌ Tag v${{ steps.version.outputs.version }} already exists!"
|
|
37
|
+
exit 1
|
|
38
|
+
else
|
|
39
|
+
echo "exists=false" >> $GITHUB_OUTPUT
|
|
40
|
+
echo "✅ Tag v${{ steps.version.outputs.version }} does not exist yet"
|
|
41
|
+
fi
|
|
42
|
+
|
|
43
|
+
- name: Create and push tag
|
|
44
|
+
if: steps.check.outputs.exists == 'false'
|
|
45
|
+
run: |
|
|
46
|
+
git config user.name "github-actions[bot]"
|
|
47
|
+
git config user.email "github-actions[bot]@users.noreply.github.com"
|
|
48
|
+
git tag "v${{ steps.version.outputs.version }}"
|
|
49
|
+
git push origin "v${{ steps.version.outputs.version }}"
|
|
50
|
+
echo "🏷️ Created and pushed tag: v${{ steps.version.outputs.version }}"
|
|
51
|
+
|
|
52
|
+
- name: Tag created successfully
|
|
53
|
+
if: steps.check.outputs.exists == 'false'
|
|
54
|
+
run: |
|
|
55
|
+
echo "✨ Tag v${{ steps.version.outputs.version }} has been created!"
|
|
56
|
+
echo "🚀 This will trigger the publish workflow automatically."
|
|
@@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
11
11
|
- Pure Python file type detection via the new `FileDetector` class
|
|
12
12
|
- Comprehensive file signature detection for 25+ file formats
|
|
13
13
|
- Smart detection for ZIP-based formats (DOCX, XLSX, PPTX, EPUB)
|
|
14
|
+
- Custom audio model configuration - override speech-to-text provider and model at runtime
|
|
15
|
+
- Pass `audio_provider` and `audio_model` parameters through `extract_content()` API
|
|
16
|
+
- Supports any provider/model combination available through Esperanto library
|
|
17
|
+
- Maintains full backward compatibility - existing code works unchanged
|
|
18
|
+
- Includes validation with helpful warnings and error messages
|
|
14
19
|
|
|
15
20
|
### Changed
|
|
16
21
|
- File type detection now uses pure Python implementation instead of libmagic
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.7.0
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -263,6 +263,14 @@ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...
|
|
|
263
263
|
|
|
264
264
|
# Summarize content with optional context
|
|
265
265
|
summary = await cc.summarize_content("long article text", context="explain to a child")
|
|
266
|
+
|
|
267
|
+
# Extract audio with custom speech-to-text model
|
|
268
|
+
from content_core.common import ProcessSourceInput
|
|
269
|
+
result = await cc.extract(ProcessSourceInput(
|
|
270
|
+
file_path="interview.mp3",
|
|
271
|
+
audio_provider="openai",
|
|
272
|
+
audio_model="whisper-1"
|
|
273
|
+
))
|
|
266
274
|
```
|
|
267
275
|
|
|
268
276
|
## Documentation
|
|
@@ -228,6 +228,14 @@ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...
|
|
|
228
228
|
|
|
229
229
|
# Summarize content with optional context
|
|
230
230
|
summary = await cc.summarize_content("long article text", context="explain to a child")
|
|
231
|
+
|
|
232
|
+
# Extract audio with custom speech-to-text model
|
|
233
|
+
from content_core.common import ProcessSourceInput
|
|
234
|
+
result = await cc.extract(ProcessSourceInput(
|
|
235
|
+
file_path="interview.mp3",
|
|
236
|
+
audio_provider="openai",
|
|
237
|
+
audio_model="whisper-1"
|
|
238
|
+
))
|
|
231
239
|
```
|
|
232
240
|
|
|
233
241
|
## Documentation
|
|
@@ -362,6 +362,203 @@ result = await cc.extract({"file_path": "conference_talk.mp4"})
|
|
|
362
362
|
print(result.content) # Full transcript
|
|
363
363
|
```
|
|
364
364
|
|
|
365
|
+
## Custom Audio Model Configuration
|
|
366
|
+
|
|
367
|
+
Content Core allows you to override the default speech-to-text model at runtime, enabling you to choose different AI providers and models based on your specific needs (language support, cost, accuracy, etc.).
|
|
368
|
+
|
|
369
|
+
### Overview
|
|
370
|
+
|
|
371
|
+
By default, audio and video files are transcribed using the model configured in `models_config.yaml` (typically OpenAI Whisper-1). You can override this on a per-call basis by specifying both `audio_provider` and `audio_model` parameters.
|
|
372
|
+
|
|
373
|
+
**Key Features:**
|
|
374
|
+
- ✅ **Runtime flexibility**: Choose different models for different use cases
|
|
375
|
+
- ✅ **Backward compatible**: Existing code works unchanged
|
|
376
|
+
- ✅ **Multiple providers**: Support for any provider supported by Esperanto
|
|
377
|
+
- ✅ **Automatic fallback**: Graceful handling of invalid configurations
|
|
378
|
+
|
|
379
|
+
### Basic Usage
|
|
380
|
+
|
|
381
|
+
```python
|
|
382
|
+
from content_core.common import ProcessSourceInput
|
|
383
|
+
import content_core as cc
|
|
384
|
+
|
|
385
|
+
# Use custom audio model for transcription
|
|
386
|
+
result = await cc.extract(ProcessSourceInput(
|
|
387
|
+
file_path="interview.mp3",
|
|
388
|
+
audio_provider="openai",
|
|
389
|
+
audio_model="whisper-1"
|
|
390
|
+
))
|
|
391
|
+
|
|
392
|
+
print(result.content) # Transcribed text using specified model
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
### Supported Providers
|
|
396
|
+
|
|
397
|
+
Content Core uses the Esperanto library for AI model abstraction, which supports multiple providers:
|
|
398
|
+
|
|
399
|
+
- **OpenAI**: `provider="openai"`, models: `whisper-1`
|
|
400
|
+
- **Google**: `provider="google"`, models: `chirp` (if available)
|
|
401
|
+
- **Other providers**: Any provider supported by Esperanto
|
|
402
|
+
|
|
403
|
+
Check the [Esperanto documentation](https://github.com/yourusername/esperanto) for the full list of supported providers and models.
|
|
404
|
+
|
|
405
|
+
### Use Cases
|
|
406
|
+
|
|
407
|
+
**Multilingual Transcription:**
|
|
408
|
+
```python
|
|
409
|
+
from content_core.common import ProcessSourceInput
|
|
410
|
+
import content_core as cc
|
|
411
|
+
|
|
412
|
+
# Use a model optimized for a specific language
|
|
413
|
+
result = await cc.extract(ProcessSourceInput(
|
|
414
|
+
file_path="spanish_interview.mp3",
|
|
415
|
+
audio_provider="openai",
|
|
416
|
+
audio_model="whisper-1" # Whisper supports 99 languages
|
|
417
|
+
))
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
**Cost Optimization:**
|
|
421
|
+
```python
|
|
422
|
+
from content_core.common import ProcessSourceInput
|
|
423
|
+
import content_core as cc
|
|
424
|
+
|
|
425
|
+
# Use different models based on quality requirements
|
|
426
|
+
# For high-value content, use premium model
|
|
427
|
+
premium_result = await cc.extract(ProcessSourceInput(
|
|
428
|
+
file_path="important_meeting.mp3",
|
|
429
|
+
audio_provider="openai",
|
|
430
|
+
audio_model="whisper-1"
|
|
431
|
+
))
|
|
432
|
+
|
|
433
|
+
# For casual content, use default or cost-effective model
|
|
434
|
+
casual_result = await cc.extract(ProcessSourceInput(
|
|
435
|
+
file_path="casual_recording.mp3"
|
|
436
|
+
# No custom params = uses default configured model
|
|
437
|
+
))
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
**Video Transcription with Custom Model:**
|
|
441
|
+
```python
|
|
442
|
+
from content_core.common import ProcessSourceInput
|
|
443
|
+
import content_core as cc
|
|
444
|
+
|
|
445
|
+
# Custom model works for video files too (audio is extracted automatically)
|
|
446
|
+
result = await cc.extract(ProcessSourceInput(
|
|
447
|
+
file_path="conference_presentation.mp4",
|
|
448
|
+
audio_provider="openai",
|
|
449
|
+
audio_model="whisper-1"
|
|
450
|
+
))
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
### Parameter Requirements
|
|
454
|
+
|
|
455
|
+
Both `audio_provider` and `audio_model` must be specified together:
|
|
456
|
+
|
|
457
|
+
```python
|
|
458
|
+
# ✅ CORRECT: Both parameters provided
|
|
459
|
+
result = await cc.extract(ProcessSourceInput(
|
|
460
|
+
file_path="audio.mp3",
|
|
461
|
+
audio_provider="openai",
|
|
462
|
+
audio_model="whisper-1"
|
|
463
|
+
))
|
|
464
|
+
|
|
465
|
+
# ✅ CORRECT: Neither parameter (uses default)
|
|
466
|
+
result = await cc.extract(ProcessSourceInput(
|
|
467
|
+
file_path="audio.mp3"
|
|
468
|
+
))
|
|
469
|
+
|
|
470
|
+
# ⚠️ WARNING: Only one parameter (logs warning, uses default)
|
|
471
|
+
result = await cc.extract(ProcessSourceInput(
|
|
472
|
+
file_path="audio.mp3",
|
|
473
|
+
audio_provider="openai" # Missing audio_model
|
|
474
|
+
))
|
|
475
|
+
# Logs: "audio_provider provided without audio_model. Both must be specified together. Falling back to default model."
|
|
476
|
+
```
|
|
477
|
+
|
|
478
|
+
### Error Handling
|
|
479
|
+
|
|
480
|
+
Content Core gracefully handles invalid model configurations:
|
|
481
|
+
|
|
482
|
+
**Invalid Provider:**
|
|
483
|
+
```python
|
|
484
|
+
result = await cc.extract(ProcessSourceInput(
|
|
485
|
+
file_path="audio.mp3",
|
|
486
|
+
audio_provider="invalid_provider",
|
|
487
|
+
audio_model="whisper-1"
|
|
488
|
+
))
|
|
489
|
+
# Logs error and falls back to default model
|
|
490
|
+
# Transcription continues successfully
|
|
491
|
+
```
|
|
492
|
+
|
|
493
|
+
**Invalid Model Name:**
|
|
494
|
+
```python
|
|
495
|
+
result = await cc.extract(ProcessSourceInput(
|
|
496
|
+
file_path="audio.mp3",
|
|
497
|
+
audio_provider="openai",
|
|
498
|
+
audio_model="nonexistent-model"
|
|
499
|
+
))
|
|
500
|
+
# Logs error and falls back to default model
|
|
501
|
+
# Transcription continues successfully
|
|
502
|
+
```
|
|
503
|
+
|
|
504
|
+
**Error Message Example:**
|
|
505
|
+
```
|
|
506
|
+
ERROR: Failed to create custom audio model 'invalid_provider/whisper-1': Unsupported provider.
|
|
507
|
+
Check that the provider and model are supported by Esperanto. Falling back to default model.
|
|
508
|
+
```
|
|
509
|
+
|
|
510
|
+
### Concurrency Control
|
|
511
|
+
|
|
512
|
+
Custom audio models respect the same concurrency limits as the default model (configured via `CCORE_AUDIO_CONCURRENCY` or `set_audio_concurrency()`). This ensures consistent API rate limit handling regardless of which model you use.
|
|
513
|
+
|
|
514
|
+
```python
|
|
515
|
+
from content_core.config import set_audio_concurrency
|
|
516
|
+
from content_core.common import ProcessSourceInput
|
|
517
|
+
import content_core as cc
|
|
518
|
+
|
|
519
|
+
# Set concurrency for all transcriptions (default and custom models)
|
|
520
|
+
set_audio_concurrency(5)
|
|
521
|
+
|
|
522
|
+
# Both use the same concurrency limit
|
|
523
|
+
default_result = await cc.extract(ProcessSourceInput(file_path="audio1.mp3"))
|
|
524
|
+
custom_result = await cc.extract(ProcessSourceInput(
|
|
525
|
+
file_path="audio2.mp3",
|
|
526
|
+
audio_provider="openai",
|
|
527
|
+
audio_model="whisper-1"
|
|
528
|
+
))
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
### Backward Compatibility
|
|
532
|
+
|
|
533
|
+
All existing code continues to work without any changes:
|
|
534
|
+
|
|
535
|
+
```python
|
|
536
|
+
import content_core as cc
|
|
537
|
+
|
|
538
|
+
# Old code (no custom params) - still works perfectly
|
|
539
|
+
result = await cc.extract("audio.mp3")
|
|
540
|
+
result = await cc.extract({"file_path": "audio.mp3"})
|
|
541
|
+
|
|
542
|
+
# New capability (optional custom params)
|
|
543
|
+
from content_core.common import ProcessSourceInput
|
|
544
|
+
result = await cc.extract(ProcessSourceInput(
|
|
545
|
+
file_path="audio.mp3",
|
|
546
|
+
audio_provider="openai",
|
|
547
|
+
audio_model="whisper-1"
|
|
548
|
+
))
|
|
549
|
+
```
|
|
550
|
+
|
|
551
|
+
### Troubleshooting
|
|
552
|
+
|
|
553
|
+
**Issue**: "Both audio_provider and audio_model must be specified together"
|
|
554
|
+
- **Solution**: Provide both parameters or neither. Don't specify just one.
|
|
555
|
+
|
|
556
|
+
**Issue**: "Failed to create custom audio model"
|
|
557
|
+
- **Solution**: Verify the provider and model are supported by Esperanto. Check your API keys are configured correctly.
|
|
558
|
+
|
|
559
|
+
**Issue**: Custom model seems to be ignored
|
|
560
|
+
- **Solution**: Ensure you're using `ProcessSourceInput` class (not plain dict) when passing custom parameters.
|
|
561
|
+
|
|
365
562
|
## File Type Detection
|
|
366
563
|
|
|
367
564
|
Content Core uses a pure Python implementation for file type detection, eliminating the need for system dependencies like libmagic. This ensures consistent behavior across all platforms (Windows, macOS, Linux).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.7.0"
|
|
4
4
|
description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -27,6 +27,14 @@ class ProcessSourceState(BaseModel):
|
|
|
27
27
|
default=None,
|
|
28
28
|
description="Override Docling output format: 'markdown', 'html', or 'json'",
|
|
29
29
|
)
|
|
30
|
+
audio_provider: Optional[str] = Field(
|
|
31
|
+
default=None,
|
|
32
|
+
description="Override speech-to-text provider (e.g., 'openai', 'google')",
|
|
33
|
+
)
|
|
34
|
+
audio_model: Optional[str] = Field(
|
|
35
|
+
default=None,
|
|
36
|
+
description="Override speech-to-text model name (e.g., 'whisper-1', 'chirp')",
|
|
37
|
+
)
|
|
30
38
|
|
|
31
39
|
|
|
32
40
|
class ProcessSourceInput(BaseModel):
|
|
@@ -36,6 +44,8 @@ class ProcessSourceInput(BaseModel):
|
|
|
36
44
|
document_engine: Optional[str] = None
|
|
37
45
|
url_engine: Optional[str] = None
|
|
38
46
|
output_format: Optional[str] = None
|
|
47
|
+
audio_provider: Optional[str] = None
|
|
48
|
+
audio_model: Optional[str] = None
|
|
39
49
|
|
|
40
50
|
|
|
41
51
|
class ProcessSourceOutput(BaseModel):
|
{content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/identification/file_detector.py
RENAMED
|
@@ -13,10 +13,17 @@ from content_core.logging import logger
|
|
|
13
13
|
|
|
14
14
|
class FileDetector:
|
|
15
15
|
"""Pure Python file type detection using magic bytes and content analysis."""
|
|
16
|
-
|
|
17
|
-
# Configuration constants
|
|
16
|
+
|
|
17
|
+
# Configuration constants for binary/text detection
|
|
18
18
|
SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
|
|
19
19
|
TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
|
|
20
|
+
|
|
21
|
+
# Configuration constants for CSV detection
|
|
22
|
+
CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
|
|
23
|
+
CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
|
|
24
|
+
CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
|
|
25
|
+
CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
|
|
26
|
+
CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
|
|
20
27
|
|
|
21
28
|
def __init__(self):
|
|
22
29
|
"""Initialize the FileDetector with signature mappings."""
|
|
@@ -364,18 +371,102 @@ class FileDetector:
|
|
|
364
371
|
|
|
365
372
|
|
|
366
373
|
def _looks_like_csv(self, content: str) -> bool:
|
|
367
|
-
"""
|
|
368
|
-
|
|
369
|
-
|
|
374
|
+
"""
|
|
375
|
+
Check if content looks like CSV format with improved heuristics.
|
|
376
|
+
|
|
377
|
+
Uses a multi-stage approach with performance optimization:
|
|
378
|
+
1. Basic structural checks (cheap)
|
|
379
|
+
2. Field length analysis (cheap, early exit)
|
|
380
|
+
3. Pattern matching (moderate cost)
|
|
381
|
+
4. Variance analysis (expensive, only if needed)
|
|
382
|
+
"""
|
|
383
|
+
lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
|
|
384
|
+
non_empty_lines = [line for line in lines if line.strip()]
|
|
385
|
+
|
|
386
|
+
# Stage 1: Basic structural checks
|
|
387
|
+
if len(non_empty_lines) < 2:
|
|
370
388
|
return False
|
|
371
|
-
|
|
389
|
+
|
|
372
390
|
# Count commas in each line
|
|
373
|
-
comma_counts = [line.count(',') for line in
|
|
374
|
-
|
|
391
|
+
comma_counts = [line.count(',') for line in non_empty_lines]
|
|
392
|
+
|
|
393
|
+
# Must have at least one comma per line
|
|
394
|
+
if not all(count > 0 for count in comma_counts):
|
|
375
395
|
return False
|
|
376
|
-
|
|
377
|
-
# CSV should have consistent comma counts
|
|
378
|
-
|
|
396
|
+
|
|
397
|
+
# CSV should have consistent comma counts across lines
|
|
398
|
+
if len(set(comma_counts)) != 1:
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
|
|
402
|
+
|
|
403
|
+
# Must have minimum number of fields to be CSV
|
|
404
|
+
if num_fields < self.CSV_MIN_FIELDS:
|
|
405
|
+
return False
|
|
406
|
+
|
|
407
|
+
# Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
|
|
408
|
+
first_line = non_empty_lines[0]
|
|
409
|
+
fields = first_line.split(',')
|
|
410
|
+
|
|
411
|
+
# CSV fields should be relatively short (not long sentences)
|
|
412
|
+
# Average field length should be reasonable (not paragraphs)
|
|
413
|
+
# Early exit avoids expensive variance calculations for obvious prose
|
|
414
|
+
avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
|
|
415
|
+
if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
|
|
416
|
+
return False # Too long to be typical CSV fields - exit early
|
|
417
|
+
|
|
418
|
+
# Stage 3: Pattern matching
|
|
419
|
+
# Check for CSV-like patterns:
|
|
420
|
+
# 1. Fields that look like headers (short, alphanumeric)
|
|
421
|
+
# 2. Quoted fields (common in CSV)
|
|
422
|
+
# 3. Numeric fields
|
|
423
|
+
has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
|
|
424
|
+
|
|
425
|
+
first_line_fields = [f.strip() for f in fields]
|
|
426
|
+
# Check if first line looks like a header (short, no sentence-ending punctuation)
|
|
427
|
+
looks_like_header = all(
|
|
428
|
+
len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
|
|
429
|
+
for f in first_line_fields
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
|
|
433
|
+
# Check if subsequent lines have similar field structure
|
|
434
|
+
# Real CSV tends to have consistent field lengths
|
|
435
|
+
if len(non_empty_lines) >= 3:
|
|
436
|
+
field_lengths_per_line = []
|
|
437
|
+
for line in non_empty_lines[:5]:
|
|
438
|
+
line_fields = line.split(',')
|
|
439
|
+
field_lengths = [len(f.strip()) for f in line_fields]
|
|
440
|
+
field_lengths_per_line.append(field_lengths)
|
|
441
|
+
|
|
442
|
+
# Calculate variance in field positions
|
|
443
|
+
# CSV data should have relatively consistent field lengths at each position
|
|
444
|
+
# Natural text with commas will have much more variance
|
|
445
|
+
position_variances = []
|
|
446
|
+
for i in range(num_fields):
|
|
447
|
+
lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
|
|
448
|
+
if lengths_at_position:
|
|
449
|
+
avg = sum(lengths_at_position) / len(lengths_at_position)
|
|
450
|
+
variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
|
|
451
|
+
position_variances.append(variance)
|
|
452
|
+
|
|
453
|
+
# High variance suggests natural text, not structured CSV
|
|
454
|
+
if position_variances:
|
|
455
|
+
avg_variance = sum(position_variances) / len(position_variances)
|
|
456
|
+
if avg_variance > self.CSV_MAX_VARIANCE:
|
|
457
|
+
return False # Very high variance = likely prose
|
|
458
|
+
|
|
459
|
+
# Scoring: Require at least some CSV-like characteristics
|
|
460
|
+
csv_score = 0
|
|
461
|
+
if looks_like_header:
|
|
462
|
+
csv_score += 1
|
|
463
|
+
if has_quoted_fields:
|
|
464
|
+
csv_score += 1
|
|
465
|
+
if num_fields >= 3: # Multiple fields is more CSV-like
|
|
466
|
+
csv_score += 1
|
|
467
|
+
|
|
468
|
+
# Need minimum score to confidently classify as CSV
|
|
469
|
+
return csv_score >= self.CSV_MIN_SCORE
|
|
379
470
|
|
|
380
471
|
|
|
381
472
|
def _is_text_file(self, content: str) -> bool:
|
|
@@ -45,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
45
45
|
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
46
46
|
|
|
47
47
|
# Extract segment
|
|
48
|
-
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
48
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
49
49
|
output_path = os.path.join(output_dir, output_filename)
|
|
50
50
|
|
|
51
51
|
# Export segment
|
|
@@ -53,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
53
53
|
|
|
54
54
|
output_files.append(output_path)
|
|
55
55
|
|
|
56
|
-
logger.debug(
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"Exported segment {i + 1}/{total_segments}: {output_filename}"
|
|
58
|
+
)
|
|
57
59
|
|
|
58
60
|
return output_files
|
|
59
61
|
|
|
@@ -172,7 +174,7 @@ async def extract_audio_data(data: ProcessSourceState):
|
|
|
172
174
|
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
173
175
|
|
|
174
176
|
# Extract segment
|
|
175
|
-
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
177
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
176
178
|
output_path = os.path.join(output_dir, output_filename)
|
|
177
179
|
|
|
178
180
|
extract_audio(input_audio_path, output_path, start_time, end_time)
|
|
@@ -188,12 +190,45 @@ async def extract_audio_data(data: ProcessSourceState):
|
|
|
188
190
|
|
|
189
191
|
# Transcribe audio files in parallel with concurrency limit
|
|
190
192
|
from content_core.models import ModelFactory
|
|
193
|
+
from esperanto import AIFactory
|
|
194
|
+
|
|
195
|
+
# Determine which model to use based on state parameters
|
|
196
|
+
if data.audio_provider and data.audio_model:
|
|
197
|
+
# Custom model provided - create new instance
|
|
198
|
+
try:
|
|
199
|
+
logger.info(
|
|
200
|
+
f"Using custom audio model: {data.audio_provider}/{data.audio_model}"
|
|
201
|
+
)
|
|
202
|
+
speech_to_text_model = AIFactory.create_speech_to_text(
|
|
203
|
+
data.audio_provider, data.audio_model
|
|
204
|
+
)
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(
|
|
207
|
+
f"Failed to create custom audio model '{data.audio_provider}/{data.audio_model}': {e}. "
|
|
208
|
+
f"Check that the provider and model are supported by Esperanto. "
|
|
209
|
+
f"Falling back to default model."
|
|
210
|
+
)
|
|
211
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
212
|
+
elif data.audio_provider or data.audio_model:
|
|
213
|
+
# Only one parameter provided - log warning and use default
|
|
214
|
+
missing = "audio_model" if data.audio_provider else "audio_provider"
|
|
215
|
+
provided = "audio_provider" if data.audio_provider else "audio_model"
|
|
216
|
+
logger.warning(
|
|
217
|
+
f"{provided} provided without {missing}. "
|
|
218
|
+
f"Both audio_provider and audio_model must be specified together. "
|
|
219
|
+
f"Falling back to default model."
|
|
220
|
+
)
|
|
221
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
222
|
+
else:
|
|
223
|
+
# No custom parameters - use default (backward compatible)
|
|
224
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
191
225
|
|
|
192
|
-
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
193
226
|
concurrency = get_audio_concurrency()
|
|
194
227
|
semaphore = asyncio.Semaphore(concurrency)
|
|
195
228
|
|
|
196
|
-
logger.debug(
|
|
229
|
+
logger.debug(
|
|
230
|
+
f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
|
|
231
|
+
)
|
|
197
232
|
|
|
198
233
|
# Create tasks for parallel transcription
|
|
199
234
|
transcription_tasks = [
|