content-core 1.5.0__tar.gz → 1.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (100) hide show
  1. content_core-1.7.0/.github/workflows/create-tag.yml +56 -0
  2. {content_core-1.5.0 → content_core-1.7.0}/CHANGELOG.md +5 -0
  3. {content_core-1.5.0 → content_core-1.7.0}/PKG-INFO +9 -1
  4. {content_core-1.5.0 → content_core-1.7.0}/README.md +8 -0
  5. {content_core-1.5.0 → content_core-1.7.0}/docs/usage.md +197 -0
  6. {content_core-1.5.0 → content_core-1.7.0}/pyproject.toml +1 -1
  7. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/__init__.py +0 -2
  8. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/state.py +10 -0
  9. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/identification/file_detector.py +102 -11
  10. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/audio.py +40 -5
  11. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/templated_message.py +0 -1
  12. {content_core-1.5.0 → content_core-1.7.0}/tests/integration/conftest.py +0 -1
  13. content_core-1.7.0/tests/unit/test_audio_model_override.py +451 -0
  14. content_core-1.7.0/uv.lock +5708 -0
  15. content_core-1.5.0/notebooks/extraction.ipynb +0 -194
  16. content_core-1.5.0/uv.lock +0 -5180
  17. {content_core-1.5.0 → content_core-1.7.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  18. {content_core-1.5.0 → content_core-1.7.0}/.github/workflows/claude-code-review.yml +0 -0
  19. {content_core-1.5.0 → content_core-1.7.0}/.github/workflows/claude.yml +0 -0
  20. {content_core-1.5.0 → content_core-1.7.0}/.github/workflows/publish.yml +0 -0
  21. {content_core-1.5.0 → content_core-1.7.0}/.gitignore +0 -0
  22. {content_core-1.5.0 → content_core-1.7.0}/.python-version +0 -0
  23. {content_core-1.5.0 → content_core-1.7.0}/CONTRIBUTING.md +0 -0
  24. {content_core-1.5.0 → content_core-1.7.0}/LICENSE +0 -0
  25. {content_core-1.5.0 → content_core-1.7.0}/Makefile +0 -0
  26. {content_core-1.5.0 → content_core-1.7.0}/docs/macos.md +0 -0
  27. {content_core-1.5.0 → content_core-1.7.0}/docs/mcp.md +0 -0
  28. {content_core-1.5.0 → content_core-1.7.0}/docs/processors.md +0 -0
  29. {content_core-1.5.0 → content_core-1.7.0}/docs/raycast.md +0 -0
  30. {content_core-1.5.0 → content_core-1.7.0}/examples/main.py +0 -0
  31. {content_core-1.5.0 → content_core-1.7.0}/prompts/content/cleanup.jinja +0 -0
  32. {content_core-1.5.0 → content_core-1.7.0}/prompts/content/summarize.jinja +0 -0
  33. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/.eslintrc.json +0 -0
  34. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/CHANGELOG.md +0 -0
  35. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/README.md +0 -0
  36. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/assets/command-icon.png +0 -0
  37. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/package-lock.json +0 -0
  38. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/package.json +0 -0
  39. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/raycast-env.d.ts +0 -0
  40. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/extract-content.tsx +0 -0
  41. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/quick-extract.tsx +0 -0
  42. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/summarize-content.tsx +0 -0
  43. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/utils/content-core.ts +0 -0
  44. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/src/utils/types.ts +0 -0
  45. {content_core-1.5.0 → content_core-1.7.0}/raycast-content-core/tsconfig.json +0 -0
  46. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/cc_config.yaml +0 -0
  47. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/__init__.py +0 -0
  48. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/exceptions.py +0 -0
  49. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/types.py +0 -0
  50. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/common/utils.py +0 -0
  51. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/config.py +0 -0
  52. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/__init__.py +0 -0
  53. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/cleanup/__init__.py +0 -0
  54. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/cleanup/core.py +0 -0
  55. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/extraction/__init__.py +0 -0
  56. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/extraction/graph.py +0 -0
  57. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/identification/__init__.py +0 -0
  58. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/summary/__init__.py +0 -0
  59. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/content/summary/core.py +0 -0
  60. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/logging.py +0 -0
  61. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/mcp/__init__.py +0 -0
  62. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/mcp/server.py +0 -0
  63. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/models.py +0 -0
  64. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/models_config.yaml +0 -0
  65. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/notebooks/run.ipynb +0 -0
  66. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/notebooks/urls.ipynb +0 -0
  67. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/docling.py +0 -0
  68. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/office.py +0 -0
  69. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/pdf.py +0 -0
  70. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/text.py +0 -0
  71. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/url.py +0 -0
  72. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/video.py +0 -0
  73. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/processors/youtube.py +0 -0
  74. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/py.typed +0 -0
  75. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/__init__.py +0 -0
  76. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/cleanup.py +0 -0
  77. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/extract.py +0 -0
  78. {content_core-1.5.0 → content_core-1.7.0}/src/content_core/tools/summarize.py +0 -0
  79. {content_core-1.5.0 → content_core-1.7.0}/test_coverage_branch_report.md +0 -0
  80. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.docx +0 -0
  81. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.epub +0 -0
  82. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.md +0 -0
  83. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.mp3 +0 -0
  84. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.mp4 +0 -0
  85. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.pdf +0 -0
  86. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.pptx +0 -0
  87. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.txt +0 -0
  88. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file.xlsx +0 -0
  89. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/file_audio.mp3 +0 -0
  90. {content_core-1.5.0 → content_core-1.7.0}/tests/input_content/new_pdf.pdf +0 -0
  91. {content_core-1.5.0 → content_core-1.7.0}/tests/integration/test_cli.py +0 -0
  92. {content_core-1.5.0 → content_core-1.7.0}/tests/integration/test_extraction.py +0 -0
  93. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_audio_concurrency.py +0 -0
  94. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_config.py +0 -0
  95. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_docling.py +0 -0
  96. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_file_detector.py +0 -0
  97. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_file_detector_critical.py +0 -0
  98. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_file_detector_performance.py +0 -0
  99. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_mcp_server.py +0 -0
  100. {content_core-1.5.0 → content_core-1.7.0}/tests/unit/test_pymupdf_ocr.py +0 -0
@@ -0,0 +1,56 @@
1
+ name: Create Tag
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ confirm:
7
+ description: 'Type "yes" to confirm tag creation'
8
+ required: true
9
+ default: 'no'
10
+
11
+ jobs:
12
+ create-tag:
13
+ runs-on: ubuntu-latest
14
+ if: github.event.inputs.confirm == 'yes'
15
+ permissions:
16
+ contents: write
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ with:
21
+ fetch-depth: 0
22
+
23
+ - name: Extract version from pyproject.toml
24
+ id: version
25
+ run: |
26
+ VERSION=$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
27
+ echo "version=$VERSION" >> $GITHUB_OUTPUT
28
+ echo "tag=v$VERSION" >> $GITHUB_OUTPUT
29
+ echo "📦 Version found: $VERSION"
30
+
31
+ - name: Check if tag already exists
32
+ id: check
33
+ run: |
34
+ if git rev-parse "v${{ steps.version.outputs.version }}" >/dev/null 2>&1; then
35
+ echo "exists=true" >> $GITHUB_OUTPUT
36
+ echo "❌ Tag v${{ steps.version.outputs.version }} already exists!"
37
+ exit 1
38
+ else
39
+ echo "exists=false" >> $GITHUB_OUTPUT
40
+ echo "✅ Tag v${{ steps.version.outputs.version }} does not exist yet"
41
+ fi
42
+
43
+ - name: Create and push tag
44
+ if: steps.check.outputs.exists == 'false'
45
+ run: |
46
+ git config user.name "github-actions[bot]"
47
+ git config user.email "github-actions[bot]@users.noreply.github.com"
48
+ git tag "v${{ steps.version.outputs.version }}"
49
+ git push origin "v${{ steps.version.outputs.version }}"
50
+ echo "🏷️ Created and pushed tag: v${{ steps.version.outputs.version }}"
51
+
52
+ - name: Tag created successfully
53
+ if: steps.check.outputs.exists == 'false'
54
+ run: |
55
+ echo "✨ Tag v${{ steps.version.outputs.version }} has been created!"
56
+ echo "🚀 This will trigger the publish workflow automatically."
@@ -11,6 +11,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
11
11
  - Pure Python file type detection via the new `FileDetector` class
12
12
  - Comprehensive file signature detection for 25+ file formats
13
13
  - Smart detection for ZIP-based formats (DOCX, XLSX, PPTX, EPUB)
14
+ - Custom audio model configuration - override speech-to-text provider and model at runtime
15
+ - Pass `audio_provider` and `audio_model` parameters through `extract_content()` API
16
+ - Supports any provider/model combination available through Esperanto library
17
+ - Maintains full backward compatibility - existing code works unchanged
18
+ - Includes validation with helpful warnings and error messages
14
19
 
15
20
  ### Changed
16
21
  - File type detection now uses pure Python implementation instead of libmagic
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -263,6 +263,14 @@ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...
263
263
 
264
264
  # Summarize content with optional context
265
265
  summary = await cc.summarize_content("long article text", context="explain to a child")
266
+
267
+ # Extract audio with custom speech-to-text model
268
+ from content_core.common import ProcessSourceInput
269
+ result = await cc.extract(ProcessSourceInput(
270
+ file_path="interview.mp3",
271
+ audio_provider="openai",
272
+ audio_model="whisper-1"
273
+ ))
266
274
  ```
267
275
 
268
276
  ## Documentation
@@ -228,6 +228,14 @@ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...
228
228
 
229
229
  # Summarize content with optional context
230
230
  summary = await cc.summarize_content("long article text", context="explain to a child")
231
+
232
+ # Extract audio with custom speech-to-text model
233
+ from content_core.common import ProcessSourceInput
234
+ result = await cc.extract(ProcessSourceInput(
235
+ file_path="interview.mp3",
236
+ audio_provider="openai",
237
+ audio_model="whisper-1"
238
+ ))
231
239
  ```
232
240
 
233
241
  ## Documentation
@@ -362,6 +362,203 @@ result = await cc.extract({"file_path": "conference_talk.mp4"})
362
362
  print(result.content) # Full transcript
363
363
  ```
364
364
 
365
+ ## Custom Audio Model Configuration
366
+
367
+ Content Core allows you to override the default speech-to-text model at runtime, enabling you to choose different AI providers and models based on your specific needs (language support, cost, accuracy, etc.).
368
+
369
+ ### Overview
370
+
371
+ By default, audio and video files are transcribed using the model configured in `models_config.yaml` (typically OpenAI Whisper-1). You can override this on a per-call basis by specifying both `audio_provider` and `audio_model` parameters.
372
+
373
+ **Key Features:**
374
+ - ✅ **Runtime flexibility**: Choose different models for different use cases
375
+ - ✅ **Backward compatible**: Existing code works unchanged
376
+ - ✅ **Multiple providers**: Support for any provider supported by Esperanto
377
+ - ✅ **Automatic fallback**: Graceful handling of invalid configurations
378
+
379
+ ### Basic Usage
380
+
381
+ ```python
382
+ from content_core.common import ProcessSourceInput
383
+ import content_core as cc
384
+
385
+ # Use custom audio model for transcription
386
+ result = await cc.extract(ProcessSourceInput(
387
+ file_path="interview.mp3",
388
+ audio_provider="openai",
389
+ audio_model="whisper-1"
390
+ ))
391
+
392
+ print(result.content) # Transcribed text using specified model
393
+ ```
394
+
395
+ ### Supported Providers
396
+
397
+ Content Core uses the Esperanto library for AI model abstraction, which supports multiple providers:
398
+
399
+ - **OpenAI**: `provider="openai"`, models: `whisper-1`
400
+ - **Google**: `provider="google"`, models: `chirp` (if available)
401
+ - **Other providers**: Any provider supported by Esperanto
402
+
403
+ Check the [Esperanto documentation](https://github.com/yourusername/esperanto) for the full list of supported providers and models.
404
+
405
+ ### Use Cases
406
+
407
+ **Multilingual Transcription:**
408
+ ```python
409
+ from content_core.common import ProcessSourceInput
410
+ import content_core as cc
411
+
412
+ # Use a model optimized for a specific language
413
+ result = await cc.extract(ProcessSourceInput(
414
+ file_path="spanish_interview.mp3",
415
+ audio_provider="openai",
416
+ audio_model="whisper-1" # Whisper supports 99 languages
417
+ ))
418
+ ```
419
+
420
+ **Cost Optimization:**
421
+ ```python
422
+ from content_core.common import ProcessSourceInput
423
+ import content_core as cc
424
+
425
+ # Use different models based on quality requirements
426
+ # For high-value content, use premium model
427
+ premium_result = await cc.extract(ProcessSourceInput(
428
+ file_path="important_meeting.mp3",
429
+ audio_provider="openai",
430
+ audio_model="whisper-1"
431
+ ))
432
+
433
+ # For casual content, use default or cost-effective model
434
+ casual_result = await cc.extract(ProcessSourceInput(
435
+ file_path="casual_recording.mp3"
436
+ # No custom params = uses default configured model
437
+ ))
438
+ ```
439
+
440
+ **Video Transcription with Custom Model:**
441
+ ```python
442
+ from content_core.common import ProcessSourceInput
443
+ import content_core as cc
444
+
445
+ # Custom model works for video files too (audio is extracted automatically)
446
+ result = await cc.extract(ProcessSourceInput(
447
+ file_path="conference_presentation.mp4",
448
+ audio_provider="openai",
449
+ audio_model="whisper-1"
450
+ ))
451
+ ```
452
+
453
+ ### Parameter Requirements
454
+
455
+ Both `audio_provider` and `audio_model` must be specified together:
456
+
457
+ ```python
458
+ # ✅ CORRECT: Both parameters provided
459
+ result = await cc.extract(ProcessSourceInput(
460
+ file_path="audio.mp3",
461
+ audio_provider="openai",
462
+ audio_model="whisper-1"
463
+ ))
464
+
465
+ # ✅ CORRECT: Neither parameter (uses default)
466
+ result = await cc.extract(ProcessSourceInput(
467
+ file_path="audio.mp3"
468
+ ))
469
+
470
+ # ⚠️ WARNING: Only one parameter (logs warning, uses default)
471
+ result = await cc.extract(ProcessSourceInput(
472
+ file_path="audio.mp3",
473
+ audio_provider="openai" # Missing audio_model
474
+ ))
475
+ # Logs: "audio_provider provided without audio_model. Both must be specified together. Falling back to default model."
476
+ ```
477
+
478
+ ### Error Handling
479
+
480
+ Content Core gracefully handles invalid model configurations:
481
+
482
+ **Invalid Provider:**
483
+ ```python
484
+ result = await cc.extract(ProcessSourceInput(
485
+ file_path="audio.mp3",
486
+ audio_provider="invalid_provider",
487
+ audio_model="whisper-1"
488
+ ))
489
+ # Logs error and falls back to default model
490
+ # Transcription continues successfully
491
+ ```
492
+
493
+ **Invalid Model Name:**
494
+ ```python
495
+ result = await cc.extract(ProcessSourceInput(
496
+ file_path="audio.mp3",
497
+ audio_provider="openai",
498
+ audio_model="nonexistent-model"
499
+ ))
500
+ # Logs error and falls back to default model
501
+ # Transcription continues successfully
502
+ ```
503
+
504
+ **Error Message Example:**
505
+ ```
506
+ ERROR: Failed to create custom audio model 'invalid_provider/whisper-1': Unsupported provider.
507
+ Check that the provider and model are supported by Esperanto. Falling back to default model.
508
+ ```
509
+
510
+ ### Concurrency Control
511
+
512
+ Custom audio models respect the same concurrency limits as the default model (configured via `CCORE_AUDIO_CONCURRENCY` or `set_audio_concurrency()`). This ensures consistent API rate limit handling regardless of which model you use.
513
+
514
+ ```python
515
+ from content_core.config import set_audio_concurrency
516
+ from content_core.common import ProcessSourceInput
517
+ import content_core as cc
518
+
519
+ # Set concurrency for all transcriptions (default and custom models)
520
+ set_audio_concurrency(5)
521
+
522
+ # Both use the same concurrency limit
523
+ default_result = await cc.extract(ProcessSourceInput(file_path="audio1.mp3"))
524
+ custom_result = await cc.extract(ProcessSourceInput(
525
+ file_path="audio2.mp3",
526
+ audio_provider="openai",
527
+ audio_model="whisper-1"
528
+ ))
529
+ ```
530
+
531
+ ### Backward Compatibility
532
+
533
+ All existing code continues to work without any changes:
534
+
535
+ ```python
536
+ import content_core as cc
537
+
538
+ # Old code (no custom params) - still works perfectly
539
+ result = await cc.extract("audio.mp3")
540
+ result = await cc.extract({"file_path": "audio.mp3"})
541
+
542
+ # New capability (optional custom params)
543
+ from content_core.common import ProcessSourceInput
544
+ result = await cc.extract(ProcessSourceInput(
545
+ file_path="audio.mp3",
546
+ audio_provider="openai",
547
+ audio_model="whisper-1"
548
+ ))
549
+ ```
550
+
551
+ ### Troubleshooting
552
+
553
+ **Issue**: "Both audio_provider and audio_model must be specified together"
554
+ - **Solution**: Provide both parameters or neither. Don't specify just one.
555
+
556
+ **Issue**: "Failed to create custom audio model"
557
+ - **Solution**: Verify the provider and model are supported by Esperanto. Check your API keys are configured correctly.
558
+
559
+ **Issue**: Custom model seems to be ignored
560
+ - **Solution**: Ensure you're using `ProcessSourceInput` class (not plain dict) when passing custom parameters.
561
+
365
562
  ## File Type Detection
366
563
 
367
564
  Content Core uses a pure Python implementation for file type detection, eliminating the need for system dependencies like libmagic. This ensures consistent behavior across all platforms (Windows, macOS, Linux).
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.5.0"
3
+ version = "1.7.0"
4
4
  description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -214,5 +214,3 @@ def csum():
214
214
 
215
215
  if __name__ == "__main__":
216
216
  ccore()
217
- if __name__ == "__main__":
218
- ccore()
@@ -27,6 +27,14 @@ class ProcessSourceState(BaseModel):
27
27
  default=None,
28
28
  description="Override Docling output format: 'markdown', 'html', or 'json'",
29
29
  )
30
+ audio_provider: Optional[str] = Field(
31
+ default=None,
32
+ description="Override speech-to-text provider (e.g., 'openai', 'google')",
33
+ )
34
+ audio_model: Optional[str] = Field(
35
+ default=None,
36
+ description="Override speech-to-text model name (e.g., 'whisper-1', 'chirp')",
37
+ )
30
38
 
31
39
 
32
40
  class ProcessSourceInput(BaseModel):
@@ -36,6 +44,8 @@ class ProcessSourceInput(BaseModel):
36
44
  document_engine: Optional[str] = None
37
45
  url_engine: Optional[str] = None
38
46
  output_format: Optional[str] = None
47
+ audio_provider: Optional[str] = None
48
+ audio_model: Optional[str] = None
39
49
 
40
50
 
41
51
  class ProcessSourceOutput(BaseModel):
@@ -13,10 +13,17 @@ from content_core.logging import logger
13
13
 
14
14
  class FileDetector:
15
15
  """Pure Python file type detection using magic bytes and content analysis."""
16
-
17
- # Configuration constants
16
+
17
+ # Configuration constants for binary/text detection
18
18
  SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
19
19
  TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
20
+
21
+ # Configuration constants for CSV detection
22
+ CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
23
+ CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
24
+ CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
25
+ CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
26
+ CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
20
27
 
21
28
  def __init__(self):
22
29
  """Initialize the FileDetector with signature mappings."""
@@ -364,18 +371,102 @@ class FileDetector:
364
371
 
365
372
 
366
373
  def _looks_like_csv(self, content: str) -> bool:
367
- """Check if content looks like CSV format."""
368
- lines = content.split('\n', 5)[:5] # Check first 5 lines
369
- if len(lines) < 2:
374
+ """
375
+ Check if content looks like CSV format with improved heuristics.
376
+
377
+ Uses a multi-stage approach with performance optimization:
378
+ 1. Basic structural checks (cheap)
379
+ 2. Field length analysis (cheap, early exit)
380
+ 3. Pattern matching (moderate cost)
381
+ 4. Variance analysis (expensive, only if needed)
382
+ """
383
+ lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
384
+ non_empty_lines = [line for line in lines if line.strip()]
385
+
386
+ # Stage 1: Basic structural checks
387
+ if len(non_empty_lines) < 2:
370
388
  return False
371
-
389
+
372
390
  # Count commas in each line
373
- comma_counts = [line.count(',') for line in lines if line.strip()]
374
- if not comma_counts:
391
+ comma_counts = [line.count(',') for line in non_empty_lines]
392
+
393
+ # Must have at least one comma per line
394
+ if not all(count > 0 for count in comma_counts):
375
395
  return False
376
-
377
- # CSV should have consistent comma counts
378
- return len(set(comma_counts)) == 1 and comma_counts[0] > 0
396
+
397
+ # CSV should have consistent comma counts across lines
398
+ if len(set(comma_counts)) != 1:
399
+ return False
400
+
401
+ num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
402
+
403
+ # Must have minimum number of fields to be CSV
404
+ if num_fields < self.CSV_MIN_FIELDS:
405
+ return False
406
+
407
+ # Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
408
+ first_line = non_empty_lines[0]
409
+ fields = first_line.split(',')
410
+
411
+ # CSV fields should be relatively short (not long sentences)
412
+ # Average field length should be reasonable (not paragraphs)
413
+ # Early exit avoids expensive variance calculations for obvious prose
414
+ avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
415
+ if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
416
+ return False # Too long to be typical CSV fields - exit early
417
+
418
+ # Stage 3: Pattern matching
419
+ # Check for CSV-like patterns:
420
+ # 1. Fields that look like headers (short, alphanumeric)
421
+ # 2. Quoted fields (common in CSV)
422
+ # 3. Numeric fields
423
+ has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
424
+
425
+ first_line_fields = [f.strip() for f in fields]
426
+ # Check if first line looks like a header (short, no sentence-ending punctuation)
427
+ looks_like_header = all(
428
+ len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
429
+ for f in first_line_fields
430
+ )
431
+
432
+ # Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
433
+ # Check if subsequent lines have similar field structure
434
+ # Real CSV tends to have consistent field lengths
435
+ if len(non_empty_lines) >= 3:
436
+ field_lengths_per_line = []
437
+ for line in non_empty_lines[:5]:
438
+ line_fields = line.split(',')
439
+ field_lengths = [len(f.strip()) for f in line_fields]
440
+ field_lengths_per_line.append(field_lengths)
441
+
442
+ # Calculate variance in field positions
443
+ # CSV data should have relatively consistent field lengths at each position
444
+ # Natural text with commas will have much more variance
445
+ position_variances = []
446
+ for i in range(num_fields):
447
+ lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
448
+ if lengths_at_position:
449
+ avg = sum(lengths_at_position) / len(lengths_at_position)
450
+ variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
451
+ position_variances.append(variance)
452
+
453
+ # High variance suggests natural text, not structured CSV
454
+ if position_variances:
455
+ avg_variance = sum(position_variances) / len(position_variances)
456
+ if avg_variance > self.CSV_MAX_VARIANCE:
457
+ return False # Very high variance = likely prose
458
+
459
+ # Scoring: Require at least some CSV-like characteristics
460
+ csv_score = 0
461
+ if looks_like_header:
462
+ csv_score += 1
463
+ if has_quoted_fields:
464
+ csv_score += 1
465
+ if num_fields >= 3: # Multiple fields is more CSV-like
466
+ csv_score += 1
467
+
468
+ # Need minimum score to confidently classify as CSV
469
+ return csv_score >= self.CSV_MIN_SCORE
379
470
 
380
471
 
381
472
  def _is_text_file(self, content: str) -> bool:
@@ -45,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
45
45
  end_time = min((i + 1) * segment_length_s, audio.duration)
46
46
 
47
47
  # Extract segment
48
- output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
48
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
49
49
  output_path = os.path.join(output_dir, output_filename)
50
50
 
51
51
  # Export segment
@@ -53,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
53
53
 
54
54
  output_files.append(output_path)
55
55
 
56
- logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
56
+ logger.debug(
57
+ f"Exported segment {i + 1}/{total_segments}: {output_filename}"
58
+ )
57
59
 
58
60
  return output_files
59
61
 
@@ -172,7 +174,7 @@ async def extract_audio_data(data: ProcessSourceState):
172
174
  end_time = min((i + 1) * segment_length_s, audio.duration)
173
175
 
174
176
  # Extract segment
175
- output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
177
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
176
178
  output_path = os.path.join(output_dir, output_filename)
177
179
 
178
180
  extract_audio(input_audio_path, output_path, start_time, end_time)
@@ -188,12 +190,45 @@ async def extract_audio_data(data: ProcessSourceState):
188
190
 
189
191
  # Transcribe audio files in parallel with concurrency limit
190
192
  from content_core.models import ModelFactory
193
+ from esperanto import AIFactory
194
+
195
+ # Determine which model to use based on state parameters
196
+ if data.audio_provider and data.audio_model:
197
+ # Custom model provided - create new instance
198
+ try:
199
+ logger.info(
200
+ f"Using custom audio model: {data.audio_provider}/{data.audio_model}"
201
+ )
202
+ speech_to_text_model = AIFactory.create_speech_to_text(
203
+ data.audio_provider, data.audio_model
204
+ )
205
+ except Exception as e:
206
+ logger.error(
207
+ f"Failed to create custom audio model '{data.audio_provider}/{data.audio_model}': {e}. "
208
+ f"Check that the provider and model are supported by Esperanto. "
209
+ f"Falling back to default model."
210
+ )
211
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
212
+ elif data.audio_provider or data.audio_model:
213
+ # Only one parameter provided - log warning and use default
214
+ missing = "audio_model" if data.audio_provider else "audio_provider"
215
+ provided = "audio_provider" if data.audio_provider else "audio_model"
216
+ logger.warning(
217
+ f"{provided} provided without {missing}. "
218
+ f"Both audio_provider and audio_model must be specified together. "
219
+ f"Falling back to default model."
220
+ )
221
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
222
+ else:
223
+ # No custom parameters - use default (backward compatible)
224
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
191
225
 
192
- speech_to_text_model = ModelFactory.get_model("speech_to_text")
193
226
  concurrency = get_audio_concurrency()
194
227
  semaphore = asyncio.Semaphore(concurrency)
195
228
 
196
- logger.debug(f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}")
229
+ logger.debug(
230
+ f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
231
+ )
197
232
 
198
233
  # Create tasks for parallel transcription
199
234
  transcription_tasks = [
@@ -2,7 +2,6 @@ from typing import Dict, Optional, Union
2
2
 
3
3
  from ai_prompter import Prompter
4
4
  from esperanto import LanguageModel
5
- from esperanto.common_types import Message
6
5
  from pydantic import BaseModel, Field
7
6
 
8
7
  from content_core.models import ModelFactory
@@ -1,7 +1,6 @@
1
1
  """Pytest configuration for integration tests."""
2
2
  import asyncio
3
3
  import gc
4
- import warnings
5
4
 
6
5
  import pytest
7
6