content-core 1.4.2__tar.gz → 1.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-1.4.2 → content_core-1.5.0}/.gitignore +3 -1
- {content_core-1.4.2 → content_core-1.5.0}/PKG-INFO +15 -1
- {content_core-1.4.2 → content_core-1.5.0}/README.md +14 -0
- {content_core-1.4.2 → content_core-1.5.0}/docs/processors.md +21 -5
- {content_core-1.4.2 → content_core-1.5.0}/docs/usage.md +115 -0
- {content_core-1.4.2 → content_core-1.5.0}/examples/main.py +1 -2
- content_core-1.5.0/notebooks/extraction.ipynb +194 -0
- {content_core-1.4.2 → content_core-1.5.0}/pyproject.toml +2 -1
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/cc_config.yaml +2 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/config.py +71 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/identification/file_detector.py +1 -2
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/summary/core.py +1 -1
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/notebooks/run.ipynb +0 -2
- content_core-1.5.0/src/content_core/processors/audio.py +221 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/url.py +3 -3
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/templated_message.py +2 -2
- content_core-1.5.0/test_coverage_branch_report.md +480 -0
- content_core-1.5.0/tests/integration/conftest.py +39 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/integration/test_extraction.py +12 -6
- content_core-1.5.0/tests/unit/test_audio_concurrency.py +225 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_config.py +1 -2
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_file_detector_critical.py +0 -2
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_file_detector_performance.py +0 -1
- {content_core-1.4.2 → content_core-1.5.0}/uv.lock +3 -3
- content_core-1.4.2/.claude/sessions/OSS-216/architecture.md +0 -195
- content_core-1.4.2/.claude/sessions/OSS-216/context.md +0 -54
- content_core-1.4.2/.claude/sessions/OSS-216/plan.md +0 -195
- content_core-1.4.2/src/content_core/processors/audio.py +0 -158
- {content_core-1.4.2 → content_core-1.5.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/.github/workflows/claude-code-review.yml +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/.github/workflows/claude.yml +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/.github/workflows/publish.yml +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/.python-version +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/CHANGELOG.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/CONTRIBUTING.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/LICENSE +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/Makefile +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/docs/macos.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/docs/mcp.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/docs/raycast.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/prompts/content/cleanup.jinja +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/prompts/content/summarize.jinja +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/.eslintrc.json +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/CHANGELOG.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/README.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/assets/command-icon.png +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/package-lock.json +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/package.json +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/raycast-env.d.ts +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/src/extract-content.tsx +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/src/quick-extract.tsx +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/src/summarize-content.tsx +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/src/utils/content-core.ts +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/src/utils/types.ts +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/raycast-content-core/tsconfig.json +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/common/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/common/state.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/common/types.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/common/utils.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/cleanup/core.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/extraction/graph.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/identification/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/logging.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/mcp/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/mcp/server.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/models.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/models_config.yaml +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/notebooks/urls.ipynb +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/docling.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/office.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/pdf.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/text.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/video.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/processors/youtube.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/py.typed +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/tools/extract.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.docx +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.epub +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.md +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.mp3 +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.mp4 +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.pdf +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.pptx +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.txt +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file.xlsx +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/input_content/new_pdf.pdf +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/integration/test_cli.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_docling.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_file_detector.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_mcp_server.py +0 -0
- {content_core-1.4.2 → content_core-1.5.0}/tests/unit/test_pymupdf_ocr.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -548,6 +548,9 @@ GOOGLE_API_KEY=your-key-here
|
|
|
548
548
|
# Engine Selection (optional)
|
|
549
549
|
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
550
550
|
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
551
|
+
|
|
552
|
+
# Audio Processing (optional)
|
|
553
|
+
CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
|
|
551
554
|
```
|
|
552
555
|
|
|
553
556
|
### Engine Selection via Environment Variables
|
|
@@ -556,9 +559,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
|
|
|
556
559
|
|
|
557
560
|
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
558
561
|
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
562
|
+
- **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
|
|
559
563
|
|
|
560
564
|
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
561
565
|
|
|
566
|
+
### Audio Processing Configuration
|
|
567
|
+
|
|
568
|
+
Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
|
|
569
|
+
|
|
570
|
+
- **Default**: 3 concurrent transcriptions
|
|
571
|
+
- **Range**: 1-10 concurrent transcriptions
|
|
572
|
+
- **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
|
|
573
|
+
|
|
574
|
+
Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
|
|
575
|
+
|
|
562
576
|
### Custom Prompt Templates
|
|
563
577
|
|
|
564
578
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -513,6 +513,9 @@ GOOGLE_API_KEY=your-key-here
|
|
|
513
513
|
# Engine Selection (optional)
|
|
514
514
|
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
515
515
|
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
516
|
+
|
|
517
|
+
# Audio Processing (optional)
|
|
518
|
+
CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
|
|
516
519
|
```
|
|
517
520
|
|
|
518
521
|
### Engine Selection via Environment Variables
|
|
@@ -521,9 +524,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
|
|
|
521
524
|
|
|
522
525
|
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
523
526
|
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
527
|
+
- **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
|
|
524
528
|
|
|
525
529
|
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
526
530
|
|
|
531
|
+
### Audio Processing Configuration
|
|
532
|
+
|
|
533
|
+
Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
|
|
534
|
+
|
|
535
|
+
- **Default**: 3 concurrent transcriptions
|
|
536
|
+
- **Range**: 1-10 concurrent transcriptions
|
|
537
|
+
- **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
|
|
538
|
+
|
|
539
|
+
Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
|
|
540
|
+
|
|
527
541
|
### Custom Prompt Templates
|
|
528
542
|
|
|
529
543
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -36,11 +36,27 @@ Content Core uses a modular approach to process content from different sources.
|
|
|
36
36
|
- **Returned Data**: Extracted text content or transcriptions (for media files), structured according to Content Core's schema.
|
|
37
37
|
- **Location**: `src/content_core/processors/file.py`
|
|
38
38
|
|
|
39
|
-
### 4. **Media Transcription Processor**
|
|
40
|
-
- **Purpose**:
|
|
41
|
-
- **Supported Input**: Audio and video files (
|
|
42
|
-
- **Returned Data**: Transcribed text from the media content
|
|
43
|
-
- **Location**: `src/content_core/processors/
|
|
39
|
+
### 4. **Media Transcription Processor (Audio/Video)**
|
|
40
|
+
- **Purpose**: Handles transcription of audio and video files using OpenAI Whisper API with parallel processing for improved performance
|
|
41
|
+
- **Supported Input**: Audio files (`.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg`) and video files (`.mp4`, `.avi`, `.mov`, `.mkv`)
|
|
42
|
+
- **Returned Data**: Transcribed text from the media content, with metadata about processed segments
|
|
43
|
+
- **Location**: `src/content_core/processors/audio.py`
|
|
44
|
+
- **Key Features**:
|
|
45
|
+
- **Automatic Segmentation**: Files longer than 10 minutes are automatically split into segments
|
|
46
|
+
- **Parallel Processing**: Multiple segments are transcribed concurrently using `asyncio.gather()` with semaphore-based concurrency control
|
|
47
|
+
- **Configurable Concurrency**: Control the number of simultaneous transcriptions (1-10, default: 3) via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in YAML config
|
|
48
|
+
- **Order Preservation**: Results are assembled in correct order regardless of completion time
|
|
49
|
+
- **Efficient Resource Usage**: Semaphore prevents API rate limiting while maximizing throughput
|
|
50
|
+
- **Configuration**:
|
|
51
|
+
```yaml
|
|
52
|
+
extraction:
|
|
53
|
+
audio:
|
|
54
|
+
concurrency: 3 # Number of concurrent transcriptions (1-10)
|
|
55
|
+
```
|
|
56
|
+
- **Performance**:
|
|
57
|
+
- Short files (<10 min): Processed as single segment, no splitting overhead
|
|
58
|
+
- Long files (>10 min): Processing time scales sub-linearly with concurrency
|
|
59
|
+
- Example: 60-minute file with concurrency=3 takes ~5-7 minutes vs ~15-20 minutes with concurrency=1
|
|
44
60
|
|
|
45
61
|
### 5. **Enhanced PyMuPDF Processor (Simple Engine)**
|
|
46
62
|
- **Purpose**: Optimized PDF extraction using PyMuPDF with enhanced quality flags, table detection, and optional OCR
|
|
@@ -247,6 +247,121 @@ Enable OCR enhancement for:
|
|
|
247
247
|
|
|
248
248
|
**Note**: The quality improvements (better character rendering, table detection) work automatically without requiring OCR or additional setup.
|
|
249
249
|
|
|
250
|
+
## Audio Processing Configuration
|
|
251
|
+
|
|
252
|
+
Content Core optimizes audio and video file processing by using parallel transcription of audio segments. This feature is particularly beneficial for long-form content like podcasts, lectures, or long videos.
|
|
253
|
+
|
|
254
|
+
### How It Works
|
|
255
|
+
|
|
256
|
+
1. **Automatic Segmentation**: Audio files longer than 10 minutes are automatically split into segments
|
|
257
|
+
2. **Parallel Transcription**: Multiple segments are transcribed concurrently using OpenAI Whisper
|
|
258
|
+
3. **Concurrency Control**: A semaphore limits the number of simultaneous API calls to prevent rate limiting
|
|
259
|
+
4. **Result Assembly**: Transcriptions are joined in the correct order to produce the complete transcript
|
|
260
|
+
|
|
261
|
+
### Configuration
|
|
262
|
+
|
|
263
|
+
#### Via YAML Configuration
|
|
264
|
+
|
|
265
|
+
Add to your `cc_config.yaml` or custom configuration file:
|
|
266
|
+
|
|
267
|
+
```yaml
|
|
268
|
+
extraction:
|
|
269
|
+
audio:
|
|
270
|
+
concurrency: 3 # Number of concurrent transcriptions (1-10, default: 3)
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
#### Via Environment Variable
|
|
274
|
+
|
|
275
|
+
Set in your `.env` file or system environment:
|
|
276
|
+
|
|
277
|
+
```plaintext
|
|
278
|
+
CCORE_AUDIO_CONCURRENCY=5 # Process 5 segments simultaneously
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
The environment variable takes precedence over the YAML configuration.
|
|
282
|
+
|
|
283
|
+
#### Programmatically in Python
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from content_core.config import set_audio_concurrency
|
|
287
|
+
|
|
288
|
+
# Override audio concurrency for the current session
|
|
289
|
+
set_audio_concurrency(5)
|
|
290
|
+
|
|
291
|
+
# Now process audio with the new setting
|
|
292
|
+
result = await cc.extract({"file_path": "long_podcast.mp3"})
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
### Performance Considerations
|
|
296
|
+
|
|
297
|
+
**Choosing the Right Concurrency Level:**
|
|
298
|
+
|
|
299
|
+
- **1-2 concurrent**: Conservative approach
|
|
300
|
+
- Best for: API rate limits, cost management, batch processing
|
|
301
|
+
- Processing time: Slower, but more reliable
|
|
302
|
+
|
|
303
|
+
- **3-5 concurrent** (recommended): Balanced approach
|
|
304
|
+
- Best for: Most use cases, moderate file lengths
|
|
305
|
+
- Processing time: Good balance between speed and stability
|
|
306
|
+
|
|
307
|
+
- **6-10 concurrent**: Aggressive approach
|
|
308
|
+
- Best for: Very long files (>1 hour), premium API tiers
|
|
309
|
+
- Processing time: Fastest, but higher risk of rate limits
|
|
310
|
+
- Note: May result in higher API costs
|
|
311
|
+
|
|
312
|
+
**Example Processing Times** (approximate, for a 60-minute audio file):
|
|
313
|
+
- Concurrency 1: ~15-20 minutes
|
|
314
|
+
- Concurrency 3: ~5-7 minutes
|
|
315
|
+
- Concurrency 10: ~2-3 minutes
|
|
316
|
+
|
|
317
|
+
### Validation and Error Handling
|
|
318
|
+
|
|
319
|
+
Content Core validates the concurrency setting and provides safe defaults:
|
|
320
|
+
|
|
321
|
+
- **Valid range**: 1-10 concurrent transcriptions
|
|
322
|
+
- **Invalid values**: Automatically fall back to default (3) with a warning logged
|
|
323
|
+
- **Invalid types**: Non-integer values are rejected with a warning
|
|
324
|
+
|
|
325
|
+
Example warning when using invalid value:
|
|
326
|
+
```
|
|
327
|
+
WARNING: Invalid CCORE_AUDIO_CONCURRENCY: '15'. Must be between 1 and 10. Using default from config.
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
### Use Cases
|
|
331
|
+
|
|
332
|
+
**Podcasts and Long Interviews:**
|
|
333
|
+
```python
|
|
334
|
+
from content_core.config import set_audio_concurrency
|
|
335
|
+
import content_core as cc
|
|
336
|
+
|
|
337
|
+
# For a 2-hour podcast, use higher concurrency
|
|
338
|
+
set_audio_concurrency(7)
|
|
339
|
+
result = await cc.extract({"file_path": "podcast_episode_120min.mp3"})
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
**Batch Processing:**
|
|
343
|
+
```python
|
|
344
|
+
from content_core.config import set_audio_concurrency
|
|
345
|
+
import content_core as cc
|
|
346
|
+
|
|
347
|
+
# For processing multiple files sequentially, use lower concurrency
|
|
348
|
+
# to avoid rate limits across all files
|
|
349
|
+
set_audio_concurrency(2)
|
|
350
|
+
|
|
351
|
+
for audio_file in audio_files:
|
|
352
|
+
result = await cc.extract({"file_path": audio_file})
|
|
353
|
+
# Process result...
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
**Video Transcription:**
|
|
357
|
+
```python
|
|
358
|
+
import content_core as cc
|
|
359
|
+
|
|
360
|
+
# Videos are processed the same way - audio is extracted first, then transcribed
|
|
361
|
+
result = await cc.extract({"file_path": "conference_talk.mp4"})
|
|
362
|
+
print(result.content) # Full transcript
|
|
363
|
+
```
|
|
364
|
+
|
|
250
365
|
## File Type Detection
|
|
251
366
|
|
|
252
367
|
Content Core uses a pure Python implementation for file type detection, eliminating the need for system dependencies like libmagic. This ensures consistent behavior across all platforms (Windows, macOS, Linux).
|
|
@@ -6,10 +6,9 @@ This script processes all files in the input_content/ directory and URLs from ur
|
|
|
6
6
|
converting them to Markdown format and saving the results to separate files.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
import os
|
|
10
9
|
import sys
|
|
11
10
|
from pathlib import Path
|
|
12
|
-
from typing import List
|
|
11
|
+
from typing import List
|
|
13
12
|
from urllib.parse import urlparse
|
|
14
13
|
|
|
15
14
|
from loguru import logger
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": 1,
|
|
6
|
+
"id": "444fcc1f",
|
|
7
|
+
"metadata": {},
|
|
8
|
+
"outputs": [
|
|
9
|
+
{
|
|
10
|
+
"name": "stderr",
|
|
11
|
+
"output_type": "stream",
|
|
12
|
+
"text": [
|
|
13
|
+
"\u001b[32m2025-09-26 16:49:15.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcontent_core.processors.audio\u001b[0m:\u001b[36mextract_audio_data\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1mAudio is longer than 10 minutes (675.31s), splitting into 2 segments\u001b[0m\n"
|
|
14
|
+
]
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"name": "stdout",
|
|
18
|
+
"output_type": "stream",
|
|
19
|
+
"text": [
|
|
20
|
+
"MoviePy - Writing audio in /var/folders/cl/346yd2sd3vz399s0m2c_ynvr0000gn/T/tmp6zx9axd4/OAT - Aula 2 v2 - Levi Rezende_audio_001.mp3\n"
|
|
21
|
+
]
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"name": "stderr",
|
|
25
|
+
"output_type": "stream",
|
|
26
|
+
"text": [
|
|
27
|
+
" \r"
|
|
28
|
+
]
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"name": "stdout",
|
|
32
|
+
"output_type": "stream",
|
|
33
|
+
"text": [
|
|
34
|
+
"MoviePy - Done.\n",
|
|
35
|
+
"MoviePy - Writing audio in /var/folders/cl/346yd2sd3vz399s0m2c_ynvr0000gn/T/tmp6zx9axd4/OAT - Aula 2 v2 - Levi Rezende_audio_002.mp3\n"
|
|
36
|
+
]
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"name": "stderr",
|
|
40
|
+
"output_type": "stream",
|
|
41
|
+
"text": [
|
|
42
|
+
" \r"
|
|
43
|
+
]
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "stdout",
|
|
47
|
+
"output_type": "stream",
|
|
48
|
+
"text": [
|
|
49
|
+
"MoviePy - Done.\n"
|
|
50
|
+
]
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"name": "stderr",
|
|
54
|
+
"output_type": "stream",
|
|
55
|
+
"text": [
|
|
56
|
+
"\u001b[32m2025-09-26 16:49:54.566\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mcontent_core.processors.audio\u001b[0m:\u001b[36mextract_audio_data\u001b[0m:\u001b[36m156\u001b[0m - \u001b[31m\u001b[1mError processing audio: \u001b[0m\n",
|
|
57
|
+
"\u001b[32m2025-09-26 16:49:54.587\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mcontent_core.processors.audio\u001b[0m:\u001b[36mextract_audio_data\u001b[0m:\u001b[36m157\u001b[0m - \u001b[31m\u001b[1mTraceback (most recent call last):\n",
|
|
58
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 101, in map_httpcore_exceptions\n",
|
|
59
|
+
" yield\n",
|
|
60
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 394, in handle_async_request\n",
|
|
61
|
+
" resp = await self._pool.handle_async_request(req)\n",
|
|
62
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py\", line 256, in handle_async_request\n",
|
|
63
|
+
" raise exc from None\n",
|
|
64
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py\", line 236, in handle_async_request\n",
|
|
65
|
+
" response = await connection.handle_async_request(\n",
|
|
66
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection.py\", line 103, in handle_async_request\n",
|
|
67
|
+
" return await self._connection.handle_async_request(request)\n",
|
|
68
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 136, in handle_async_request\n",
|
|
69
|
+
" raise exc\n",
|
|
70
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 106, in handle_async_request\n",
|
|
71
|
+
" ) = await self._receive_response_headers(**kwargs)\n",
|
|
72
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 177, in _receive_response_headers\n",
|
|
73
|
+
" event = await self._receive_event(timeout=timeout)\n",
|
|
74
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 217, in _receive_event\n",
|
|
75
|
+
" data = await self._network_stream.read(\n",
|
|
76
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_backends/anyio.py\", line 32, in read\n",
|
|
77
|
+
" with map_exceptions(exc_map):\n",
|
|
78
|
+
" File \"/Users/luisnovo/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py\", line 153, in __exit__\n",
|
|
79
|
+
" self.gen.throw(typ, value, traceback)\n",
|
|
80
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_exceptions.py\", line 14, in map_exceptions\n",
|
|
81
|
+
" raise to_exc(exc) from exc\n",
|
|
82
|
+
"httpcore.ReadTimeout\n",
|
|
83
|
+
"\n",
|
|
84
|
+
"The above exception was the direct cause of the following exception:\n",
|
|
85
|
+
"\n",
|
|
86
|
+
"Traceback (most recent call last):\n",
|
|
87
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/src/content_core/processors/audio.py\", line 146, in extract_audio_data\n",
|
|
88
|
+
" transcription = await transcribe_audio_segment(\n",
|
|
89
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/src/content_core/processors/audio.py\", line 103, in transcribe_audio_segment\n",
|
|
90
|
+
" return (await model.atranscribe(audio_file)).text\n",
|
|
91
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/esperanto/providers/stt/openai.py\", line 158, in atranscribe\n",
|
|
92
|
+
" response = await self.async_client.post(\n",
|
|
93
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1859, in post\n",
|
|
94
|
+
" return await self.request(\n",
|
|
95
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1540, in request\n",
|
|
96
|
+
" return await self.send(request, auth=auth, follow_redirects=follow_redirects)\n",
|
|
97
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1629, in send\n",
|
|
98
|
+
" response = await self._send_handling_auth(\n",
|
|
99
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1657, in _send_handling_auth\n",
|
|
100
|
+
" response = await self._send_handling_redirects(\n",
|
|
101
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1694, in _send_handling_redirects\n",
|
|
102
|
+
" response = await self._send_single_request(request)\n",
|
|
103
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1730, in _send_single_request\n",
|
|
104
|
+
" response = await transport.handle_async_request(request)\n",
|
|
105
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 393, in handle_async_request\n",
|
|
106
|
+
" with map_httpcore_exceptions():\n",
|
|
107
|
+
" File \"/Users/luisnovo/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py\", line 153, in __exit__\n",
|
|
108
|
+
" self.gen.throw(typ, value, traceback)\n",
|
|
109
|
+
" File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 118, in map_httpcore_exceptions\n",
|
|
110
|
+
" raise mapped_exc(message) from exc\n",
|
|
111
|
+
"httpx.ReadTimeout\n",
|
|
112
|
+
"\u001b[0m\n"
|
|
113
|
+
]
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"ename": "ReadTimeout",
|
|
117
|
+
"evalue": "",
|
|
118
|
+
"output_type": "error",
|
|
119
|
+
"traceback": [
|
|
120
|
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
121
|
+
"\u001b[0;31mReadTimeout\u001b[0m Traceback (most recent call last)",
|
|
122
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:101\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
|
|
123
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:394\u001b[0m, in \u001b[0;36mAsyncHTTPTransport.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m--> 394\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool\u001b[38;5;241m.\u001b[39mhandle_async_request(req)\n\u001b[1;32m 396\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp\u001b[38;5;241m.\u001b[39mstream, typing\u001b[38;5;241m.\u001b[39mAsyncIterable)\n",
|
|
124
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py:256\u001b[0m, in \u001b[0;36mAsyncConnectionPool.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_close_connections(closing)\n\u001b[0;32m--> 256\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 258\u001b[0m \u001b[38;5;66;03m# Return the response. Note that in this case we still have to manage\u001b[39;00m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;66;03m# the point at which the response is closed.\u001b[39;00m\n",
|
|
125
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py:236\u001b[0m, in \u001b[0;36mAsyncConnectionPool.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 235\u001b[0m \u001b[38;5;66;03m# Send the request on the assigned connection.\u001b[39;00m\n\u001b[0;32m--> 236\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m connection\u001b[38;5;241m.\u001b[39mhandle_async_request(\n\u001b[1;32m 237\u001b[0m pool_request\u001b[38;5;241m.\u001b[39mrequest\n\u001b[1;32m 238\u001b[0m )\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[1;32m 240\u001b[0m \u001b[38;5;66;03m# In some cases a connection may initially be available to\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;66;03m# handle a request, but then become unavailable.\u001b[39;00m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;66;03m# In this case we clear the connection and try again.\u001b[39;00m\n",
|
|
126
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection.py:103\u001b[0m, in \u001b[0;36mAsyncHTTPConnection.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39mhandle_async_request(request)\n",
|
|
127
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:136\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response_closed()\n\u001b[0;32m--> 136\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n",
|
|
128
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:106\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\n\u001b[1;32m 98\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreceive_response_headers\u001b[39m\u001b[38;5;124m\"\u001b[39m, logger, request, kwargs\n\u001b[1;32m 99\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m trace:\n\u001b[1;32m 100\u001b[0m (\n\u001b[1;32m 101\u001b[0m http_version,\n\u001b[1;32m 102\u001b[0m status,\n\u001b[1;32m 103\u001b[0m reason_phrase,\n\u001b[1;32m 104\u001b[0m headers,\n\u001b[1;32m 105\u001b[0m trailing_data,\n\u001b[0;32m--> 106\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_receive_response_headers(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 107\u001b[0m trace\u001b[38;5;241m.\u001b[39mreturn_value \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 108\u001b[0m http_version,\n\u001b[1;32m 109\u001b[0m status,\n\u001b[1;32m 110\u001b[0m reason_phrase,\n\u001b[1;32m 111\u001b[0m headers,\n\u001b[1;32m 112\u001b[0m )\n",
|
|
129
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:177\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection._receive_response_headers\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 177\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_receive_event(timeout\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(event, h11\u001b[38;5;241m.\u001b[39mResponse):\n",
|
|
130
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:217\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection._receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event \u001b[38;5;129;01mis\u001b[39;00m h11\u001b[38;5;241m.\u001b[39mNEED_DATA:\n\u001b[0;32m--> 217\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_network_stream\u001b[38;5;241m.\u001b[39mread(\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mREAD_NUM_BYTES, timeout\u001b[38;5;241m=\u001b[39mtimeout\n\u001b[1;32m 219\u001b[0m )\n\u001b[1;32m 221\u001b[0m \u001b[38;5;66;03m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;66;03m# httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;66;03m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;66;03m# it as a ConnectError.\u001b[39;00m\n",
|
|
131
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_backends/anyio.py:32\u001b[0m, in \u001b[0;36mAnyIOStream.read\u001b[0;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[1;32m 26\u001b[0m exc_map \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 27\u001b[0m \u001b[38;5;167;01mTimeoutError\u001b[39;00m: ReadTimeout,\n\u001b[1;32m 28\u001b[0m anyio\u001b[38;5;241m.\u001b[39mBrokenResourceError: ReadError,\n\u001b[1;32m 29\u001b[0m anyio\u001b[38;5;241m.\u001b[39mClosedResourceError: ReadError,\n\u001b[1;32m 30\u001b[0m anyio\u001b[38;5;241m.\u001b[39mEndOfStream: ReadError,\n\u001b[1;32m 31\u001b[0m }\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m anyio\u001b[38;5;241m.\u001b[39mfail_after(timeout):\n",
|
|
132
|
+
"File \u001b[0;32m~/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n",
|
|
133
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_exceptions.py:14\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[0;34m(map)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exc, from_exc):\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m to_exc(exc) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexc\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n",
|
|
134
|
+
"\u001b[0;31mReadTimeout\u001b[0m: ",
|
|
135
|
+
"\nThe above exception was the direct cause of the following exception:\n",
|
|
136
|
+
"\u001b[0;31mReadTimeout\u001b[0m Traceback (most recent call last)",
|
|
137
|
+
"Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mcontent_core\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m extract_content\n\u001b[1;32m 4\u001b[0m lesson_source \u001b[38;5;241m=\u001b[39m ProcessSourceInput(\n\u001b[1;32m 5\u001b[0m file_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/Users/luisnovo/dev/projetos/snl/adaptive-learning-novo/data/input/Oportunidade ao Alcance de Todos/OAT - Aula 2 v2 - Levi Rezende.mp4\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m )\n\u001b[0;32m----> 8\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m extract_content(lesson_source)\n",
|
|
138
|
+
"File \u001b[0;32m~/dev/projetos/content-core/src/content_core/content/extraction/__init__.py:12\u001b[0m, in \u001b[0;36mextract_content\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 11\u001b[0m data \u001b[38;5;241m=\u001b[39m ProcessSourceInput(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mdata)\n\u001b[0;32m---> 12\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mainvoke(data)\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ProcessSourceOutput(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mresult)\n",
|
|
139
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/__init__.py:2920\u001b[0m, in \u001b[0;36mPregel.ainvoke\u001b[0;34m(self, input, config, stream_mode, print_mode, output_keys, interrupt_before, interrupt_after, **kwargs)\u001b[0m\n\u001b[1;32m 2917\u001b[0m chunks: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Any] \u001b[38;5;241m|\u001b[39m Any] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 2918\u001b[0m interrupts: \u001b[38;5;28mlist\u001b[39m[Interrupt] \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m-> 2920\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mastream(\n\u001b[1;32m 2921\u001b[0m \u001b[38;5;28minput\u001b[39m,\n\u001b[1;32m 2922\u001b[0m config,\n\u001b[1;32m 2923\u001b[0m stream_mode\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mupdates\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 2924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stream_mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2925\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m stream_mode,\n\u001b[1;32m 2926\u001b[0m print_mode\u001b[38;5;241m=\u001b[39mprint_mode,\n\u001b[1;32m 2927\u001b[0m output_keys\u001b[38;5;241m=\u001b[39moutput_keys,\n\u001b[1;32m 2928\u001b[0m interrupt_before\u001b[38;5;241m=\u001b[39minterrupt_before,\n\u001b[1;32m 2929\u001b[0m interrupt_after\u001b[38;5;241m=\u001b[39minterrupt_after,\n\u001b[1;32m 2930\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 2931\u001b[0m ):\n\u001b[1;32m 2932\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stream_mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 2933\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(chunk) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
|
|
140
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/__init__.py:2768\u001b[0m, in \u001b[0;36mPregel.astream\u001b[0;34m(self, input, config, stream_mode, print_mode, output_keys, interrupt_before, interrupt_after, checkpoint_during, debug, subgraphs)\u001b[0m\n\u001b[1;32m 2766\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m task \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m loop\u001b[38;5;241m.\u001b[39mamatch_cached_writes():\n\u001b[1;32m 2767\u001b[0m loop\u001b[38;5;241m.\u001b[39moutput_writes(task\u001b[38;5;241m.\u001b[39mid, task\u001b[38;5;241m.\u001b[39mwrites, cached\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 2768\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m runner\u001b[38;5;241m.\u001b[39matick(\n\u001b[1;32m 2769\u001b[0m [t \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m loop\u001b[38;5;241m.\u001b[39mtasks\u001b[38;5;241m.\u001b[39mvalues() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m t\u001b[38;5;241m.\u001b[39mwrites],\n\u001b[1;32m 2770\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstep_timeout,\n\u001b[1;32m 2771\u001b[0m get_waiter\u001b[38;5;241m=\u001b[39mget_waiter,\n\u001b[1;32m 2772\u001b[0m schedule_task\u001b[38;5;241m=\u001b[39mloop\u001b[38;5;241m.\u001b[39maaccept_push,\n\u001b[1;32m 2773\u001b[0m ):\n\u001b[1;32m 2774\u001b[0m \u001b[38;5;66;03m# emit output\u001b[39;00m\n\u001b[1;32m 2775\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m _output(\n\u001b[1;32m 2776\u001b[0m stream_mode,\n\u001b[1;32m 2777\u001b[0m print_mode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2780\u001b[0m asyncio\u001b[38;5;241m.\u001b[39mQueueEmpty,\n\u001b[1;32m 2781\u001b[0m ):\n\u001b[1;32m 2782\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m o\n",
|
|
141
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/runner.py:295\u001b[0m, in \u001b[0;36mPregelRunner.atick\u001b[0;34m(self, tasks, reraise, timeout, retry_policy, get_waiter, schedule_task)\u001b[0m\n\u001b[1;32m 293\u001b[0m t \u001b[38;5;241m=\u001b[39m tasks[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 295\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m arun_with_retry(\n\u001b[1;32m 296\u001b[0m t,\n\u001b[1;32m 297\u001b[0m retry_policy,\n\u001b[1;32m 298\u001b[0m stream\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_astream,\n\u001b[1;32m 299\u001b[0m configurable\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 300\u001b[0m CONFIG_KEY_CALL: partial(\n\u001b[1;32m 301\u001b[0m _acall,\n\u001b[1;32m 302\u001b[0m weakref\u001b[38;5;241m.\u001b[39mref(t),\n\u001b[1;32m 303\u001b[0m stream\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_astream,\n\u001b[1;32m 304\u001b[0m retry_policy\u001b[38;5;241m=\u001b[39mretry_policy,\n\u001b[1;32m 305\u001b[0m futures\u001b[38;5;241m=\u001b[39mweakref\u001b[38;5;241m.\u001b[39mref(futures),\n\u001b[1;32m 306\u001b[0m schedule_task\u001b[38;5;241m=\u001b[39mschedule_task,\n\u001b[1;32m 307\u001b[0m submit\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msubmit,\n\u001b[1;32m 308\u001b[0m loop\u001b[38;5;241m=\u001b[39mloop,\n\u001b[1;32m 309\u001b[0m ),\n\u001b[1;32m 310\u001b[0m },\n\u001b[1;32m 311\u001b[0m )\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommit(t, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
|
|
142
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/retry.py:137\u001b[0m, in \u001b[0;36marun_with_retry\u001b[0;34m(task, retry_policy, stream, match_cached_writes, configurable)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m task\u001b[38;5;241m.\u001b[39mproc\u001b[38;5;241m.\u001b[39mainvoke(task\u001b[38;5;241m.\u001b[39minput, config)\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ParentCommand \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 139\u001b[0m ns: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m config[CONF][CONFIG_KEY_CHECKPOINT_NS]\n",
|
|
143
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/utils/runnable.py:676\u001b[0m, in \u001b[0;36mRunnableSeq.ainvoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mcreate_task(\n\u001b[1;32m 673\u001b[0m step\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs), context\u001b[38;5;241m=\u001b[39mcontext\n\u001b[1;32m 674\u001b[0m )\n\u001b[1;32m 675\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 676\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m step\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 677\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 678\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m step\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config)\n",
|
|
144
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/utils/runnable.py:440\u001b[0m, in \u001b[0;36mRunnableCallable.ainvoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m run_manager\u001b[38;5;241m.\u001b[39mon_chain_end(ret)\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 440\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mafunc(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrecurse \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(ret, Runnable):\n\u001b[1;32m 442\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m ret\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config)\n",
|
|
145
|
+
"File \u001b[0;32m~/dev/projetos/content-core/src/content_core/processors/audio.py:146\u001b[0m, in \u001b[0;36mextract_audio_data\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 144\u001b[0m transcriptions \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m audio_file \u001b[38;5;129;01min\u001b[39;00m output_files:\n\u001b[0;32m--> 146\u001b[0m transcription \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m transcribe_audio_segment(\n\u001b[1;32m 147\u001b[0m audio_file, speech_to_text_model\n\u001b[1;32m 148\u001b[0m )\n\u001b[1;32m 149\u001b[0m transcriptions\u001b[38;5;241m.\u001b[39mappend(transcription)\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio_files\u001b[39m\u001b[38;5;124m\"\u001b[39m: output_files},\n\u001b[1;32m 153\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(transcriptions),\n\u001b[1;32m 154\u001b[0m }\n",
|
|
146
|
+
"File \u001b[0;32m~/dev/projetos/content-core/src/content_core/processors/audio.py:103\u001b[0m, in \u001b[0;36mtranscribe_audio_segment\u001b[0;34m(audio_file, model)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mtranscribe_audio_segment\u001b[39m(audio_file, model):\n\u001b[1;32m 102\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Transcribe a single audio segment asynchronously\"\"\"\u001b[39;00m\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[38;5;28;01mawait\u001b[39;00m model\u001b[38;5;241m.\u001b[39matranscribe(audio_file))\u001b[38;5;241m.\u001b[39mtext\n",
|
|
147
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/esperanto/providers/stt/openai.py:158\u001b[0m, in \u001b[0;36mOpenAISpeechToTextModel.atranscribe\u001b[0;34m(self, audio_file, language, prompt)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(audio_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 157\u001b[0m files \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfile\u001b[39m\u001b[38;5;124m\"\u001b[39m: (audio_file, f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio/mpeg\u001b[39m\u001b[38;5;124m\"\u001b[39m)}\n\u001b[0;32m--> 158\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masync_client\u001b[38;5;241m.\u001b[39mpost(\n\u001b[1;32m 159\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbase_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/audio/transcriptions\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 160\u001b[0m headers\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_headers(),\n\u001b[1;32m 161\u001b[0m files\u001b[38;5;241m=\u001b[39mfiles,\n\u001b[1;32m 162\u001b[0m data\u001b[38;5;241m=\u001b[39mkwargs\n\u001b[1;32m 163\u001b[0m )\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 165\u001b[0m \u001b[38;5;66;03m# For BinaryIO, send the file object directly\u001b[39;00m\n\u001b[1;32m 166\u001b[0m filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(audio_file, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maudio.mp3\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
|
148
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1859\u001b[0m, in \u001b[0;36mAsyncClient.post\u001b[0;34m(self, url, content, data, files, json, params, headers, cookies, auth, follow_redirects, timeout, extensions)\u001b[0m\n\u001b[1;32m 1838\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpost\u001b[39m(\n\u001b[1;32m 1839\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1840\u001b[0m url: URL \u001b[38;5;241m|\u001b[39m \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1852\u001b[0m extensions: RequestExtensions \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1853\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Response:\n\u001b[1;32m 1854\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1855\u001b[0m \u001b[38;5;124;03m Send a `POST` request.\u001b[39;00m\n\u001b[1;32m 1856\u001b[0m \n\u001b[1;32m 1857\u001b[0m \u001b[38;5;124;03m **Parameters**: See `httpx.request`.\u001b[39;00m\n\u001b[1;32m 1858\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1859\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m 1860\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1861\u001b[0m url,\n\u001b[1;32m 1862\u001b[0m content\u001b[38;5;241m=\u001b[39mcontent,\n\u001b[1;32m 1863\u001b[0m data\u001b[38;5;241m=\u001b[39mdata,\n\u001b[1;32m 1864\u001b[0m files\u001b[38;5;241m=\u001b[39mfiles,\n\u001b[1;32m 1865\u001b[0m json\u001b[38;5;241m=\u001b[39mjson,\n\u001b[1;32m 1866\u001b[0m params\u001b[38;5;241m=\u001b[39mparams,\n\u001b[1;32m 1867\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m 1868\u001b[0m cookies\u001b[38;5;241m=\u001b[39mcookies,\n\u001b[1;32m 1869\u001b[0m auth\u001b[38;5;241m=\u001b[39mauth,\n\u001b[1;32m 1870\u001b[0m follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects,\n\u001b[1;32m 1871\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[1;32m 1872\u001b[0m extensions\u001b[38;5;241m=\u001b[39mextensions,\n\u001b[1;32m 1873\u001b[0m )\n",
|
|
149
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1540\u001b[0m, in \u001b[0;36mAsyncClient.request\u001b[0;34m(self, method, url, content, data, files, json, params, headers, cookies, auth, follow_redirects, timeout, extensions)\u001b[0m\n\u001b[1;32m 1525\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(message, \u001b[38;5;167;01mDeprecationWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 1527\u001b[0m request \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuild_request(\n\u001b[1;32m 1528\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 1529\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1538\u001b[0m extensions\u001b[38;5;241m=\u001b[39mextensions,\n\u001b[1;32m 1539\u001b[0m )\n\u001b[0;32m-> 1540\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msend(request, auth\u001b[38;5;241m=\u001b[39mauth, follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects)\n",
|
|
150
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1629\u001b[0m, in \u001b[0;36mAsyncClient.send\u001b[0;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[1;32m 1625\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_timeout(request)\n\u001b[1;32m 1627\u001b[0m auth \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_build_request_auth(request, auth)\n\u001b[0;32m-> 1629\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_send_handling_auth(\n\u001b[1;32m 1630\u001b[0m request,\n\u001b[1;32m 1631\u001b[0m auth\u001b[38;5;241m=\u001b[39mauth,\n\u001b[1;32m 1632\u001b[0m follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects,\n\u001b[1;32m 1633\u001b[0m history\u001b[38;5;241m=\u001b[39m[],\n\u001b[1;32m 1634\u001b[0m )\n\u001b[1;32m 1635\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1636\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m stream:\n",
|
|
151
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1657\u001b[0m, in \u001b[0;36mAsyncClient._send_handling_auth\u001b[0;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[1;32m 1654\u001b[0m request \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m auth_flow\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__anext__\u001b[39m()\n\u001b[1;32m 1656\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m-> 1657\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_send_handling_redirects(\n\u001b[1;32m 1658\u001b[0m request,\n\u001b[1;32m 1659\u001b[0m follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects,\n\u001b[1;32m 1660\u001b[0m history\u001b[38;5;241m=\u001b[39mhistory,\n\u001b[1;32m 1661\u001b[0m )\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
|
|
152
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1694\u001b[0m, in \u001b[0;36mAsyncClient._send_handling_redirects\u001b[0;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[1;32m 1691\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_hooks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrequest\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m hook(request)\n\u001b[0;32m-> 1694\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_send_single_request(request)\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_hooks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n",
|
|
153
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1730\u001b[0m, in \u001b[0;36mAsyncClient._send_single_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 1725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 1726\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAttempted to send an sync request with an AsyncClient instance.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1727\u001b[0m )\n\u001b[1;32m 1729\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39mrequest):\n\u001b[0;32m-> 1730\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m transport\u001b[38;5;241m.\u001b[39mhandle_async_request(request)\n\u001b[1;32m 1732\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response\u001b[38;5;241m.\u001b[39mstream, AsyncByteStream)\n\u001b[1;32m 1733\u001b[0m response\u001b[38;5;241m.\u001b[39mrequest \u001b[38;5;241m=\u001b[39m request\n",
|
|
154
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:393\u001b[0m, in \u001b[0;36mAsyncHTTPTransport.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mhttpcore\u001b[39;00m\n\u001b[1;32m 381\u001b[0m req \u001b[38;5;241m=\u001b[39m httpcore\u001b[38;5;241m.\u001b[39mRequest(\n\u001b[1;32m 382\u001b[0m method\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mmethod,\n\u001b[1;32m 383\u001b[0m url\u001b[38;5;241m=\u001b[39mhttpcore\u001b[38;5;241m.\u001b[39mURL(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 391\u001b[0m extensions\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mextensions,\n\u001b[1;32m 392\u001b[0m )\n\u001b[0;32m--> 393\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m 394\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool\u001b[38;5;241m.\u001b[39mhandle_async_request(req)\n\u001b[1;32m 396\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp\u001b[38;5;241m.\u001b[39mstream, typing\u001b[38;5;241m.\u001b[39mAsyncIterable)\n",
|
|
155
|
+
"File \u001b[0;32m~/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 151\u001b[0m value \u001b[38;5;241m=\u001b[39m typ()\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m exc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m value\n",
|
|
156
|
+
"File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:118\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 117\u001b[0m message \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(exc)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mapped_exc(message) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexc\u001b[39;00m\n",
|
|
157
|
+
"\u001b[0;31mReadTimeout\u001b[0m: "
|
|
158
|
+
]
|
|
159
|
+
}
|
|
160
|
+
],
|
|
161
|
+
"source": [
|
|
162
|
+
"from content_core.common import ProcessSourceInput\n",
|
|
163
|
+
"from content_core import extract_content\n",
|
|
164
|
+
"\n",
|
|
165
|
+
"lesson_source = ProcessSourceInput(\n",
|
|
166
|
+
" file_path=\"/Users/luisnovo/dev/projetos/snl/adaptive-learning-novo/data/input/Oportunidade ao Alcance de Todos/OAT - Aula 2 v2 - Levi Rezende.mp4\",\n",
|
|
167
|
+
")\n",
|
|
168
|
+
" \n",
|
|
169
|
+
"result = await extract_content(lesson_source)"
|
|
170
|
+
]
|
|
171
|
+
}
|
|
172
|
+
],
|
|
173
|
+
"metadata": {
|
|
174
|
+
"kernelspec": {
|
|
175
|
+
"display_name": ".venv",
|
|
176
|
+
"language": "python",
|
|
177
|
+
"name": "python3"
|
|
178
|
+
},
|
|
179
|
+
"language_info": {
|
|
180
|
+
"codemirror_mode": {
|
|
181
|
+
"name": "ipython",
|
|
182
|
+
"version": 3
|
|
183
|
+
},
|
|
184
|
+
"file_extension": ".py",
|
|
185
|
+
"mimetype": "text/x-python",
|
|
186
|
+
"name": "python",
|
|
187
|
+
"nbconvert_exporter": "python",
|
|
188
|
+
"pygments_lexer": "ipython3",
|
|
189
|
+
"version": "3.10.6"
|
|
190
|
+
}
|
|
191
|
+
},
|
|
192
|
+
"nbformat": 4,
|
|
193
|
+
"nbformat_minor": 5
|
|
194
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "content-core"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.5.0"
|
|
4
4
|
description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
homepage = "https://github.com/lfnovo/content-core"
|
|
@@ -70,3 +70,4 @@ dev = [
|
|
|
70
70
|
pythonpath = ["src"]
|
|
71
71
|
asyncio_mode = "auto"
|
|
72
72
|
asyncio_default_fixture_loop_scope = "function"
|
|
73
|
+
asyncio_default_test_loop_scope = "function"
|
|
@@ -32,6 +32,8 @@ summary_model:
|
|
|
32
32
|
extraction:
|
|
33
33
|
document_engine: auto # auto | simple | docling - for files/documents
|
|
34
34
|
url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
|
|
35
|
+
audio:
|
|
36
|
+
concurrency: 3 # Number of concurrent audio transcriptions (1-10)
|
|
35
37
|
docling:
|
|
36
38
|
output_format: markdown # markdown | html | json
|
|
37
39
|
pymupdf:
|
|
@@ -70,6 +70,61 @@ def get_url_engine():
|
|
|
70
70
|
return env_engine
|
|
71
71
|
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
72
72
|
|
|
73
|
+
def get_audio_concurrency():
|
|
74
|
+
"""
|
|
75
|
+
Get audio concurrency with environment variable override and validation.
|
|
76
|
+
|
|
77
|
+
Returns the configured number of concurrent audio transcriptions, with automatic
|
|
78
|
+
validation and fallback to safe defaults.
|
|
79
|
+
|
|
80
|
+
Configuration priority (highest to lowest):
|
|
81
|
+
1. CCORE_AUDIO_CONCURRENCY environment variable
|
|
82
|
+
2. extraction.audio.concurrency in YAML config
|
|
83
|
+
3. Default value: 3
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
int: Number of concurrent transcriptions (1-10)
|
|
87
|
+
|
|
88
|
+
Validation:
|
|
89
|
+
- Values must be integers between 1 and 10 (inclusive)
|
|
90
|
+
- Invalid values (out of range, non-integer, etc.) automatically fall back to default
|
|
91
|
+
- A warning is logged when invalid values are detected
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> import os
|
|
95
|
+
>>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "5"
|
|
96
|
+
>>> get_audio_concurrency()
|
|
97
|
+
5
|
|
98
|
+
|
|
99
|
+
>>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "20" # Too high
|
|
100
|
+
>>> get_audio_concurrency() # Falls back to default
|
|
101
|
+
3
|
|
102
|
+
"""
|
|
103
|
+
env_concurrency = os.environ.get("CCORE_AUDIO_CONCURRENCY")
|
|
104
|
+
if env_concurrency:
|
|
105
|
+
try:
|
|
106
|
+
concurrency = int(env_concurrency)
|
|
107
|
+
if concurrency < 1 or concurrency > 10:
|
|
108
|
+
# Import logger here to avoid circular imports
|
|
109
|
+
from content_core.logging import logger
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
|
|
112
|
+
f"Must be between 1 and 10. "
|
|
113
|
+
f"Using default from config."
|
|
114
|
+
)
|
|
115
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
116
|
+
return concurrency
|
|
117
|
+
except ValueError:
|
|
118
|
+
# Import logger here to avoid circular imports
|
|
119
|
+
from content_core.logging import logger
|
|
120
|
+
logger.warning(
|
|
121
|
+
f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
|
|
122
|
+
f"Must be a valid integer. "
|
|
123
|
+
f"Using default from config."
|
|
124
|
+
)
|
|
125
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
126
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
127
|
+
|
|
73
128
|
# Programmatic config overrides: use in notebooks or scripts
|
|
74
129
|
def set_document_engine(engine: str):
|
|
75
130
|
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
@@ -102,3 +157,19 @@ def set_pymupdf_ocr_fallback(enabled: bool):
|
|
|
102
157
|
extraction = CONFIG.setdefault("extraction", {})
|
|
103
158
|
pymupdf_cfg = extraction.setdefault("pymupdf", {})
|
|
104
159
|
pymupdf_cfg["ocr_fallback"] = enabled
|
|
160
|
+
|
|
161
|
+
def set_audio_concurrency(concurrency: int):
|
|
162
|
+
"""
|
|
163
|
+
Override the audio concurrency setting (1-10).
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
concurrency (int): Number of concurrent audio transcriptions (1-10)
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: If concurrency is not between 1 and 10
|
|
170
|
+
"""
|
|
171
|
+
if not isinstance(concurrency, int) or concurrency < 1 or concurrency > 10:
|
|
172
|
+
raise ValueError(f"Audio concurrency must be an integer between 1 and 10, got: {concurrency}")
|
|
173
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
174
|
+
audio_cfg = extraction.setdefault("audio", {})
|
|
175
|
+
audio_cfg["concurrency"] = concurrency
|
{content_core-1.4.2 → content_core-1.5.0}/src/content_core/content/identification/file_detector.py
RENAMED
|
@@ -3,10 +3,9 @@ Pure Python file type detection using magic bytes and content analysis.
|
|
|
3
3
|
Replaces libmagic dependency with a lightweight implementation.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import os
|
|
7
6
|
import zipfile
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from typing import Dict, Optional
|
|
8
|
+
from typing import Dict, Optional
|
|
10
9
|
|
|
11
10
|
from content_core.common.exceptions import UnsupportedTypeException
|
|
12
11
|
from content_core.logging import logger
|