content-core 1.4.2__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (103) hide show
  1. content_core-1.6.0/.github/workflows/create-tag.yml +56 -0
  2. {content_core-1.4.2 → content_core-1.6.0}/.gitignore +3 -1
  3. {content_core-1.4.2 → content_core-1.6.0}/PKG-INFO +15 -1
  4. {content_core-1.4.2 → content_core-1.6.0}/README.md +14 -0
  5. {content_core-1.4.2 → content_core-1.6.0}/docs/processors.md +21 -5
  6. {content_core-1.4.2 → content_core-1.6.0}/docs/usage.md +115 -0
  7. {content_core-1.4.2 → content_core-1.6.0}/examples/main.py +1 -2
  8. content_core-1.6.0/notebooks/extraction.ipynb +194 -0
  9. {content_core-1.4.2 → content_core-1.6.0}/pyproject.toml +2 -1
  10. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/__init__.py +0 -2
  11. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/cc_config.yaml +2 -0
  12. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/config.py +71 -0
  13. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/identification/file_detector.py +103 -13
  14. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/summary/core.py +1 -1
  15. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/notebooks/run.ipynb +0 -2
  16. content_core-1.6.0/src/content_core/processors/audio.py +225 -0
  17. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/url.py +3 -3
  18. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/templated_message.py +2 -2
  19. content_core-1.6.0/test_coverage_branch_report.md +480 -0
  20. content_core-1.6.0/tests/integration/conftest.py +39 -0
  21. {content_core-1.4.2 → content_core-1.6.0}/tests/integration/test_extraction.py +12 -6
  22. content_core-1.6.0/tests/unit/test_audio_concurrency.py +225 -0
  23. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_config.py +1 -2
  24. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_file_detector_critical.py +0 -2
  25. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_file_detector_performance.py +0 -1
  26. content_core-1.6.0/uv.lock +5708 -0
  27. content_core-1.4.2/.claude/sessions/OSS-216/architecture.md +0 -195
  28. content_core-1.4.2/.claude/sessions/OSS-216/context.md +0 -54
  29. content_core-1.4.2/.claude/sessions/OSS-216/plan.md +0 -195
  30. content_core-1.4.2/src/content_core/processors/audio.py +0 -158
  31. content_core-1.4.2/uv.lock +0 -5180
  32. {content_core-1.4.2 → content_core-1.6.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  33. {content_core-1.4.2 → content_core-1.6.0}/.github/workflows/claude-code-review.yml +0 -0
  34. {content_core-1.4.2 → content_core-1.6.0}/.github/workflows/claude.yml +0 -0
  35. {content_core-1.4.2 → content_core-1.6.0}/.github/workflows/publish.yml +0 -0
  36. {content_core-1.4.2 → content_core-1.6.0}/.python-version +0 -0
  37. {content_core-1.4.2 → content_core-1.6.0}/CHANGELOG.md +0 -0
  38. {content_core-1.4.2 → content_core-1.6.0}/CONTRIBUTING.md +0 -0
  39. {content_core-1.4.2 → content_core-1.6.0}/LICENSE +0 -0
  40. {content_core-1.4.2 → content_core-1.6.0}/Makefile +0 -0
  41. {content_core-1.4.2 → content_core-1.6.0}/docs/macos.md +0 -0
  42. {content_core-1.4.2 → content_core-1.6.0}/docs/mcp.md +0 -0
  43. {content_core-1.4.2 → content_core-1.6.0}/docs/raycast.md +0 -0
  44. {content_core-1.4.2 → content_core-1.6.0}/prompts/content/cleanup.jinja +0 -0
  45. {content_core-1.4.2 → content_core-1.6.0}/prompts/content/summarize.jinja +0 -0
  46. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/.eslintrc.json +0 -0
  47. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/CHANGELOG.md +0 -0
  48. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/README.md +0 -0
  49. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/assets/command-icon.png +0 -0
  50. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/package-lock.json +0 -0
  51. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/package.json +0 -0
  52. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/raycast-env.d.ts +0 -0
  53. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/src/extract-content.tsx +0 -0
  54. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/src/quick-extract.tsx +0 -0
  55. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/src/summarize-content.tsx +0 -0
  56. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/src/utils/content-core.ts +0 -0
  57. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/src/utils/types.ts +0 -0
  58. {content_core-1.4.2 → content_core-1.6.0}/raycast-content-core/tsconfig.json +0 -0
  59. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/common/__init__.py +0 -0
  60. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/common/exceptions.py +0 -0
  61. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/common/state.py +0 -0
  62. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/common/types.py +0 -0
  63. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/common/utils.py +0 -0
  64. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/__init__.py +0 -0
  65. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/cleanup/__init__.py +0 -0
  66. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/cleanup/core.py +0 -0
  67. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/extraction/__init__.py +0 -0
  68. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/extraction/graph.py +0 -0
  69. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/identification/__init__.py +0 -0
  70. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/content/summary/__init__.py +0 -0
  71. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/logging.py +0 -0
  72. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/mcp/__init__.py +0 -0
  73. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/mcp/server.py +0 -0
  74. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/models.py +0 -0
  75. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/models_config.yaml +0 -0
  76. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/notebooks/urls.ipynb +0 -0
  77. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/docling.py +0 -0
  78. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/office.py +0 -0
  79. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/pdf.py +0 -0
  80. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/text.py +0 -0
  81. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/video.py +0 -0
  82. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/processors/youtube.py +0 -0
  83. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/py.typed +0 -0
  84. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/tools/__init__.py +0 -0
  85. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/tools/cleanup.py +0 -0
  86. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/tools/extract.py +0 -0
  87. {content_core-1.4.2 → content_core-1.6.0}/src/content_core/tools/summarize.py +0 -0
  88. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.docx +0 -0
  89. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.epub +0 -0
  90. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.md +0 -0
  91. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.mp3 +0 -0
  92. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.mp4 +0 -0
  93. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.pdf +0 -0
  94. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.pptx +0 -0
  95. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.txt +0 -0
  96. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file.xlsx +0 -0
  97. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/file_audio.mp3 +0 -0
  98. {content_core-1.4.2 → content_core-1.6.0}/tests/input_content/new_pdf.pdf +0 -0
  99. {content_core-1.4.2 → content_core-1.6.0}/tests/integration/test_cli.py +0 -0
  100. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_docling.py +0 -0
  101. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_file_detector.py +0 -0
  102. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_mcp_server.py +0 -0
  103. {content_core-1.4.2 → content_core-1.6.0}/tests/unit/test_pymupdf_ocr.py +0 -0
@@ -0,0 +1,56 @@
1
+ name: Create Tag
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ inputs:
6
+ confirm:
7
+ description: 'Type "yes" to confirm tag creation'
8
+ required: true
9
+ default: 'no'
10
+
11
+ jobs:
12
+ create-tag:
13
+ runs-on: ubuntu-latest
14
+ if: github.event.inputs.confirm == 'yes'
15
+ permissions:
16
+ contents: write
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ with:
21
+ fetch-depth: 0
22
+
23
+ - name: Extract version from pyproject.toml
24
+ id: version
25
+ run: |
26
+ VERSION=$(grep '^version = ' pyproject.toml | sed 's/version = "\(.*\)"/\1/')
27
+ echo "version=$VERSION" >> $GITHUB_OUTPUT
28
+ echo "tag=v$VERSION" >> $GITHUB_OUTPUT
29
+ echo "📦 Version found: $VERSION"
30
+
31
+ - name: Check if tag already exists
32
+ id: check
33
+ run: |
34
+ if git rev-parse "v${{ steps.version.outputs.version }}" >/dev/null 2>&1; then
35
+ echo "exists=true" >> $GITHUB_OUTPUT
36
+ echo "❌ Tag v${{ steps.version.outputs.version }} already exists!"
37
+ exit 1
38
+ else
39
+ echo "exists=false" >> $GITHUB_OUTPUT
40
+ echo "✅ Tag v${{ steps.version.outputs.version }} does not exist yet"
41
+ fi
42
+
43
+ - name: Create and push tag
44
+ if: steps.check.outputs.exists == 'false'
45
+ run: |
46
+ git config user.name "github-actions[bot]"
47
+ git config user.email "github-actions[bot]@users.noreply.github.com"
48
+ git tag "v${{ steps.version.outputs.version }}"
49
+ git push origin "v${{ steps.version.outputs.version }}"
50
+ echo "🏷️ Created and pushed tag: v${{ steps.version.outputs.version }}"
51
+
52
+ - name: Tag created successfully
53
+ if: steps.check.outputs.exists == 'false'
54
+ run: |
55
+ echo "✨ Tag v${{ steps.version.outputs.version }} has been created!"
56
+ echo "🚀 This will trigger the publish workflow automatically."
@@ -27,4 +27,6 @@ CLAUDE.md
27
27
  node_modules/
28
28
  **/notebooks/private
29
29
 
30
- claude-logs
30
+ claude-logs
31
+ specs/
32
+ .claude/sessions
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.4.2
3
+ Version: 1.6.0
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -548,6 +548,9 @@ GOOGLE_API_KEY=your-key-here
548
548
  # Engine Selection (optional)
549
549
  CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
550
550
  CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
551
+
552
+ # Audio Processing (optional)
553
+ CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
551
554
  ```
552
555
 
553
556
  ### Engine Selection via Environment Variables
@@ -556,9 +559,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
556
559
 
557
560
  - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
558
561
  - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
562
+ - **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
559
563
 
560
564
  These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
561
565
 
566
+ ### Audio Processing Configuration
567
+
568
+ Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
569
+
570
+ - **Default**: 3 concurrent transcriptions
571
+ - **Range**: 1-10 concurrent transcriptions
572
+ - **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
573
+
574
+ Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
575
+
562
576
  ### Custom Prompt Templates
563
577
 
564
578
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -513,6 +513,9 @@ GOOGLE_API_KEY=your-key-here
513
513
  # Engine Selection (optional)
514
514
  CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
515
515
  CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
516
+
517
+ # Audio Processing (optional)
518
+ CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
516
519
  ```
517
520
 
518
521
  ### Engine Selection via Environment Variables
@@ -521,9 +524,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
521
524
 
522
525
  - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
523
526
  - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
527
+ - **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
524
528
 
525
529
  These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
526
530
 
531
+ ### Audio Processing Configuration
532
+
533
+ Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
534
+
535
+ - **Default**: 3 concurrent transcriptions
536
+ - **Range**: 1-10 concurrent transcriptions
537
+ - **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
538
+
539
+ Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
540
+
527
541
  ### Custom Prompt Templates
528
542
 
529
543
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -36,11 +36,27 @@ Content Core uses a modular approach to process content from different sources.
36
36
  - **Returned Data**: Extracted text content or transcriptions (for media files), structured according to Content Core's schema.
37
37
  - **Location**: `src/content_core/processors/file.py`
38
38
 
39
- ### 4. **Media Transcription Processor**
40
- - **Purpose**: Specifically handles transcription of audio and video files using external services or libraries.
41
- - **Supported Input**: Audio and video files (e.g., `.mp3`, `.mp4`).
42
- - **Returned Data**: Transcribed text from the media content.
43
- - **Location**: `src/content_core/processors/transcription.py`
39
+ ### 4. **Media Transcription Processor (Audio/Video)**
40
+ - **Purpose**: Handles transcription of audio and video files using OpenAI Whisper API with parallel processing for improved performance
41
+ - **Supported Input**: Audio files (`.mp3`, `.wav`, `.m4a`, `.flac`, `.ogg`) and video files (`.mp4`, `.avi`, `.mov`, `.mkv`)
42
+ - **Returned Data**: Transcribed text from the media content, with metadata about processed segments
43
+ - **Location**: `src/content_core/processors/audio.py`
44
+ - **Key Features**:
45
+ - **Automatic Segmentation**: Files longer than 10 minutes are automatically split into segments
46
+ - **Parallel Processing**: Multiple segments are transcribed concurrently using `asyncio.gather()` with semaphore-based concurrency control
47
+ - **Configurable Concurrency**: Control the number of simultaneous transcriptions (1-10, default: 3) via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in YAML config
48
+ - **Order Preservation**: Results are assembled in correct order regardless of completion time
49
+ - **Efficient Resource Usage**: Semaphore prevents API rate limiting while maximizing throughput
50
+ - **Configuration**:
51
+ ```yaml
52
+ extraction:
53
+ audio:
54
+ concurrency: 3 # Number of concurrent transcriptions (1-10)
55
+ ```
56
+ - **Performance**:
57
+ - Short files (<10 min): Processed as single segment, no splitting overhead
58
+ - Long files (>10 min): Processing time scales sub-linearly with concurrency
59
+ - Example: 60-minute file with concurrency=3 takes ~5-7 minutes vs ~15-20 minutes with concurrency=1
44
60
 
45
61
  ### 5. **Enhanced PyMuPDF Processor (Simple Engine)**
46
62
  - **Purpose**: Optimized PDF extraction using PyMuPDF with enhanced quality flags, table detection, and optional OCR
@@ -247,6 +247,121 @@ Enable OCR enhancement for:
247
247
 
248
248
  **Note**: The quality improvements (better character rendering, table detection) work automatically without requiring OCR or additional setup.
249
249
 
250
+ ## Audio Processing Configuration
251
+
252
+ Content Core optimizes audio and video file processing by using parallel transcription of audio segments. This feature is particularly beneficial for long-form content like podcasts, lectures, or long videos.
253
+
254
+ ### How It Works
255
+
256
+ 1. **Automatic Segmentation**: Audio files longer than 10 minutes are automatically split into segments
257
+ 2. **Parallel Transcription**: Multiple segments are transcribed concurrently using OpenAI Whisper
258
+ 3. **Concurrency Control**: A semaphore limits the number of simultaneous API calls to prevent rate limiting
259
+ 4. **Result Assembly**: Transcriptions are joined in the correct order to produce the complete transcript
260
+
261
+ ### Configuration
262
+
263
+ #### Via YAML Configuration
264
+
265
+ Add to your `cc_config.yaml` or custom configuration file:
266
+
267
+ ```yaml
268
+ extraction:
269
+ audio:
270
+ concurrency: 3 # Number of concurrent transcriptions (1-10, default: 3)
271
+ ```
272
+
273
+ #### Via Environment Variable
274
+
275
+ Set in your `.env` file or system environment:
276
+
277
+ ```plaintext
278
+ CCORE_AUDIO_CONCURRENCY=5 # Process 5 segments simultaneously
279
+ ```
280
+
281
+ The environment variable takes precedence over the YAML configuration.
282
+
283
+ #### Programmatically in Python
284
+
285
+ ```python
286
+ from content_core.config import set_audio_concurrency
287
+
288
+ # Override audio concurrency for the current session
289
+ set_audio_concurrency(5)
290
+
291
+ # Now process audio with the new setting
292
+ result = await cc.extract({"file_path": "long_podcast.mp3"})
293
+ ```
294
+
295
+ ### Performance Considerations
296
+
297
+ **Choosing the Right Concurrency Level:**
298
+
299
+ - **1-2 concurrent**: Conservative approach
300
+ - Best for: API rate limits, cost management, batch processing
301
+ - Processing time: Slower, but more reliable
302
+
303
+ - **3-5 concurrent** (recommended): Balanced approach
304
+ - Best for: Most use cases, moderate file lengths
305
+ - Processing time: Good balance between speed and stability
306
+
307
+ - **6-10 concurrent**: Aggressive approach
308
+ - Best for: Very long files (>1 hour), premium API tiers
309
+ - Processing time: Fastest, but higher risk of rate limits
310
+ - Note: May result in higher API costs
311
+
312
+ **Example Processing Times** (approximate, for a 60-minute audio file):
313
+ - Concurrency 1: ~15-20 minutes
314
+ - Concurrency 3: ~5-7 minutes
315
+ - Concurrency 10: ~2-3 minutes
316
+
317
+ ### Validation and Error Handling
318
+
319
+ Content Core validates the concurrency setting and provides safe defaults:
320
+
321
+ - **Valid range**: 1-10 concurrent transcriptions
322
+ - **Invalid values**: Automatically fall back to default (3) with a warning logged
323
+ - **Invalid types**: Non-integer values are rejected with a warning
324
+
325
+ Example warning when using invalid value:
326
+ ```
327
+ WARNING: Invalid CCORE_AUDIO_CONCURRENCY: '15'. Must be between 1 and 10. Using default from config.
328
+ ```
329
+
330
+ ### Use Cases
331
+
332
+ **Podcasts and Long Interviews:**
333
+ ```python
334
+ from content_core.config import set_audio_concurrency
335
+ import content_core as cc
336
+
337
+ # For a 2-hour podcast, use higher concurrency
338
+ set_audio_concurrency(7)
339
+ result = await cc.extract({"file_path": "podcast_episode_120min.mp3"})
340
+ ```
341
+
342
+ **Batch Processing:**
343
+ ```python
344
+ from content_core.config import set_audio_concurrency
345
+ import content_core as cc
346
+
347
+ # For processing multiple files sequentially, use lower concurrency
348
+ # to avoid rate limits across all files
349
+ set_audio_concurrency(2)
350
+
351
+ for audio_file in audio_files:
352
+ result = await cc.extract({"file_path": audio_file})
353
+ # Process result...
354
+ ```
355
+
356
+ **Video Transcription:**
357
+ ```python
358
+ import content_core as cc
359
+
360
+ # Videos are processed the same way - audio is extracted first, then transcribed
361
+ result = await cc.extract({"file_path": "conference_talk.mp4"})
362
+ print(result.content) # Full transcript
363
+ ```
364
+
250
365
  ## File Type Detection
251
366
 
252
367
  Content Core uses a pure Python implementation for file type detection, eliminating the need for system dependencies like libmagic. This ensures consistent behavior across all platforms (Windows, macOS, Linux).
@@ -6,10 +6,9 @@ This script processes all files in the input_content/ directory and URLs from ur
6
6
  converting them to Markdown format and saving the results to separate files.
7
7
  """
8
8
 
9
- import os
10
9
  import sys
11
10
  from pathlib import Path
12
- from typing import List, Tuple
11
+ from typing import List
13
12
  from urllib.parse import urlparse
14
13
 
15
14
  from loguru import logger
@@ -0,0 +1,194 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "444fcc1f",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stderr",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "\u001b[32m2025-09-26 16:49:15.616\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mcontent_core.processors.audio\u001b[0m:\u001b[36mextract_audio_data\u001b[0m:\u001b[36m123\u001b[0m - \u001b[1mAudio is longer than 10 minutes (675.31s), splitting into 2 segments\u001b[0m\n"
14
+ ]
15
+ },
16
+ {
17
+ "name": "stdout",
18
+ "output_type": "stream",
19
+ "text": [
20
+ "MoviePy - Writing audio in /var/folders/cl/346yd2sd3vz399s0m2c_ynvr0000gn/T/tmp6zx9axd4/OAT - Aula 2 v2 - Levi Rezende_audio_001.mp3\n"
21
+ ]
22
+ },
23
+ {
24
+ "name": "stderr",
25
+ "output_type": "stream",
26
+ "text": [
27
+ " \r"
28
+ ]
29
+ },
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "MoviePy - Done.\n",
35
+ "MoviePy - Writing audio in /var/folders/cl/346yd2sd3vz399s0m2c_ynvr0000gn/T/tmp6zx9axd4/OAT - Aula 2 v2 - Levi Rezende_audio_002.mp3\n"
36
+ ]
37
+ },
38
+ {
39
+ "name": "stderr",
40
+ "output_type": "stream",
41
+ "text": [
42
+ " \r"
43
+ ]
44
+ },
45
+ {
46
+ "name": "stdout",
47
+ "output_type": "stream",
48
+ "text": [
49
+ "MoviePy - Done.\n"
50
+ ]
51
+ },
52
+ {
53
+ "name": "stderr",
54
+ "output_type": "stream",
55
+ "text": [
56
+ "\u001b[32m2025-09-26 16:49:54.566\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mcontent_core.processors.audio\u001b[0m:\u001b[36mextract_audio_data\u001b[0m:\u001b[36m156\u001b[0m - \u001b[31m\u001b[1mError processing audio: \u001b[0m\n",
57
+ "\u001b[32m2025-09-26 16:49:54.587\u001b[0m | \u001b[31m\u001b[1mERROR \u001b[0m | \u001b[36mcontent_core.processors.audio\u001b[0m:\u001b[36mextract_audio_data\u001b[0m:\u001b[36m157\u001b[0m - \u001b[31m\u001b[1mTraceback (most recent call last):\n",
58
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 101, in map_httpcore_exceptions\n",
59
+ " yield\n",
60
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 394, in handle_async_request\n",
61
+ " resp = await self._pool.handle_async_request(req)\n",
62
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py\", line 256, in handle_async_request\n",
63
+ " raise exc from None\n",
64
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py\", line 236, in handle_async_request\n",
65
+ " response = await connection.handle_async_request(\n",
66
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection.py\", line 103, in handle_async_request\n",
67
+ " return await self._connection.handle_async_request(request)\n",
68
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 136, in handle_async_request\n",
69
+ " raise exc\n",
70
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 106, in handle_async_request\n",
71
+ " ) = await self._receive_response_headers(**kwargs)\n",
72
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 177, in _receive_response_headers\n",
73
+ " event = await self._receive_event(timeout=timeout)\n",
74
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py\", line 217, in _receive_event\n",
75
+ " data = await self._network_stream.read(\n",
76
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_backends/anyio.py\", line 32, in read\n",
77
+ " with map_exceptions(exc_map):\n",
78
+ " File \"/Users/luisnovo/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py\", line 153, in __exit__\n",
79
+ " self.gen.throw(typ, value, traceback)\n",
80
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_exceptions.py\", line 14, in map_exceptions\n",
81
+ " raise to_exc(exc) from exc\n",
82
+ "httpcore.ReadTimeout\n",
83
+ "\n",
84
+ "The above exception was the direct cause of the following exception:\n",
85
+ "\n",
86
+ "Traceback (most recent call last):\n",
87
+ " File \"/Users/luisnovo/dev/projetos/content-core/src/content_core/processors/audio.py\", line 146, in extract_audio_data\n",
88
+ " transcription = await transcribe_audio_segment(\n",
89
+ " File \"/Users/luisnovo/dev/projetos/content-core/src/content_core/processors/audio.py\", line 103, in transcribe_audio_segment\n",
90
+ " return (await model.atranscribe(audio_file)).text\n",
91
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/esperanto/providers/stt/openai.py\", line 158, in atranscribe\n",
92
+ " response = await self.async_client.post(\n",
93
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1859, in post\n",
94
+ " return await self.request(\n",
95
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1540, in request\n",
96
+ " return await self.send(request, auth=auth, follow_redirects=follow_redirects)\n",
97
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1629, in send\n",
98
+ " response = await self._send_handling_auth(\n",
99
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1657, in _send_handling_auth\n",
100
+ " response = await self._send_handling_redirects(\n",
101
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1694, in _send_handling_redirects\n",
102
+ " response = await self._send_single_request(request)\n",
103
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py\", line 1730, in _send_single_request\n",
104
+ " response = await transport.handle_async_request(request)\n",
105
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 393, in handle_async_request\n",
106
+ " with map_httpcore_exceptions():\n",
107
+ " File \"/Users/luisnovo/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py\", line 153, in __exit__\n",
108
+ " self.gen.throw(typ, value, traceback)\n",
109
+ " File \"/Users/luisnovo/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py\", line 118, in map_httpcore_exceptions\n",
110
+ " raise mapped_exc(message) from exc\n",
111
+ "httpx.ReadTimeout\n",
112
+ "\u001b[0m\n"
113
+ ]
114
+ },
115
+ {
116
+ "ename": "ReadTimeout",
117
+ "evalue": "",
118
+ "output_type": "error",
119
+ "traceback": [
120
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
121
+ "\u001b[0;31mReadTimeout\u001b[0m Traceback (most recent call last)",
122
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:101\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 100\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m\n\u001b[1;32m 102\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
123
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:394\u001b[0m, in \u001b[0;36mAsyncHTTPTransport.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[0;32m--> 394\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool\u001b[38;5;241m.\u001b[39mhandle_async_request(req)\n\u001b[1;32m 396\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp\u001b[38;5;241m.\u001b[39mstream, typing\u001b[38;5;241m.\u001b[39mAsyncIterable)\n",
124
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py:256\u001b[0m, in \u001b[0;36mAsyncConnectionPool.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_close_connections(closing)\n\u001b[0;32m--> 256\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 258\u001b[0m \u001b[38;5;66;03m# Return the response. Note that in this case we still have to manage\u001b[39;00m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;66;03m# the point at which the response is closed.\u001b[39;00m\n",
125
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection_pool.py:236\u001b[0m, in \u001b[0;36mAsyncConnectionPool.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 235\u001b[0m \u001b[38;5;66;03m# Send the request on the assigned connection.\u001b[39;00m\n\u001b[0;32m--> 236\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m connection\u001b[38;5;241m.\u001b[39mhandle_async_request(\n\u001b[1;32m 237\u001b[0m pool_request\u001b[38;5;241m.\u001b[39mrequest\n\u001b[1;32m 238\u001b[0m )\n\u001b[1;32m 239\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ConnectionNotAvailable:\n\u001b[1;32m 240\u001b[0m \u001b[38;5;66;03m# In some cases a connection may initially be available to\u001b[39;00m\n\u001b[1;32m 241\u001b[0m \u001b[38;5;66;03m# handle a request, but then become unavailable.\u001b[39;00m\n\u001b[1;32m 242\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 243\u001b[0m \u001b[38;5;66;03m# In this case we clear the connection and try again.\u001b[39;00m\n",
126
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/connection.py:103\u001b[0m, in \u001b[0;36mAsyncHTTPConnection.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_connection\u001b[38;5;241m.\u001b[39mhandle_async_request(request)\n",
127
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:136\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_response_closed()\n\u001b[0;32m--> 136\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exc\n",
128
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:106\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 97\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m Trace(\n\u001b[1;32m 98\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreceive_response_headers\u001b[39m\u001b[38;5;124m\"\u001b[39m, logger, request, kwargs\n\u001b[1;32m 99\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m trace:\n\u001b[1;32m 100\u001b[0m (\n\u001b[1;32m 101\u001b[0m http_version,\n\u001b[1;32m 102\u001b[0m status,\n\u001b[1;32m 103\u001b[0m reason_phrase,\n\u001b[1;32m 104\u001b[0m headers,\n\u001b[1;32m 105\u001b[0m trailing_data,\n\u001b[0;32m--> 106\u001b[0m ) \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_receive_response_headers(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 107\u001b[0m trace\u001b[38;5;241m.\u001b[39mreturn_value \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 108\u001b[0m http_version,\n\u001b[1;32m 109\u001b[0m status,\n\u001b[1;32m 110\u001b[0m reason_phrase,\n\u001b[1;32m 111\u001b[0m headers,\n\u001b[1;32m 112\u001b[0m )\n",
129
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:177\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection._receive_response_headers\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 176\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m--> 177\u001b[0m event \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_receive_event(timeout\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(event, h11\u001b[38;5;241m.\u001b[39mResponse):\n",
130
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_async/http11.py:217\u001b[0m, in \u001b[0;36mAsyncHTTP11Connection._receive_event\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 216\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m event \u001b[38;5;129;01mis\u001b[39;00m h11\u001b[38;5;241m.\u001b[39mNEED_DATA:\n\u001b[0;32m--> 217\u001b[0m data \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_network_stream\u001b[38;5;241m.\u001b[39mread(\n\u001b[1;32m 218\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mREAD_NUM_BYTES, timeout\u001b[38;5;241m=\u001b[39mtimeout\n\u001b[1;32m 219\u001b[0m )\n\u001b[1;32m 221\u001b[0m \u001b[38;5;66;03m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[1;32m 222\u001b[0m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m 223\u001b[0m \u001b[38;5;66;03m# httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 227\u001b[0m \u001b[38;5;66;03m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[1;32m 228\u001b[0m \u001b[38;5;66;03m# it as a ConnectError.\u001b[39;00m\n",
131
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_backends/anyio.py:32\u001b[0m, in \u001b[0;36mAnyIOStream.read\u001b[0;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[1;32m 26\u001b[0m exc_map \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 27\u001b[0m \u001b[38;5;167;01mTimeoutError\u001b[39;00m: ReadTimeout,\n\u001b[1;32m 28\u001b[0m anyio\u001b[38;5;241m.\u001b[39mBrokenResourceError: ReadError,\n\u001b[1;32m 29\u001b[0m anyio\u001b[38;5;241m.\u001b[39mClosedResourceError: ReadError,\n\u001b[1;32m 30\u001b[0m anyio\u001b[38;5;241m.\u001b[39mEndOfStream: ReadError,\n\u001b[1;32m 31\u001b[0m }\n\u001b[0;32m---> 32\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[1;32m 33\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m anyio\u001b[38;5;241m.\u001b[39mfail_after(timeout):\n",
132
+ "File \u001b[0;32m~/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n",
133
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpcore/_exceptions.py:14\u001b[0m, in \u001b[0;36mmap_exceptions\u001b[0;34m(map)\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(exc, from_exc):\n\u001b[0;32m---> 14\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m to_exc(exc) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexc\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n",
134
+ "\u001b[0;31mReadTimeout\u001b[0m: ",
135
+ "\nThe above exception was the direct cause of the following exception:\n",
136
+ "\u001b[0;31mReadTimeout\u001b[0m Traceback (most recent call last)",
137
+ "Cell \u001b[0;32mIn[1], line 8\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mcontent_core\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m extract_content\n\u001b[1;32m 4\u001b[0m lesson_source \u001b[38;5;241m=\u001b[39m ProcessSourceInput(\n\u001b[1;32m 5\u001b[0m file_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/Users/luisnovo/dev/projetos/snl/adaptive-learning-novo/data/input/Oportunidade ao Alcance de Todos/OAT - Aula 2 v2 - Levi Rezende.mp4\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 6\u001b[0m )\n\u001b[0;32m----> 8\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m extract_content(lesson_source)\n",
138
+ "File \u001b[0;32m~/dev/projetos/content-core/src/content_core/content/extraction/__init__.py:12\u001b[0m, in \u001b[0;36mextract_content\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(data, \u001b[38;5;28mdict\u001b[39m):\n\u001b[1;32m 11\u001b[0m data \u001b[38;5;241m=\u001b[39m ProcessSourceInput(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mdata)\n\u001b[0;32m---> 12\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m graph\u001b[38;5;241m.\u001b[39mainvoke(data)\n\u001b[1;32m 13\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ProcessSourceOutput(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mresult)\n",
139
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/__init__.py:2920\u001b[0m, in \u001b[0;36mPregel.ainvoke\u001b[0;34m(self, input, config, stream_mode, print_mode, output_keys, interrupt_before, interrupt_after, **kwargs)\u001b[0m\n\u001b[1;32m 2917\u001b[0m chunks: \u001b[38;5;28mlist\u001b[39m[\u001b[38;5;28mdict\u001b[39m[\u001b[38;5;28mstr\u001b[39m, Any] \u001b[38;5;241m|\u001b[39m Any] \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 2918\u001b[0m interrupts: \u001b[38;5;28mlist\u001b[39m[Interrupt] \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m-> 2920\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m chunk \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mastream(\n\u001b[1;32m 2921\u001b[0m \u001b[38;5;28minput\u001b[39m,\n\u001b[1;32m 2922\u001b[0m config,\n\u001b[1;32m 2923\u001b[0m stream_mode\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mupdates\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 2924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stream_mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 2925\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m stream_mode,\n\u001b[1;32m 2926\u001b[0m print_mode\u001b[38;5;241m=\u001b[39mprint_mode,\n\u001b[1;32m 2927\u001b[0m output_keys\u001b[38;5;241m=\u001b[39moutput_keys,\n\u001b[1;32m 2928\u001b[0m interrupt_before\u001b[38;5;241m=\u001b[39minterrupt_before,\n\u001b[1;32m 2929\u001b[0m interrupt_after\u001b[38;5;241m=\u001b[39minterrupt_after,\n\u001b[1;32m 2930\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 2931\u001b[0m ):\n\u001b[1;32m 2932\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m stream_mode \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalues\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 2933\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(chunk) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m2\u001b[39m:\n",
140
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/__init__.py:2768\u001b[0m, in \u001b[0;36mPregel.astream\u001b[0;34m(self, input, config, stream_mode, print_mode, output_keys, interrupt_before, interrupt_after, checkpoint_during, debug, subgraphs)\u001b[0m\n\u001b[1;32m 2766\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m task \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m loop\u001b[38;5;241m.\u001b[39mamatch_cached_writes():\n\u001b[1;32m 2767\u001b[0m loop\u001b[38;5;241m.\u001b[39moutput_writes(task\u001b[38;5;241m.\u001b[39mid, task\u001b[38;5;241m.\u001b[39mwrites, cached\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m-> 2768\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mfor\u001b[39;00m _ \u001b[38;5;129;01min\u001b[39;00m runner\u001b[38;5;241m.\u001b[39matick(\n\u001b[1;32m 2769\u001b[0m [t \u001b[38;5;28;01mfor\u001b[39;00m t \u001b[38;5;129;01min\u001b[39;00m loop\u001b[38;5;241m.\u001b[39mtasks\u001b[38;5;241m.\u001b[39mvalues() \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m t\u001b[38;5;241m.\u001b[39mwrites],\n\u001b[1;32m 2770\u001b[0m timeout\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstep_timeout,\n\u001b[1;32m 2771\u001b[0m get_waiter\u001b[38;5;241m=\u001b[39mget_waiter,\n\u001b[1;32m 2772\u001b[0m schedule_task\u001b[38;5;241m=\u001b[39mloop\u001b[38;5;241m.\u001b[39maaccept_push,\n\u001b[1;32m 2773\u001b[0m ):\n\u001b[1;32m 2774\u001b[0m \u001b[38;5;66;03m# emit output\u001b[39;00m\n\u001b[1;32m 2775\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m o \u001b[38;5;129;01min\u001b[39;00m _output(\n\u001b[1;32m 2776\u001b[0m stream_mode,\n\u001b[1;32m 2777\u001b[0m print_mode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 2780\u001b[0m asyncio\u001b[38;5;241m.\u001b[39mQueueEmpty,\n\u001b[1;32m 2781\u001b[0m ):\n\u001b[1;32m 2782\u001b[0m \u001b[38;5;28;01myield\u001b[39;00m o\n",
141
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/runner.py:295\u001b[0m, in \u001b[0;36mPregelRunner.atick\u001b[0;34m(self, tasks, reraise, timeout, retry_policy, get_waiter, schedule_task)\u001b[0m\n\u001b[1;32m 293\u001b[0m t \u001b[38;5;241m=\u001b[39m tasks[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 294\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 295\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m arun_with_retry(\n\u001b[1;32m 296\u001b[0m t,\n\u001b[1;32m 297\u001b[0m retry_policy,\n\u001b[1;32m 298\u001b[0m stream\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_astream,\n\u001b[1;32m 299\u001b[0m configurable\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 300\u001b[0m CONFIG_KEY_CALL: partial(\n\u001b[1;32m 301\u001b[0m _acall,\n\u001b[1;32m 302\u001b[0m weakref\u001b[38;5;241m.\u001b[39mref(t),\n\u001b[1;32m 303\u001b[0m stream\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39muse_astream,\n\u001b[1;32m 304\u001b[0m retry_policy\u001b[38;5;241m=\u001b[39mretry_policy,\n\u001b[1;32m 305\u001b[0m futures\u001b[38;5;241m=\u001b[39mweakref\u001b[38;5;241m.\u001b[39mref(futures),\n\u001b[1;32m 306\u001b[0m schedule_task\u001b[38;5;241m=\u001b[39mschedule_task,\n\u001b[1;32m 307\u001b[0m submit\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msubmit,\n\u001b[1;32m 308\u001b[0m loop\u001b[38;5;241m=\u001b[39mloop,\n\u001b[1;32m 309\u001b[0m ),\n\u001b[1;32m 310\u001b[0m },\n\u001b[1;32m 311\u001b[0m )\n\u001b[1;32m 312\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommit(t, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[1;32m 313\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n",
142
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/pregel/retry.py:137\u001b[0m, in \u001b[0;36marun_with_retry\u001b[0;34m(task, retry_policy, stream, match_cached_writes, configurable)\u001b[0m\n\u001b[1;32m 135\u001b[0m \u001b[38;5;28;01mbreak\u001b[39;00m\n\u001b[1;32m 136\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 137\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m task\u001b[38;5;241m.\u001b[39mproc\u001b[38;5;241m.\u001b[39mainvoke(task\u001b[38;5;241m.\u001b[39minput, config)\n\u001b[1;32m 138\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m ParentCommand \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 139\u001b[0m ns: \u001b[38;5;28mstr\u001b[39m \u001b[38;5;241m=\u001b[39m config[CONF][CONFIG_KEY_CHECKPOINT_NS]\n",
143
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/utils/runnable.py:676\u001b[0m, in \u001b[0;36mRunnableSeq.ainvoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 672\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39mcreate_task(\n\u001b[1;32m 673\u001b[0m step\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs), context\u001b[38;5;241m=\u001b[39mcontext\n\u001b[1;32m 674\u001b[0m )\n\u001b[1;32m 675\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 676\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m step\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 677\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 678\u001b[0m \u001b[38;5;28minput\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m step\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config)\n",
144
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/langgraph/utils/runnable.py:440\u001b[0m, in \u001b[0;36mRunnableCallable.ainvoke\u001b[0;34m(self, input, config, **kwargs)\u001b[0m\n\u001b[1;32m 438\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m run_manager\u001b[38;5;241m.\u001b[39mon_chain_end(ret)\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 440\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mafunc(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 441\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrecurse \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(ret, Runnable):\n\u001b[1;32m 442\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m ret\u001b[38;5;241m.\u001b[39mainvoke(\u001b[38;5;28minput\u001b[39m, config)\n",
145
+ "File \u001b[0;32m~/dev/projetos/content-core/src/content_core/processors/audio.py:146\u001b[0m, in \u001b[0;36mextract_audio_data\u001b[0;34m(data)\u001b[0m\n\u001b[1;32m 144\u001b[0m transcriptions \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m audio_file \u001b[38;5;129;01min\u001b[39;00m output_files:\n\u001b[0;32m--> 146\u001b[0m transcription \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m transcribe_audio_segment(\n\u001b[1;32m 147\u001b[0m audio_file, speech_to_text_model\n\u001b[1;32m 148\u001b[0m )\n\u001b[1;32m 149\u001b[0m transcriptions\u001b[38;5;241m.\u001b[39mappend(transcription)\n\u001b[1;32m 151\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m {\n\u001b[1;32m 152\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmetadata\u001b[39m\u001b[38;5;124m\"\u001b[39m: {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio_files\u001b[39m\u001b[38;5;124m\"\u001b[39m: output_files},\n\u001b[1;32m 153\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcontent\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(transcriptions),\n\u001b[1;32m 154\u001b[0m }\n",
146
+ "File \u001b[0;32m~/dev/projetos/content-core/src/content_core/processors/audio.py:103\u001b[0m, in \u001b[0;36mtranscribe_audio_segment\u001b[0;34m(audio_file, model)\u001b[0m\n\u001b[1;32m 101\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mtranscribe_audio_segment\u001b[39m(audio_file, model):\n\u001b[1;32m 102\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Transcribe a single audio segment asynchronously\"\"\"\u001b[39;00m\n\u001b[0;32m--> 103\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m (\u001b[38;5;28;01mawait\u001b[39;00m model\u001b[38;5;241m.\u001b[39matranscribe(audio_file))\u001b[38;5;241m.\u001b[39mtext\n",
147
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/esperanto/providers/stt/openai.py:158\u001b[0m, in \u001b[0;36mOpenAISpeechToTextModel.atranscribe\u001b[0;34m(self, audio_file, language, prompt)\u001b[0m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mopen\u001b[39m(audio_file, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrb\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;28;01mas\u001b[39;00m f:\n\u001b[1;32m 157\u001b[0m files \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mfile\u001b[39m\u001b[38;5;124m\"\u001b[39m: (audio_file, f, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudio/mpeg\u001b[39m\u001b[38;5;124m\"\u001b[39m)}\n\u001b[0;32m--> 158\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masync_client\u001b[38;5;241m.\u001b[39mpost(\n\u001b[1;32m 159\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbase_url\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/audio/transcriptions\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 160\u001b[0m headers\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_headers(),\n\u001b[1;32m 161\u001b[0m files\u001b[38;5;241m=\u001b[39mfiles,\n\u001b[1;32m 162\u001b[0m data\u001b[38;5;241m=\u001b[39mkwargs\n\u001b[1;32m 163\u001b[0m )\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 165\u001b[0m \u001b[38;5;66;03m# For BinaryIO, send the file object directly\u001b[39;00m\n\u001b[1;32m 166\u001b[0m filename \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mgetattr\u001b[39m(audio_file, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mname\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124maudio.mp3\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
148
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1859\u001b[0m, in \u001b[0;36mAsyncClient.post\u001b[0;34m(self, url, content, data, files, json, params, headers, cookies, auth, follow_redirects, timeout, extensions)\u001b[0m\n\u001b[1;32m 1838\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21mpost\u001b[39m(\n\u001b[1;32m 1839\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 1840\u001b[0m url: URL \u001b[38;5;241m|\u001b[39m \u001b[38;5;28mstr\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1852\u001b[0m extensions: RequestExtensions \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 1853\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Response:\n\u001b[1;32m 1854\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1855\u001b[0m \u001b[38;5;124;03m Send a `POST` request.\u001b[39;00m\n\u001b[1;32m 1856\u001b[0m \n\u001b[1;32m 1857\u001b[0m \u001b[38;5;124;03m **Parameters**: See `httpx.request`.\u001b[39;00m\n\u001b[1;32m 1858\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1859\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mrequest(\n\u001b[1;32m 1860\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPOST\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1861\u001b[0m url,\n\u001b[1;32m 1862\u001b[0m content\u001b[38;5;241m=\u001b[39mcontent,\n\u001b[1;32m 1863\u001b[0m data\u001b[38;5;241m=\u001b[39mdata,\n\u001b[1;32m 1864\u001b[0m files\u001b[38;5;241m=\u001b[39mfiles,\n\u001b[1;32m 1865\u001b[0m json\u001b[38;5;241m=\u001b[39mjson,\n\u001b[1;32m 1866\u001b[0m params\u001b[38;5;241m=\u001b[39mparams,\n\u001b[1;32m 1867\u001b[0m headers\u001b[38;5;241m=\u001b[39mheaders,\n\u001b[1;32m 1868\u001b[0m cookies\u001b[38;5;241m=\u001b[39mcookies,\n\u001b[1;32m 1869\u001b[0m auth\u001b[38;5;241m=\u001b[39mauth,\n\u001b[1;32m 1870\u001b[0m follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects,\n\u001b[1;32m 1871\u001b[0m timeout\u001b[38;5;241m=\u001b[39mtimeout,\n\u001b[1;32m 1872\u001b[0m extensions\u001b[38;5;241m=\u001b[39mextensions,\n\u001b[1;32m 1873\u001b[0m )\n",
149
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1540\u001b[0m, in \u001b[0;36mAsyncClient.request\u001b[0;34m(self, method, url, content, data, files, json, params, headers, cookies, auth, follow_redirects, timeout, extensions)\u001b[0m\n\u001b[1;32m 1525\u001b[0m warnings\u001b[38;5;241m.\u001b[39mwarn(message, \u001b[38;5;167;01mDeprecationWarning\u001b[39;00m, stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m2\u001b[39m)\n\u001b[1;32m 1527\u001b[0m request \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbuild_request(\n\u001b[1;32m 1528\u001b[0m method\u001b[38;5;241m=\u001b[39mmethod,\n\u001b[1;32m 1529\u001b[0m url\u001b[38;5;241m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1538\u001b[0m extensions\u001b[38;5;241m=\u001b[39mextensions,\n\u001b[1;32m 1539\u001b[0m )\n\u001b[0;32m-> 1540\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msend(request, auth\u001b[38;5;241m=\u001b[39mauth, follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects)\n",
150
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1629\u001b[0m, in \u001b[0;36mAsyncClient.send\u001b[0;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[1;32m 1625\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_set_timeout(request)\n\u001b[1;32m 1627\u001b[0m auth \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_build_request_auth(request, auth)\n\u001b[0;32m-> 1629\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_send_handling_auth(\n\u001b[1;32m 1630\u001b[0m request,\n\u001b[1;32m 1631\u001b[0m auth\u001b[38;5;241m=\u001b[39mauth,\n\u001b[1;32m 1632\u001b[0m follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects,\n\u001b[1;32m 1633\u001b[0m history\u001b[38;5;241m=\u001b[39m[],\n\u001b[1;32m 1634\u001b[0m )\n\u001b[1;32m 1635\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1636\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m stream:\n",
151
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1657\u001b[0m, in \u001b[0;36mAsyncClient._send_handling_auth\u001b[0;34m(self, request, auth, follow_redirects, history)\u001b[0m\n\u001b[1;32m 1654\u001b[0m request \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m auth_flow\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__anext__\u001b[39m()\n\u001b[1;32m 1656\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m:\n\u001b[0;32m-> 1657\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_send_handling_redirects(\n\u001b[1;32m 1658\u001b[0m request,\n\u001b[1;32m 1659\u001b[0m follow_redirects\u001b[38;5;241m=\u001b[39mfollow_redirects,\n\u001b[1;32m 1660\u001b[0m history\u001b[38;5;241m=\u001b[39mhistory,\n\u001b[1;32m 1661\u001b[0m )\n\u001b[1;32m 1662\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1663\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
152
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1694\u001b[0m, in \u001b[0;36mAsyncClient._send_handling_redirects\u001b[0;34m(self, request, follow_redirects, history)\u001b[0m\n\u001b[1;32m 1691\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_hooks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrequest\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n\u001b[1;32m 1692\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m hook(request)\n\u001b[0;32m-> 1694\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_send_single_request(request)\n\u001b[1;32m 1695\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1696\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m hook \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_event_hooks[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mresponse\u001b[39m\u001b[38;5;124m\"\u001b[39m]:\n",
153
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_client.py:1730\u001b[0m, in \u001b[0;36mAsyncClient._send_single_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 1725\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 1726\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mAttempted to send an sync request with an AsyncClient instance.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1727\u001b[0m )\n\u001b[1;32m 1729\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m request_context(request\u001b[38;5;241m=\u001b[39mrequest):\n\u001b[0;32m-> 1730\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m transport\u001b[38;5;241m.\u001b[39mhandle_async_request(request)\n\u001b[1;32m 1732\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(response\u001b[38;5;241m.\u001b[39mstream, AsyncByteStream)\n\u001b[1;32m 1733\u001b[0m response\u001b[38;5;241m.\u001b[39mrequest \u001b[38;5;241m=\u001b[39m request\n",
154
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:393\u001b[0m, in \u001b[0;36mAsyncHTTPTransport.handle_async_request\u001b[0;34m(self, request)\u001b[0m\n\u001b[1;32m 379\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mhttpcore\u001b[39;00m\n\u001b[1;32m 381\u001b[0m req \u001b[38;5;241m=\u001b[39m httpcore\u001b[38;5;241m.\u001b[39mRequest(\n\u001b[1;32m 382\u001b[0m method\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mmethod,\n\u001b[1;32m 383\u001b[0m url\u001b[38;5;241m=\u001b[39mhttpcore\u001b[38;5;241m.\u001b[39mURL(\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 391\u001b[0m extensions\u001b[38;5;241m=\u001b[39mrequest\u001b[38;5;241m.\u001b[39mextensions,\n\u001b[1;32m 392\u001b[0m )\n\u001b[0;32m--> 393\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m 394\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_pool\u001b[38;5;241m.\u001b[39mhandle_async_request(req)\n\u001b[1;32m 396\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(resp\u001b[38;5;241m.\u001b[39mstream, typing\u001b[38;5;241m.\u001b[39mAsyncIterable)\n",
155
+ "File \u001b[0;32m~/.local/share/uv/python/cpython-3.10.6-macos-aarch64-none/lib/python3.10/contextlib.py:153\u001b[0m, in \u001b[0;36m_GeneratorContextManager.__exit__\u001b[0;34m(self, typ, value, traceback)\u001b[0m\n\u001b[1;32m 151\u001b[0m value \u001b[38;5;241m=\u001b[39m typ()\n\u001b[1;32m 152\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 153\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgen\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mthrow\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtyp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtraceback\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 154\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m exc:\n\u001b[1;32m 155\u001b[0m \u001b[38;5;66;03m# Suppress StopIteration *unless* it's the same exception that\u001b[39;00m\n\u001b[1;32m 156\u001b[0m \u001b[38;5;66;03m# was passed to throw(). This prevents a StopIteration\u001b[39;00m\n\u001b[1;32m 157\u001b[0m \u001b[38;5;66;03m# raised inside the \"with\" statement from being suppressed.\u001b[39;00m\n\u001b[1;32m 158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m exc \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m value\n",
156
+ "File \u001b[0;32m~/dev/projetos/content-core/.venv/lib/python3.10/site-packages/httpx/_transports/default.py:118\u001b[0m, in \u001b[0;36mmap_httpcore_exceptions\u001b[0;34m()\u001b[0m\n\u001b[1;32m 115\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[1;32m 117\u001b[0m message \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mstr\u001b[39m(exc)\n\u001b[0;32m--> 118\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m mapped_exc(message) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mexc\u001b[39;00m\n",
157
+ "\u001b[0;31mReadTimeout\u001b[0m: "
158
+ ]
159
+ }
160
+ ],
161
+ "source": [
162
+ "from content_core.common import ProcessSourceInput\n",
163
+ "from content_core import extract_content\n",
164
+ "\n",
165
+ "lesson_source = ProcessSourceInput(\n",
166
+ " file_path=\"/Users/luisnovo/dev/projetos/snl/adaptive-learning-novo/data/input/Oportunidade ao Alcance de Todos/OAT - Aula 2 v2 - Levi Rezende.mp4\",\n",
167
+ ")\n",
168
+ " \n",
169
+ "result = await extract_content(lesson_source)"
170
+ ]
171
+ }
172
+ ],
173
+ "metadata": {
174
+ "kernelspec": {
175
+ "display_name": ".venv",
176
+ "language": "python",
177
+ "name": "python3"
178
+ },
179
+ "language_info": {
180
+ "codemirror_mode": {
181
+ "name": "ipython",
182
+ "version": 3
183
+ },
184
+ "file_extension": ".py",
185
+ "mimetype": "text/x-python",
186
+ "name": "python",
187
+ "nbconvert_exporter": "python",
188
+ "pygments_lexer": "ipython3",
189
+ "version": "3.10.6"
190
+ }
191
+ },
192
+ "nbformat": 4,
193
+ "nbformat_minor": 5
194
+ }
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "1.4.2"
3
+ version = "1.6.0"
4
4
  description = "Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server"
5
5
  readme = "README.md"
6
6
  homepage = "https://github.com/lfnovo/content-core"
@@ -70,3 +70,4 @@ dev = [
70
70
  pythonpath = ["src"]
71
71
  asyncio_mode = "auto"
72
72
  asyncio_default_fixture_loop_scope = "function"
73
+ asyncio_default_test_loop_scope = "function"
@@ -214,5 +214,3 @@ def csum():
214
214
 
215
215
  if __name__ == "__main__":
216
216
  ccore()
217
- if __name__ == "__main__":
218
- ccore()
@@ -32,6 +32,8 @@ summary_model:
32
32
  extraction:
33
33
  document_engine: auto # auto | simple | docling - for files/documents
34
34
  url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
35
+ audio:
36
+ concurrency: 3 # Number of concurrent audio transcriptions (1-10)
35
37
  docling:
36
38
  output_format: markdown # markdown | html | json
37
39
  pymupdf: