content-core 1.4.2__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +0 -2
- content_core/cc_config.yaml +2 -0
- content_core/config.py +71 -0
- content_core/content/identification/file_detector.py +103 -13
- content_core/content/summary/core.py +1 -1
- content_core/notebooks/run.ipynb +0 -2
- content_core/processors/audio.py +114 -47
- content_core/processors/url.py +3 -3
- content_core/templated_message.py +2 -2
- {content_core-1.4.2.dist-info → content_core-1.6.0.dist-info}/METADATA +15 -1
- {content_core-1.4.2.dist-info → content_core-1.6.0.dist-info}/RECORD +14 -14
- {content_core-1.4.2.dist-info → content_core-1.6.0.dist-info}/WHEEL +0 -0
- {content_core-1.4.2.dist-info → content_core-1.6.0.dist-info}/entry_points.txt +0 -0
- {content_core-1.4.2.dist-info → content_core-1.6.0.dist-info}/licenses/LICENSE +0 -0
content_core/__init__.py
CHANGED
content_core/cc_config.yaml
CHANGED
|
@@ -32,6 +32,8 @@ summary_model:
|
|
|
32
32
|
extraction:
|
|
33
33
|
document_engine: auto # auto | simple | docling - for files/documents
|
|
34
34
|
url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
|
|
35
|
+
audio:
|
|
36
|
+
concurrency: 3 # Number of concurrent audio transcriptions (1-10)
|
|
35
37
|
docling:
|
|
36
38
|
output_format: markdown # markdown | html | json
|
|
37
39
|
pymupdf:
|
content_core/config.py
CHANGED
|
@@ -70,6 +70,61 @@ def get_url_engine():
|
|
|
70
70
|
return env_engine
|
|
71
71
|
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
72
72
|
|
|
73
|
+
def get_audio_concurrency():
|
|
74
|
+
"""
|
|
75
|
+
Get audio concurrency with environment variable override and validation.
|
|
76
|
+
|
|
77
|
+
Returns the configured number of concurrent audio transcriptions, with automatic
|
|
78
|
+
validation and fallback to safe defaults.
|
|
79
|
+
|
|
80
|
+
Configuration priority (highest to lowest):
|
|
81
|
+
1. CCORE_AUDIO_CONCURRENCY environment variable
|
|
82
|
+
2. extraction.audio.concurrency in YAML config
|
|
83
|
+
3. Default value: 3
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
int: Number of concurrent transcriptions (1-10)
|
|
87
|
+
|
|
88
|
+
Validation:
|
|
89
|
+
- Values must be integers between 1 and 10 (inclusive)
|
|
90
|
+
- Invalid values (out of range, non-integer, etc.) automatically fall back to default
|
|
91
|
+
- A warning is logged when invalid values are detected
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> import os
|
|
95
|
+
>>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "5"
|
|
96
|
+
>>> get_audio_concurrency()
|
|
97
|
+
5
|
|
98
|
+
|
|
99
|
+
>>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "20" # Too high
|
|
100
|
+
>>> get_audio_concurrency() # Falls back to default
|
|
101
|
+
3
|
|
102
|
+
"""
|
|
103
|
+
env_concurrency = os.environ.get("CCORE_AUDIO_CONCURRENCY")
|
|
104
|
+
if env_concurrency:
|
|
105
|
+
try:
|
|
106
|
+
concurrency = int(env_concurrency)
|
|
107
|
+
if concurrency < 1 or concurrency > 10:
|
|
108
|
+
# Import logger here to avoid circular imports
|
|
109
|
+
from content_core.logging import logger
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
|
|
112
|
+
f"Must be between 1 and 10. "
|
|
113
|
+
f"Using default from config."
|
|
114
|
+
)
|
|
115
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
116
|
+
return concurrency
|
|
117
|
+
except ValueError:
|
|
118
|
+
# Import logger here to avoid circular imports
|
|
119
|
+
from content_core.logging import logger
|
|
120
|
+
logger.warning(
|
|
121
|
+
f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
|
|
122
|
+
f"Must be a valid integer. "
|
|
123
|
+
f"Using default from config."
|
|
124
|
+
)
|
|
125
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
126
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
127
|
+
|
|
73
128
|
# Programmatic config overrides: use in notebooks or scripts
|
|
74
129
|
def set_document_engine(engine: str):
|
|
75
130
|
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
@@ -102,3 +157,19 @@ def set_pymupdf_ocr_fallback(enabled: bool):
|
|
|
102
157
|
extraction = CONFIG.setdefault("extraction", {})
|
|
103
158
|
pymupdf_cfg = extraction.setdefault("pymupdf", {})
|
|
104
159
|
pymupdf_cfg["ocr_fallback"] = enabled
|
|
160
|
+
|
|
161
|
+
def set_audio_concurrency(concurrency: int):
|
|
162
|
+
"""
|
|
163
|
+
Override the audio concurrency setting (1-10).
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
concurrency (int): Number of concurrent audio transcriptions (1-10)
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: If concurrency is not between 1 and 10
|
|
170
|
+
"""
|
|
171
|
+
if not isinstance(concurrency, int) or concurrency < 1 or concurrency > 10:
|
|
172
|
+
raise ValueError(f"Audio concurrency must be an integer between 1 and 10, got: {concurrency}")
|
|
173
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
174
|
+
audio_cfg = extraction.setdefault("audio", {})
|
|
175
|
+
audio_cfg["concurrency"] = concurrency
|
|
@@ -3,10 +3,9 @@ Pure Python file type detection using magic bytes and content analysis.
|
|
|
3
3
|
Replaces libmagic dependency with a lightweight implementation.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import os
|
|
7
6
|
import zipfile
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from typing import Dict, Optional
|
|
8
|
+
from typing import Dict, Optional
|
|
10
9
|
|
|
11
10
|
from content_core.common.exceptions import UnsupportedTypeException
|
|
12
11
|
from content_core.logging import logger
|
|
@@ -14,10 +13,17 @@ from content_core.logging import logger
|
|
|
14
13
|
|
|
15
14
|
class FileDetector:
|
|
16
15
|
"""Pure Python file type detection using magic bytes and content analysis."""
|
|
17
|
-
|
|
18
|
-
# Configuration constants
|
|
16
|
+
|
|
17
|
+
# Configuration constants for binary/text detection
|
|
19
18
|
SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
|
|
20
19
|
TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
|
|
20
|
+
|
|
21
|
+
# Configuration constants for CSV detection
|
|
22
|
+
CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
|
|
23
|
+
CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
|
|
24
|
+
CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
|
|
25
|
+
CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
|
|
26
|
+
CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
|
|
21
27
|
|
|
22
28
|
def __init__(self):
|
|
23
29
|
"""Initialize the FileDetector with signature mappings."""
|
|
@@ -365,18 +371,102 @@ class FileDetector:
|
|
|
365
371
|
|
|
366
372
|
|
|
367
373
|
def _looks_like_csv(self, content: str) -> bool:
|
|
368
|
-
"""
|
|
369
|
-
|
|
370
|
-
|
|
374
|
+
"""
|
|
375
|
+
Check if content looks like CSV format with improved heuristics.
|
|
376
|
+
|
|
377
|
+
Uses a multi-stage approach with performance optimization:
|
|
378
|
+
1. Basic structural checks (cheap)
|
|
379
|
+
2. Field length analysis (cheap, early exit)
|
|
380
|
+
3. Pattern matching (moderate cost)
|
|
381
|
+
4. Variance analysis (expensive, only if needed)
|
|
382
|
+
"""
|
|
383
|
+
lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
|
|
384
|
+
non_empty_lines = [line for line in lines if line.strip()]
|
|
385
|
+
|
|
386
|
+
# Stage 1: Basic structural checks
|
|
387
|
+
if len(non_empty_lines) < 2:
|
|
371
388
|
return False
|
|
372
|
-
|
|
389
|
+
|
|
373
390
|
# Count commas in each line
|
|
374
|
-
comma_counts = [line.count(',') for line in
|
|
375
|
-
|
|
391
|
+
comma_counts = [line.count(',') for line in non_empty_lines]
|
|
392
|
+
|
|
393
|
+
# Must have at least one comma per line
|
|
394
|
+
if not all(count > 0 for count in comma_counts):
|
|
376
395
|
return False
|
|
377
|
-
|
|
378
|
-
# CSV should have consistent comma counts
|
|
379
|
-
|
|
396
|
+
|
|
397
|
+
# CSV should have consistent comma counts across lines
|
|
398
|
+
if len(set(comma_counts)) != 1:
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
|
|
402
|
+
|
|
403
|
+
# Must have minimum number of fields to be CSV
|
|
404
|
+
if num_fields < self.CSV_MIN_FIELDS:
|
|
405
|
+
return False
|
|
406
|
+
|
|
407
|
+
# Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
|
|
408
|
+
first_line = non_empty_lines[0]
|
|
409
|
+
fields = first_line.split(',')
|
|
410
|
+
|
|
411
|
+
# CSV fields should be relatively short (not long sentences)
|
|
412
|
+
# Average field length should be reasonable (not paragraphs)
|
|
413
|
+
# Early exit avoids expensive variance calculations for obvious prose
|
|
414
|
+
avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
|
|
415
|
+
if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
|
|
416
|
+
return False # Too long to be typical CSV fields - exit early
|
|
417
|
+
|
|
418
|
+
# Stage 3: Pattern matching
|
|
419
|
+
# Check for CSV-like patterns:
|
|
420
|
+
# 1. Fields that look like headers (short, alphanumeric)
|
|
421
|
+
# 2. Quoted fields (common in CSV)
|
|
422
|
+
# 3. Numeric fields
|
|
423
|
+
has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
|
|
424
|
+
|
|
425
|
+
first_line_fields = [f.strip() for f in fields]
|
|
426
|
+
# Check if first line looks like a header (short, no sentence-ending punctuation)
|
|
427
|
+
looks_like_header = all(
|
|
428
|
+
len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
|
|
429
|
+
for f in first_line_fields
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
|
|
433
|
+
# Check if subsequent lines have similar field structure
|
|
434
|
+
# Real CSV tends to have consistent field lengths
|
|
435
|
+
if len(non_empty_lines) >= 3:
|
|
436
|
+
field_lengths_per_line = []
|
|
437
|
+
for line in non_empty_lines[:5]:
|
|
438
|
+
line_fields = line.split(',')
|
|
439
|
+
field_lengths = [len(f.strip()) for f in line_fields]
|
|
440
|
+
field_lengths_per_line.append(field_lengths)
|
|
441
|
+
|
|
442
|
+
# Calculate variance in field positions
|
|
443
|
+
# CSV data should have relatively consistent field lengths at each position
|
|
444
|
+
# Natural text with commas will have much more variance
|
|
445
|
+
position_variances = []
|
|
446
|
+
for i in range(num_fields):
|
|
447
|
+
lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
|
|
448
|
+
if lengths_at_position:
|
|
449
|
+
avg = sum(lengths_at_position) / len(lengths_at_position)
|
|
450
|
+
variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
|
|
451
|
+
position_variances.append(variance)
|
|
452
|
+
|
|
453
|
+
# High variance suggests natural text, not structured CSV
|
|
454
|
+
if position_variances:
|
|
455
|
+
avg_variance = sum(position_variances) / len(position_variances)
|
|
456
|
+
if avg_variance > self.CSV_MAX_VARIANCE:
|
|
457
|
+
return False # Very high variance = likely prose
|
|
458
|
+
|
|
459
|
+
# Scoring: Require at least some CSV-like characteristics
|
|
460
|
+
csv_score = 0
|
|
461
|
+
if looks_like_header:
|
|
462
|
+
csv_score += 1
|
|
463
|
+
if has_quoted_fields:
|
|
464
|
+
csv_score += 1
|
|
465
|
+
if num_fields >= 3: # Multiple fields is more CSV-like
|
|
466
|
+
csv_score += 1
|
|
467
|
+
|
|
468
|
+
# Need minimum score to confidently classify as CSV
|
|
469
|
+
return csv_score >= self.CSV_MIN_SCORE
|
|
380
470
|
|
|
381
471
|
|
|
382
472
|
def _is_text_file(self, content: str) -> bool:
|
|
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
|
|
|
8
8
|
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
|
-
user_prompt_template="
|
|
11
|
+
user_prompt_template="content/summarize",
|
|
12
12
|
data={"content": content, "context": context},
|
|
13
13
|
)
|
|
14
14
|
)
|
content_core/notebooks/run.ipynb
CHANGED
|
@@ -63,8 +63,6 @@
|
|
|
63
63
|
"source": [
|
|
64
64
|
"from content_core.content.extraction import extract_content\n",
|
|
65
65
|
"\n",
|
|
66
|
-
"from content_core.content.cleanup import cleanup_content\n",
|
|
67
|
-
"from content_core.content.summary import summarize\n",
|
|
68
66
|
"\n",
|
|
69
67
|
"\n",
|
|
70
68
|
"yt = await extract_content(dict(url=\"https://www.youtube.com/watch?v=lLprprtHfts\"))\n",
|
content_core/processors/audio.py
CHANGED
|
@@ -8,11 +8,9 @@ from functools import partial
|
|
|
8
8
|
from moviepy import AudioFileClip
|
|
9
9
|
|
|
10
10
|
from content_core.common import ProcessSourceState
|
|
11
|
+
from content_core.config import get_audio_concurrency
|
|
11
12
|
from content_core.logging import logger
|
|
12
13
|
|
|
13
|
-
# todo: remove reference to model_manager
|
|
14
|
-
# future: parallelize the transcription process
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
|
|
18
16
|
"""
|
|
@@ -47,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
47
45
|
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
48
46
|
|
|
49
47
|
# Extract segment
|
|
50
|
-
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
48
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
51
49
|
output_path = os.path.join(output_dir, output_filename)
|
|
52
50
|
|
|
53
51
|
# Export segment
|
|
@@ -55,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
|
|
|
55
53
|
|
|
56
54
|
output_files.append(output_path)
|
|
57
55
|
|
|
58
|
-
logger.debug(
|
|
56
|
+
logger.debug(
|
|
57
|
+
f"Exported segment {i + 1}/{total_segments}: {output_filename}"
|
|
58
|
+
)
|
|
59
59
|
|
|
60
60
|
return output_files
|
|
61
61
|
|
|
@@ -98,61 +98,128 @@ def extract_audio(
|
|
|
98
98
|
raise
|
|
99
99
|
|
|
100
100
|
|
|
101
|
-
async def transcribe_audio_segment(audio_file, model):
|
|
102
|
-
"""
|
|
103
|
-
|
|
101
|
+
async def transcribe_audio_segment(audio_file, model, semaphore):
|
|
102
|
+
"""
|
|
103
|
+
Transcribe a single audio segment asynchronously with concurrency control.
|
|
104
104
|
|
|
105
|
+
This function uses a semaphore to limit the number of concurrent transcriptions,
|
|
106
|
+
preventing API rate limits while allowing parallel processing for improved performance.
|
|
105
107
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
+
Args:
|
|
109
|
+
audio_file (str): Path to the audio file segment to transcribe
|
|
110
|
+
model: Speech-to-text model instance with atranscribe() method
|
|
111
|
+
semaphore (asyncio.Semaphore): Semaphore to control concurrency
|
|
108
112
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
temp_dir = tempfile.mkdtemp()
|
|
112
|
-
output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
|
|
113
|
-
output_dir = temp_dir
|
|
114
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
113
|
+
Returns:
|
|
114
|
+
str: Transcribed text from the audio segment
|
|
115
115
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
116
|
+
Note:
|
|
117
|
+
Multiple instances of this function can run concurrently, but the semaphore
|
|
118
|
+
ensures that no more than N transcriptions happen simultaneously, where N
|
|
119
|
+
is configured via get_audio_concurrency() (default: 3, range: 1-10).
|
|
120
|
+
"""
|
|
121
|
+
async with semaphore:
|
|
122
|
+
return (await model.atranscribe(audio_file)).text
|
|
121
123
|
|
|
122
|
-
if duration_s > segment_length_s:
|
|
123
|
-
logger.info(
|
|
124
|
-
f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
|
|
125
|
-
)
|
|
126
|
-
for i in range(math.ceil(duration_s / segment_length_s)):
|
|
127
|
-
start_time = i * segment_length_s
|
|
128
|
-
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
129
124
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
125
|
+
async def extract_audio_data(data: ProcessSourceState):
|
|
126
|
+
"""
|
|
127
|
+
Extract and transcribe audio from a file with automatic segmentation and parallel processing.
|
|
128
|
+
|
|
129
|
+
This function handles the complete audio processing pipeline:
|
|
130
|
+
1. Splits long audio files (>10 minutes) into segments
|
|
131
|
+
2. Transcribes segments in parallel using configurable concurrency
|
|
132
|
+
3. Joins transcriptions in correct order
|
|
133
133
|
|
|
134
|
-
|
|
134
|
+
For files longer than 10 minutes, segments are processed concurrently with a
|
|
135
|
+
configurable concurrency limit to balance performance and API rate limits.
|
|
135
136
|
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
137
|
+
Args:
|
|
138
|
+
data (ProcessSourceState): State object containing file_path to audio/video file
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
dict: Dictionary containing:
|
|
142
|
+
- metadata: Information about processed segments count
|
|
143
|
+
- content: Complete transcribed text
|
|
139
144
|
|
|
140
|
-
|
|
141
|
-
|
|
145
|
+
Configuration:
|
|
146
|
+
Concurrency is controlled via:
|
|
147
|
+
- Environment variable: CCORE_AUDIO_CONCURRENCY (1-10, default: 3)
|
|
148
|
+
- YAML config: extraction.audio.concurrency
|
|
142
149
|
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
150
|
+
Raises:
|
|
151
|
+
Exception: If audio extraction or transcription fails
|
|
152
|
+
"""
|
|
153
|
+
input_audio_path = data.file_path
|
|
154
|
+
audio = None
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
# Use TemporaryDirectory context manager for automatic cleanup
|
|
158
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
159
|
+
output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
|
|
160
|
+
output_dir = temp_dir
|
|
161
|
+
|
|
162
|
+
# Split audio into segments if longer than 10 minutes
|
|
163
|
+
audio = AudioFileClip(input_audio_path)
|
|
164
|
+
duration_s = audio.duration
|
|
165
|
+
segment_length_s = 10 * 60 # 10 minutes in seconds
|
|
166
|
+
output_files = []
|
|
167
|
+
|
|
168
|
+
if duration_s > segment_length_s:
|
|
169
|
+
logger.info(
|
|
170
|
+
f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
|
|
171
|
+
)
|
|
172
|
+
for i in range(math.ceil(duration_s / segment_length_s)):
|
|
173
|
+
start_time = i * segment_length_s
|
|
174
|
+
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
175
|
+
|
|
176
|
+
# Extract segment
|
|
177
|
+
output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
|
|
178
|
+
output_path = os.path.join(output_dir, output_filename)
|
|
179
|
+
|
|
180
|
+
extract_audio(input_audio_path, output_path, start_time, end_time)
|
|
181
|
+
|
|
182
|
+
output_files.append(output_path)
|
|
183
|
+
else:
|
|
184
|
+
output_files = [input_audio_path]
|
|
185
|
+
|
|
186
|
+
# Close audio clip after determining segments
|
|
187
|
+
if audio:
|
|
188
|
+
audio.close()
|
|
189
|
+
audio = None
|
|
190
|
+
|
|
191
|
+
# Transcribe audio files in parallel with concurrency limit
|
|
192
|
+
from content_core.models import ModelFactory
|
|
193
|
+
|
|
194
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
195
|
+
concurrency = get_audio_concurrency()
|
|
196
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
197
|
+
|
|
198
|
+
logger.debug(
|
|
199
|
+
f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
|
|
148
200
|
)
|
|
149
|
-
transcriptions.append(transcription)
|
|
150
201
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
202
|
+
# Create tasks for parallel transcription
|
|
203
|
+
transcription_tasks = [
|
|
204
|
+
transcribe_audio_segment(audio_file, speech_to_text_model, semaphore)
|
|
205
|
+
for audio_file in output_files
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
# Execute all transcriptions concurrently (limited by semaphore)
|
|
209
|
+
transcriptions = await asyncio.gather(*transcription_tasks)
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
"metadata": {"segments_count": len(output_files)},
|
|
213
|
+
"content": " ".join(transcriptions),
|
|
214
|
+
}
|
|
155
215
|
except Exception as e:
|
|
156
216
|
logger.error(f"Error processing audio: {str(e)}")
|
|
157
217
|
logger.error(traceback.format_exc())
|
|
158
218
|
raise
|
|
219
|
+
finally:
|
|
220
|
+
# Ensure audio clip is closed even if an error occurs
|
|
221
|
+
if audio:
|
|
222
|
+
try:
|
|
223
|
+
audio.close()
|
|
224
|
+
except Exception:
|
|
225
|
+
pass
|
content_core/processors/url.py
CHANGED
|
@@ -147,10 +147,10 @@ async def extract_url_firecrawl(url: str):
|
|
|
147
147
|
from firecrawl import AsyncFirecrawlApp
|
|
148
148
|
|
|
149
149
|
app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
|
|
150
|
-
scrape_result = await app.
|
|
150
|
+
scrape_result = await app.scrape(url, formats=["markdown", "html"])
|
|
151
151
|
return {
|
|
152
|
-
"title": scrape_result.metadata
|
|
153
|
-
"content": scrape_result.markdown,
|
|
152
|
+
"title": scrape_result.metadata.title or "",
|
|
153
|
+
"content": scrape_result.markdown or "",
|
|
154
154
|
}
|
|
155
155
|
|
|
156
156
|
except Exception as e:
|
|
@@ -36,14 +36,14 @@ async def templated_message(
|
|
|
36
36
|
prompt_template=input.system_prompt_template,
|
|
37
37
|
template_text=input.system_prompt_text,
|
|
38
38
|
).render(data=input.data)
|
|
39
|
-
msgs.append(
|
|
39
|
+
msgs.append({"role": "system", "content": system_prompt})
|
|
40
40
|
|
|
41
41
|
if input.user_prompt_template or input.user_prompt_text:
|
|
42
42
|
user_prompt = Prompter(
|
|
43
43
|
prompt_template=input.user_prompt_template,
|
|
44
44
|
template_text=input.user_prompt_text,
|
|
45
45
|
).render(data=input.data)
|
|
46
|
-
msgs.append(
|
|
46
|
+
msgs.append({"role": "user", "content": user_prompt})
|
|
47
47
|
|
|
48
48
|
result = await model.achat_complete(msgs)
|
|
49
49
|
return result.content
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.6.0
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -548,6 +548,9 @@ GOOGLE_API_KEY=your-key-here
|
|
|
548
548
|
# Engine Selection (optional)
|
|
549
549
|
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
550
550
|
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
551
|
+
|
|
552
|
+
# Audio Processing (optional)
|
|
553
|
+
CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
|
|
551
554
|
```
|
|
552
555
|
|
|
553
556
|
### Engine Selection via Environment Variables
|
|
@@ -556,9 +559,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
|
|
|
556
559
|
|
|
557
560
|
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
558
561
|
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
562
|
+
- **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
|
|
559
563
|
|
|
560
564
|
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
561
565
|
|
|
566
|
+
### Audio Processing Configuration
|
|
567
|
+
|
|
568
|
+
Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
|
|
569
|
+
|
|
570
|
+
- **Default**: 3 concurrent transcriptions
|
|
571
|
+
- **Range**: 1-10 concurrent transcriptions
|
|
572
|
+
- **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
|
|
573
|
+
|
|
574
|
+
Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
|
|
575
|
+
|
|
562
576
|
### Custom Prompt Templates
|
|
563
577
|
|
|
564
578
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
content_core/__init__.py,sha256=
|
|
2
|
-
content_core/cc_config.yaml,sha256=
|
|
3
|
-
content_core/config.py,sha256=
|
|
1
|
+
content_core/__init__.py,sha256=c1qw2HyV8DqBnoB9fbXNIZtda60MrxGUvWG5EQFtr-8,6492
|
|
2
|
+
content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
|
|
3
|
+
content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
|
|
6
6
|
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
7
7
|
content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
8
|
-
content_core/templated_message.py,sha256=
|
|
8
|
+
content_core/templated_message.py,sha256=jsjGqD-zf__pV4P0eo9cffTK2C90-VggL64qNYejFo0,1615
|
|
9
9
|
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
10
10
|
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
11
11
|
content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
|
|
@@ -17,27 +17,27 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
|
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
18
|
content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
|
|
20
|
-
content_core/content/identification/file_detector.py,sha256=
|
|
20
|
+
content_core/content/identification/file_detector.py,sha256=GBP1cCNc1qnt5_HkE5ALQDja2f8WgqCC5h1r8xF0P7A,21480
|
|
21
21
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
22
|
-
content_core/content/summary/core.py,sha256=
|
|
22
|
+
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
23
23
|
content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
|
|
24
24
|
content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
|
|
25
|
-
content_core/notebooks/run.ipynb,sha256=
|
|
25
|
+
content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
|
|
26
26
|
content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
|
|
27
|
-
content_core/processors/audio.py,sha256=
|
|
27
|
+
content_core/processors/audio.py,sha256=CYwoTDPsVUDALHuz_EHcnjVfsKF8XjQmvmX8c-OmMNU,8462
|
|
28
28
|
content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
|
|
29
29
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
30
30
|
content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
|
|
31
31
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
32
|
-
content_core/processors/url.py,sha256=
|
|
32
|
+
content_core/processors/url.py,sha256=RhBOyqfSWFaf8Dhpxlo9xbsF5yuP5FhXfhbvbi4CQPc,7514
|
|
33
33
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
34
34
|
content_core/processors/youtube.py,sha256=_qvxI9qTdxu3l1fKLuJARFt8KtZVFJ3JJBLkq1hAAXo,7868
|
|
35
35
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
36
36
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
37
37
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
38
38
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
39
|
-
content_core-1.
|
|
40
|
-
content_core-1.
|
|
41
|
-
content_core-1.
|
|
42
|
-
content_core-1.
|
|
43
|
-
content_core-1.
|
|
39
|
+
content_core-1.6.0.dist-info/METADATA,sha256=bBxEINm9h2ppJIia11flDRDH7UshzamVrHKHGxHrmjs,21963
|
|
40
|
+
content_core-1.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
content_core-1.6.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
|
|
42
|
+
content_core-1.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
43
|
+
content_core-1.6.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|