content-core 1.4.2__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/__init__.py CHANGED
@@ -214,5 +214,3 @@ def csum():
214
214
 
215
215
  if __name__ == "__main__":
216
216
  ccore()
217
- if __name__ == "__main__":
218
- ccore()
@@ -32,6 +32,8 @@ summary_model:
32
32
  extraction:
33
33
  document_engine: auto # auto | simple | docling - for files/documents
34
34
  url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
35
+ audio:
36
+ concurrency: 3 # Number of concurrent audio transcriptions (1-10)
35
37
  docling:
36
38
  output_format: markdown # markdown | html | json
37
39
  pymupdf:
content_core/config.py CHANGED
@@ -70,6 +70,61 @@ def get_url_engine():
70
70
  return env_engine
71
71
  return CONFIG.get("extraction", {}).get("url_engine", "auto")
72
72
 
73
+ def get_audio_concurrency():
74
+ """
75
+ Get audio concurrency with environment variable override and validation.
76
+
77
+ Returns the configured number of concurrent audio transcriptions, with automatic
78
+ validation and fallback to safe defaults.
79
+
80
+ Configuration priority (highest to lowest):
81
+ 1. CCORE_AUDIO_CONCURRENCY environment variable
82
+ 2. extraction.audio.concurrency in YAML config
83
+ 3. Default value: 3
84
+
85
+ Returns:
86
+ int: Number of concurrent transcriptions (1-10)
87
+
88
+ Validation:
89
+ - Values must be integers between 1 and 10 (inclusive)
90
+ - Invalid values (out of range, non-integer, etc.) automatically fall back to default
91
+ - A warning is logged when invalid values are detected
92
+
93
+ Examples:
94
+ >>> import os
95
+ >>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "5"
96
+ >>> get_audio_concurrency()
97
+ 5
98
+
99
+ >>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "20" # Too high
100
+ >>> get_audio_concurrency() # Falls back to default
101
+ 3
102
+ """
103
+ env_concurrency = os.environ.get("CCORE_AUDIO_CONCURRENCY")
104
+ if env_concurrency:
105
+ try:
106
+ concurrency = int(env_concurrency)
107
+ if concurrency < 1 or concurrency > 10:
108
+ # Import logger here to avoid circular imports
109
+ from content_core.logging import logger
110
+ logger.warning(
111
+ f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
112
+ f"Must be between 1 and 10. "
113
+ f"Using default from config."
114
+ )
115
+ return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
116
+ return concurrency
117
+ except ValueError:
118
+ # Import logger here to avoid circular imports
119
+ from content_core.logging import logger
120
+ logger.warning(
121
+ f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
122
+ f"Must be a valid integer. "
123
+ f"Using default from config."
124
+ )
125
+ return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
126
+ return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
127
+
73
128
  # Programmatic config overrides: use in notebooks or scripts
74
129
  def set_document_engine(engine: str):
75
130
  """Override the document extraction engine ('auto', 'simple', or 'docling')."""
@@ -102,3 +157,19 @@ def set_pymupdf_ocr_fallback(enabled: bool):
102
157
  extraction = CONFIG.setdefault("extraction", {})
103
158
  pymupdf_cfg = extraction.setdefault("pymupdf", {})
104
159
  pymupdf_cfg["ocr_fallback"] = enabled
160
+
161
+ def set_audio_concurrency(concurrency: int):
162
+ """
163
+ Override the audio concurrency setting (1-10).
164
+
165
+ Args:
166
+ concurrency (int): Number of concurrent audio transcriptions (1-10)
167
+
168
+ Raises:
169
+ ValueError: If concurrency is not between 1 and 10
170
+ """
171
+ if not isinstance(concurrency, int) or concurrency < 1 or concurrency > 10:
172
+ raise ValueError(f"Audio concurrency must be an integer between 1 and 10, got: {concurrency}")
173
+ extraction = CONFIG.setdefault("extraction", {})
174
+ audio_cfg = extraction.setdefault("audio", {})
175
+ audio_cfg["concurrency"] = concurrency
@@ -3,10 +3,9 @@ Pure Python file type detection using magic bytes and content analysis.
3
3
  Replaces libmagic dependency with a lightweight implementation.
4
4
  """
5
5
 
6
- import os
7
6
  import zipfile
8
7
  from pathlib import Path
9
- from typing import Dict, Optional, Tuple
8
+ from typing import Dict, Optional
10
9
 
11
10
  from content_core.common.exceptions import UnsupportedTypeException
12
11
  from content_core.logging import logger
@@ -14,10 +13,17 @@ from content_core.logging import logger
14
13
 
15
14
  class FileDetector:
16
15
  """Pure Python file type detection using magic bytes and content analysis."""
17
-
18
- # Configuration constants
16
+
17
+ # Configuration constants for binary/text detection
19
18
  SIGNATURE_READ_SIZE = 512 # Bytes to read for binary signature detection
20
19
  TEXT_READ_SIZE = 1024 # Bytes to read for text content analysis
20
+
21
+ # Configuration constants for CSV detection
22
+ CSV_MAX_FIELD_LENGTH = 100 # Maximum average field length for CSV (longer suggests prose)
23
+ CSV_MAX_VARIANCE = 500 # Maximum variance in field lengths (higher suggests natural text)
24
+ CSV_MIN_SCORE = 2 # Minimum score required to classify as CSV
25
+ CSV_MIN_FIELDS = 2 # Minimum number of fields required for CSV
26
+ CSV_MAX_HEADER_FIELD_LENGTH = 50 # Maximum length for individual header fields
21
27
 
22
28
  def __init__(self):
23
29
  """Initialize the FileDetector with signature mappings."""
@@ -365,18 +371,102 @@ class FileDetector:
365
371
 
366
372
 
367
373
  def _looks_like_csv(self, content: str) -> bool:
368
- """Check if content looks like CSV format."""
369
- lines = content.split('\n', 5)[:5] # Check first 5 lines
370
- if len(lines) < 2:
374
+ """
375
+ Check if content looks like CSV format with improved heuristics.
376
+
377
+ Uses a multi-stage approach with performance optimization:
378
+ 1. Basic structural checks (cheap)
379
+ 2. Field length analysis (cheap, early exit)
380
+ 3. Pattern matching (moderate cost)
381
+ 4. Variance analysis (expensive, only if needed)
382
+ """
383
+ lines = content.split('\n', 10)[:10] # Check first 10 lines for better accuracy
384
+ non_empty_lines = [line for line in lines if line.strip()]
385
+
386
+ # Stage 1: Basic structural checks
387
+ if len(non_empty_lines) < 2:
371
388
  return False
372
-
389
+
373
390
  # Count commas in each line
374
- comma_counts = [line.count(',') for line in lines if line.strip()]
375
- if not comma_counts:
391
+ comma_counts = [line.count(',') for line in non_empty_lines]
392
+
393
+ # Must have at least one comma per line
394
+ if not all(count > 0 for count in comma_counts):
376
395
  return False
377
-
378
- # CSV should have consistent comma counts
379
- return len(set(comma_counts)) == 1 and comma_counts[0] > 0
396
+
397
+ # CSV should have consistent comma counts across lines
398
+ if len(set(comma_counts)) != 1:
399
+ return False
400
+
401
+ num_fields = comma_counts[0] + 1 # Number of fields = commas + 1
402
+
403
+ # Must have minimum number of fields to be CSV
404
+ if num_fields < self.CSV_MIN_FIELDS:
405
+ return False
406
+
407
+ # Stage 2: Field length analysis (PERFORMANCE OPTIMIZATION: early exit)
408
+ first_line = non_empty_lines[0]
409
+ fields = first_line.split(',')
410
+
411
+ # CSV fields should be relatively short (not long sentences)
412
+ # Average field length should be reasonable (not paragraphs)
413
+ # Early exit avoids expensive variance calculations for obvious prose
414
+ avg_field_length = sum(len(f.strip()) for f in fields) / len(fields)
415
+ if avg_field_length > self.CSV_MAX_FIELD_LENGTH:
416
+ return False # Too long to be typical CSV fields - exit early
417
+
418
+ # Stage 3: Pattern matching
419
+ # Check for CSV-like patterns:
420
+ # 1. Fields that look like headers (short, alphanumeric)
421
+ # 2. Quoted fields (common in CSV)
422
+ # 3. Numeric fields
423
+ has_quoted_fields = any('"' in line or "'" in line for line in non_empty_lines[:3])
424
+
425
+ first_line_fields = [f.strip() for f in fields]
426
+ # Check if first line looks like a header (short, no sentence-ending punctuation)
427
+ looks_like_header = all(
428
+ len(f) < self.CSV_MAX_HEADER_FIELD_LENGTH and not f.endswith('.') and not f.endswith('!')
429
+ for f in first_line_fields
430
+ )
431
+
432
+ # Stage 4: Variance analysis (EXPENSIVE - only if we have enough data)
433
+ # Check if subsequent lines have similar field structure
434
+ # Real CSV tends to have consistent field lengths
435
+ if len(non_empty_lines) >= 3:
436
+ field_lengths_per_line = []
437
+ for line in non_empty_lines[:5]:
438
+ line_fields = line.split(',')
439
+ field_lengths = [len(f.strip()) for f in line_fields]
440
+ field_lengths_per_line.append(field_lengths)
441
+
442
+ # Calculate variance in field positions
443
+ # CSV data should have relatively consistent field lengths at each position
444
+ # Natural text with commas will have much more variance
445
+ position_variances = []
446
+ for i in range(num_fields):
447
+ lengths_at_position = [fl[i] if i < len(fl) else 0 for fl in field_lengths_per_line]
448
+ if lengths_at_position:
449
+ avg = sum(lengths_at_position) / len(lengths_at_position)
450
+ variance = sum((x - avg) ** 2 for x in lengths_at_position) / len(lengths_at_position)
451
+ position_variances.append(variance)
452
+
453
+ # High variance suggests natural text, not structured CSV
454
+ if position_variances:
455
+ avg_variance = sum(position_variances) / len(position_variances)
456
+ if avg_variance > self.CSV_MAX_VARIANCE:
457
+ return False # Very high variance = likely prose
458
+
459
+ # Scoring: Require at least some CSV-like characteristics
460
+ csv_score = 0
461
+ if looks_like_header:
462
+ csv_score += 1
463
+ if has_quoted_fields:
464
+ csv_score += 1
465
+ if num_fields >= 3: # Multiple fields is more CSV-like
466
+ csv_score += 1
467
+
468
+ # Need minimum score to confidently classify as CSV
469
+ return csv_score >= self.CSV_MIN_SCORE
380
470
 
381
471
 
382
472
  def _is_text_file(self, content: str) -> bool:
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
8
8
  templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
9
9
  response = await templated_message_fn(
10
10
  TemplatedMessageInput(
11
- user_prompt_template="prompts/content/summarize",
11
+ user_prompt_template="content/summarize",
12
12
  data={"content": content, "context": context},
13
13
  )
14
14
  )
@@ -63,8 +63,6 @@
63
63
  "source": [
64
64
  "from content_core.content.extraction import extract_content\n",
65
65
  "\n",
66
- "from content_core.content.cleanup import cleanup_content\n",
67
- "from content_core.content.summary import summarize\n",
68
66
  "\n",
69
67
  "\n",
70
68
  "yt = await extract_content(dict(url=\"https://www.youtube.com/watch?v=lLprprtHfts\"))\n",
@@ -8,11 +8,9 @@ from functools import partial
8
8
  from moviepy import AudioFileClip
9
9
 
10
10
  from content_core.common import ProcessSourceState
11
+ from content_core.config import get_audio_concurrency
11
12
  from content_core.logging import logger
12
13
 
13
- # todo: remove reference to model_manager
14
- # future: parallelize the transcription process
15
-
16
14
 
17
15
  async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
18
16
  """
@@ -47,7 +45,7 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
47
45
  end_time = min((i + 1) * segment_length_s, audio.duration)
48
46
 
49
47
  # Extract segment
50
- output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
48
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
51
49
  output_path = os.path.join(output_dir, output_filename)
52
50
 
53
51
  # Export segment
@@ -55,7 +53,9 @@ async def split_audio(input_file, segment_length_minutes=15, output_prefix=None)
55
53
 
56
54
  output_files.append(output_path)
57
55
 
58
- logger.debug(f"Exported segment {i+1}/{total_segments}: {output_filename}")
56
+ logger.debug(
57
+ f"Exported segment {i + 1}/{total_segments}: {output_filename}"
58
+ )
59
59
 
60
60
  return output_files
61
61
 
@@ -98,61 +98,128 @@ def extract_audio(
98
98
  raise
99
99
 
100
100
 
101
- async def transcribe_audio_segment(audio_file, model):
102
- """Transcribe a single audio segment asynchronously"""
103
- return (await model.atranscribe(audio_file)).text
101
+ async def transcribe_audio_segment(audio_file, model, semaphore):
102
+ """
103
+ Transcribe a single audio segment asynchronously with concurrency control.
104
104
 
105
+ This function uses a semaphore to limit the number of concurrent transcriptions,
106
+ preventing API rate limits while allowing parallel processing for improved performance.
105
107
 
106
- async def extract_audio_data(data: ProcessSourceState):
107
- input_audio_path = data.file_path
108
+ Args:
109
+ audio_file (str): Path to the audio file segment to transcribe
110
+ model: Speech-to-text model instance with atranscribe() method
111
+ semaphore (asyncio.Semaphore): Semaphore to control concurrency
108
112
 
109
- try:
110
- # Create a temporary directory for audio segments
111
- temp_dir = tempfile.mkdtemp()
112
- output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
113
- output_dir = temp_dir
114
- os.makedirs(output_dir, exist_ok=True)
113
+ Returns:
114
+ str: Transcribed text from the audio segment
115
115
 
116
- # Split audio into segments if longer than 10 minutes
117
- audio = AudioFileClip(input_audio_path)
118
- duration_s = audio.duration
119
- segment_length_s = 10 * 60 # 10 minutes in seconds
120
- output_files = []
116
+ Note:
117
+ Multiple instances of this function can run concurrently, but the semaphore
118
+ ensures that no more than N transcriptions happen simultaneously, where N
119
+ is configured via get_audio_concurrency() (default: 3, range: 1-10).
120
+ """
121
+ async with semaphore:
122
+ return (await model.atranscribe(audio_file)).text
121
123
 
122
- if duration_s > segment_length_s:
123
- logger.info(
124
- f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
125
- )
126
- for i in range(math.ceil(duration_s / segment_length_s)):
127
- start_time = i * segment_length_s
128
- end_time = min((i + 1) * segment_length_s, audio.duration)
129
124
 
130
- # Extract segment
131
- output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
132
- output_path = os.path.join(output_dir, output_filename)
125
+ async def extract_audio_data(data: ProcessSourceState):
126
+ """
127
+ Extract and transcribe audio from a file with automatic segmentation and parallel processing.
128
+
129
+ This function handles the complete audio processing pipeline:
130
+ 1. Splits long audio files (>10 minutes) into segments
131
+ 2. Transcribes segments in parallel using configurable concurrency
132
+ 3. Joins transcriptions in correct order
133
133
 
134
- extract_audio(input_audio_path, output_path, start_time, end_time)
134
+ For files longer than 10 minutes, segments are processed concurrently with a
135
+ configurable concurrency limit to balance performance and API rate limits.
135
136
 
136
- output_files.append(output_path)
137
- else:
138
- output_files = [input_audio_path]
137
+ Args:
138
+ data (ProcessSourceState): State object containing file_path to audio/video file
139
+
140
+ Returns:
141
+ dict: Dictionary containing:
142
+ - metadata: Information about processed segments count
143
+ - content: Complete transcribed text
139
144
 
140
- # Transcribe audio files
141
- from content_core.models import ModelFactory
145
+ Configuration:
146
+ Concurrency is controlled via:
147
+ - Environment variable: CCORE_AUDIO_CONCURRENCY (1-10, default: 3)
148
+ - YAML config: extraction.audio.concurrency
142
149
 
143
- speech_to_text_model = ModelFactory.get_model("speech_to_text")
144
- transcriptions = []
145
- for audio_file in output_files:
146
- transcription = await transcribe_audio_segment(
147
- audio_file, speech_to_text_model
150
+ Raises:
151
+ Exception: If audio extraction or transcription fails
152
+ """
153
+ input_audio_path = data.file_path
154
+ audio = None
155
+
156
+ try:
157
+ # Use TemporaryDirectory context manager for automatic cleanup
158
+ with tempfile.TemporaryDirectory() as temp_dir:
159
+ output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
160
+ output_dir = temp_dir
161
+
162
+ # Split audio into segments if longer than 10 minutes
163
+ audio = AudioFileClip(input_audio_path)
164
+ duration_s = audio.duration
165
+ segment_length_s = 10 * 60 # 10 minutes in seconds
166
+ output_files = []
167
+
168
+ if duration_s > segment_length_s:
169
+ logger.info(
170
+ f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
171
+ )
172
+ for i in range(math.ceil(duration_s / segment_length_s)):
173
+ start_time = i * segment_length_s
174
+ end_time = min((i + 1) * segment_length_s, audio.duration)
175
+
176
+ # Extract segment
177
+ output_filename = f"{output_prefix}_{str(i + 1).zfill(3)}.mp3"
178
+ output_path = os.path.join(output_dir, output_filename)
179
+
180
+ extract_audio(input_audio_path, output_path, start_time, end_time)
181
+
182
+ output_files.append(output_path)
183
+ else:
184
+ output_files = [input_audio_path]
185
+
186
+ # Close audio clip after determining segments
187
+ if audio:
188
+ audio.close()
189
+ audio = None
190
+
191
+ # Transcribe audio files in parallel with concurrency limit
192
+ from content_core.models import ModelFactory
193
+
194
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
195
+ concurrency = get_audio_concurrency()
196
+ semaphore = asyncio.Semaphore(concurrency)
197
+
198
+ logger.debug(
199
+ f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}"
148
200
  )
149
- transcriptions.append(transcription)
150
201
 
151
- return {
152
- "metadata": {"audio_files": output_files},
153
- "content": " ".join(transcriptions),
154
- }
202
+ # Create tasks for parallel transcription
203
+ transcription_tasks = [
204
+ transcribe_audio_segment(audio_file, speech_to_text_model, semaphore)
205
+ for audio_file in output_files
206
+ ]
207
+
208
+ # Execute all transcriptions concurrently (limited by semaphore)
209
+ transcriptions = await asyncio.gather(*transcription_tasks)
210
+
211
+ return {
212
+ "metadata": {"segments_count": len(output_files)},
213
+ "content": " ".join(transcriptions),
214
+ }
155
215
  except Exception as e:
156
216
  logger.error(f"Error processing audio: {str(e)}")
157
217
  logger.error(traceback.format_exc())
158
218
  raise
219
+ finally:
220
+ # Ensure audio clip is closed even if an error occurs
221
+ if audio:
222
+ try:
223
+ audio.close()
224
+ except Exception:
225
+ pass
@@ -147,10 +147,10 @@ async def extract_url_firecrawl(url: str):
147
147
  from firecrawl import AsyncFirecrawlApp
148
148
 
149
149
  app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
150
- scrape_result = await app.scrape_url(url, formats=["markdown", "html"])
150
+ scrape_result = await app.scrape(url, formats=["markdown", "html"])
151
151
  return {
152
- "title": scrape_result.metadata["title"] or scrape_result.title,
153
- "content": scrape_result.markdown,
152
+ "title": scrape_result.metadata.title or "",
153
+ "content": scrape_result.markdown or "",
154
154
  }
155
155
 
156
156
  except Exception as e:
@@ -36,14 +36,14 @@ async def templated_message(
36
36
  prompt_template=input.system_prompt_template,
37
37
  template_text=input.system_prompt_text,
38
38
  ).render(data=input.data)
39
- msgs.append(Message(role="system", content=system_prompt))
39
+ msgs.append({"role": "system", "content": system_prompt})
40
40
 
41
41
  if input.user_prompt_template or input.user_prompt_text:
42
42
  user_prompt = Prompter(
43
43
  prompt_template=input.user_prompt_template,
44
44
  template_text=input.user_prompt_text,
45
45
  ).render(data=input.data)
46
- msgs.append(Message(role="user", content=user_prompt))
46
+ msgs.append({"role": "user", "content": user_prompt})
47
47
 
48
48
  result = await model.achat_complete(msgs)
49
49
  return result.content
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 1.4.2
3
+ Version: 1.6.0
4
4
  Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -548,6 +548,9 @@ GOOGLE_API_KEY=your-key-here
548
548
  # Engine Selection (optional)
549
549
  CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
550
550
  CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
551
+
552
+ # Audio Processing (optional)
553
+ CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
551
554
  ```
552
555
 
553
556
  ### Engine Selection via Environment Variables
@@ -556,9 +559,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
556
559
 
557
560
  - **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
558
561
  - **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
562
+ - **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
559
563
 
560
564
  These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
561
565
 
566
+ ### Audio Processing Configuration
567
+
568
+ Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
569
+
570
+ - **Default**: 3 concurrent transcriptions
571
+ - **Range**: 1-10 concurrent transcriptions
572
+ - **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
573
+
574
+ Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
575
+
562
576
  ### Custom Prompt Templates
563
577
 
564
578
  Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
@@ -1,11 +1,11 @@
1
- content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
2
- content_core/cc_config.yaml,sha256=hjTt5z1Z9b5LShVIqNT3OiAnTAdmr0LB5y8RTyH-fNA,1119
3
- content_core/config.py,sha256=3XAsMF3EhDJ6aCpzk1UZG_m3-SFdYe3cHiDPH7eVGwQ,4312
1
+ content_core/__init__.py,sha256=c1qw2HyV8DqBnoB9fbXNIZtda60MrxGUvWG5EQFtr-8,6492
2
+ content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
3
+ content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
4
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
5
5
  content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
6
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
7
7
  content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
- content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
8
+ content_core/templated_message.py,sha256=jsjGqD-zf__pV4P0eo9cffTK2C90-VggL64qNYejFo0,1615
9
9
  content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
10
10
  content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
11
11
  content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
@@ -17,27 +17,27 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
17
17
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
18
18
  content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
19
19
  content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
20
- content_core/content/identification/file_detector.py,sha256=s_10Osxv8gfVfs3UPXFzCOosvWCrf4ZCFXcW2yimUIM,17170
20
+ content_core/content/identification/file_detector.py,sha256=GBP1cCNc1qnt5_HkE5ALQDja2f8WgqCC5h1r8xF0P7A,21480
21
21
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
22
- content_core/content/summary/core.py,sha256=kEabpETljzUb-yf0NcVWTOuCtayESo74gGBVDX7YTFs,550
22
+ content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
23
23
  content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
24
24
  content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
25
- content_core/notebooks/run.ipynb,sha256=WPBNcQUNXR5MldNMghVcU4vE4ibrVmlANa80baQn8TA,371078
25
+ content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
26
26
  content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
27
- content_core/processors/audio.py,sha256=Mie20g_2Akhw6BHBVo3sHMpDRYUkqBI72lEDakscx3s,5729
27
+ content_core/processors/audio.py,sha256=CYwoTDPsVUDALHuz_EHcnjVfsKF8XjQmvmX8c-OmMNU,8462
28
28
  content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
29
29
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
30
30
  content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
31
31
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
32
- content_core/processors/url.py,sha256=To0LTtMVNN3M83CdodQaZFuU7-IMM5w9QOHRKNV8PVI,7532
32
+ content_core/processors/url.py,sha256=RhBOyqfSWFaf8Dhpxlo9xbsF5yuP5FhXfhbvbi4CQPc,7514
33
33
  content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
34
34
  content_core/processors/youtube.py,sha256=_qvxI9qTdxu3l1fKLuJARFt8KtZVFJ3JJBLkq1hAAXo,7868
35
35
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
36
36
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
37
37
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
38
38
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
39
- content_core-1.4.2.dist-info/METADATA,sha256=E8l57dOkDGx8_GUnk4BsaLbFKD560wKjQLlydqar1jQ,21093
40
- content_core-1.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
- content_core-1.4.2.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
42
- content_core-1.4.2.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
43
- content_core-1.4.2.dist-info/RECORD,,
39
+ content_core-1.6.0.dist-info/METADATA,sha256=bBxEINm9h2ppJIia11flDRDH7UshzamVrHKHGxHrmjs,21963
40
+ content_core-1.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
41
+ content_core-1.6.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
42
+ content_core-1.6.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
43
+ content_core-1.6.0.dist-info/RECORD,,