content-core 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/cc_config.yaml +2 -0
- content_core/config.py +71 -0
- content_core/content/identification/file_detector.py +1 -2
- content_core/content/summary/core.py +1 -1
- content_core/notebooks/run.ipynb +0 -2
- content_core/processors/audio.py +114 -51
- content_core/processors/url.py +3 -3
- content_core/templated_message.py +2 -2
- {content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/METADATA +15 -1
- {content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/RECORD +13 -13
- {content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/WHEEL +0 -0
- {content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/entry_points.txt +0 -0
- {content_core-1.4.1.dist-info → content_core-1.5.0.dist-info}/licenses/LICENSE +0 -0
content_core/cc_config.yaml
CHANGED
|
@@ -32,6 +32,8 @@ summary_model:
|
|
|
32
32
|
extraction:
|
|
33
33
|
document_engine: auto # auto | simple | docling - for files/documents
|
|
34
34
|
url_engine: auto # auto | simple | firecrawl | jina | docling - for URLs
|
|
35
|
+
audio:
|
|
36
|
+
concurrency: 3 # Number of concurrent audio transcriptions (1-10)
|
|
35
37
|
docling:
|
|
36
38
|
output_format: markdown # markdown | html | json
|
|
37
39
|
pymupdf:
|
content_core/config.py
CHANGED
|
@@ -70,6 +70,61 @@ def get_url_engine():
|
|
|
70
70
|
return env_engine
|
|
71
71
|
return CONFIG.get("extraction", {}).get("url_engine", "auto")
|
|
72
72
|
|
|
73
|
+
def get_audio_concurrency():
|
|
74
|
+
"""
|
|
75
|
+
Get audio concurrency with environment variable override and validation.
|
|
76
|
+
|
|
77
|
+
Returns the configured number of concurrent audio transcriptions, with automatic
|
|
78
|
+
validation and fallback to safe defaults.
|
|
79
|
+
|
|
80
|
+
Configuration priority (highest to lowest):
|
|
81
|
+
1. CCORE_AUDIO_CONCURRENCY environment variable
|
|
82
|
+
2. extraction.audio.concurrency in YAML config
|
|
83
|
+
3. Default value: 3
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
int: Number of concurrent transcriptions (1-10)
|
|
87
|
+
|
|
88
|
+
Validation:
|
|
89
|
+
- Values must be integers between 1 and 10 (inclusive)
|
|
90
|
+
- Invalid values (out of range, non-integer, etc.) automatically fall back to default
|
|
91
|
+
- A warning is logged when invalid values are detected
|
|
92
|
+
|
|
93
|
+
Examples:
|
|
94
|
+
>>> import os
|
|
95
|
+
>>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "5"
|
|
96
|
+
>>> get_audio_concurrency()
|
|
97
|
+
5
|
|
98
|
+
|
|
99
|
+
>>> os.environ["CCORE_AUDIO_CONCURRENCY"] = "20" # Too high
|
|
100
|
+
>>> get_audio_concurrency() # Falls back to default
|
|
101
|
+
3
|
|
102
|
+
"""
|
|
103
|
+
env_concurrency = os.environ.get("CCORE_AUDIO_CONCURRENCY")
|
|
104
|
+
if env_concurrency:
|
|
105
|
+
try:
|
|
106
|
+
concurrency = int(env_concurrency)
|
|
107
|
+
if concurrency < 1 or concurrency > 10:
|
|
108
|
+
# Import logger here to avoid circular imports
|
|
109
|
+
from content_core.logging import logger
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
|
|
112
|
+
f"Must be between 1 and 10. "
|
|
113
|
+
f"Using default from config."
|
|
114
|
+
)
|
|
115
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
116
|
+
return concurrency
|
|
117
|
+
except ValueError:
|
|
118
|
+
# Import logger here to avoid circular imports
|
|
119
|
+
from content_core.logging import logger
|
|
120
|
+
logger.warning(
|
|
121
|
+
f"Invalid CCORE_AUDIO_CONCURRENCY: '{env_concurrency}'. "
|
|
122
|
+
f"Must be a valid integer. "
|
|
123
|
+
f"Using default from config."
|
|
124
|
+
)
|
|
125
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
126
|
+
return CONFIG.get("extraction", {}).get("audio", {}).get("concurrency", 3)
|
|
127
|
+
|
|
73
128
|
# Programmatic config overrides: use in notebooks or scripts
|
|
74
129
|
def set_document_engine(engine: str):
|
|
75
130
|
"""Override the document extraction engine ('auto', 'simple', or 'docling')."""
|
|
@@ -102,3 +157,19 @@ def set_pymupdf_ocr_fallback(enabled: bool):
|
|
|
102
157
|
extraction = CONFIG.setdefault("extraction", {})
|
|
103
158
|
pymupdf_cfg = extraction.setdefault("pymupdf", {})
|
|
104
159
|
pymupdf_cfg["ocr_fallback"] = enabled
|
|
160
|
+
|
|
161
|
+
def set_audio_concurrency(concurrency: int):
|
|
162
|
+
"""
|
|
163
|
+
Override the audio concurrency setting (1-10).
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
concurrency (int): Number of concurrent audio transcriptions (1-10)
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: If concurrency is not between 1 and 10
|
|
170
|
+
"""
|
|
171
|
+
if not isinstance(concurrency, int) or concurrency < 1 or concurrency > 10:
|
|
172
|
+
raise ValueError(f"Audio concurrency must be an integer between 1 and 10, got: {concurrency}")
|
|
173
|
+
extraction = CONFIG.setdefault("extraction", {})
|
|
174
|
+
audio_cfg = extraction.setdefault("audio", {})
|
|
175
|
+
audio_cfg["concurrency"] = concurrency
|
|
@@ -3,10 +3,9 @@ Pure Python file type detection using magic bytes and content analysis.
|
|
|
3
3
|
Replaces libmagic dependency with a lightweight implementation.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import os
|
|
7
6
|
import zipfile
|
|
8
7
|
from pathlib import Path
|
|
9
|
-
from typing import Dict, Optional
|
|
8
|
+
from typing import Dict, Optional
|
|
10
9
|
|
|
11
10
|
from content_core.common.exceptions import UnsupportedTypeException
|
|
12
11
|
from content_core.logging import logger
|
|
@@ -8,7 +8,7 @@ async def summarize(content: str, context: str) -> str:
|
|
|
8
8
|
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
|
-
user_prompt_template="
|
|
11
|
+
user_prompt_template="content/summarize",
|
|
12
12
|
data={"content": content, "context": context},
|
|
13
13
|
)
|
|
14
14
|
)
|
content_core/notebooks/run.ipynb
CHANGED
|
@@ -63,8 +63,6 @@
|
|
|
63
63
|
"source": [
|
|
64
64
|
"from content_core.content.extraction import extract_content\n",
|
|
65
65
|
"\n",
|
|
66
|
-
"from content_core.content.cleanup import cleanup_content\n",
|
|
67
|
-
"from content_core.content.summary import summarize\n",
|
|
68
66
|
"\n",
|
|
69
67
|
"\n",
|
|
70
68
|
"yt = await extract_content(dict(url=\"https://www.youtube.com/watch?v=lLprprtHfts\"))\n",
|
content_core/processors/audio.py
CHANGED
|
@@ -8,11 +8,9 @@ from functools import partial
|
|
|
8
8
|
from moviepy import AudioFileClip
|
|
9
9
|
|
|
10
10
|
from content_core.common import ProcessSourceState
|
|
11
|
+
from content_core.config import get_audio_concurrency
|
|
11
12
|
from content_core.logging import logger
|
|
12
13
|
|
|
13
|
-
# todo: remove reference to model_manager
|
|
14
|
-
# future: parallelize the transcription process
|
|
15
|
-
|
|
16
14
|
|
|
17
15
|
async def split_audio(input_file, segment_length_minutes=15, output_prefix=None):
|
|
18
16
|
"""
|
|
@@ -98,61 +96,126 @@ def extract_audio(
|
|
|
98
96
|
raise
|
|
99
97
|
|
|
100
98
|
|
|
101
|
-
async def transcribe_audio_segment(audio_file, model):
|
|
102
|
-
"""
|
|
103
|
-
|
|
99
|
+
async def transcribe_audio_segment(audio_file, model, semaphore):
|
|
100
|
+
"""
|
|
101
|
+
Transcribe a single audio segment asynchronously with concurrency control.
|
|
102
|
+
|
|
103
|
+
This function uses a semaphore to limit the number of concurrent transcriptions,
|
|
104
|
+
preventing API rate limits while allowing parallel processing for improved performance.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
audio_file (str): Path to the audio file segment to transcribe
|
|
108
|
+
model: Speech-to-text model instance with atranscribe() method
|
|
109
|
+
semaphore (asyncio.Semaphore): Semaphore to control concurrency
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
str: Transcribed text from the audio segment
|
|
113
|
+
|
|
114
|
+
Note:
|
|
115
|
+
Multiple instances of this function can run concurrently, but the semaphore
|
|
116
|
+
ensures that no more than N transcriptions happen simultaneously, where N
|
|
117
|
+
is configured via get_audio_concurrency() (default: 3, range: 1-10).
|
|
118
|
+
"""
|
|
119
|
+
async with semaphore:
|
|
120
|
+
return (await model.atranscribe(audio_file)).text
|
|
104
121
|
|
|
105
122
|
|
|
106
123
|
async def extract_audio_data(data: ProcessSourceState):
|
|
107
|
-
|
|
124
|
+
"""
|
|
125
|
+
Extract and transcribe audio from a file with automatic segmentation and parallel processing.
|
|
108
126
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
output_dir = temp_dir
|
|
114
|
-
os.makedirs(output_dir, exist_ok=True)
|
|
127
|
+
This function handles the complete audio processing pipeline:
|
|
128
|
+
1. Splits long audio files (>10 minutes) into segments
|
|
129
|
+
2. Transcribes segments in parallel using configurable concurrency
|
|
130
|
+
3. Joins transcriptions in correct order
|
|
115
131
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
132
|
+
For files longer than 10 minutes, segments are processed concurrently with a
|
|
133
|
+
configurable concurrency limit to balance performance and API rate limits.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
data (ProcessSourceState): State object containing file_path to audio/video file
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
dict: Dictionary containing:
|
|
140
|
+
- metadata: Information about processed segments count
|
|
141
|
+
- content: Complete transcribed text
|
|
121
142
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
143
|
+
Configuration:
|
|
144
|
+
Concurrency is controlled via:
|
|
145
|
+
- Environment variable: CCORE_AUDIO_CONCURRENCY (1-10, default: 3)
|
|
146
|
+
- YAML config: extraction.audio.concurrency
|
|
147
|
+
|
|
148
|
+
Raises:
|
|
149
|
+
Exception: If audio extraction or transcription fails
|
|
150
|
+
"""
|
|
151
|
+
input_audio_path = data.file_path
|
|
152
|
+
audio = None
|
|
153
|
+
|
|
154
|
+
try:
|
|
155
|
+
# Use TemporaryDirectory context manager for automatic cleanup
|
|
156
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
157
|
+
output_prefix = os.path.splitext(os.path.basename(input_audio_path))[0]
|
|
158
|
+
output_dir = temp_dir
|
|
159
|
+
|
|
160
|
+
# Split audio into segments if longer than 10 minutes
|
|
161
|
+
audio = AudioFileClip(input_audio_path)
|
|
162
|
+
duration_s = audio.duration
|
|
163
|
+
segment_length_s = 10 * 60 # 10 minutes in seconds
|
|
164
|
+
output_files = []
|
|
165
|
+
|
|
166
|
+
if duration_s > segment_length_s:
|
|
167
|
+
logger.info(
|
|
168
|
+
f"Audio is longer than 10 minutes ({duration_s}s), splitting into {math.ceil(duration_s / segment_length_s)} segments"
|
|
169
|
+
)
|
|
170
|
+
for i in range(math.ceil(duration_s / segment_length_s)):
|
|
171
|
+
start_time = i * segment_length_s
|
|
172
|
+
end_time = min((i + 1) * segment_length_s, audio.duration)
|
|
173
|
+
|
|
174
|
+
# Extract segment
|
|
175
|
+
output_filename = f"{output_prefix}_{str(i+1).zfill(3)}.mp3"
|
|
176
|
+
output_path = os.path.join(output_dir, output_filename)
|
|
177
|
+
|
|
178
|
+
extract_audio(input_audio_path, output_path, start_time, end_time)
|
|
179
|
+
|
|
180
|
+
output_files.append(output_path)
|
|
181
|
+
else:
|
|
182
|
+
output_files = [input_audio_path]
|
|
183
|
+
|
|
184
|
+
# Close audio clip after determining segments
|
|
185
|
+
if audio:
|
|
186
|
+
audio.close()
|
|
187
|
+
audio = None
|
|
188
|
+
|
|
189
|
+
# Transcribe audio files in parallel with concurrency limit
|
|
190
|
+
from content_core.models import ModelFactory
|
|
191
|
+
|
|
192
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
193
|
+
concurrency = get_audio_concurrency()
|
|
194
|
+
semaphore = asyncio.Semaphore(concurrency)
|
|
195
|
+
|
|
196
|
+
logger.debug(f"Transcribing {len(output_files)} audio segments with concurrency limit of {concurrency}")
|
|
197
|
+
|
|
198
|
+
# Create tasks for parallel transcription
|
|
199
|
+
transcription_tasks = [
|
|
200
|
+
transcribe_audio_segment(audio_file, speech_to_text_model, semaphore)
|
|
201
|
+
for audio_file in output_files
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
# Execute all transcriptions concurrently (limited by semaphore)
|
|
205
|
+
transcriptions = await asyncio.gather(*transcription_tasks)
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
"metadata": {"segments_count": len(output_files)},
|
|
209
|
+
"content": " ".join(transcriptions),
|
|
210
|
+
}
|
|
155
211
|
except Exception as e:
|
|
156
212
|
logger.error(f"Error processing audio: {str(e)}")
|
|
157
213
|
logger.error(traceback.format_exc())
|
|
158
214
|
raise
|
|
215
|
+
finally:
|
|
216
|
+
# Ensure audio clip is closed even if an error occurs
|
|
217
|
+
if audio:
|
|
218
|
+
try:
|
|
219
|
+
audio.close()
|
|
220
|
+
except Exception:
|
|
221
|
+
pass
|
content_core/processors/url.py
CHANGED
|
@@ -147,10 +147,10 @@ async def extract_url_firecrawl(url: str):
|
|
|
147
147
|
from firecrawl import AsyncFirecrawlApp
|
|
148
148
|
|
|
149
149
|
app = AsyncFirecrawlApp(api_key=os.environ.get("FIRECRAWL_API_KEY"))
|
|
150
|
-
scrape_result = await app.
|
|
150
|
+
scrape_result = await app.scrape(url, formats=["markdown", "html"])
|
|
151
151
|
return {
|
|
152
|
-
"title": scrape_result.metadata
|
|
153
|
-
"content": scrape_result.markdown,
|
|
152
|
+
"title": scrape_result.metadata.title or "",
|
|
153
|
+
"content": scrape_result.markdown or "",
|
|
154
154
|
}
|
|
155
155
|
|
|
156
156
|
except Exception as e:
|
|
@@ -36,14 +36,14 @@ async def templated_message(
|
|
|
36
36
|
prompt_template=input.system_prompt_template,
|
|
37
37
|
template_text=input.system_prompt_text,
|
|
38
38
|
).render(data=input.data)
|
|
39
|
-
msgs.append(
|
|
39
|
+
msgs.append({"role": "system", "content": system_prompt})
|
|
40
40
|
|
|
41
41
|
if input.user_prompt_template or input.user_prompt_text:
|
|
42
42
|
user_prompt = Prompter(
|
|
43
43
|
prompt_template=input.user_prompt_template,
|
|
44
44
|
template_text=input.user_prompt_text,
|
|
45
45
|
).render(data=input.data)
|
|
46
|
-
msgs.append(
|
|
46
|
+
msgs.append({"role": "user", "content": user_prompt})
|
|
47
47
|
|
|
48
48
|
result = await model.achat_complete(msgs)
|
|
49
49
|
return result.content
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.5.0
|
|
4
4
|
Summary: Extract what matters from any media source. Available as Python Library, macOS Service, CLI and MCP Server
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -548,6 +548,9 @@ GOOGLE_API_KEY=your-key-here
|
|
|
548
548
|
# Engine Selection (optional)
|
|
549
549
|
CCORE_DOCUMENT_ENGINE=auto # auto, simple, docling
|
|
550
550
|
CCORE_URL_ENGINE=auto # auto, simple, firecrawl, jina
|
|
551
|
+
|
|
552
|
+
# Audio Processing (optional)
|
|
553
|
+
CCORE_AUDIO_CONCURRENCY=3 # Number of concurrent audio transcriptions (1-10, default: 3)
|
|
551
554
|
```
|
|
552
555
|
|
|
553
556
|
### Engine Selection via Environment Variables
|
|
@@ -556,9 +559,20 @@ For deployment scenarios like MCP servers or Raycast extensions, you can overrid
|
|
|
556
559
|
|
|
557
560
|
- **`CCORE_DOCUMENT_ENGINE`**: Force document engine (`auto`, `simple`, `docling`)
|
|
558
561
|
- **`CCORE_URL_ENGINE`**: Force URL engine (`auto`, `simple`, `firecrawl`, `jina`)
|
|
562
|
+
- **`CCORE_AUDIO_CONCURRENCY`**: Number of concurrent audio transcriptions (1-10, default: 3)
|
|
559
563
|
|
|
560
564
|
These variables take precedence over config file settings and provide explicit control for different deployment scenarios.
|
|
561
565
|
|
|
566
|
+
### Audio Processing Configuration
|
|
567
|
+
|
|
568
|
+
Content Core processes long audio files by splitting them into segments and transcribing them in parallel for improved performance. You can control the concurrency level to balance speed with API rate limits:
|
|
569
|
+
|
|
570
|
+
- **Default**: 3 concurrent transcriptions
|
|
571
|
+
- **Range**: 1-10 concurrent transcriptions
|
|
572
|
+
- **Configuration**: Set via `CCORE_AUDIO_CONCURRENCY` environment variable or `extraction.audio.concurrency` in `cc_config.yaml`
|
|
573
|
+
|
|
574
|
+
Higher concurrency values can speed up processing of long audio/video files but may hit API rate limits. Lower values are more conservative and suitable for accounts with lower API quotas.
|
|
575
|
+
|
|
562
576
|
### Custom Prompt Templates
|
|
563
577
|
|
|
564
578
|
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
content_core/__init__.py,sha256=t4xFo9f3uB2FD1tdR-7ruhMW9_ciJawQReK6iFXWfR0,6531
|
|
2
|
-
content_core/cc_config.yaml,sha256=
|
|
3
|
-
content_core/config.py,sha256=
|
|
2
|
+
content_core/cc_config.yaml,sha256=3Ot5u-YSBx2k3JXWnCP7s7OVBbGpGebBy_CWj3we-u4,1211
|
|
3
|
+
content_core/config.py,sha256=Mao6AZZoiSiX7uZwOGgk759LlV0j6NdfYGgWgX6vhAs,7112
|
|
4
4
|
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
5
5
|
content_core/models.py,sha256=Kt6tWdAX87eQ2tL6eTwcHU7_NIRnN4exP4RzV2WrMig,881
|
|
6
6
|
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
7
7
|
content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
8
|
-
content_core/templated_message.py,sha256=
|
|
8
|
+
content_core/templated_message.py,sha256=jsjGqD-zf__pV4P0eo9cffTK2C90-VggL64qNYejFo0,1615
|
|
9
9
|
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
10
10
|
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
11
11
|
content_core/common/state.py,sha256=K5jsDg4l2GSaoGyFYzdd1GW14vLaAxdxes8vUrPNVkE,1622
|
|
@@ -17,27 +17,27 @@ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnR
|
|
|
17
17
|
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
18
18
|
content_core/content/extraction/graph.py,sha256=AFi9B_hTuxqdgvogCOk4Xdqoboug7_KXtV0ZHlb8igM,8139
|
|
19
19
|
content_core/content/identification/__init__.py,sha256=DDoCi1r-6Z_pGPPi3X1ZwyRrcRtg-rAiCTK50hnO5Y0,235
|
|
20
|
-
content_core/content/identification/file_detector.py,sha256=
|
|
20
|
+
content_core/content/identification/file_detector.py,sha256=JTfGK28BQg_SGYqLzGVT4OGBfWx8HtEPA-3kfW5o3oE,17153
|
|
21
21
|
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
22
|
-
content_core/content/summary/core.py,sha256=
|
|
22
|
+
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
23
23
|
content_core/mcp/__init__.py,sha256=KNZYH4F9AoW1Orw1BtO3n92Cn-127hI7iF9gnGadueU,95
|
|
24
24
|
content_core/mcp/server.py,sha256=ql0uXHkIbZlHQUhUQ4CaRnj19xT6t8ErydWntFgmtUg,7021
|
|
25
|
-
content_core/notebooks/run.ipynb,sha256=
|
|
25
|
+
content_core/notebooks/run.ipynb,sha256=8gbFln9WLrli_qWJB8SKQKcSNbAv25DvN5Cu4EAAeBQ,370952
|
|
26
26
|
content_core/notebooks/urls.ipynb,sha256=gSmiSzmbol_Li36w8tpUsy5QgRbrnBx94Ry2zHwMvwY,7107
|
|
27
|
-
content_core/processors/audio.py,sha256=
|
|
27
|
+
content_core/processors/audio.py,sha256=fdR_KcLRG3jSwY3t_eVDoMgUHQQyXmAAlmfETMtomq0,8396
|
|
28
28
|
content_core/processors/docling.py,sha256=lf_NHh255gn4d2EymJYqyH2QiAgQDiJCY3t6Ne7R9rU,2507
|
|
29
29
|
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
30
30
|
content_core/processors/pdf.py,sha256=TTDhfV2INtXumFDjLJFNMRfpbJ_tqwIcSBDzuThKxJI,10617
|
|
31
31
|
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
32
|
-
content_core/processors/url.py,sha256=
|
|
32
|
+
content_core/processors/url.py,sha256=RhBOyqfSWFaf8Dhpxlo9xbsF5yuP5FhXfhbvbi4CQPc,7514
|
|
33
33
|
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
34
34
|
content_core/processors/youtube.py,sha256=_qvxI9qTdxu3l1fKLuJARFt8KtZVFJ3JJBLkq1hAAXo,7868
|
|
35
35
|
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
36
36
|
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
37
37
|
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
38
38
|
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
39
|
-
content_core-1.
|
|
40
|
-
content_core-1.
|
|
41
|
-
content_core-1.
|
|
42
|
-
content_core-1.
|
|
43
|
-
content_core-1.
|
|
39
|
+
content_core-1.5.0.dist-info/METADATA,sha256=D3Cuy_zwW7u6jeuDVxYCwSEzJt8yrIjEFi9bJhJPqLQ,21963
|
|
40
|
+
content_core-1.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
41
|
+
content_core-1.5.0.dist-info/entry_points.txt,sha256=ifbBxw37b7gAxZXoduS15KtqHuMHuU58STRkEmgM2zA,147
|
|
42
|
+
content_core-1.5.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
43
|
+
content_core-1.5.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|