content-core 0.1.2__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- {content_core-0.1.2 → content_core-0.3.0}/.gitignore +3 -1
- content_core-0.3.0/.windsurfrules +13 -0
- {content_core-0.1.2 → content_core-0.3.0}/PKG-INFO +37 -2
- {content_core-0.1.2 → content_core-0.3.0}/README.md +36 -1
- content_core-0.3.0/docs/processors.md +53 -0
- content_core-0.3.0/docs/usage.md +81 -0
- {content_core-0.1.2 → content_core-0.3.0}/pyproject.toml +1 -1
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/__init__.py +13 -13
- content_core-0.3.0/src/content_core/config.py +27 -0
- content_core-0.3.0/src/content_core/content/__init__.py +5 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/content/cleanup/core.py +2 -2
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/content/extraction/graph.py +1 -1
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/content/summary/core.py +2 -2
- content_core-0.3.0/src/content_core/logging.py +15 -0
- content_core-0.3.0/src/content_core/models.py +24 -0
- content_core-0.3.0/src/content_core/models_config.yaml +27 -0
- content_core-0.3.0/src/content_core/notebooks/run.ipynb +353 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/audio.py +5 -3
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/office.py +1 -1
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/pdf.py +2 -4
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/text.py +1 -2
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/url.py +1 -1
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/video.py +1 -2
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/processors/youtube.py +1 -1
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/prompter.py +53 -9
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/templated_message.py +2 -2
- {content_core-0.1.2 → content_core-0.3.0}/uv.lock +1 -1
- content_core-0.1.2/.windsurfrules +0 -1
- content_core-0.1.2/src/content_core/config.py +0 -37
- content_core-0.1.2/src/content_core/content/__init__.py +0 -0
- content_core-0.1.2/src/content_core/notebooks/run.ipynb +0 -558
- {content_core-0.1.2 → content_core-0.3.0}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/.github/workflows/publish.yml +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/.python-version +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/CONTRIBUTING.md +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/LICENSE +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/Makefile +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/common/__init__.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/common/exceptions.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/common/state.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/common/utils.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/content/cleanup/__init__.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/content/extraction/__init__.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/content/summary/__init__.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0/src/content_core}/prompts/content/cleanup.jinja +0 -0
- {content_core-0.1.2 → content_core-0.3.0/src/content_core}/prompts/content/summarize.jinja +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/py.typed +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/tools/__init__.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/tools/cleanup.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/tools/extract.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/src/content_core/tools/summarize.py +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.docx +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.epub +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.md +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.mp3 +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.mp4 +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.pdf +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.pptx +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.txt +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file.xlsx +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/input_content/file_audio.mp3 +0 -0
- {content_core-0.1.2 → content_core-0.3.0}/tests/integration/test_extraction.py +0 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Also use uv as the package manager: uv run, uv sync, uv add.
|
|
2
|
+
|
|
3
|
+
All documentation (code or readmes) must be in english.
|
|
4
|
+
Whenever I ask you to tag and release, make sure to run `make test` as part of the process.
|
|
5
|
+
|
|
6
|
+
The full release process is:
|
|
7
|
+
- Run `make test` to make sure everything is working
|
|
8
|
+
- Update version on pyproject.toml
|
|
9
|
+
- Run `uv sync` to update the lock file
|
|
10
|
+
- Commit all that's needed
|
|
11
|
+
- Merge to main
|
|
12
|
+
- Tag the release
|
|
13
|
+
- Push to GitHub
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -43,7 +43,7 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
43
43
|
* Direct text strings.
|
|
44
44
|
* Web URLs (using robust extraction methods).
|
|
45
45
|
* Local files (including automatic transcription for video/audio files and parsing for text-based formats).
|
|
46
|
-
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
|
|
46
|
+
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
|
|
47
47
|
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
48
48
|
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
49
49
|
|
|
@@ -161,6 +161,27 @@ csum https://example.com
|
|
|
161
161
|
csum document.txt
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
+
## Quick Start
|
|
165
|
+
|
|
166
|
+
You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
import content_core as cc
|
|
170
|
+
|
|
171
|
+
# Extract content from a URL, file, or text
|
|
172
|
+
result = await cc.extract("https://example.com/article")
|
|
173
|
+
|
|
174
|
+
# Clean messy content
|
|
175
|
+
cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
|
|
176
|
+
|
|
177
|
+
# Summarize content with optional context
|
|
178
|
+
summary = await cc.summarize_content("long article text", context="explain to a child")
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Documentation
|
|
182
|
+
|
|
183
|
+
For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
|
|
184
|
+
|
|
164
185
|
## Using with Langchain
|
|
165
186
|
|
|
166
187
|
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
@@ -220,6 +241,20 @@ OPENAI_API_KEY=your-key-here
|
|
|
220
241
|
GOOGLE_API_KEY=your-key-here
|
|
221
242
|
```
|
|
222
243
|
|
|
244
|
+
### Custom Prompt Templates
|
|
245
|
+
|
|
246
|
+
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
247
|
+
|
|
248
|
+
Example `.env` with custom prompt path:
|
|
249
|
+
|
|
250
|
+
```plaintext
|
|
251
|
+
OPENAI_API_KEY=your-key-here
|
|
252
|
+
GOOGLE_API_KEY=your-key-here
|
|
253
|
+
PROMPT_PATH=/path/to/your/custom/prompts
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
When a prompt template is requested, Content Core will first look in the custom directory specified by `PROMPT_PATH` (if set and exists). If the template is not found there, it will fall back to the default built-in prompts. This allows you to override specific prompts while still using the default ones for others.
|
|
257
|
+
|
|
223
258
|
## Development
|
|
224
259
|
|
|
225
260
|
To set up a development environment:
|
|
@@ -14,7 +14,7 @@ The primary goal of Content Core is to simplify the process of ingesting content
|
|
|
14
14
|
* Direct text strings.
|
|
15
15
|
* Web URLs (using robust extraction methods).
|
|
16
16
|
* Local files (including automatic transcription for video/audio files and parsing for text-based formats).
|
|
17
|
-
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
|
|
17
|
+
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
|
|
18
18
|
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
19
19
|
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
20
20
|
|
|
@@ -132,6 +132,27 @@ csum https://example.com
|
|
|
132
132
|
csum document.txt
|
|
133
133
|
```
|
|
134
134
|
|
|
135
|
+
## Quick Start
|
|
136
|
+
|
|
137
|
+
You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
import content_core as cc
|
|
141
|
+
|
|
142
|
+
# Extract content from a URL, file, or text
|
|
143
|
+
result = await cc.extract("https://example.com/article")
|
|
144
|
+
|
|
145
|
+
# Clean messy content
|
|
146
|
+
cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
|
|
147
|
+
|
|
148
|
+
# Summarize content with optional context
|
|
149
|
+
summary = await cc.summarize_content("long article text", context="explain to a child")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## Documentation
|
|
153
|
+
|
|
154
|
+
For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
|
|
155
|
+
|
|
135
156
|
## Using with Langchain
|
|
136
157
|
|
|
137
158
|
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
@@ -191,6 +212,20 @@ OPENAI_API_KEY=your-key-here
|
|
|
191
212
|
GOOGLE_API_KEY=your-key-here
|
|
192
213
|
```
|
|
193
214
|
|
|
215
|
+
### Custom Prompt Templates
|
|
216
|
+
|
|
217
|
+
Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
|
|
218
|
+
|
|
219
|
+
Example `.env` with custom prompt path:
|
|
220
|
+
|
|
221
|
+
```plaintext
|
|
222
|
+
OPENAI_API_KEY=your-key-here
|
|
223
|
+
GOOGLE_API_KEY=your-key-here
|
|
224
|
+
PROMPT_PATH=/path/to/your/custom/prompts
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
When a prompt template is requested, Content Core will first look in the custom directory specified by `PROMPT_PATH` (if set and exists). If the template is not found there, it will fall back to the default built-in prompts. This allows you to override specific prompts while still using the default ones for others.
|
|
228
|
+
|
|
194
229
|
## Development
|
|
195
230
|
|
|
196
231
|
To set up a development environment:
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# Content Core Processors
|
|
2
|
+
|
|
3
|
+
This document provides an overview of the content processors available in Content Core. These processors are responsible for extracting and handling content from various sources and file types.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Content Core uses a modular approach to process content from different sources. Each processor is designed to handle specific types of input, such as web URLs, local files, or direct text input. Below, you'll find detailed information about each processor, including supported file types, returned data formats, and their purpose.
|
|
8
|
+
|
|
9
|
+
## Processors
|
|
10
|
+
|
|
11
|
+
### 1. **Text Processor**
|
|
12
|
+
- **Purpose**: Handles direct text input provided by the user.
|
|
13
|
+
- **Supported Input**: Raw text strings.
|
|
14
|
+
- **Returned Data**: The input text as-is, wrapped in a structured format compatible with Content Core's output schema.
|
|
15
|
+
- **Location**: `src/content_core/processors/text.py`
|
|
16
|
+
|
|
17
|
+
### 2. **Web Processor**
|
|
18
|
+
- **Purpose**: Extracts content from web URLs, focusing on meaningful text while ignoring boilerplate (ads, navigation, etc.).
|
|
19
|
+
- **Supported Input**: URLs (web pages).
|
|
20
|
+
- **Returned Data**: Extracted text content from the web page, often in a cleaned format.
|
|
21
|
+
- **Location**: `src/content_core/processors/web.py`
|
|
22
|
+
|
|
23
|
+
### 3. **File Processor**
|
|
24
|
+
- **Purpose**: Processes local files of various types, extracting content based on file format.
|
|
25
|
+
- **Supported Input**: Local files including:
|
|
26
|
+
- Text-based formats: `.txt`, `.md` (Markdown), `.html`, etc.
|
|
27
|
+
- Document formats: `.pdf`, `.docx`, etc.
|
|
28
|
+
- Media files: `.mp4`, `.mp3` (audio/video, via transcription).
|
|
29
|
+
- **Returned Data**: Extracted text content or transcriptions (for media files), structured according to Content Core's schema.
|
|
30
|
+
- **Location**: `src/content_core/processors/file.py`
|
|
31
|
+
|
|
32
|
+
### 4. **Media Transcription Processor**
|
|
33
|
+
- **Purpose**: Specifically handles transcription of audio and video files using external services or libraries.
|
|
34
|
+
- **Supported Input**: Audio and video files (e.g., `.mp3`, `.mp4`).
|
|
35
|
+
- **Returned Data**: Transcribed text from the media content.
|
|
36
|
+
- **Location**: `src/content_core/processors/transcription.py`
|
|
37
|
+
|
|
38
|
+
## How Processors Work
|
|
39
|
+
|
|
40
|
+
Content Core automatically selects the appropriate processor based on the input type:
|
|
41
|
+
- If a URL is provided, the Web Processor is used.
|
|
42
|
+
- If a file path is provided, the File Processor determines the file type and delegates to specialized handlers (like the Media Transcription Processor for audio/video).
|
|
43
|
+
- If raw text is provided, the Text Processor handles it directly.
|
|
44
|
+
|
|
45
|
+
Each processor returns data in a consistent format, allowing seamless integration with other components of Content Core for further processing (like cleaning or summarization).
|
|
46
|
+
|
|
47
|
+
## Custom Processors
|
|
48
|
+
|
|
49
|
+
Developers can extend Content Core by creating custom processors for unsupported file types or specialized extraction needs. To do so, create a new processor module in `src/content_core/processors/` and ensure it adheres to the expected interface for integration with the content extraction pipeline.
|
|
50
|
+
|
|
51
|
+
## Contributing
|
|
52
|
+
|
|
53
|
+
If you have suggestions for improving existing processors or adding support for new file types, please contribute to the project by submitting a pull request or opening an issue on the GitHub repository.
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Using the Content Core Library
|
|
2
|
+
|
|
3
|
+
This documentation explains how to configure and use the **Content Core** library in your projects. The library allows customization of AI model settings through a YAML file and environment variables.
|
|
4
|
+
|
|
5
|
+
## Environment Variable for Configuration
|
|
6
|
+
|
|
7
|
+
The library uses the `CCORE_MODEL_CONFIG_PATH` environment variable to locate the custom YAML configuration file. If this variable is not set or the specified file is not found, the library will fall back to internal default settings.
|
|
8
|
+
|
|
9
|
+
To set the environment variable, add the following line to your `.env` file or set it directly in your environment:
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
CCORE_MODEL_CONFIG_PATH=/path/to/your/models_config.yaml
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## YAML File Schema
|
|
16
|
+
|
|
17
|
+
The YAML configuration file defines the AI models that the library will use. The structure of the file is as follows:
|
|
18
|
+
|
|
19
|
+
- **speech_to_text**: Configuration for the speech-to-text model.
|
|
20
|
+
- **provider**: Model provider (example: `openai`).
|
|
21
|
+
- **model_name**: Model name (example: `whisper-1`).
|
|
22
|
+
- **default_model**: Configuration for the default language model.
|
|
23
|
+
- **provider**: Model provider.
|
|
24
|
+
- **model_name**: Model name.
|
|
25
|
+
- **config**: Additional parameters like `temperature`, `top_p`, `max_tokens`.
|
|
26
|
+
- **cleanup_model**: Configuration for the content cleanup model.
|
|
27
|
+
- **provider**: Model provider.
|
|
28
|
+
- **model_name**: Model name.
|
|
29
|
+
- **config**: Additional parameters.
|
|
30
|
+
- **summary_model**: Configuration for the summary model.
|
|
31
|
+
- **provider**: Model provider.
|
|
32
|
+
- **model_name**: Model name.
|
|
33
|
+
- **config**: Additional parameters.
|
|
34
|
+
|
|
35
|
+
### Default YAML File
|
|
36
|
+
|
|
37
|
+
Here is the content of the default YAML file used by the library:
|
|
38
|
+
|
|
39
|
+
```yaml
|
|
40
|
+
speech_to_text:
|
|
41
|
+
provider: openai
|
|
42
|
+
model_name: whisper-1
|
|
43
|
+
|
|
44
|
+
default_model:
|
|
45
|
+
provider: openai
|
|
46
|
+
model_name: gpt-4o-mini
|
|
47
|
+
config:
|
|
48
|
+
temperature: 0.5
|
|
49
|
+
top_p: 1
|
|
50
|
+
max_tokens: 2000
|
|
51
|
+
|
|
52
|
+
cleanup_model:
|
|
53
|
+
provider: openai
|
|
54
|
+
model_name: gpt-4o-mini
|
|
55
|
+
config:
|
|
56
|
+
temperature: 0
|
|
57
|
+
max_tokens: 8000
|
|
58
|
+
output_format: json
|
|
59
|
+
|
|
60
|
+
summary_model:
|
|
61
|
+
provider: openai
|
|
62
|
+
model_name: gpt-4o-mini
|
|
63
|
+
config:
|
|
64
|
+
temperature: 0
|
|
65
|
+
top_p: 1
|
|
66
|
+
max_tokens: 2000
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Customization
|
|
70
|
+
|
|
71
|
+
You can customize any aspect of the YAML file to suit your needs. Change the providers, model names, or configuration parameters as desired.
|
|
72
|
+
|
|
73
|
+
To simplify setup, we suggest copying the provided sample files:
|
|
74
|
+
- Copy `.env.sample` to `.env` and adjust the environment variables, including `CCORE_MODEL_CONFIG_PATH`.
|
|
75
|
+
- Copy `models_config.yaml.sample` to your desired location and modify it as needed.
|
|
76
|
+
|
|
77
|
+
This will allow you to quickly start with customized settings without needing to create the files from scratch.
|
|
78
|
+
|
|
79
|
+
## Support
|
|
80
|
+
|
|
81
|
+
If you have questions or encounter issues while using the library, open an issue in the repository or contact the support team.
|
|
@@ -7,18 +7,21 @@ from xml.etree import ElementTree as ET
|
|
|
7
7
|
|
|
8
8
|
from dicttoxml import dicttoxml # type: ignore
|
|
9
9
|
from dotenv import load_dotenv
|
|
10
|
-
from loguru import logger
|
|
11
10
|
|
|
12
11
|
from content_core.common import ProcessSourceInput
|
|
13
12
|
from content_core.content.cleanup import cleanup_content
|
|
14
13
|
from content_core.content.extraction import extract_content
|
|
15
14
|
from content_core.content.summary import summarize
|
|
15
|
+
from content_core.logging import configure_logging, logger
|
|
16
|
+
|
|
17
|
+
# Exposing functions for direct access when importing content_core as cc
|
|
18
|
+
extract = extract_content
|
|
19
|
+
clean = cleanup_content
|
|
16
20
|
|
|
17
21
|
load_dotenv()
|
|
18
22
|
|
|
19
|
-
# Configure loguru logger
|
|
20
|
-
|
|
21
|
-
logger.add(sys.stderr, level="INFO") # Default to INFO level
|
|
23
|
+
# Configure loguru logger using centralized configuration
|
|
24
|
+
configure_logging(debug=False)
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def parse_content_format(content: str) -> str:
|
|
@@ -94,10 +97,9 @@ async def ccore_main():
|
|
|
94
97
|
|
|
95
98
|
args = parser.parse_args()
|
|
96
99
|
|
|
97
|
-
# Adjust logging level based on debug flag
|
|
100
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
101
|
+
configure_logging(debug=args.debug)
|
|
98
102
|
if args.debug:
|
|
99
|
-
logger.remove()
|
|
100
|
-
logger.add(sys.stderr, level="DEBUG")
|
|
101
103
|
logger.debug("Debug logging enabled")
|
|
102
104
|
|
|
103
105
|
content = get_content(args, parser)
|
|
@@ -136,10 +138,9 @@ async def cclean_main():
|
|
|
136
138
|
|
|
137
139
|
args = parser.parse_args()
|
|
138
140
|
|
|
139
|
-
# Adjust logging level based on debug flag
|
|
141
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
142
|
+
configure_logging(debug=args.debug)
|
|
140
143
|
if args.debug:
|
|
141
|
-
logger.remove()
|
|
142
|
-
logger.add(sys.stderr, level="DEBUG")
|
|
143
144
|
logger.debug("Debug logging enabled")
|
|
144
145
|
|
|
145
146
|
content = get_content(args, parser)
|
|
@@ -176,10 +177,9 @@ async def csum_main():
|
|
|
176
177
|
|
|
177
178
|
args = parser.parse_args()
|
|
178
179
|
|
|
179
|
-
# Adjust logging level based on debug flag
|
|
180
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
181
|
+
configure_logging(debug=args.debug)
|
|
180
182
|
if args.debug:
|
|
181
|
-
logger.remove()
|
|
182
|
-
logger.add(sys.stderr, level="DEBUG")
|
|
183
183
|
logger.debug("Debug logging enabled")
|
|
184
184
|
|
|
185
185
|
content = get_content(args, parser)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import pkgutil
|
|
3
|
+
|
|
4
|
+
import yaml
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
|
|
7
|
+
# Load environment variables from .env file
|
|
8
|
+
load_dotenv()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_config():
|
|
12
|
+
config_path = os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
13
|
+
if config_path and os.path.exists(config_path):
|
|
14
|
+
try:
|
|
15
|
+
with open(config_path, "r") as file:
|
|
16
|
+
return yaml.safe_load(file)
|
|
17
|
+
except Exception as e:
|
|
18
|
+
print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
|
|
19
|
+
print("Usando configurações padrão internas.")
|
|
20
|
+
|
|
21
|
+
default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
|
|
22
|
+
if default_config_data:
|
|
23
|
+
return yaml.safe_load(default_config_data)
|
|
24
|
+
return {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
CONFIG = load_config()
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from functools import partial
|
|
2
2
|
|
|
3
|
-
from content_core.
|
|
3
|
+
from content_core.models import ModelFactory
|
|
4
4
|
from content_core.templated_message import TemplatedMessageInput, templated_message
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
async def cleanup_content(content) -> str:
|
|
8
|
-
templated_summary_fn = partial(templated_message, model=
|
|
8
|
+
templated_summary_fn = partial(templated_message, model=ModelFactory.get_model('cleanup_model'))
|
|
9
9
|
input = TemplatedMessageInput(
|
|
10
10
|
system_prompt_template="content/cleanup",
|
|
11
11
|
user_prompt_text=content,
|
|
@@ -3,13 +3,13 @@ from typing import Any, Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
import magic
|
|
5
5
|
from langgraph.graph import END, START, StateGraph
|
|
6
|
-
from loguru import logger
|
|
7
6
|
|
|
8
7
|
from content_core.common import (
|
|
9
8
|
ProcessSourceInput,
|
|
10
9
|
ProcessSourceState,
|
|
11
10
|
UnsupportedTypeException,
|
|
12
11
|
)
|
|
12
|
+
from content_core.logging import logger
|
|
13
13
|
from content_core.processors.audio import extract_audio # type: ignore
|
|
14
14
|
from content_core.processors.office import (
|
|
15
15
|
SUPPORTED_OFFICE_TYPES,
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from functools import partial
|
|
2
2
|
|
|
3
|
-
from content_core.
|
|
3
|
+
from content_core.models import ModelFactory
|
|
4
4
|
from content_core.templated_message import TemplatedMessageInput, templated_message
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
async def summarize(content: str, context: str) -> str:
|
|
8
|
-
templated_message_fn = partial(templated_message, model=
|
|
8
|
+
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
11
|
user_prompt_template="content/summarize",
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
def configure_logging(debug=False):
|
|
5
|
+
"""
|
|
6
|
+
Configure the global logger for the application.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
debug (bool): If True, set logging level to DEBUG; otherwise, set to INFO.
|
|
10
|
+
"""
|
|
11
|
+
logger.remove() # Remove any existing handlers
|
|
12
|
+
logger.add(sys.stderr, level="DEBUG" if debug else "INFO")
|
|
13
|
+
|
|
14
|
+
# Initial configuration with default level (INFO)
|
|
15
|
+
configure_logging(debug=False)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from esperanto import AIFactory
|
|
2
|
+
from esperanto.providers.stt import SpeechToTextModel
|
|
3
|
+
from .config import CONFIG
|
|
4
|
+
|
|
5
|
+
class ModelFactory:
|
|
6
|
+
_instances = {}
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def get_model(model_alias):
|
|
10
|
+
if model_alias not in ModelFactory._instances:
|
|
11
|
+
config = CONFIG.get(model_alias, {})
|
|
12
|
+
if not config:
|
|
13
|
+
raise ValueError(f"Configuração para o modelo {model_alias} não encontrada.")
|
|
14
|
+
|
|
15
|
+
provider = config.get('provider')
|
|
16
|
+
model_name = config.get('model_name')
|
|
17
|
+
model_config = config.get('config', {})
|
|
18
|
+
|
|
19
|
+
if model_alias == 'speech_to_text':
|
|
20
|
+
ModelFactory._instances[model_alias] = AIFactory.create_speech_to_text(provider, model_name)
|
|
21
|
+
else:
|
|
22
|
+
ModelFactory._instances[model_alias] = AIFactory.create_language(provider, model_name, config=model_config)
|
|
23
|
+
|
|
24
|
+
return ModelFactory._instances[model_alias]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
speech_to_text:
|
|
2
|
+
provider: openai
|
|
3
|
+
model_name: whisper-1
|
|
4
|
+
|
|
5
|
+
default_model:
|
|
6
|
+
provider: openai
|
|
7
|
+
model_name: gpt-4o-mini
|
|
8
|
+
config:
|
|
9
|
+
temperature: 0.5
|
|
10
|
+
top_p: 1
|
|
11
|
+
max_tokens: 2000
|
|
12
|
+
|
|
13
|
+
cleanup_model:
|
|
14
|
+
provider: openai
|
|
15
|
+
model_name: gpt-4o-mini
|
|
16
|
+
config:
|
|
17
|
+
temperature: 0
|
|
18
|
+
max_tokens: 8000
|
|
19
|
+
output_format: json
|
|
20
|
+
|
|
21
|
+
summary_model:
|
|
22
|
+
provider: openai
|
|
23
|
+
model_name: gpt-4o-mini
|
|
24
|
+
config:
|
|
25
|
+
temperature: 0
|
|
26
|
+
top_p: 1
|
|
27
|
+
max_tokens: 2000
|