content-core 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +13 -13
- content_core/config.py +23 -33
- content_core/content/__init__.py +5 -0
- content_core/content/cleanup/core.py +2 -2
- content_core/content/extraction/graph.py +1 -1
- content_core/content/summary/core.py +2 -2
- content_core/logging.py +15 -0
- content_core/models.py +24 -0
- content_core/models_config.yaml +27 -0
- content_core/notebooks/run.ipynb +101 -145
- content_core/processors/audio.py +5 -3
- content_core/processors/office.py +1 -1
- content_core/processors/pdf.py +2 -4
- content_core/processors/text.py +1 -2
- content_core/processors/url.py +1 -1
- content_core/processors/video.py +1 -2
- content_core/processors/youtube.py +1 -1
- content_core/prompter.py +3 -1
- content_core/templated_message.py +2 -2
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/METADATA +22 -1
- content_core-0.3.1.dist-info/RECORD +38 -0
- content_core-0.2.0.dist-info/RECORD +0 -35
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/WHEEL +0 -0
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/entry_points.txt +0 -0
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/licenses/LICENSE +0 -0
content_core/__init__.py
CHANGED
|
@@ -7,18 +7,21 @@ from xml.etree import ElementTree as ET
|
|
|
7
7
|
|
|
8
8
|
from dicttoxml import dicttoxml # type: ignore
|
|
9
9
|
from dotenv import load_dotenv
|
|
10
|
-
from loguru import logger
|
|
11
10
|
|
|
12
11
|
from content_core.common import ProcessSourceInput
|
|
13
12
|
from content_core.content.cleanup import cleanup_content
|
|
14
13
|
from content_core.content.extraction import extract_content
|
|
15
14
|
from content_core.content.summary import summarize
|
|
15
|
+
from content_core.logging import configure_logging, logger
|
|
16
|
+
|
|
17
|
+
# Exposing functions for direct access when importing content_core as cc
|
|
18
|
+
extract = extract_content
|
|
19
|
+
clean = cleanup_content
|
|
16
20
|
|
|
17
21
|
load_dotenv()
|
|
18
22
|
|
|
19
|
-
# Configure loguru logger
|
|
20
|
-
|
|
21
|
-
logger.add(sys.stderr, level="INFO") # Default to INFO level
|
|
23
|
+
# Configure loguru logger using centralized configuration
|
|
24
|
+
configure_logging(debug=False)
|
|
22
25
|
|
|
23
26
|
|
|
24
27
|
def parse_content_format(content: str) -> str:
|
|
@@ -94,10 +97,9 @@ async def ccore_main():
|
|
|
94
97
|
|
|
95
98
|
args = parser.parse_args()
|
|
96
99
|
|
|
97
|
-
# Adjust logging level based on debug flag
|
|
100
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
101
|
+
configure_logging(debug=args.debug)
|
|
98
102
|
if args.debug:
|
|
99
|
-
logger.remove()
|
|
100
|
-
logger.add(sys.stderr, level="DEBUG")
|
|
101
103
|
logger.debug("Debug logging enabled")
|
|
102
104
|
|
|
103
105
|
content = get_content(args, parser)
|
|
@@ -136,10 +138,9 @@ async def cclean_main():
|
|
|
136
138
|
|
|
137
139
|
args = parser.parse_args()
|
|
138
140
|
|
|
139
|
-
# Adjust logging level based on debug flag
|
|
141
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
142
|
+
configure_logging(debug=args.debug)
|
|
140
143
|
if args.debug:
|
|
141
|
-
logger.remove()
|
|
142
|
-
logger.add(sys.stderr, level="DEBUG")
|
|
143
144
|
logger.debug("Debug logging enabled")
|
|
144
145
|
|
|
145
146
|
content = get_content(args, parser)
|
|
@@ -176,10 +177,9 @@ async def csum_main():
|
|
|
176
177
|
|
|
177
178
|
args = parser.parse_args()
|
|
178
179
|
|
|
179
|
-
# Adjust logging level based on debug flag
|
|
180
|
+
# Adjust logging level based on debug flag using centralized configuration
|
|
181
|
+
configure_logging(debug=args.debug)
|
|
180
182
|
if args.debug:
|
|
181
|
-
logger.remove()
|
|
182
|
-
logger.add(sys.stderr, level="DEBUG")
|
|
183
183
|
logger.debug("Debug logging enabled")
|
|
184
184
|
|
|
185
185
|
content = get_content(args, parser)
|
content_core/config.py
CHANGED
|
@@ -1,37 +1,27 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
import os
|
|
2
|
+
import pkgutil
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
)
|
|
4
|
+
import yaml
|
|
5
|
+
from dotenv import load_dotenv
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
"gpt-4o-mini",
|
|
11
|
-
config={
|
|
12
|
-
"temperature": 0.5,
|
|
13
|
-
"top_p": 1,
|
|
14
|
-
"max_tokens": 2000,
|
|
15
|
-
},
|
|
16
|
-
)
|
|
7
|
+
# Load environment variables from .env file
|
|
8
|
+
load_dotenv()
|
|
17
9
|
|
|
18
|
-
CLEANUP_MODEL = AIFactory.create_language(
|
|
19
|
-
"openai",
|
|
20
|
-
"gpt-4o-mini",
|
|
21
|
-
config={
|
|
22
|
-
"temperature": 0,
|
|
23
|
-
"max_tokens": 8000,
|
|
24
|
-
"output_format": "json",
|
|
25
|
-
# "stream": True, # TODO: handle streaming
|
|
26
|
-
},
|
|
27
|
-
) # Fix deprecation
|
|
28
10
|
|
|
29
|
-
|
|
30
|
-
"
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
)
|
|
11
|
+
def load_config():
|
|
12
|
+
config_path = os.environ.get("CCORE_MODEL_CONFIG_PATH")
|
|
13
|
+
if config_path and os.path.exists(config_path):
|
|
14
|
+
try:
|
|
15
|
+
with open(config_path, "r") as file:
|
|
16
|
+
return yaml.safe_load(file)
|
|
17
|
+
except Exception as e:
|
|
18
|
+
print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
|
|
19
|
+
print("Usando configurações padrão internas.")
|
|
20
|
+
|
|
21
|
+
default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
|
|
22
|
+
if default_config_data:
|
|
23
|
+
return yaml.safe_load(default_config_data)
|
|
24
|
+
return {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
CONFIG = load_config()
|
content_core/content/__init__.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from functools import partial
|
|
2
2
|
|
|
3
|
-
from content_core.
|
|
3
|
+
from content_core.models import ModelFactory
|
|
4
4
|
from content_core.templated_message import TemplatedMessageInput, templated_message
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
async def cleanup_content(content) -> str:
|
|
8
|
-
templated_summary_fn = partial(templated_message, model=
|
|
8
|
+
templated_summary_fn = partial(templated_message, model=ModelFactory.get_model('cleanup_model'))
|
|
9
9
|
input = TemplatedMessageInput(
|
|
10
10
|
system_prompt_template="content/cleanup",
|
|
11
11
|
user_prompt_text=content,
|
|
@@ -3,13 +3,13 @@ from typing import Any, Dict, Optional
|
|
|
3
3
|
|
|
4
4
|
import magic
|
|
5
5
|
from langgraph.graph import END, START, StateGraph
|
|
6
|
-
from loguru import logger
|
|
7
6
|
|
|
8
7
|
from content_core.common import (
|
|
9
8
|
ProcessSourceInput,
|
|
10
9
|
ProcessSourceState,
|
|
11
10
|
UnsupportedTypeException,
|
|
12
11
|
)
|
|
12
|
+
from content_core.logging import logger
|
|
13
13
|
from content_core.processors.audio import extract_audio # type: ignore
|
|
14
14
|
from content_core.processors.office import (
|
|
15
15
|
SUPPORTED_OFFICE_TYPES,
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from functools import partial
|
|
2
2
|
|
|
3
|
-
from content_core.
|
|
3
|
+
from content_core.models import ModelFactory
|
|
4
4
|
from content_core.templated_message import TemplatedMessageInput, templated_message
|
|
5
5
|
|
|
6
6
|
|
|
7
7
|
async def summarize(content: str, context: str) -> str:
|
|
8
|
-
templated_message_fn = partial(templated_message, model=
|
|
8
|
+
templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
|
|
9
9
|
response = await templated_message_fn(
|
|
10
10
|
TemplatedMessageInput(
|
|
11
11
|
user_prompt_template="content/summarize",
|
content_core/logging.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from loguru import logger
|
|
3
|
+
|
|
4
|
+
def configure_logging(debug=False):
|
|
5
|
+
"""
|
|
6
|
+
Configure the global logger for the application.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
debug (bool): If True, set logging level to DEBUG; otherwise, set to INFO.
|
|
10
|
+
"""
|
|
11
|
+
logger.remove() # Remove any existing handlers
|
|
12
|
+
logger.add(sys.stderr, level="DEBUG" if debug else "INFO")
|
|
13
|
+
|
|
14
|
+
# Initial configuration with default level (INFO)
|
|
15
|
+
configure_logging(debug=False)
|
content_core/models.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from esperanto import AIFactory
|
|
2
|
+
from esperanto.providers.stt import SpeechToTextModel
|
|
3
|
+
from .config import CONFIG
|
|
4
|
+
|
|
5
|
+
class ModelFactory:
|
|
6
|
+
_instances = {}
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def get_model(model_alias):
|
|
10
|
+
if model_alias not in ModelFactory._instances:
|
|
11
|
+
config = CONFIG.get(model_alias, {})
|
|
12
|
+
if not config:
|
|
13
|
+
raise ValueError(f"Configuração para o modelo {model_alias} não encontrada.")
|
|
14
|
+
|
|
15
|
+
provider = config.get('provider')
|
|
16
|
+
model_name = config.get('model_name')
|
|
17
|
+
model_config = config.get('config', {})
|
|
18
|
+
|
|
19
|
+
if model_alias == 'speech_to_text':
|
|
20
|
+
ModelFactory._instances[model_alias] = AIFactory.create_speech_to_text(provider, model_name)
|
|
21
|
+
else:
|
|
22
|
+
ModelFactory._instances[model_alias] = AIFactory.create_language(provider, model_name, config=model_config)
|
|
23
|
+
|
|
24
|
+
return ModelFactory._instances[model_alias]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
speech_to_text:
|
|
2
|
+
provider: openai
|
|
3
|
+
model_name: whisper-1
|
|
4
|
+
|
|
5
|
+
default_model:
|
|
6
|
+
provider: openai
|
|
7
|
+
model_name: gpt-4o-mini
|
|
8
|
+
config:
|
|
9
|
+
temperature: 0.5
|
|
10
|
+
top_p: 1
|
|
11
|
+
max_tokens: 2000
|
|
12
|
+
|
|
13
|
+
cleanup_model:
|
|
14
|
+
provider: openai
|
|
15
|
+
model_name: gpt-4o-mini
|
|
16
|
+
config:
|
|
17
|
+
temperature: 0
|
|
18
|
+
max_tokens: 8000
|
|
19
|
+
output_format: json
|
|
20
|
+
|
|
21
|
+
summary_model:
|
|
22
|
+
provider: openai
|
|
23
|
+
model_name: gpt-4o-mini
|
|
24
|
+
config:
|
|
25
|
+
temperature: 0
|
|
26
|
+
top_p: 1
|
|
27
|
+
max_tokens: 2000
|