content-core 0.2.0__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

Files changed (61) hide show
  1. content_core-0.3.1/.windsurfrules +13 -0
  2. {content_core-0.2.0 → content_core-0.3.1}/PKG-INFO +22 -1
  3. {content_core-0.2.0 → content_core-0.3.1}/README.md +21 -0
  4. content_core-0.3.1/docs/usage.md +81 -0
  5. {content_core-0.2.0 → content_core-0.3.1}/pyproject.toml +2 -1
  6. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/__init__.py +13 -13
  7. content_core-0.3.1/src/content_core/config.py +27 -0
  8. content_core-0.3.1/src/content_core/content/__init__.py +5 -0
  9. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/content/cleanup/core.py +2 -2
  10. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/content/extraction/graph.py +1 -1
  11. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/content/summary/core.py +2 -2
  12. content_core-0.3.1/src/content_core/logging.py +15 -0
  13. content_core-0.3.1/src/content_core/models.py +24 -0
  14. content_core-0.3.1/src/content_core/models_config.yaml +27 -0
  15. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/notebooks/run.ipynb +101 -145
  16. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/audio.py +5 -3
  17. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/office.py +1 -1
  18. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/pdf.py +2 -4
  19. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/text.py +1 -2
  20. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/url.py +1 -1
  21. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/video.py +1 -2
  22. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/processors/youtube.py +1 -1
  23. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/prompter.py +3 -1
  24. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/templated_message.py +2 -2
  25. {content_core-0.2.0 → content_core-0.3.1}/uv.lock +3 -4
  26. content_core-0.2.0/.windsurfrules +0 -1
  27. content_core-0.2.0/src/content_core/config.py +0 -37
  28. content_core-0.2.0/src/content_core/content/__init__.py +0 -0
  29. {content_core-0.2.0 → content_core-0.3.1}/.github/PULL_REQUEST_TEMPLATE.md +0 -0
  30. {content_core-0.2.0 → content_core-0.3.1}/.github/workflows/publish.yml +0 -0
  31. {content_core-0.2.0 → content_core-0.3.1}/.gitignore +0 -0
  32. {content_core-0.2.0 → content_core-0.3.1}/.python-version +0 -0
  33. {content_core-0.2.0 → content_core-0.3.1}/CONTRIBUTING.md +0 -0
  34. {content_core-0.2.0 → content_core-0.3.1}/LICENSE +0 -0
  35. {content_core-0.2.0 → content_core-0.3.1}/Makefile +0 -0
  36. {content_core-0.2.0 → content_core-0.3.1}/docs/processors.md +0 -0
  37. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/common/__init__.py +0 -0
  38. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/common/exceptions.py +0 -0
  39. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/common/state.py +0 -0
  40. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/common/utils.py +0 -0
  41. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/content/cleanup/__init__.py +0 -0
  42. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/content/extraction/__init__.py +0 -0
  43. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/content/summary/__init__.py +0 -0
  44. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/prompts/content/cleanup.jinja +0 -0
  45. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/prompts/content/summarize.jinja +0 -0
  46. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/py.typed +0 -0
  47. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/tools/__init__.py +0 -0
  48. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/tools/cleanup.py +0 -0
  49. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/tools/extract.py +0 -0
  50. {content_core-0.2.0 → content_core-0.3.1}/src/content_core/tools/summarize.py +0 -0
  51. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.docx +0 -0
  52. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.epub +0 -0
  53. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.md +0 -0
  54. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.mp3 +0 -0
  55. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.mp4 +0 -0
  56. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.pdf +0 -0
  57. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.pptx +0 -0
  58. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.txt +0 -0
  59. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file.xlsx +0 -0
  60. {content_core-0.2.0 → content_core-0.3.1}/tests/input_content/file_audio.mp3 +0 -0
  61. {content_core-0.2.0 → content_core-0.3.1}/tests/integration/test_extraction.py +0 -0
@@ -0,0 +1,13 @@
1
+ Also use uv as the package manager: uv run, uv sync, uv add.
2
+
3
+ All documentation (code or readmes) must be in english.
4
+ Whenever I ask you to tag and release, make sure to run `make test` as part of the process.
5
+
6
+ The full release process is:
7
+ - Run `make test` to make sure everything is working
8
+ - Update version on pyproject.toml
9
+ - Run `uv sync` to update the lock file
10
+ - Commit all that's needed
11
+ - Merge to main
12
+ - Tag the release
13
+ - Push to GitHub
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.2.0
3
+ Version: 0.3.1
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -161,6 +161,27 @@ csum https://example.com
161
161
  csum document.txt
162
162
  ```
163
163
 
164
+ ## Quick Start
165
+
166
+ You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
167
+
168
+ ```python
169
+ import content_core as cc
170
+
171
+ # Extract content from a URL, file, or text
172
+ result = await cc.extract("https://example.com/article")
173
+
174
+ # Clean messy content
175
+ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
176
+
177
+ # Summarize content with optional context
178
+ summary = await cc.summarize_content("long article text", context="explain to a child")
179
+ ```
180
+
181
+ ## Documentation
182
+
183
+ For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
184
+
164
185
  ## Using with Langchain
165
186
 
166
187
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -132,6 +132,27 @@ csum https://example.com
132
132
  csum document.txt
133
133
  ```
134
134
 
135
+ ## Quick Start
136
+
137
+ You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
138
+
139
+ ```python
140
+ import content_core as cc
141
+
142
+ # Extract content from a URL, file, or text
143
+ result = await cc.extract("https://example.com/article")
144
+
145
+ # Clean messy content
146
+ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
147
+
148
+ # Summarize content with optional context
149
+ summary = await cc.summarize_content("long article text", context="explain to a child")
150
+ ```
151
+
152
+ ## Documentation
153
+
154
+ For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
155
+
135
156
  ## Using with Langchain
136
157
 
137
158
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -0,0 +1,81 @@
1
+ # Using the Content Core Library
2
+
3
+ This documentation explains how to configure and use the **Content Core** library in your projects. The library allows customization of AI model settings through a YAML file and environment variables.
4
+
5
+ ## Environment Variable for Configuration
6
+
7
+ The library uses the `CCORE_MODEL_CONFIG_PATH` environment variable to locate the custom YAML configuration file. If this variable is not set or the specified file is not found, the library will fall back to internal default settings.
8
+
9
+ To set the environment variable, add the following line to your `.env` file or set it directly in your environment:
10
+
11
+ ```
12
+ CCORE_MODEL_CONFIG_PATH=/path/to/your/models_config.yaml
13
+ ```
14
+
15
+ ## YAML File Schema
16
+
17
+ The YAML configuration file defines the AI models that the library will use. The structure of the file is as follows:
18
+
19
+ - **speech_to_text**: Configuration for the speech-to-text model.
20
+ - **provider**: Model provider (example: `openai`).
21
+ - **model_name**: Model name (example: `whisper-1`).
22
+ - **default_model**: Configuration for the default language model.
23
+ - **provider**: Model provider.
24
+ - **model_name**: Model name.
25
+ - **config**: Additional parameters like `temperature`, `top_p`, `max_tokens`.
26
+ - **cleanup_model**: Configuration for the content cleanup model.
27
+ - **provider**: Model provider.
28
+ - **model_name**: Model name.
29
+ - **config**: Additional parameters.
30
+ - **summary_model**: Configuration for the summary model.
31
+ - **provider**: Model provider.
32
+ - **model_name**: Model name.
33
+ - **config**: Additional parameters.
34
+
35
+ ### Default YAML File
36
+
37
+ Here is the content of the default YAML file used by the library:
38
+
39
+ ```yaml
40
+ speech_to_text:
41
+ provider: openai
42
+ model_name: whisper-1
43
+
44
+ default_model:
45
+ provider: openai
46
+ model_name: gpt-4o-mini
47
+ config:
48
+ temperature: 0.5
49
+ top_p: 1
50
+ max_tokens: 2000
51
+
52
+ cleanup_model:
53
+ provider: openai
54
+ model_name: gpt-4o-mini
55
+ config:
56
+ temperature: 0
57
+ max_tokens: 8000
58
+ output_format: json
59
+
60
+ summary_model:
61
+ provider: openai
62
+ model_name: gpt-4o-mini
63
+ config:
64
+ temperature: 0
65
+ top_p: 1
66
+ max_tokens: 2000
67
+ ```
68
+
69
+ ## Customization
70
+
71
+ You can customize any aspect of the YAML file to suit your needs. Change the providers, model names, or configuration parameters as desired.
72
+
73
+ To simplify setup, we suggest copying the provided sample files:
74
+ - Copy `.env.sample` to `.env` and adjust the environment variables, including `CCORE_MODEL_CONFIG_PATH`.
75
+ - Copy `models_config.yaml.sample` to your desired location and modify it as needed.
76
+
77
+ This will allow you to quickly start with customized settings without needing to create the files from scratch.
78
+
79
+ ## Support
80
+
81
+ If you have questions or encounter issues while using the library, open an issue in the repository or contact the support team.
@@ -1,8 +1,9 @@
1
1
  [project]
2
2
  name = "content-core"
3
- version = "0.2.0"
3
+ version = "0.3.1"
4
4
  description = "Extract what matters from any media source"
5
5
  readme = "README.md"
6
+ homepage = "https://github.com/lfnovo/content-core"
6
7
  authors = [
7
8
  { name = "LUIS NOVO", email = "lfnovo@gmail.com" }
8
9
  ]
@@ -7,18 +7,21 @@ from xml.etree import ElementTree as ET
7
7
 
8
8
  from dicttoxml import dicttoxml # type: ignore
9
9
  from dotenv import load_dotenv
10
- from loguru import logger
11
10
 
12
11
  from content_core.common import ProcessSourceInput
13
12
  from content_core.content.cleanup import cleanup_content
14
13
  from content_core.content.extraction import extract_content
15
14
  from content_core.content.summary import summarize
15
+ from content_core.logging import configure_logging, logger
16
+
17
+ # Exposing functions for direct access when importing content_core as cc
18
+ extract = extract_content
19
+ clean = cleanup_content
16
20
 
17
21
  load_dotenv()
18
22
 
19
- # Configure loguru logger
20
- logger.remove() # Remove default handler
21
- logger.add(sys.stderr, level="INFO") # Default to INFO level
23
+ # Configure loguru logger using centralized configuration
24
+ configure_logging(debug=False)
22
25
 
23
26
 
24
27
  def parse_content_format(content: str) -> str:
@@ -94,10 +97,9 @@ async def ccore_main():
94
97
 
95
98
  args = parser.parse_args()
96
99
 
97
- # Adjust logging level based on debug flag
100
+ # Adjust logging level based on debug flag using centralized configuration
101
+ configure_logging(debug=args.debug)
98
102
  if args.debug:
99
- logger.remove()
100
- logger.add(sys.stderr, level="DEBUG")
101
103
  logger.debug("Debug logging enabled")
102
104
 
103
105
  content = get_content(args, parser)
@@ -136,10 +138,9 @@ async def cclean_main():
136
138
 
137
139
  args = parser.parse_args()
138
140
 
139
- # Adjust logging level based on debug flag
141
+ # Adjust logging level based on debug flag using centralized configuration
142
+ configure_logging(debug=args.debug)
140
143
  if args.debug:
141
- logger.remove()
142
- logger.add(sys.stderr, level="DEBUG")
143
144
  logger.debug("Debug logging enabled")
144
145
 
145
146
  content = get_content(args, parser)
@@ -176,10 +177,9 @@ async def csum_main():
176
177
 
177
178
  args = parser.parse_args()
178
179
 
179
- # Adjust logging level based on debug flag
180
+ # Adjust logging level based on debug flag using centralized configuration
181
+ configure_logging(debug=args.debug)
180
182
  if args.debug:
181
- logger.remove()
182
- logger.add(sys.stderr, level="DEBUG")
183
183
  logger.debug("Debug logging enabled")
184
184
 
185
185
  content = get_content(args, parser)
@@ -0,0 +1,27 @@
1
+ import os
2
+ import pkgutil
3
+
4
+ import yaml
5
+ from dotenv import load_dotenv
6
+
7
+ # Load environment variables from .env file
8
+ load_dotenv()
9
+
10
+
11
+ def load_config():
12
+ config_path = os.environ.get("CCORE_MODEL_CONFIG_PATH")
13
+ if config_path and os.path.exists(config_path):
14
+ try:
15
+ with open(config_path, "r") as file:
16
+ return yaml.safe_load(file)
17
+ except Exception as e:
18
+ print(f"Erro ao carregar o arquivo de configuração de {config_path}: {e}")
19
+ print("Usando configurações padrão internas.")
20
+
21
+ default_config_data = pkgutil.get_data("content_core", "models_config.yaml")
22
+ if default_config_data:
23
+ return yaml.safe_load(default_config_data)
24
+ return {}
25
+
26
+
27
+ CONFIG = load_config()
@@ -0,0 +1,5 @@
1
+ from .cleanup import cleanup_content
2
+ from .extraction import extract_content
3
+ from .summary import summarize
4
+
5
+ __all__ = ["extract_content", "cleanup_content", "summarize"]
@@ -1,11 +1,11 @@
1
1
  from functools import partial
2
2
 
3
- from content_core.config import CLEANUP_MODEL
3
+ from content_core.models import ModelFactory
4
4
  from content_core.templated_message import TemplatedMessageInput, templated_message
5
5
 
6
6
 
7
7
  async def cleanup_content(content) -> str:
8
- templated_summary_fn = partial(templated_message, model=CLEANUP_MODEL)
8
+ templated_summary_fn = partial(templated_message, model=ModelFactory.get_model('cleanup_model'))
9
9
  input = TemplatedMessageInput(
10
10
  system_prompt_template="content/cleanup",
11
11
  user_prompt_text=content,
@@ -3,13 +3,13 @@ from typing import Any, Dict, Optional
3
3
 
4
4
  import magic
5
5
  from langgraph.graph import END, START, StateGraph
6
- from loguru import logger
7
6
 
8
7
  from content_core.common import (
9
8
  ProcessSourceInput,
10
9
  ProcessSourceState,
11
10
  UnsupportedTypeException,
12
11
  )
12
+ from content_core.logging import logger
13
13
  from content_core.processors.audio import extract_audio # type: ignore
14
14
  from content_core.processors.office import (
15
15
  SUPPORTED_OFFICE_TYPES,
@@ -1,11 +1,11 @@
1
1
  from functools import partial
2
2
 
3
- from content_core.config import SUMMARY_MODEL
3
+ from content_core.models import ModelFactory
4
4
  from content_core.templated_message import TemplatedMessageInput, templated_message
5
5
 
6
6
 
7
7
  async def summarize(content: str, context: str) -> str:
8
- templated_message_fn = partial(templated_message, model=SUMMARY_MODEL)
8
+ templated_message_fn = partial(templated_message, model=ModelFactory.get_model('summary_model'))
9
9
  response = await templated_message_fn(
10
10
  TemplatedMessageInput(
11
11
  user_prompt_template="content/summarize",
@@ -0,0 +1,15 @@
1
+ import sys
2
+ from loguru import logger
3
+
4
+ def configure_logging(debug=False):
5
+ """
6
+ Configure the global logger for the application.
7
+
8
+ Args:
9
+ debug (bool): If True, set logging level to DEBUG; otherwise, set to INFO.
10
+ """
11
+ logger.remove() # Remove any existing handlers
12
+ logger.add(sys.stderr, level="DEBUG" if debug else "INFO")
13
+
14
+ # Initial configuration with default level (INFO)
15
+ configure_logging(debug=False)
@@ -0,0 +1,24 @@
1
+ from esperanto import AIFactory
2
+ from esperanto.providers.stt import SpeechToTextModel
3
+ from .config import CONFIG
4
+
5
+ class ModelFactory:
6
+ _instances = {}
7
+
8
+ @staticmethod
9
+ def get_model(model_alias):
10
+ if model_alias not in ModelFactory._instances:
11
+ config = CONFIG.get(model_alias, {})
12
+ if not config:
13
+ raise ValueError(f"Configuração para o modelo {model_alias} não encontrada.")
14
+
15
+ provider = config.get('provider')
16
+ model_name = config.get('model_name')
17
+ model_config = config.get('config', {})
18
+
19
+ if model_alias == 'speech_to_text':
20
+ ModelFactory._instances[model_alias] = AIFactory.create_speech_to_text(provider, model_name)
21
+ else:
22
+ ModelFactory._instances[model_alias] = AIFactory.create_language(provider, model_name, config=model_config)
23
+
24
+ return ModelFactory._instances[model_alias]
@@ -0,0 +1,27 @@
1
+ speech_to_text:
2
+ provider: openai
3
+ model_name: whisper-1
4
+
5
+ default_model:
6
+ provider: openai
7
+ model_name: gpt-4o-mini
8
+ config:
9
+ temperature: 0.5
10
+ top_p: 1
11
+ max_tokens: 2000
12
+
13
+ cleanup_model:
14
+ provider: openai
15
+ model_name: gpt-4o-mini
16
+ config:
17
+ temperature: 0
18
+ max_tokens: 8000
19
+ output_format: json
20
+
21
+ summary_model:
22
+ provider: openai
23
+ model_name: gpt-4o-mini
24
+ config:
25
+ temperature: 0
26
+ top_p: 1
27
+ max_tokens: 2000