content-core 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

content_core/prompter.py CHANGED
@@ -10,13 +10,27 @@ from typing import Any, Dict, Optional, Union
10
10
  from dotenv import load_dotenv
11
11
  from jinja2 import Environment, FileSystemLoader, Template
12
12
  from langchain_core.prompts import ChatPromptTemplate
13
+ from loguru import logger
13
14
  from pydantic import BaseModel
14
15
 
15
16
  load_dotenv()
16
17
 
17
- prompt_path = "/Users/luisnovo/dev/projetos/content-core/prompts"
18
+ prompt_path_default = os.path.join(
19
+ os.path.dirname(os.path.abspath(__file__)), "prompts"
20
+ )
21
+ prompt_path_custom = os.getenv("PROMPT_PATH")
18
22
 
19
- env = Environment(loader=FileSystemLoader(prompt_path))
23
+ logger.debug(
24
+ f"Pasta de prompts personalizada: {prompt_path_custom if prompt_path_custom else 'Não definida'}"
25
+ )
26
+ logger.debug(f"Pasta de prompts padrão: {prompt_path_default}")
27
+
28
+ env_custom = (
29
+ Environment(loader=FileSystemLoader(prompt_path_custom))
30
+ if prompt_path_custom and os.path.exists(prompt_path_custom)
31
+ else None
32
+ )
33
+ env_default = Environment(loader=FileSystemLoader(prompt_path_default))
20
34
 
21
35
 
22
36
  @dataclass
@@ -57,7 +71,33 @@ class Prompter:
57
71
  ValueError: If neither prompt_template nor prompt_text is provided.
58
72
  """
59
73
  if self.prompt_template:
60
- self.template = env.get_template(f"{self.prompt_template}.jinja")
74
+ # Primeiro tenta carregar da pasta personalizada, se disponível
75
+ if env_custom:
76
+ try:
77
+ self.template = env_custom.get_template(
78
+ f"{self.prompt_template}.jinja"
79
+ )
80
+ logger.debug(
81
+ f"Template {self.prompt_template} carregado da pasta personalizada"
82
+ )
83
+ return
84
+ except Exception as e:
85
+ logger.debug(
86
+ f"Template {self.prompt_template} não encontrado na pasta personalizada: {e}"
87
+ )
88
+
89
+ # Se não encontrou na personalizada ou não há pasta personalizada, tenta a padrão
90
+ try:
91
+ self.template = env_default.get_template(
92
+ f"{self.prompt_template}.jinja"
93
+ )
94
+ logger.debug(
95
+ f"Template {self.prompt_template} carregado da pasta padrão"
96
+ )
97
+ except Exception as e:
98
+ raise ValueError(
99
+ f"Template {self.prompt_template} não encontrado na pasta padrão: {e}"
100
+ )
61
101
  elif self.prompt_text:
62
102
  self.template = Template(self.prompt_text)
63
103
  else:
@@ -105,11 +145,13 @@ class Prompter:
105
145
  """
106
146
  # Convert Pydantic model to dict if necessary
107
147
  data_dict = data.model_dump() if isinstance(data, BaseModel) else data
108
- data_dict["current_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
148
+ # Create a new mutable dictionary with the original data
149
+ render_data = dict(data_dict)
150
+ render_data["current_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
109
151
  if self.parser:
110
- data_dict["format_instructions"] = self.parser.get_format_instructions()
152
+ render_data["format_instructions"] = self.parser.get_format_instructions()
111
153
  assert self.template, "Prompter template is not defined"
112
- assert isinstance(self.template, Template), (
113
- "Prompter template is not a Jinja2 Template"
114
- )
115
- return self.template.render(data_dict)
154
+ assert isinstance(
155
+ self.template, Template
156
+ ), "Prompter template is not a Jinja2 Template"
157
+ return self.template.render(render_data)
@@ -0,0 +1,16 @@
1
+ # GOAL
2
+
3
+ Adjust the content below to make it clean and readable:
4
+ Remove repeated strings that do not add value to the text.
5
+
6
+ Remove any content unrelated to the text itself (e.g., metadata, artifacts, or extraction errors).
7
+
8
+ Format the output as unstructured but clear text.
9
+
10
+ Do not add extra text, introductions, conclusions, or commentary—only rewrite the provided content as it is.
11
+
12
+ Do not interpret, analyze, or alter the meaning, intent, or narrative of the text—just reformat it for clarity and readability.
13
+
14
+ Do not change the text structure, do not write conclusions about it. Your only job is to make it readable.
15
+
16
+ Keep the text in its original language, regardless of what it is.
@@ -0,0 +1,25 @@
1
+ You are an AI assistant for a personal study platform.
2
+
3
+ In this platform, your user collects various articles and content from the Internet for reference and study.
4
+
5
+ Your role is to summarize the selected content as densely as possible, helping the reader extract maximum value from it without reading the full text.
6
+ Focus solely on the content's value, avoiding unnecessary comments or messages.
7
+
8
+ The summary should be dense, rich in characters, and designed to create a powerful vector representation.
9
+ If the user provided additional context, follow its instructions. Otherwise, summary the whole content.
10
+
11
+ Do not return any acknowledgments or greetings—only the summary.
12
+
13
+ CONTENT:
14
+
15
+ {{ content }}
16
+
17
+ {% if context %}
18
+ CONTEXT:
19
+
20
+ User has provided the aditional context for your task:
21
+ {{context}}
22
+ {% endif%}
23
+
24
+
25
+ SUMMARY:
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.1.1
3
+ Version: 0.2.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
8
- Requires-Dist: aiohttp>=3.11.16
8
+ Requires-Dist: aiohttp>=3.11
9
9
  Requires-Dist: bs4>=0.0.2
10
10
  Requires-Dist: dicttoxml>=1.7.16
11
11
  Requires-Dist: esperanto>=1.2.0
@@ -43,7 +43,7 @@ The primary goal of Content Core is to simplify the process of ingesting content
43
43
  * Direct text strings.
44
44
  * Web URLs (using robust extraction methods).
45
45
  * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
46
- * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
46
+ * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type. See the [Processors Documentation](./docs/processors.md) for detailed information on how different content types are handled.
47
47
  * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
48
48
  * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
49
49
 
@@ -220,6 +220,20 @@ OPENAI_API_KEY=your-key-here
220
220
  GOOGLE_API_KEY=your-key-here
221
221
  ```
222
222
 
223
+ ### Custom Prompt Templates
224
+
225
+ Content Core allows you to define custom prompt templates for content processing. By default, the library uses built-in prompts located in the `prompts` directory. However, you can create your own prompt templates and store them in a dedicated directory. To specify the location of your custom prompts, set the `PROMPT_PATH` environment variable in your `.env` file or system environment.
226
+
227
+ Example `.env` with custom prompt path:
228
+
229
+ ```plaintext
230
+ OPENAI_API_KEY=your-key-here
231
+ GOOGLE_API_KEY=your-key-here
232
+ PROMPT_PATH=/path/to/your/custom/prompts
233
+ ```
234
+
235
+ When a prompt template is requested, Content Core will first look in the custom directory specified by `PROMPT_PATH` (if set and exists). If the template is not found there, it will fall back to the default built-in prompts. This allows you to override specific prompts while still using the default ones for others.
236
+
223
237
  ## Development
224
238
 
225
239
  To set up a development environment:
@@ -1,6 +1,6 @@
1
1
  content_core/__init__.py,sha256=CAPVVm3mDl8dH3j7Pn7t7UJqdwwpNjy77SzF7acssFw,6352
2
2
  content_core/config.py,sha256=5nFWb-g7DG__OuxSvwK4yCFEC1YCKdE6-rX5Z7c6JSo,794
3
- content_core/prompter.py,sha256=aUm_Bz_pkQuXIMKB1Xe6OEE-y4AUNoNsfFy82fAU-Ss,4049
3
+ content_core/prompter.py,sha256=oXDBww_V-_NR1rQvpEpZwf6NNBlsAMk-hj6yMdkKXRk,5729
4
4
  content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
5
5
  content_core/templated_message.py,sha256=NSttaX1jL5LYIDlJnabx7baDuySIFIPjFjAX5NZt9pM,1704
6
6
  content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
@@ -14,7 +14,7 @@ content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU
14
14
  content_core/content/extraction/graph.py,sha256=qrTEl9YDUJJJg7TbBqPjueSvV9oo4_WwAJ-VpWKOYec,4621
15
15
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
16
16
  content_core/content/summary/core.py,sha256=BTzTYhB-5sQmvvDaxnnWwBYcVinMHcnG-fBApcf2cyg,517
17
- content_core/notebooks/run.ipynb,sha256=MmZGVl62b8S7FpsFPAUKbB7ndEtyFXY-JeihaeT5CII,375888
17
+ content_core/notebooks/run.ipynb,sha256=s4mIIiYdMfTlutaVlsYjFGwwvVnoVF83UbGT9rgizCA,340220
18
18
  content_core/processors/audio.py,sha256=ox-ScigfbBrN9B4MCvdgbYn2d3GBYqf6His0HPrzXDs,3459
19
19
  content_core/processors/office.py,sha256=13qNAfqqLwXUT6HNmF8OnxjbfvkhnTRCPoVUctw4w1k,12139
20
20
  content_core/processors/pdf.py,sha256=yndt8EGvV5_IxcFbFp4lb4g9T84w6cJ4LPdiTduO7aM,5296
@@ -22,12 +22,14 @@ content_core/processors/text.py,sha256=MiXNILDKcLO5sWTzp-LNJ8yC764_gmUEvDB7GUd7W
22
22
  content_core/processors/url.py,sha256=LcDB_FcmJMcwqM75DX2ERXcOry98e63WEjwd6l8u3ho,6268
23
23
  content_core/processors/video.py,sha256=aEe3M_POENPwI1tK4mUNxSeGewsmjex7lvMmELdcQeo,5183
24
24
  content_core/processors/youtube.py,sha256=3DioyLZT-wHmLcJ-vnONjMdZ1qWpeuHhQtsZw2nIE5M,5784
25
+ content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
26
+ content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
25
27
  content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
26
28
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
27
29
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
28
30
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
29
- content_core-0.1.1.dist-info/METADATA,sha256=U6Z7hfiSC83FCrX6zl7M_36L6As9CPKXXJ0T5QLQLFw,7372
30
- content_core-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
31
- content_core-0.1.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
32
- content_core-0.1.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
33
- content_core-0.1.1.dist-info/RECORD,,
31
+ content_core-0.2.0.dist-info/METADATA,sha256=fXtKo9M6oBAxKPDavDDtMRPyptAqgcK8Qi_PKR1xMTo,8390
32
+ content_core-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
33
+ content_core-0.2.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
34
+ content_core-0.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
35
+ content_core-0.2.0.dist-info/RECORD,,