content-core 0.4.0__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -0,0 +1,72 @@
1
+ """
2
+ Docling-based document extraction processor.
3
+ """
4
+
5
+ try:
6
+ from docling.document_converter import DocumentConverter
7
+ except ImportError:
8
+
9
+ class DocumentConverter:
10
+ """Stub when docling is not installed."""
11
+
12
+ def __init__(self):
13
+ raise ImportError("Docling not installed")
14
+
15
+ def convert(self, source: str):
16
+ raise ImportError("Docling not installed")
17
+
18
+
19
+ from content_core.common.state import ProcessSourceState
20
+ from content_core.config import CONFIG
21
+
22
+ # Supported MIME types for Docling extraction
23
+ DOCLING_SUPPORTED = {
24
+ "application/pdf",
25
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
26
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
27
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
28
+ "text/markdown",
29
+ "text/plain",
30
+ "text/x-markdown",
31
+ "text/csv",
32
+ "text/html",
33
+ "image/png",
34
+ "image/jpeg",
35
+ "image/tiff",
36
+ "image/bmp",
37
+ }
38
+
39
+
40
+ async def extract_with_docling(state: ProcessSourceState) -> ProcessSourceState:
41
+ """
42
+ Use Docling to parse files, URLs, or content into the desired format.
43
+ """
44
+ # Initialize Docling converter
45
+ converter = DocumentConverter()
46
+
47
+ # Determine source: file path, URL, or direct content
48
+ source = state.file_path or state.url or state.content
49
+ if not source:
50
+ raise ValueError("No input provided for Docling extraction.")
51
+
52
+ # Convert document
53
+ result = converter.convert(source)
54
+ doc = result.document
55
+
56
+ # Determine output format (per execution override, metadata, then config)
57
+ cfg_fmt = (
58
+ CONFIG.get("extraction", {}).get("docling", {}).get("output_format", "markdown")
59
+ )
60
+ fmt = state.output_format or state.metadata.get("docling_format") or cfg_fmt
61
+ # Record the format used
62
+ state.metadata["docling_format"] = fmt
63
+ if fmt == "html":
64
+ output = doc.export_to_html()
65
+ elif fmt == "json":
66
+ output = doc.export_to_json()
67
+ else:
68
+ output = doc.export_to_markdown()
69
+
70
+ # Update state
71
+ state.content = output
72
+ return state
@@ -1,18 +1,18 @@
1
1
  from typing import Dict, Optional, Union
2
2
 
3
+ from ai_prompter import Prompter
3
4
  from esperanto import LanguageModel
4
5
  from esperanto.common_types import Message
5
6
  from pydantic import BaseModel, Field
6
7
 
7
8
  from content_core.models import ModelFactory
8
- from content_core.prompter import Prompter
9
9
 
10
10
 
11
11
  class TemplatedMessageInput(BaseModel):
12
- system_prompt_template: Optional[str] = ""
13
- system_prompt_text: Optional[str] = ""
14
- user_prompt_template: Optional[str] = ""
15
- user_prompt_text: Optional[str] = ""
12
+ system_prompt_template: Optional[str] = None
13
+ system_prompt_text: Optional[str] = None
14
+ user_prompt_template: Optional[str] = None
15
+ user_prompt_text: Optional[str] = None
16
16
  data: Optional[Union[Dict, BaseModel]] = Field(default_factory=lambda: {})
17
17
  config: Dict = Field(
18
18
  description="The config for the LLM",
@@ -28,30 +28,22 @@ async def templated_message(
28
28
  input: TemplatedMessageInput, model: Optional[LanguageModel] = None
29
29
  ) -> str:
30
30
  if not model:
31
- model = ModelFactory.get_model('default_model')
31
+ model = ModelFactory.get_model("default_model")
32
32
 
33
33
  msgs = []
34
34
  if input.system_prompt_template or input.system_prompt_text:
35
- msgs.append(
36
- Message(
37
- role="system",
38
- content=Prompter(
39
- prompt_template=input.system_prompt_template,
40
- prompt_text=input.system_prompt_text,
41
- ).render(data=input.data),
42
- )
43
- )
35
+ system_prompt = Prompter(
36
+ prompt_template=input.system_prompt_template,
37
+ template_text=input.system_prompt_text,
38
+ ).render(data=input.data)
39
+ msgs.append(Message(role="system", content=system_prompt))
44
40
 
45
41
  if input.user_prompt_template or input.user_prompt_text:
46
- msgs.append(
47
- Message(
48
- role="user",
49
- content=Prompter(
50
- prompt_template=input.user_prompt_template,
51
- prompt_text=input.user_prompt_text,
52
- ).render(data=input.data),
53
- )
54
- )
42
+ user_prompt = Prompter(
43
+ prompt_template=input.user_prompt_template,
44
+ template_text=input.user_prompt_text,
45
+ ).render(data=input.data)
46
+ msgs.append(Message(role="user", content=user_prompt))
55
47
 
56
48
  result = await model.achat_complete(msgs)
57
49
  return result.content
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.4.0
3
+ Version: 0.5.1
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: >=3.10
8
+ Requires-Dist: ai-prompter>=0.2.3
8
9
  Requires-Dist: aiohttp>=3.11
9
10
  Requires-Dist: bs4>=0.0.2
10
11
  Requires-Dist: dicttoxml>=1.7.16
@@ -25,6 +26,11 @@ Requires-Dist: python-magic>=0.4.27
25
26
  Requires-Dist: python-pptx>=1.0.2
26
27
  Requires-Dist: validators>=0.34.0
27
28
  Requires-Dist: youtube-transcript-api>=1.0.3
29
+ Provides-Extra: docling
30
+ Requires-Dist: asciidoc; extra == 'docling'
31
+ Requires-Dist: docling[ocr]; extra == 'docling'
32
+ Requires-Dist: pandas; extra == 'docling'
33
+ Requires-Dist: pillow; extra == 'docling'
28
34
  Description-Content-Type: text/markdown
29
35
 
30
36
  # Content Core
@@ -54,8 +60,10 @@ The primary goal of Content Core is to simplify the process of ingesting content
54
60
  Install Content Core using `pip`:
55
61
 
56
62
  ```bash
57
- # Install the package
63
+ # Install the package (without Docling)
58
64
  pip install content-core
65
+ # Install with Docling support
66
+ pip install content-core[docling]
59
67
  ```
60
68
 
61
69
  Alternatively, if you’re developing locally:
@@ -224,12 +232,58 @@ async def main():
224
232
  md_data = await extract_content({"file_path": "path/to/your/document.md"})
225
233
  print(md_data)
226
234
 
235
+ # Per-execution override with Docling
236
+ doc_data = await extract_content({
237
+ "file_path": "path/to/your/document.pdf",
238
+ "engine": "docling",
239
+ "output_format": "html"
240
+ })
241
+ print(doc_data)
242
+
227
243
  if __name__ == "__main__":
228
244
  asyncio.run(main())
229
245
  ```
230
246
 
231
247
  (See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
232
248
 
249
+ ## Docling Integration
250
+
251
+ Content Core supports an optional Docling-based extraction engine for rich document formats (PDF, DOCX, PPTX, XLSX, Markdown, AsciiDoc, HTML, CSV, Images).
252
+
253
+ ### Installation
254
+
255
+ ```bash
256
+ # Install with Docling support
257
+ pip install content-core[docling]
258
+ ```
259
+
260
+ ### Enabling Docling
261
+
262
+ #### Via configuration file
263
+
264
+ In your `cc_config.yaml` or custom config, set:
265
+ ```yaml
266
+ extraction:
267
+ engine: docling # 'legacy' (default) or 'docling'
268
+ docling:
269
+ output_format: markdown # markdown | html | json
270
+ ```
271
+
272
+ #### Programmatically in Python
273
+
274
+ ```python
275
+ from content_core.config import set_extraction_engine, set_docling_output_format
276
+
277
+ # switch engine to Docling
278
+ set_extraction_engine("docling")
279
+
280
+ # choose output format: 'markdown', 'html', or 'json'
281
+ set_docling_output_format("html")
282
+
283
+ # now use ccore.extract or ccore.ccore
284
+ result = await cc.extract("document.pdf")
285
+ ```
286
+
233
287
  ## Configuration
234
288
 
235
289
  Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
@@ -1,24 +1,26 @@
1
- content_core/__init__.py,sha256=sBCcvRJ-9u5htV5AdptlYPNO0R8NmAex2K1XAkJAoL0,6474
2
- content_core/config.py,sha256=sy7UtMhMldLVzg-tvGQYV3pkv9OdokIZQ0jp9RXd06g,749
1
+ content_core/__init__.py,sha256=ANKeslNXOGumwrkjqgRik23e5PdGps2C0FSup8_XH2Y,6515
2
+ content_core/cc_config.yaml,sha256=w66fo5ut6TPaU3o4hkjnroqg2hkr8YuOG3BRtI50j1s,701
3
+ content_core/config.py,sha256=-aUsTB6Z3fa_XIWdHNXhMgWkVLWjEW1kfyQXXB_-j54,1632
3
4
  content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
4
5
  content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
5
6
  content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
6
- content_core/prompter.py,sha256=-ShuSyHvK50xlgsAFfA9AnAJV-LlzWwmbPDq2wUZRcI,5793
7
7
  content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
- content_core/templated_message.py,sha256=iWz-TwWq08mspgZW3EgIGf7HqtW1tXuTDpo9FkNwixQ,1729
8
+ content_core/templated_message.py,sha256=KbI2rcvgGM5oRIcsG68zAZfgNsC97fR16D61683ZSnY,1617
9
9
  content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
10
10
  content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
11
- content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
11
+ content_core/common/state.py,sha256=cJvIwqvrvGxuk1t51bTOvPV-RM5Nbd8F8C4o0dawIXo,1185
12
12
  content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
13
13
  content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
14
14
  content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
15
  content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
16
  content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
- content_core/content/extraction/graph.py,sha256=W_mpGcR_Vw6cMh56U-YONzVxFMbhY9aU8rt3Pdta6Bg,5526
17
+ content_core/content/extraction/graph.py,sha256=Sp9XJ6AoLXA_FUFWhmfTMzOC2gkarp1Qg8MsIScLCok,6213
18
18
  content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
19
  content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
- content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
20
+ content_core/notebooks/docling.ipynb,sha256=aTad8NORNd-TUMlbX58DURJ4-QCeplTeTT0vUj301m0,631
21
+ content_core/notebooks/run.ipynb,sha256=lV8n1fx_kgIQHBnk1vR6ChBjMS5luAEuDDljsTBNjrQ,369490
21
22
  content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
23
+ content_core/processors/docling.py,sha256=wQ8ThAcyrCy-c95QtgplQ9UZtjCZTddLD9y1_CrRtSQ,2111
22
24
  content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
23
25
  content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
24
26
  content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
@@ -31,8 +33,8 @@ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8j
31
33
  content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
32
34
  content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
33
35
  content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
34
- content_core-0.4.0.dist-info/METADATA,sha256=sXLcda5ZXi4ibpBxrIlC_YT3DuJcNiqk_FFR_LgMISQ,9111
35
- content_core-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
- content_core-0.4.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
37
- content_core-0.4.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
38
- content_core-0.4.0.dist-info/RECORD,,
36
+ content_core-0.5.1.dist-info/METADATA,sha256=mkvdVcLsiBDGiobgswCVQF8Xkceq5VpIRZspniB61PY,10533
37
+ content_core-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
38
+ content_core-0.5.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
39
+ content_core-0.5.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
40
+ content_core-0.5.1.dist-info/RECORD,,
content_core/prompter.py DELETED
@@ -1,159 +0,0 @@
1
- """
2
- A prompt management module using Jinja to generate complex prompts with simple templates.
3
- """
4
-
5
- import os
6
- from dataclasses import dataclass
7
- from datetime import datetime
8
- from typing import Any, Dict, Optional, Union
9
-
10
- from dotenv import load_dotenv
11
- from jinja2 import Environment, FileSystemLoader, Template
12
- from langchain_core.prompts import ChatPromptTemplate
13
- from pydantic import BaseModel
14
-
15
- from content_core.logging import logger
16
-
17
- load_dotenv()
18
-
19
- prompt_path_default = os.path.join(
20
- os.path.dirname(os.path.abspath(__file__)), "prompts"
21
- )
22
- prompt_path_custom = os.getenv("PROMPT_PATH")
23
-
24
- logger.debug(
25
- f"Pasta de prompts personalizada: {prompt_path_custom if prompt_path_custom else 'Não definida'}"
26
- )
27
- logger.debug(f"Pasta de prompts padrão: {prompt_path_default}")
28
-
29
- env_custom = (
30
- Environment(loader=FileSystemLoader(prompt_path_custom))
31
- if prompt_path_custom and os.path.exists(prompt_path_custom)
32
- else None
33
- )
34
- env_default = Environment(loader=FileSystemLoader(prompt_path_default))
35
-
36
-
37
- @dataclass
38
- class Prompter:
39
- """
40
- A class for managing and rendering prompt templates.
41
-
42
- Attributes:
43
- prompt_template (str, optional): The name of the prompt template file.
44
- prompt_variation (str, optional): The variation of the prompt template.
45
- prompt_text (str, optional): The raw prompt text.
46
- template (Union[str, Template], optional): The Jinja2 template object.
47
- """
48
-
49
- prompt_template: Optional[str] = None
50
- prompt_variation: Optional[str] = "default"
51
- prompt_text: Optional[str] = None
52
- template: Optional[Union[str, Template]] = None
53
- parser: Optional[Any] = None
54
-
55
- def __init__(self, prompt_template=None, prompt_text=None, parser=None):
56
- """
57
- Initialize the Prompter with either a template file or raw text.
58
-
59
- Args:
60
- prompt_template (str, optional): The name of the prompt template file.
61
- prompt_text (str, optional): The raw prompt text.
62
- """
63
- self.prompt_template = prompt_template
64
- self.prompt_text = prompt_text
65
- self.parser = parser
66
- self.setup()
67
-
68
- def setup(self):
69
- """
70
- Set up the Jinja2 template based on the provided template file or text.
71
- Raises:
72
- ValueError: If neither prompt_template nor prompt_text is provided.
73
- """
74
- if self.prompt_template:
75
- # Primeiro tenta carregar da pasta personalizada, se disponível
76
- if env_custom:
77
- try:
78
- self.template = env_custom.get_template(
79
- f"{self.prompt_template}.jinja"
80
- )
81
- logger.debug(
82
- f"Template {self.prompt_template} carregado da pasta personalizada"
83
- )
84
- return
85
- except Exception as e:
86
- logger.debug(
87
- f"Template {self.prompt_template} não encontrado na pasta personalizada: {e}"
88
- )
89
-
90
- # Se não encontrou na personalizada ou não há pasta personalizada, tenta a padrão
91
- try:
92
- self.template = env_default.get_template(
93
- f"{self.prompt_template}.jinja"
94
- )
95
- logger.debug(
96
- f"Template {self.prompt_template} carregado da pasta padrão"
97
- )
98
- except Exception as e:
99
- raise ValueError(
100
- f"Template {self.prompt_template} não encontrado na pasta padrão: {e}"
101
- )
102
- elif self.prompt_text:
103
- self.template = Template(self.prompt_text)
104
- else:
105
- raise ValueError("Prompter must have a prompt_template or prompt_text")
106
-
107
- assert self.prompt_template or self.prompt_text, "Prompt is required"
108
-
109
- def to_langchain(self):
110
- if isinstance(self.template, str):
111
- template_text = self.template
112
- else:
113
- # For file-based templates, read the raw content
114
- template_path = os.path.join("prompts", f"{self.prompt_template}.jinja")
115
- with open(template_path, "r") as f:
116
- template_text = f.read()
117
- return ChatPromptTemplate.from_template(template_text, template_format="jinja2")
118
-
119
- @classmethod
120
- def from_text(cls, text: str):
121
- """
122
- Create a Prompter instance from raw text, which can contain Jinja code.
123
-
124
- Args:
125
- text (str): The raw prompt text.
126
-
127
- Returns:
128
- Prompter: A new Prompter instance.
129
- """
130
-
131
- return cls(prompt_text=text)
132
-
133
- def render(self, data: Optional[Union[Dict, BaseModel]] = {}) -> str:
134
- """
135
- Render the prompt template with the given data.
136
-
137
- Args:
138
- data (Union[Dict, BaseModel]): The data to be used in rendering the template.
139
- Can be either a dictionary or a Pydantic BaseModel.
140
-
141
- Returns:
142
- str: The rendered prompt text.
143
-
144
- Raises:
145
- AssertionError: If the template is not defined or not a Jinja2 Template.
146
- """
147
- # Convert Pydantic model to dict if necessary
148
- data_dict = data.model_dump() if isinstance(data, BaseModel) else data
149
- # Create a new mutable dictionary with the original data
150
- render_data = dict(data_dict)
151
- render_data["current_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
152
- if self.parser:
153
- render_data["format_instructions"] = self.parser.get_format_instructions()
154
- assert self.template, "Prompter template is not defined"
155
- assert isinstance(
156
- self.template, Template
157
- ), "Prompter template is not a Jinja2 Template"
158
- return self.template.render(render_data)
159
- return self.template.render(render_data)