content-core 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -2,11 +2,12 @@ import asyncio
2
2
  import os
3
3
  from functools import partial
4
4
  from math import ceil
5
- from content_core.config import SPEECH_TO_TEXT_MODEL
6
- from loguru import logger
5
+
7
6
  from pydub import AudioSegment
8
7
 
9
8
  from content_core.common import ProcessSourceState
9
+ from content_core.logging import logger
10
+ from content_core.models import ModelFactory
10
11
 
11
12
  # todo: remove reference to model_manager
12
13
  # future: parallelize the transcription process
@@ -79,8 +80,9 @@ async def extract_audio(data: ProcessSourceState):
79
80
  audio_files = await split_audio(input_audio_path)
80
81
 
81
82
  # Transcribe all segments concurrently
83
+ speech_to_text_model = ModelFactory.get_model("speech_to_text")
82
84
  transcribe_tasks = [
83
- transcribe_audio_segment(audio_file, SPEECH_TO_TEXT_MODEL)
85
+ transcribe_audio_segment(audio_file, speech_to_text_model)
84
86
  for audio_file in audio_files
85
87
  ]
86
88
  transcriptions = await asyncio.gather(*transcribe_tasks)
@@ -2,11 +2,11 @@ import asyncio
2
2
  from functools import partial
3
3
 
4
4
  from docx import Document # type: ignore
5
- from loguru import logger
6
5
  from openpyxl import load_workbook # type: ignore
7
6
  from pptx import Presentation # type: ignore
8
7
 
9
8
  from content_core.common import ProcessSourceState
9
+ from content_core.logging import logger
10
10
 
11
11
  SUPPORTED_OFFICE_TYPES = [
12
12
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -3,11 +3,9 @@ import re
3
3
  import unicodedata
4
4
 
5
5
  import fitz # type: ignore
6
- from loguru import logger
7
6
 
8
- from content_core.common import (
9
- ProcessSourceState,
10
- )
7
+ from content_core.common import ProcessSourceState
8
+ from content_core.logging import logger
11
9
 
12
10
  # todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
13
11
  # todo: what else can we do to make the text more readable?
@@ -1,8 +1,7 @@
1
1
  import asyncio
2
2
 
3
- from loguru import logger
4
-
5
3
  from content_core.common import ProcessSourceState
4
+ from content_core.logging import logger
6
5
 
7
6
 
8
7
  async def extract_txt(state: ProcessSourceState):
@@ -3,9 +3,9 @@ from urllib.parse import urlparse
3
3
 
4
4
  import aiohttp
5
5
  from bs4 import BeautifulSoup, Comment
6
- from loguru import logger
7
6
 
8
7
  from content_core.common import ProcessSourceState
8
+ from content_core.logging import logger
9
9
 
10
10
  # future: better extraction methods
11
11
  # https://github.com/buriy/python-readability
@@ -4,9 +4,8 @@ import os
4
4
  import subprocess
5
5
  from functools import partial
6
6
 
7
- from loguru import logger
8
-
9
7
  from content_core.common import ProcessSourceState
8
+ from content_core.logging import logger
10
9
 
11
10
 
12
11
  async def extract_audio_from_video(input_file, output_file, stream_index):
@@ -3,12 +3,12 @@ import ssl
3
3
 
4
4
  import aiohttp
5
5
  from bs4 import BeautifulSoup
6
- from loguru import logger
7
6
  from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
8
7
  from youtube_transcript_api.formatters import TextFormatter # type: ignore
9
8
 
10
9
  from content_core.common import ProcessSourceState
11
10
  from content_core.common.exceptions import NoTranscriptFound
11
+ from content_core.logging import logger
12
12
 
13
13
  ssl._create_default_https_context = ssl._create_unverified_context
14
14
 
content_core/prompter.py CHANGED
@@ -10,9 +10,10 @@ from typing import Any, Dict, Optional, Union
10
10
  from dotenv import load_dotenv
11
11
  from jinja2 import Environment, FileSystemLoader, Template
12
12
  from langchain_core.prompts import ChatPromptTemplate
13
- from loguru import logger
14
13
  from pydantic import BaseModel
15
14
 
15
+ from content_core.logging import logger
16
+
16
17
  load_dotenv()
17
18
 
18
19
  prompt_path_default = os.path.join(
@@ -155,3 +156,4 @@ class Prompter:
155
156
  self.template, Template
156
157
  ), "Prompter template is not a Jinja2 Template"
157
158
  return self.template.render(render_data)
159
+ return self.template.render(render_data)
@@ -4,7 +4,7 @@ from esperanto import LanguageModel
4
4
  from esperanto.common_types import Message
5
5
  from pydantic import BaseModel, Field
6
6
 
7
- from content_core.config import DEFAULT_MODEL
7
+ from content_core.models import ModelFactory
8
8
  from content_core.prompter import Prompter
9
9
 
10
10
 
@@ -28,7 +28,7 @@ async def templated_message(
28
28
  input: TemplatedMessageInput, model: Optional[LanguageModel] = None
29
29
  ) -> str:
30
30
  if not model:
31
- model = DEFAULT_MODEL
31
+ model = ModelFactory.get_model('default_model')
32
32
 
33
33
  msgs = []
34
34
  if input.system_prompt_template or input.system_prompt_text:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: content-core
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Extract what matters from any media source
5
5
  Author-email: LUIS NOVO <lfnovo@gmail.com>
6
6
  License-File: LICENSE
@@ -161,6 +161,27 @@ csum https://example.com
161
161
  csum document.txt
162
162
  ```
163
163
 
164
+ ## Quick Start
165
+
166
+ You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
167
+
168
+ ```python
169
+ import content_core as cc
170
+
171
+ # Extract content from a URL, file, or text
172
+ result = await cc.extract("https://example.com/article")
173
+
174
+ # Clean messy content
175
+ cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
176
+
177
+ # Summarize content with optional context
178
+ summary = await cc.summarize_content("long article text", context="explain to a child")
179
+ ```
180
+
181
+ ## Documentation
182
+
183
+ For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
184
+
164
185
  ## Using with Langchain
165
186
 
166
187
  For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
@@ -0,0 +1,38 @@
1
+ content_core/__init__.py,sha256=sBCcvRJ-9u5htV5AdptlYPNO0R8NmAex2K1XAkJAoL0,6474
2
+ content_core/config.py,sha256=sy7UtMhMldLVzg-tvGQYV3pkv9OdokIZQ0jp9RXd06g,749
3
+ content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
4
+ content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
5
+ content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
6
+ content_core/prompter.py,sha256=-ShuSyHvK50xlgsAFfA9AnAJV-LlzWwmbPDq2wUZRcI,5793
7
+ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
8
+ content_core/templated_message.py,sha256=iWz-TwWq08mspgZW3EgIGf7HqtW1tXuTDpo9FkNwixQ,1729
9
+ content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
10
+ content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
11
+ content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
12
+ content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
13
+ content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
14
+ content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
15
+ content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
16
+ content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
17
+ content_core/content/extraction/graph.py,sha256=4-yZDYErUvnPsgoBM3zmpGFg347-cbwJ4_VeyMmAYj4,4635
18
+ content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
19
+ content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
20
+ content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
21
+ content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
22
+ content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
23
+ content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
24
+ content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
25
+ content_core/processors/url.py,sha256=u2qgGLe9n58RtGXMBf1d31rwMIgyogg7Btn-AEl8KQU,6282
26
+ content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
27
+ content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
28
+ content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
29
+ content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
30
+ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
31
+ content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
32
+ content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
33
+ content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
34
+ content_core-0.3.0.dist-info/METADATA,sha256=nBKxYD0J8db7zO9ZVSyyxwl7VxtVGMrRREpt-HPgbBg,9111
35
+ content_core-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
36
+ content_core-0.3.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
37
+ content_core-0.3.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
38
+ content_core-0.3.0.dist-info/RECORD,,
@@ -1,35 +0,0 @@
1
- content_core/__init__.py,sha256=CAPVVm3mDl8dH3j7Pn7t7UJqdwwpNjy77SzF7acssFw,6352
2
- content_core/config.py,sha256=5nFWb-g7DG__OuxSvwK4yCFEC1YCKdE6-rX5Z7c6JSo,794
3
- content_core/prompter.py,sha256=oXDBww_V-_NR1rQvpEpZwf6NNBlsAMk-hj6yMdkKXRk,5729
4
- content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
5
- content_core/templated_message.py,sha256=NSttaX1jL5LYIDlJnabx7baDuySIFIPjFjAX5NZt9pM,1704
6
- content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
7
- content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
8
- content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
9
- content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
10
- content_core/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
12
- content_core/content/cleanup/core.py,sha256=FJupJGiIcZfAEX8Usn8ob2REfOVYFhcL3JmGovdnJOM,506
13
- content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
14
- content_core/content/extraction/graph.py,sha256=qrTEl9YDUJJJg7TbBqPjueSvV9oo4_WwAJ-VpWKOYec,4621
15
- content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
16
- content_core/content/summary/core.py,sha256=BTzTYhB-5sQmvvDaxnnWwBYcVinMHcnG-fBApcf2cyg,517
17
- content_core/notebooks/run.ipynb,sha256=s4mIIiYdMfTlutaVlsYjFGwwvVnoVF83UbGT9rgizCA,340220
18
- content_core/processors/audio.py,sha256=ox-ScigfbBrN9B4MCvdgbYn2d3GBYqf6His0HPrzXDs,3459
19
- content_core/processors/office.py,sha256=13qNAfqqLwXUT6HNmF8OnxjbfvkhnTRCPoVUctw4w1k,12139
20
- content_core/processors/pdf.py,sha256=yndt8EGvV5_IxcFbFp4lb4g9T84w6cJ4LPdiTduO7aM,5296
21
- content_core/processors/text.py,sha256=MiXNILDKcLO5sWTzp-LNJ8yC764_gmUEvDB7GUd7Wys,1145
22
- content_core/processors/url.py,sha256=LcDB_FcmJMcwqM75DX2ERXcOry98e63WEjwd6l8u3ho,6268
23
- content_core/processors/video.py,sha256=aEe3M_POENPwI1tK4mUNxSeGewsmjex7lvMmELdcQeo,5183
24
- content_core/processors/youtube.py,sha256=3DioyLZT-wHmLcJ-vnONjMdZ1qWpeuHhQtsZw2nIE5M,5784
25
- content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
26
- content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
27
- content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
28
- content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
29
- content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
30
- content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
31
- content_core-0.2.0.dist-info/METADATA,sha256=fXtKo9M6oBAxKPDavDDtMRPyptAqgcK8Qi_PKR1xMTo,8390
32
- content_core-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
33
- content_core-0.2.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
34
- content_core-0.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
35
- content_core-0.2.0.dist-info/RECORD,,