PyPI - content-core - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

content-core 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of content-core might be problematic. Click here for more details.

Files changed (25) hide show

content_core/__init__.py +13 -13
content_core/config.py +23 -33
content_core/content/__init__.py +5 -0
content_core/content/cleanup/core.py +2 -2
content_core/content/extraction/graph.py +1 -1
content_core/content/summary/core.py +2 -2
content_core/logging.py +15 -0
content_core/models.py +24 -0
content_core/models_config.yaml +27 -0
content_core/notebooks/run.ipynb +101 -145
content_core/processors/audio.py +5 -3
content_core/processors/office.py +1 -1
content_core/processors/pdf.py +2 -4
content_core/processors/text.py +1 -2
content_core/processors/url.py +1 -1
content_core/processors/video.py +1 -2
content_core/processors/youtube.py +1 -1
content_core/prompter.py +3 -1
content_core/templated_message.py +2 -2
{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/METADATA +22 -1
content_core-0.3.1.dist-info/RECORD +38 -0
content_core-0.2.0.dist-info/RECORD +0 -35
{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/WHEEL +0 -0
{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/entry_points.txt +0 -0
{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/licenses/LICENSE +0 -0

content_core/processors/audio.py CHANGED Viewed

@@ -2,11 +2,12 @@ import asyncio
 import os
 from functools import partial
 from math import ceil
-from content_core.config import SPEECH_TO_TEXT_MODEL
-from loguru import logger
 from pydub import AudioSegment
 from content_core.common import ProcessSourceState
+from content_core.logging import logger
+from content_core.models import ModelFactory
 # todo: remove reference to model_manager
 # future: parallelize the transcription process
@@ -79,8 +80,9 @@ async def extract_audio(data: ProcessSourceState):
         audio_files = await split_audio(input_audio_path)
         # Transcribe all segments concurrently
+        speech_to_text_model = ModelFactory.get_model("speech_to_text")
         transcribe_tasks = [
-            transcribe_audio_segment(audio_file, SPEECH_TO_TEXT_MODEL)
+            transcribe_audio_segment(audio_file, speech_to_text_model)
             for audio_file in audio_files
         ]
         transcriptions = await asyncio.gather(*transcribe_tasks)

content_core/processors/office.py CHANGED Viewed

@@ -2,11 +2,11 @@ import asyncio
 from functools import partial
 from docx import Document  # type: ignore
-from loguru import logger
 from openpyxl import load_workbook  # type: ignore
 from pptx import Presentation  # type: ignore
 from content_core.common import ProcessSourceState
+from content_core.logging import logger
 SUPPORTED_OFFICE_TYPES = [
     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",

content_core/processors/pdf.py CHANGED Viewed

@@ -3,11 +3,9 @@ import re
 import unicodedata
 import fitz  # type: ignore
-from loguru import logger
-from content_core.common import (
-    ProcessSourceState,
-)
+from content_core.common import ProcessSourceState
+from content_core.logging import logger
 # todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
 # todo: what else can we do to make the text more readable?

content_core/processors/text.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import asyncio
-from loguru import logger
 from content_core.common import ProcessSourceState
+from content_core.logging import logger
 async def extract_txt(state: ProcessSourceState):

content_core/processors/url.py CHANGED Viewed

@@ -3,9 +3,9 @@ from urllib.parse import urlparse
 import aiohttp
 from bs4 import BeautifulSoup, Comment
-from loguru import logger
 from content_core.common import ProcessSourceState
+from content_core.logging import logger
 # future: better extraction methods
 # https://github.com/buriy/python-readability

content_core/processors/video.py CHANGED Viewed

@@ -4,9 +4,8 @@ import os
 import subprocess
 from functools import partial
-from loguru import logger
 from content_core.common import ProcessSourceState
+from content_core.logging import logger
 async def extract_audio_from_video(input_file, output_file, stream_index):

content_core/processors/youtube.py CHANGED Viewed

@@ -3,12 +3,12 @@ import ssl
 import aiohttp
 from bs4 import BeautifulSoup
-from loguru import logger
 from youtube_transcript_api import YouTubeTranscriptApi  # type: ignore
 from youtube_transcript_api.formatters import TextFormatter  # type: ignore
 from content_core.common import ProcessSourceState
 from content_core.common.exceptions import NoTranscriptFound
+from content_core.logging import logger
 ssl._create_default_https_context = ssl._create_unverified_context

content_core/prompter.py CHANGED Viewed

@@ -10,9 +10,10 @@ from typing import Any, Dict, Optional, Union
 from dotenv import load_dotenv
 from jinja2 import Environment, FileSystemLoader, Template
 from langchain_core.prompts import ChatPromptTemplate
-from loguru import logger
 from pydantic import BaseModel
+from content_core.logging import logger
 load_dotenv()
 prompt_path_default = os.path.join(
@@ -155,3 +156,4 @@ class Prompter:
             self.template, Template
         ), "Prompter template is not a Jinja2 Template"
         return self.template.render(render_data)
+        return self.template.render(render_data)

content_core/templated_message.py CHANGED Viewed

@@ -4,7 +4,7 @@ from esperanto import LanguageModel
 from esperanto.common_types import Message
 from pydantic import BaseModel, Field
-from content_core.config import DEFAULT_MODEL
+from content_core.models import ModelFactory
 from content_core.prompter import Prompter
@@ -28,7 +28,7 @@ async def templated_message(
     input: TemplatedMessageInput, model: Optional[LanguageModel] = None
 ) -> str:
     if not model:
-        model = DEFAULT_MODEL
+        model = ModelFactory.get_model('default_model')
     msgs = []
     if input.system_prompt_template or input.system_prompt_text:

{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: content-core
-Version: 0.2.0
+Version: 0.3.1
 Summary: Extract what matters from any media source
 Author-email: LUIS NOVO <lfnovo@gmail.com>
 License-File: LICENSE
@@ -161,6 +161,27 @@ csum https://example.com
 csum document.txt
 ```
+## Quick Start
+You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
+```python
+import content_core as cc
+# Extract content from a URL, file, or text
+result = await cc.extract("https://example.com/article")
+# Clean messy content
+cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
+# Summarize content with optional context
+summary = await cc.summarize_content("long article text", context="explain to a child")
+```
+## Documentation
+For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
 ## Using with Langchain
 For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.

content_core-0.3.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,38 @@
+content_core/__init__.py,sha256=sBCcvRJ-9u5htV5AdptlYPNO0R8NmAex2K1XAkJAoL0,6474
+content_core/config.py,sha256=sy7UtMhMldLVzg-tvGQYV3pkv9OdokIZQ0jp9RXd06g,749
+content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
+content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
+content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
+content_core/prompter.py,sha256=-ShuSyHvK50xlgsAFfA9AnAJV-LlzWwmbPDq2wUZRcI,5793
+content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
+content_core/templated_message.py,sha256=iWz-TwWq08mspgZW3EgIGf7HqtW1tXuTDpo9FkNwixQ,1729
+content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
+content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
+content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
+content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
+content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
+content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
+content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
+content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
+content_core/content/extraction/graph.py,sha256=4-yZDYErUvnPsgoBM3zmpGFg347-cbwJ4_VeyMmAYj4,4635
+content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
+content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
+content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
+content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
+content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
+content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
+content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
+content_core/processors/url.py,sha256=u2qgGLe9n58RtGXMBf1d31rwMIgyogg7Btn-AEl8KQU,6282
+content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
+content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
+content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
+content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
+content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
+content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
+content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
+content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
+content_core-0.3.1.dist-info/METADATA,sha256=NZ3ELDGt4ZTQtkpW5Mmnz-susXKBf1aFA9SEesgvC-I,9111
+content_core-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+content_core-0.3.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
+content_core-0.3.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
+content_core-0.3.1.dist-info/RECORD,,

content_core-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,35 +0,0 @@
-content_core/__init__.py,sha256=CAPVVm3mDl8dH3j7Pn7t7UJqdwwpNjy77SzF7acssFw,6352
-content_core/config.py,sha256=5nFWb-g7DG__OuxSvwK4yCFEC1YCKdE6-rX5Z7c6JSo,794
-content_core/prompter.py,sha256=oXDBww_V-_NR1rQvpEpZwf6NNBlsAMk-hj6yMdkKXRk,5729
-content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
-content_core/templated_message.py,sha256=NSttaX1jL5LYIDlJnabx7baDuySIFIPjFjAX5NZt9pM,1704
-content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
-content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
-content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
-content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
-content_core/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
-content_core/content/cleanup/core.py,sha256=FJupJGiIcZfAEX8Usn8ob2REfOVYFhcL3JmGovdnJOM,506
-content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
-content_core/content/extraction/graph.py,sha256=qrTEl9YDUJJJg7TbBqPjueSvV9oo4_WwAJ-VpWKOYec,4621
-content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
-content_core/content/summary/core.py,sha256=BTzTYhB-5sQmvvDaxnnWwBYcVinMHcnG-fBApcf2cyg,517
-content_core/notebooks/run.ipynb,sha256=s4mIIiYdMfTlutaVlsYjFGwwvVnoVF83UbGT9rgizCA,340220
-content_core/processors/audio.py,sha256=ox-ScigfbBrN9B4MCvdgbYn2d3GBYqf6His0HPrzXDs,3459
-content_core/processors/office.py,sha256=13qNAfqqLwXUT6HNmF8OnxjbfvkhnTRCPoVUctw4w1k,12139
-content_core/processors/pdf.py,sha256=yndt8EGvV5_IxcFbFp4lb4g9T84w6cJ4LPdiTduO7aM,5296
-content_core/processors/text.py,sha256=MiXNILDKcLO5sWTzp-LNJ8yC764_gmUEvDB7GUd7Wys,1145
-content_core/processors/url.py,sha256=LcDB_FcmJMcwqM75DX2ERXcOry98e63WEjwd6l8u3ho,6268
-content_core/processors/video.py,sha256=aEe3M_POENPwI1tK4mUNxSeGewsmjex7lvMmELdcQeo,5183
-content_core/processors/youtube.py,sha256=3DioyLZT-wHmLcJ-vnONjMdZ1qWpeuHhQtsZw2nIE5M,5784
-content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
-content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
-content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
-content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
-content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
-content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
-content_core-0.2.0.dist-info/METADATA,sha256=fXtKo9M6oBAxKPDavDDtMRPyptAqgcK8Qi_PKR1xMTo,8390
-content_core-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-content_core-0.2.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
-content_core-0.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
-content_core-0.2.0.dist-info/RECORD,,

{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

content-core 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

Potentially problematic release.

content-core 0.2.0py3-none-any.whl → 0.3.1py3-none-any.whl