content-core 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +13 -13
- content_core/config.py +23 -33
- content_core/content/__init__.py +5 -0
- content_core/content/cleanup/core.py +2 -2
- content_core/content/extraction/graph.py +1 -1
- content_core/content/summary/core.py +2 -2
- content_core/logging.py +15 -0
- content_core/models.py +24 -0
- content_core/models_config.yaml +27 -0
- content_core/notebooks/run.ipynb +101 -145
- content_core/processors/audio.py +5 -3
- content_core/processors/office.py +1 -1
- content_core/processors/pdf.py +2 -4
- content_core/processors/text.py +1 -2
- content_core/processors/url.py +1 -1
- content_core/processors/video.py +1 -2
- content_core/processors/youtube.py +1 -1
- content_core/prompter.py +3 -1
- content_core/templated_message.py +2 -2
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/METADATA +22 -1
- content_core-0.3.1.dist-info/RECORD +38 -0
- content_core-0.2.0.dist-info/RECORD +0 -35
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/WHEEL +0 -0
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/entry_points.txt +0 -0
- {content_core-0.2.0.dist-info → content_core-0.3.1.dist-info}/licenses/LICENSE +0 -0
content_core/processors/audio.py
CHANGED
|
@@ -2,11 +2,12 @@ import asyncio
|
|
|
2
2
|
import os
|
|
3
3
|
from functools import partial
|
|
4
4
|
from math import ceil
|
|
5
|
-
|
|
6
|
-
from loguru import logger
|
|
5
|
+
|
|
7
6
|
from pydub import AudioSegment
|
|
8
7
|
|
|
9
8
|
from content_core.common import ProcessSourceState
|
|
9
|
+
from content_core.logging import logger
|
|
10
|
+
from content_core.models import ModelFactory
|
|
10
11
|
|
|
11
12
|
# todo: remove reference to model_manager
|
|
12
13
|
# future: parallelize the transcription process
|
|
@@ -79,8 +80,9 @@ async def extract_audio(data: ProcessSourceState):
|
|
|
79
80
|
audio_files = await split_audio(input_audio_path)
|
|
80
81
|
|
|
81
82
|
# Transcribe all segments concurrently
|
|
83
|
+
speech_to_text_model = ModelFactory.get_model("speech_to_text")
|
|
82
84
|
transcribe_tasks = [
|
|
83
|
-
transcribe_audio_segment(audio_file,
|
|
85
|
+
transcribe_audio_segment(audio_file, speech_to_text_model)
|
|
84
86
|
for audio_file in audio_files
|
|
85
87
|
]
|
|
86
88
|
transcriptions = await asyncio.gather(*transcribe_tasks)
|
|
@@ -2,11 +2,11 @@ import asyncio
|
|
|
2
2
|
from functools import partial
|
|
3
3
|
|
|
4
4
|
from docx import Document # type: ignore
|
|
5
|
-
from loguru import logger
|
|
6
5
|
from openpyxl import load_workbook # type: ignore
|
|
7
6
|
from pptx import Presentation # type: ignore
|
|
8
7
|
|
|
9
8
|
from content_core.common import ProcessSourceState
|
|
9
|
+
from content_core.logging import logger
|
|
10
10
|
|
|
11
11
|
SUPPORTED_OFFICE_TYPES = [
|
|
12
12
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
content_core/processors/pdf.py
CHANGED
|
@@ -3,11 +3,9 @@ import re
|
|
|
3
3
|
import unicodedata
|
|
4
4
|
|
|
5
5
|
import fitz # type: ignore
|
|
6
|
-
from loguru import logger
|
|
7
6
|
|
|
8
|
-
from content_core.common import
|
|
9
|
-
|
|
10
|
-
)
|
|
7
|
+
from content_core.common import ProcessSourceState
|
|
8
|
+
from content_core.logging import logger
|
|
11
9
|
|
|
12
10
|
# todo: find tables - https://pymupdf.readthedocs.io/en/latest/the-basics.html#extracting-tables-from-a-page
|
|
13
11
|
# todo: what else can we do to make the text more readable?
|
content_core/processors/text.py
CHANGED
content_core/processors/url.py
CHANGED
|
@@ -3,9 +3,9 @@ from urllib.parse import urlparse
|
|
|
3
3
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
from bs4 import BeautifulSoup, Comment
|
|
6
|
-
from loguru import logger
|
|
7
6
|
|
|
8
7
|
from content_core.common import ProcessSourceState
|
|
8
|
+
from content_core.logging import logger
|
|
9
9
|
|
|
10
10
|
# future: better extraction methods
|
|
11
11
|
# https://github.com/buriy/python-readability
|
content_core/processors/video.py
CHANGED
|
@@ -4,9 +4,8 @@ import os
|
|
|
4
4
|
import subprocess
|
|
5
5
|
from functools import partial
|
|
6
6
|
|
|
7
|
-
from loguru import logger
|
|
8
|
-
|
|
9
7
|
from content_core.common import ProcessSourceState
|
|
8
|
+
from content_core.logging import logger
|
|
10
9
|
|
|
11
10
|
|
|
12
11
|
async def extract_audio_from_video(input_file, output_file, stream_index):
|
|
@@ -3,12 +3,12 @@ import ssl
|
|
|
3
3
|
|
|
4
4
|
import aiohttp
|
|
5
5
|
from bs4 import BeautifulSoup
|
|
6
|
-
from loguru import logger
|
|
7
6
|
from youtube_transcript_api import YouTubeTranscriptApi # type: ignore
|
|
8
7
|
from youtube_transcript_api.formatters import TextFormatter # type: ignore
|
|
9
8
|
|
|
10
9
|
from content_core.common import ProcessSourceState
|
|
11
10
|
from content_core.common.exceptions import NoTranscriptFound
|
|
11
|
+
from content_core.logging import logger
|
|
12
12
|
|
|
13
13
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
14
14
|
|
content_core/prompter.py
CHANGED
|
@@ -10,9 +10,10 @@ from typing import Any, Dict, Optional, Union
|
|
|
10
10
|
from dotenv import load_dotenv
|
|
11
11
|
from jinja2 import Environment, FileSystemLoader, Template
|
|
12
12
|
from langchain_core.prompts import ChatPromptTemplate
|
|
13
|
-
from loguru import logger
|
|
14
13
|
from pydantic import BaseModel
|
|
15
14
|
|
|
15
|
+
from content_core.logging import logger
|
|
16
|
+
|
|
16
17
|
load_dotenv()
|
|
17
18
|
|
|
18
19
|
prompt_path_default = os.path.join(
|
|
@@ -155,3 +156,4 @@ class Prompter:
|
|
|
155
156
|
self.template, Template
|
|
156
157
|
), "Prompter template is not a Jinja2 Template"
|
|
157
158
|
return self.template.render(render_data)
|
|
159
|
+
return self.template.render(render_data)
|
|
@@ -4,7 +4,7 @@ from esperanto import LanguageModel
|
|
|
4
4
|
from esperanto.common_types import Message
|
|
5
5
|
from pydantic import BaseModel, Field
|
|
6
6
|
|
|
7
|
-
from content_core.
|
|
7
|
+
from content_core.models import ModelFactory
|
|
8
8
|
from content_core.prompter import Prompter
|
|
9
9
|
|
|
10
10
|
|
|
@@ -28,7 +28,7 @@ async def templated_message(
|
|
|
28
28
|
input: TemplatedMessageInput, model: Optional[LanguageModel] = None
|
|
29
29
|
) -> str:
|
|
30
30
|
if not model:
|
|
31
|
-
model =
|
|
31
|
+
model = ModelFactory.get_model('default_model')
|
|
32
32
|
|
|
33
33
|
msgs = []
|
|
34
34
|
if input.system_prompt_template or input.system_prompt_text:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: content-core
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Extract what matters from any media source
|
|
5
5
|
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
@@ -161,6 +161,27 @@ csum https://example.com
|
|
|
161
161
|
csum document.txt
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
+
## Quick Start
|
|
165
|
+
|
|
166
|
+
You can quickly integrate `content-core` into your Python projects to extract, clean, and summarize content from various sources.
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
import content_core as cc
|
|
170
|
+
|
|
171
|
+
# Extract content from a URL, file, or text
|
|
172
|
+
result = await cc.extract("https://example.com/article")
|
|
173
|
+
|
|
174
|
+
# Clean messy content
|
|
175
|
+
cleaned_text = await cc.clean("...messy text with [brackets] and extra spaces...")
|
|
176
|
+
|
|
177
|
+
# Summarize content with optional context
|
|
178
|
+
summary = await cc.summarize_content("long article text", context="explain to a child")
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Documentation
|
|
182
|
+
|
|
183
|
+
For more information on how to use the Content Core library, including details on AI model configuration and customization, refer to our [Usage Documentation](docs/usage.md).
|
|
184
|
+
|
|
164
185
|
## Using with Langchain
|
|
165
186
|
|
|
166
187
|
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
content_core/__init__.py,sha256=sBCcvRJ-9u5htV5AdptlYPNO0R8NmAex2K1XAkJAoL0,6474
|
|
2
|
+
content_core/config.py,sha256=sy7UtMhMldLVzg-tvGQYV3pkv9OdokIZQ0jp9RXd06g,749
|
|
3
|
+
content_core/logging.py,sha256=oeRdWKknEolptopxF1IvnEGEc0ZUw45QXYUEZ71GcdY,438
|
|
4
|
+
content_core/models.py,sha256=FBV_tV6cmI0F82WfcA6xHag-YMsxI1dIbDGWG-3Eq_Y,935
|
|
5
|
+
content_core/models_config.yaml,sha256=Yr-GS94ffxnkaWojUfpErUMM7m_MShsYjR6QuDjMzwo,444
|
|
6
|
+
content_core/prompter.py,sha256=-ShuSyHvK50xlgsAFfA9AnAJV-LlzWwmbPDq2wUZRcI,5793
|
|
7
|
+
content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
8
|
+
content_core/templated_message.py,sha256=iWz-TwWq08mspgZW3EgIGf7HqtW1tXuTDpo9FkNwixQ,1729
|
|
9
|
+
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
10
|
+
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
11
|
+
content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
|
|
12
|
+
content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
|
|
13
|
+
content_core/content/__init__.py,sha256=ymocLXXwWnnhQFHCB3jXanNvJ2m27TVs1yO8EhCrefU,171
|
|
14
|
+
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
15
|
+
content_core/content/cleanup/core.py,sha256=AXUGUWxGob8si5uKRnDrreOcHV_gbGJr4YnRsNm2GX0,531
|
|
16
|
+
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
17
|
+
content_core/content/extraction/graph.py,sha256=4-yZDYErUvnPsgoBM3zmpGFg347-cbwJ4_VeyMmAYj4,4635
|
|
18
|
+
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
19
|
+
content_core/content/summary/core.py,sha256=LejUbPxnRD0sbO6MupiIb-IHLxEUGU5beBZwmIiBncc,542
|
|
20
|
+
content_core/notebooks/run.ipynb,sha256=U_-SXsEmMNiNhFiZXtQeEeSnVn1NF4q9Xd6XOUpcjqg,330371
|
|
21
|
+
content_core/processors/audio.py,sha256=jDn0_6F5dLcmz_C-iR80uOqOIAz49ELya2R5JeM15vo,3538
|
|
22
|
+
content_core/processors/office.py,sha256=DXkfmjqUhmhP6rJaO5Z5Y9sv-iK0zaPZ3waynFIPtsk,12153
|
|
23
|
+
content_core/processors/pdf.py,sha256=9jf-eROAqw6yQwdlbsxPXsaJXY26hVG7nSTPH9n4afY,5301
|
|
24
|
+
content_core/processors/text.py,sha256=kKHA60-NYjLmCTYUnk8TdJxQQ0Shkg-K61Ezqaelz7k,1158
|
|
25
|
+
content_core/processors/url.py,sha256=u2qgGLe9n58RtGXMBf1d31rwMIgyogg7Btn-AEl8KQU,6282
|
|
26
|
+
content_core/processors/video.py,sha256=3WnZwTswvTLm8PtQhKwoqJ2BH6YZi62dMUjALwJiebo,5196
|
|
27
|
+
content_core/processors/youtube.py,sha256=nM286Km7FLN0r1f-n-dRkqs6mSXxCo4YOhTeGzj7Suo,5798
|
|
28
|
+
content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
|
|
29
|
+
content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
|
|
30
|
+
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
31
|
+
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
32
|
+
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
33
|
+
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
34
|
+
content_core-0.3.1.dist-info/METADATA,sha256=NZ3ELDGt4ZTQtkpW5Mmnz-susXKBf1aFA9SEesgvC-I,9111
|
|
35
|
+
content_core-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
36
|
+
content_core-0.3.1.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
37
|
+
content_core-0.3.1.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
38
|
+
content_core-0.3.1.dist-info/RECORD,,
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
content_core/__init__.py,sha256=CAPVVm3mDl8dH3j7Pn7t7UJqdwwpNjy77SzF7acssFw,6352
|
|
2
|
-
content_core/config.py,sha256=5nFWb-g7DG__OuxSvwK4yCFEC1YCKdE6-rX5Z7c6JSo,794
|
|
3
|
-
content_core/prompter.py,sha256=oXDBww_V-_NR1rQvpEpZwf6NNBlsAMk-hj6yMdkKXRk,5729
|
|
4
|
-
content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
5
|
-
content_core/templated_message.py,sha256=NSttaX1jL5LYIDlJnabx7baDuySIFIPjFjAX5NZt9pM,1704
|
|
6
|
-
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
7
|
-
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
8
|
-
content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
|
|
9
|
-
content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
|
|
10
|
-
content_core/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
12
|
-
content_core/content/cleanup/core.py,sha256=FJupJGiIcZfAEX8Usn8ob2REfOVYFhcL3JmGovdnJOM,506
|
|
13
|
-
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
14
|
-
content_core/content/extraction/graph.py,sha256=qrTEl9YDUJJJg7TbBqPjueSvV9oo4_WwAJ-VpWKOYec,4621
|
|
15
|
-
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
16
|
-
content_core/content/summary/core.py,sha256=BTzTYhB-5sQmvvDaxnnWwBYcVinMHcnG-fBApcf2cyg,517
|
|
17
|
-
content_core/notebooks/run.ipynb,sha256=s4mIIiYdMfTlutaVlsYjFGwwvVnoVF83UbGT9rgizCA,340220
|
|
18
|
-
content_core/processors/audio.py,sha256=ox-ScigfbBrN9B4MCvdgbYn2d3GBYqf6His0HPrzXDs,3459
|
|
19
|
-
content_core/processors/office.py,sha256=13qNAfqqLwXUT6HNmF8OnxjbfvkhnTRCPoVUctw4w1k,12139
|
|
20
|
-
content_core/processors/pdf.py,sha256=yndt8EGvV5_IxcFbFp4lb4g9T84w6cJ4LPdiTduO7aM,5296
|
|
21
|
-
content_core/processors/text.py,sha256=MiXNILDKcLO5sWTzp-LNJ8yC764_gmUEvDB7GUd7Wys,1145
|
|
22
|
-
content_core/processors/url.py,sha256=LcDB_FcmJMcwqM75DX2ERXcOry98e63WEjwd6l8u3ho,6268
|
|
23
|
-
content_core/processors/video.py,sha256=aEe3M_POENPwI1tK4mUNxSeGewsmjex7lvMmELdcQeo,5183
|
|
24
|
-
content_core/processors/youtube.py,sha256=3DioyLZT-wHmLcJ-vnONjMdZ1qWpeuHhQtsZw2nIE5M,5784
|
|
25
|
-
content_core/prompts/content/cleanup.jinja,sha256=elyjbm9O_AeOcxkG-kui5wjBIRiOQCicjm92I4NmoVA,693
|
|
26
|
-
content_core/prompts/content/summarize.jinja,sha256=zLPbomfjA-tQZr-c_rOqvKhd55R8NN3Q2gLyLR1sKso,817
|
|
27
|
-
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
28
|
-
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
29
|
-
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
30
|
-
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
31
|
-
content_core-0.2.0.dist-info/METADATA,sha256=fXtKo9M6oBAxKPDavDDtMRPyptAqgcK8Qi_PKR1xMTo,8390
|
|
32
|
-
content_core-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
33
|
-
content_core-0.2.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
34
|
-
content_core-0.2.0.dist-info/licenses/LICENSE,sha256=myj0z2T4qIkenCgLsRfx7Wk6UqCQNj5c7O14Qx4zpGg,1066
|
|
35
|
-
content_core-0.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|