content-core 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core/__init__.py +214 -0
- content_core/common/__init__.py +21 -0
- content_core/common/exceptions.py +70 -0
- content_core/common/state.py +30 -0
- content_core/common/utils.py +31 -0
- content_core/config.py +37 -0
- content_core/content/__init__.py +0 -0
- content_core/content/cleanup/__init__.py +5 -0
- content_core/content/cleanup/core.py +15 -0
- content_core/content/extraction/__init__.py +13 -0
- content_core/content/extraction/graph.py +148 -0
- content_core/content/summary/__init__.py +5 -0
- content_core/content/summary/core.py +15 -0
- content_core/notebooks/run.ipynb +558 -0
- content_core/processors/audio.py +106 -0
- content_core/processors/office.py +331 -0
- content_core/processors/pdf.py +170 -0
- content_core/processors/text.py +37 -0
- content_core/processors/url.py +191 -0
- content_core/processors/video.py +167 -0
- content_core/processors/youtube.py +159 -0
- content_core/prompter.py +115 -0
- content_core/py.typed +2 -0
- content_core/templated_message.py +57 -0
- content_core/tools/__init__.py +9 -0
- content_core/tools/cleanup.py +15 -0
- content_core/tools/extract.py +21 -0
- content_core/tools/summarize.py +17 -0
- content_core-0.1.0.dist-info/METADATA +250 -0
- content_core-0.1.0.dist-info/RECORD +32 -0
- content_core-0.1.0.dist-info/WHEEL +4 -0
- content_core-0.1.0.dist-info/entry_points.txt +4 -0
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from langchain_core.tools import tool
|
|
2
|
+
|
|
3
|
+
from content_core.content_cleanup import cleanup_content
|
|
4
|
+
from content_core.common import process_input_content
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@tool
|
|
8
|
+
async def cleanup_content_tool(content: str) -> str:
|
|
9
|
+
"""
|
|
10
|
+
Clean content. Rewrite paragraphs. Fix grammar and spelling.
|
|
11
|
+
Accepts direct text, URLs, or file paths. If a URL or file path is provided,
|
|
12
|
+
the content will be extracted first before cleaning.
|
|
13
|
+
"""
|
|
14
|
+
content = await process_input_content(content)
|
|
15
|
+
return await cleanup_content(content)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
from langchain_core.tools import tool
|
|
4
|
+
|
|
5
|
+
from content_core.extraction import extract_content
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@tool
|
|
9
|
+
async def extract_content_tool(file_path_or_url: str) -> Dict:
|
|
10
|
+
"""
|
|
11
|
+
Extract title, content and metadata from URLs and Links.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
file_path_or_url: URL or file path to extract content from.
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
Dict: Extracted content and metadata.
|
|
18
|
+
"""
|
|
19
|
+
if file_path_or_url.startswith("http"):
|
|
20
|
+
return await extract_content({"url": file_path_or_url})
|
|
21
|
+
return await extract_content({"file_path": file_path_or_url})
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from langchain_core.tools import tool
|
|
4
|
+
|
|
5
|
+
from content_core.content_summary import summarize
|
|
6
|
+
from content_core.common import process_input_content
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@tool
|
|
10
|
+
async def summarize_content_tool(content: str, context: Optional[str] = None) -> str:
|
|
11
|
+
"""
|
|
12
|
+
Summarize content according to instructions provided via context.
|
|
13
|
+
Accepts direct text, URLs, or file paths. If a URL or file path is provided,
|
|
14
|
+
the content will be extracted first before summarizing.
|
|
15
|
+
"""
|
|
16
|
+
content = await process_input_content(content)
|
|
17
|
+
return await summarize(content, context or "")
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: content-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract what matters from any media source
|
|
5
|
+
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: aiohttp>=3.11.16
|
|
8
|
+
Requires-Dist: bs4>=0.0.2
|
|
9
|
+
Requires-Dist: dicttoxml>=1.7.16
|
|
10
|
+
Requires-Dist: esperanto>=1.2.0
|
|
11
|
+
Requires-Dist: google-genai>=1.10.0
|
|
12
|
+
Requires-Dist: jinja2>=3.1.6
|
|
13
|
+
Requires-Dist: langdetect>=1.0.9
|
|
14
|
+
Requires-Dist: langgraph>=0.3.29
|
|
15
|
+
Requires-Dist: loguru>=0.7.3
|
|
16
|
+
Requires-Dist: openai>=1.73.0
|
|
17
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
18
|
+
Requires-Dist: pandas>=2.2.3
|
|
19
|
+
Requires-Dist: pydub>=0.25.1
|
|
20
|
+
Requires-Dist: pymupdf>=1.25.5
|
|
21
|
+
Requires-Dist: python-docx>=1.1.2
|
|
22
|
+
Requires-Dist: python-dotenv>=1.1.0
|
|
23
|
+
Requires-Dist: python-magic>=0.4.27
|
|
24
|
+
Requires-Dist: python-pptx>=1.0.2
|
|
25
|
+
Requires-Dist: validators>=0.34.0
|
|
26
|
+
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# Content Core
|
|
30
|
+
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+
|
|
33
|
+
**Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
|
|
34
|
+
|
|
35
|
+
## Overview
|
|
36
|
+
|
|
37
|
+
The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
|
|
38
|
+
|
|
39
|
+
## Key Features
|
|
40
|
+
|
|
41
|
+
* **Multi-Source Extraction:** Handles content from:
|
|
42
|
+
* Direct text strings.
|
|
43
|
+
* Web URLs (using robust extraction methods).
|
|
44
|
+
* Local files (including automatic transcription for video/audio files and parsing for text-based formats).
|
|
45
|
+
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
|
|
46
|
+
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
47
|
+
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
48
|
+
|
|
49
|
+
## Getting Started
|
|
50
|
+
|
|
51
|
+
### Installation
|
|
52
|
+
|
|
53
|
+
Install Content Core using `pip`:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Install the package
|
|
57
|
+
pip install content-core
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Alternatively, if you’re developing locally:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Clone the repository
|
|
64
|
+
git clone https://github.com/lfnovo/content-core
|
|
65
|
+
cd content-core
|
|
66
|
+
|
|
67
|
+
# Install with uv
|
|
68
|
+
uv sync
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Command-Line Interface
|
|
72
|
+
|
|
73
|
+
Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
|
|
74
|
+
ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
|
|
75
|
+
|
|
76
|
+
#### ccore - Extract Content
|
|
77
|
+
|
|
78
|
+
Extracts content from text, URLs, or files, with optional formatting.
|
|
79
|
+
Usage:
|
|
80
|
+
```bash
|
|
81
|
+
ccore [-f|--format xml|json|text] [-d|--debug] [content]
|
|
82
|
+
```
|
|
83
|
+
Options:
|
|
84
|
+
- `-f`, `--format`: Output format (xml, json, or text). Default: text.
|
|
85
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
86
|
+
- `content`: Input content (text, URL, or file path). If omitted, reads from stdin.
|
|
87
|
+
|
|
88
|
+
Examples:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Extract from a URL as text
|
|
92
|
+
ccore https://example.com
|
|
93
|
+
|
|
94
|
+
# Extract from a file as JSON
|
|
95
|
+
ccore -f json document.pdf
|
|
96
|
+
|
|
97
|
+
# Extract from piped text as XML
|
|
98
|
+
echo "Sample text" | ccore --format xml
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### cclean - Clean Content
|
|
102
|
+
Cleans content by removing unnecessary formatting, spaces, or artifacts. Accepts text, JSON, XML input, URLs, or file paths.
|
|
103
|
+
Usage:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
cclean [-d|--debug] [content]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Options:
|
|
110
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
111
|
+
- `content`: Input content to clean (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
|
|
112
|
+
|
|
113
|
+
Examples:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Clean a text string
|
|
117
|
+
cclean " messy text "
|
|
118
|
+
|
|
119
|
+
# Clean piped JSON
|
|
120
|
+
echo '{"content": " messy text "}' | cclean
|
|
121
|
+
|
|
122
|
+
# Clean content from a URL
|
|
123
|
+
cclean https://example.com
|
|
124
|
+
|
|
125
|
+
# Clean a file’s content
|
|
126
|
+
cclean document.txt
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### csum - Summarize Content
|
|
130
|
+
|
|
131
|
+
Summarizes content with an optional context to guide the summary style. Accepts text, JSON, XML input, URLs, or file paths.
|
|
132
|
+
|
|
133
|
+
Usage:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
csum [--context "context text"] [-d|--debug] [content]
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Options:
|
|
140
|
+
- `--context`: Context for summarization (e.g., "explain to a child"). Default: none.
|
|
141
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
142
|
+
- `content`: Input content to summarize (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
|
|
143
|
+
|
|
144
|
+
Examples:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
# Summarize text
|
|
148
|
+
csum "AI is transforming industries."
|
|
149
|
+
|
|
150
|
+
# Summarize with context
|
|
151
|
+
csum --context "in bullet points" "AI is transforming industries."
|
|
152
|
+
|
|
153
|
+
# Summarize piped content
|
|
154
|
+
cat article.txt | csum --context "one sentence"
|
|
155
|
+
|
|
156
|
+
# Summarize content from URL
|
|
157
|
+
csum https://example.com
|
|
158
|
+
|
|
159
|
+
# Summarize a file's content
|
|
160
|
+
csum document.txt
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Using with Langchain
|
|
164
|
+
|
|
165
|
+
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
166
|
+
|
|
167
|
+
You can import and use these tools like any other Langchain tool. For example:
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from content_core.tools import extract_content_tool, cleanup_content_tool, summarize_content_tool
|
|
171
|
+
from langchain.agents import initialize_agent, AgentType
|
|
172
|
+
|
|
173
|
+
tools = [extract_content_tool, cleanup_content_tool, summarize_content_tool]
|
|
174
|
+
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
|
|
175
|
+
agent.run("Extract the content from https://example.com and then summarize it.")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Refer to the source code in `src/content_core/tools` for specific tool implementations and usage details.
|
|
179
|
+
|
|
180
|
+
## Basic Usage
|
|
181
|
+
|
|
182
|
+
The core functionality revolves around the extract_content function.
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
import asyncio
|
|
186
|
+
from content_core.extraction import extract_content
|
|
187
|
+
|
|
188
|
+
async def main():
|
|
189
|
+
# Extract from raw text
|
|
190
|
+
text_data = await extract_content({"content": "This is my sample text content."})
|
|
191
|
+
print(text_data)
|
|
192
|
+
|
|
193
|
+
# Extract from a URL
|
|
194
|
+
url_data = await extract_content({"url": "https://www.example.com"})
|
|
195
|
+
print(url_data)
|
|
196
|
+
|
|
197
|
+
# Extract from a local video file (gets transcript)
|
|
198
|
+
video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
|
|
199
|
+
print(video_data)
|
|
200
|
+
|
|
201
|
+
# Extract from a local markdown file
|
|
202
|
+
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
203
|
+
print(md_data)
|
|
204
|
+
|
|
205
|
+
if __name__ == "__main__":
|
|
206
|
+
asyncio.run(main())
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
(See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
|
|
210
|
+
|
|
211
|
+
## Configuration
|
|
212
|
+
|
|
213
|
+
Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
|
|
214
|
+
|
|
215
|
+
Example `.env`:
|
|
216
|
+
|
|
217
|
+
```plaintext
|
|
218
|
+
OPENAI_API_KEY=your-key-here
|
|
219
|
+
GOOGLE_API_KEY=your-key-here
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Development
|
|
223
|
+
|
|
224
|
+
To set up a development environment:
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
# Clone the repository
|
|
228
|
+
git clone <repository-url>
|
|
229
|
+
cd content-core
|
|
230
|
+
|
|
231
|
+
# Create virtual environment and install dependencies
|
|
232
|
+
uv venv
|
|
233
|
+
source .venv/bin/activate
|
|
234
|
+
uv sync --group dev
|
|
235
|
+
|
|
236
|
+
# Run tests
|
|
237
|
+
make test
|
|
238
|
+
|
|
239
|
+
# Lint code
|
|
240
|
+
make lint
|
|
241
|
+
|
|
242
|
+
# See all commands
|
|
243
|
+
make help
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
Contributions are welcome! Please follow standard practices (fork, feature branch, pull request).
|
|
248
|
+
|
|
249
|
+
## License
|
|
250
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
content_core/__init__.py,sha256=CAPVVm3mDl8dH3j7Pn7t7UJqdwwpNjy77SzF7acssFw,6352
|
|
2
|
+
content_core/config.py,sha256=5nFWb-g7DG__OuxSvwK4yCFEC1YCKdE6-rX5Z7c6JSo,794
|
|
3
|
+
content_core/prompter.py,sha256=aUm_Bz_pkQuXIMKB1Xe6OEE-y4AUNoNsfFy82fAU-Ss,4049
|
|
4
|
+
content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
|
|
5
|
+
content_core/templated_message.py,sha256=NSttaX1jL5LYIDlJnabx7baDuySIFIPjFjAX5NZt9pM,1704
|
|
6
|
+
content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
|
|
7
|
+
content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
|
|
8
|
+
content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
|
|
9
|
+
content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
|
|
10
|
+
content_core/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
|
|
12
|
+
content_core/content/cleanup/core.py,sha256=FJupJGiIcZfAEX8Usn8ob2REfOVYFhcL3JmGovdnJOM,506
|
|
13
|
+
content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
|
|
14
|
+
content_core/content/extraction/graph.py,sha256=qrTEl9YDUJJJg7TbBqPjueSvV9oo4_WwAJ-VpWKOYec,4621
|
|
15
|
+
content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
|
|
16
|
+
content_core/content/summary/core.py,sha256=BTzTYhB-5sQmvvDaxnnWwBYcVinMHcnG-fBApcf2cyg,517
|
|
17
|
+
content_core/notebooks/run.ipynb,sha256=MmZGVl62b8S7FpsFPAUKbB7ndEtyFXY-JeihaeT5CII,375888
|
|
18
|
+
content_core/processors/audio.py,sha256=ox-ScigfbBrN9B4MCvdgbYn2d3GBYqf6His0HPrzXDs,3459
|
|
19
|
+
content_core/processors/office.py,sha256=13qNAfqqLwXUT6HNmF8OnxjbfvkhnTRCPoVUctw4w1k,12139
|
|
20
|
+
content_core/processors/pdf.py,sha256=yndt8EGvV5_IxcFbFp4lb4g9T84w6cJ4LPdiTduO7aM,5296
|
|
21
|
+
content_core/processors/text.py,sha256=MiXNILDKcLO5sWTzp-LNJ8yC764_gmUEvDB7GUd7Wys,1145
|
|
22
|
+
content_core/processors/url.py,sha256=LcDB_FcmJMcwqM75DX2ERXcOry98e63WEjwd6l8u3ho,6268
|
|
23
|
+
content_core/processors/video.py,sha256=aEe3M_POENPwI1tK4mUNxSeGewsmjex7lvMmELdcQeo,5183
|
|
24
|
+
content_core/processors/youtube.py,sha256=RdkMWVV3iy4HMZfkT_eq_fUfSbwbimN8LrKTcdti0JA,5700
|
|
25
|
+
content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
|
|
26
|
+
content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
|
|
27
|
+
content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
|
|
28
|
+
content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
|
|
29
|
+
content_core-0.1.0.dist-info/METADATA,sha256=tkjNmmkdSfe3Mt64qOFicBwmHDAnHCEC0fbZW7wFMJo,7305
|
|
30
|
+
content_core-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
31
|
+
content_core-0.1.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
|
|
32
|
+
content_core-0.1.0.dist-info/RECORD,,
|