content-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of content-core might be problematic. Click here for more details.
- content_core-0.1.0/.github/PULL_REQUEST_TEMPLATE.md +22 -0
- content_core-0.1.0/.github/workflows/publish.yml +38 -0
- content_core-0.1.0/.gitignore +21 -0
- content_core-0.1.0/.python-version +1 -0
- content_core-0.1.0/.windsurfrules +1 -0
- content_core-0.1.0/Makefile +8 -0
- content_core-0.1.0/PKG-INFO +250 -0
- content_core-0.1.0/README.md +222 -0
- content_core-0.1.0/prompts/content/cleanup.jinja +16 -0
- content_core-0.1.0/prompts/content/summarize.jinja +25 -0
- content_core-0.1.0/pyproject.toml +60 -0
- content_core-0.1.0/src/content_core/__init__.py +214 -0
- content_core-0.1.0/src/content_core/common/__init__.py +21 -0
- content_core-0.1.0/src/content_core/common/exceptions.py +70 -0
- content_core-0.1.0/src/content_core/common/state.py +30 -0
- content_core-0.1.0/src/content_core/common/utils.py +31 -0
- content_core-0.1.0/src/content_core/config.py +37 -0
- content_core-0.1.0/src/content_core/content/__init__.py +0 -0
- content_core-0.1.0/src/content_core/content/cleanup/__init__.py +5 -0
- content_core-0.1.0/src/content_core/content/cleanup/core.py +15 -0
- content_core-0.1.0/src/content_core/content/extraction/__init__.py +13 -0
- content_core-0.1.0/src/content_core/content/extraction/graph.py +148 -0
- content_core-0.1.0/src/content_core/content/summary/__init__.py +5 -0
- content_core-0.1.0/src/content_core/content/summary/core.py +15 -0
- content_core-0.1.0/src/content_core/notebooks/run.ipynb +558 -0
- content_core-0.1.0/src/content_core/processors/audio.py +106 -0
- content_core-0.1.0/src/content_core/processors/office.py +331 -0
- content_core-0.1.0/src/content_core/processors/pdf.py +170 -0
- content_core-0.1.0/src/content_core/processors/text.py +37 -0
- content_core-0.1.0/src/content_core/processors/url.py +191 -0
- content_core-0.1.0/src/content_core/processors/video.py +167 -0
- content_core-0.1.0/src/content_core/processors/youtube.py +159 -0
- content_core-0.1.0/src/content_core/prompter.py +115 -0
- content_core-0.1.0/src/content_core/py.typed +2 -0
- content_core-0.1.0/src/content_core/templated_message.py +57 -0
- content_core-0.1.0/src/content_core/tools/__init__.py +9 -0
- content_core-0.1.0/src/content_core/tools/cleanup.py +15 -0
- content_core-0.1.0/src/content_core/tools/extract.py +21 -0
- content_core-0.1.0/src/content_core/tools/summarize.py +17 -0
- content_core-0.1.0/tests/input_content/file.docx +0 -0
- content_core-0.1.0/tests/input_content/file.epub +0 -0
- content_core-0.1.0/tests/input_content/file.md +73 -0
- content_core-0.1.0/tests/input_content/file.mp3 +0 -0
- content_core-0.1.0/tests/input_content/file.mp4 +0 -0
- content_core-0.1.0/tests/input_content/file.pdf +0 -0
- content_core-0.1.0/tests/input_content/file.pptx +0 -0
- content_core-0.1.0/tests/input_content/file.txt +73 -0
- content_core-0.1.0/tests/input_content/file.xlsx +0 -0
- content_core-0.1.0/tests/input_content/file_audio.mp3 +0 -0
- content_core-0.1.0/tests/integration/test_extraction.py +202 -0
- content_core-0.1.0/uv.lock +2714 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
<!--
|
|
2
|
+
Thanks for contributing a pull request! Please ensure you have taken a look at CONTRIBUTING.md
|
|
3
|
+
-->
|
|
4
|
+
|
|
5
|
+
#### Reference Issues/PRs
|
|
6
|
+
<!--
|
|
7
|
+
Example: Fixes #1234. See also #3456.
|
|
8
|
+
Please use keywords (e.g., Fixes) to create link to the issues or pull requests
|
|
9
|
+
you resolved, so that they will automatically be closed when your pull request
|
|
10
|
+
is merged. See https://github.com/blog/1506-closing-issues-via-pull-requests
|
|
11
|
+
-->
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
#### What does this implement/fix? Explain your changes.
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
#### Any other comments?
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
<!--
|
|
21
|
+
Thanks for contributing!
|
|
22
|
+
-->
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
- '[0-9]*.[0-9]*.[0-9]*'
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
publish:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
|
|
16
|
+
- name: Set up Python
|
|
17
|
+
uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.10"
|
|
20
|
+
|
|
21
|
+
- name: Install uv
|
|
22
|
+
run: |
|
|
23
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
24
|
+
echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies and build tools
|
|
27
|
+
run: |
|
|
28
|
+
uv venv
|
|
29
|
+
rm -rf dist
|
|
30
|
+
uv sync
|
|
31
|
+
|
|
32
|
+
- name: Build package
|
|
33
|
+
run: uv build
|
|
34
|
+
|
|
35
|
+
- name: Publish to PyPI
|
|
36
|
+
env:
|
|
37
|
+
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
|
|
38
|
+
run: uv publish --token "$PYPI_TOKEN"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Also use uv as the package manager: uv run, uv sync, uv add.
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: content-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract what matters from any media source
|
|
5
|
+
Author-email: LUIS NOVO <lfnovo@gmail.com>
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: aiohttp>=3.11.16
|
|
8
|
+
Requires-Dist: bs4>=0.0.2
|
|
9
|
+
Requires-Dist: dicttoxml>=1.7.16
|
|
10
|
+
Requires-Dist: esperanto>=1.2.0
|
|
11
|
+
Requires-Dist: google-genai>=1.10.0
|
|
12
|
+
Requires-Dist: jinja2>=3.1.6
|
|
13
|
+
Requires-Dist: langdetect>=1.0.9
|
|
14
|
+
Requires-Dist: langgraph>=0.3.29
|
|
15
|
+
Requires-Dist: loguru>=0.7.3
|
|
16
|
+
Requires-Dist: openai>=1.73.0
|
|
17
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
18
|
+
Requires-Dist: pandas>=2.2.3
|
|
19
|
+
Requires-Dist: pydub>=0.25.1
|
|
20
|
+
Requires-Dist: pymupdf>=1.25.5
|
|
21
|
+
Requires-Dist: python-docx>=1.1.2
|
|
22
|
+
Requires-Dist: python-dotenv>=1.1.0
|
|
23
|
+
Requires-Dist: python-magic>=0.4.27
|
|
24
|
+
Requires-Dist: python-pptx>=1.0.2
|
|
25
|
+
Requires-Dist: validators>=0.34.0
|
|
26
|
+
Requires-Dist: youtube-transcript-api>=1.0.3
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# Content Core
|
|
30
|
+
|
|
31
|
+
[](https://opensource.org/licenses/MIT)
|
|
32
|
+
|
|
33
|
+
**Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
|
|
34
|
+
|
|
35
|
+
## Overview
|
|
36
|
+
|
|
37
|
+
The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
|
|
38
|
+
|
|
39
|
+
## Key Features
|
|
40
|
+
|
|
41
|
+
* **Multi-Source Extraction:** Handles content from:
|
|
42
|
+
* Direct text strings.
|
|
43
|
+
* Web URLs (using robust extraction methods).
|
|
44
|
+
* Local files (including automatic transcription for video/audio files and parsing for text-based formats).
|
|
45
|
+
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
|
|
46
|
+
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
47
|
+
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
48
|
+
|
|
49
|
+
## Getting Started
|
|
50
|
+
|
|
51
|
+
### Installation
|
|
52
|
+
|
|
53
|
+
Install Content Core using `pip`:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
# Install the package
|
|
57
|
+
pip install content-core
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Alternatively, if you’re developing locally:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Clone the repository
|
|
64
|
+
git clone https://github.com/lfnovo/content-core
|
|
65
|
+
cd content-core
|
|
66
|
+
|
|
67
|
+
# Install with uv
|
|
68
|
+
uv sync
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Command-Line Interface
|
|
72
|
+
|
|
73
|
+
Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
|
|
74
|
+
ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
|
|
75
|
+
|
|
76
|
+
#### ccore - Extract Content
|
|
77
|
+
|
|
78
|
+
Extracts content from text, URLs, or files, with optional formatting.
|
|
79
|
+
Usage:
|
|
80
|
+
```bash
|
|
81
|
+
ccore [-f|--format xml|json|text] [-d|--debug] [content]
|
|
82
|
+
```
|
|
83
|
+
Options:
|
|
84
|
+
- `-f`, `--format`: Output format (xml, json, or text). Default: text.
|
|
85
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
86
|
+
- `content`: Input content (text, URL, or file path). If omitted, reads from stdin.
|
|
87
|
+
|
|
88
|
+
Examples:
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Extract from a URL as text
|
|
92
|
+
ccore https://example.com
|
|
93
|
+
|
|
94
|
+
# Extract from a file as JSON
|
|
95
|
+
ccore -f json document.pdf
|
|
96
|
+
|
|
97
|
+
# Extract from piped text as XML
|
|
98
|
+
echo "Sample text" | ccore --format xml
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
#### cclean - Clean Content
|
|
102
|
+
Cleans content by removing unnecessary formatting, spaces, or artifacts. Accepts text, JSON, XML input, URLs, or file paths.
|
|
103
|
+
Usage:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
cclean [-d|--debug] [content]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Options:
|
|
110
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
111
|
+
- `content`: Input content to clean (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
|
|
112
|
+
|
|
113
|
+
Examples:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
# Clean a text string
|
|
117
|
+
cclean " messy text "
|
|
118
|
+
|
|
119
|
+
# Clean piped JSON
|
|
120
|
+
echo '{"content": " messy text "}' | cclean
|
|
121
|
+
|
|
122
|
+
# Clean content from a URL
|
|
123
|
+
cclean https://example.com
|
|
124
|
+
|
|
125
|
+
# Clean a file’s content
|
|
126
|
+
cclean document.txt
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### csum - Summarize Content
|
|
130
|
+
|
|
131
|
+
Summarizes content with an optional context to guide the summary style. Accepts text, JSON, XML input, URLs, or file paths.
|
|
132
|
+
|
|
133
|
+
Usage:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
csum [--context "context text"] [-d|--debug] [content]
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Options:
|
|
140
|
+
- `--context`: Context for summarization (e.g., "explain to a child"). Default: none.
|
|
141
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
142
|
+
- `content`: Input content to summarize (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
|
|
143
|
+
|
|
144
|
+
Examples:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
# Summarize text
|
|
148
|
+
csum "AI is transforming industries."
|
|
149
|
+
|
|
150
|
+
# Summarize with context
|
|
151
|
+
csum --context "in bullet points" "AI is transforming industries."
|
|
152
|
+
|
|
153
|
+
# Summarize piped content
|
|
154
|
+
cat article.txt | csum --context "one sentence"
|
|
155
|
+
|
|
156
|
+
# Summarize content from URL
|
|
157
|
+
csum https://example.com
|
|
158
|
+
|
|
159
|
+
# Summarize a file's content
|
|
160
|
+
csum document.txt
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Using with Langchain
|
|
164
|
+
|
|
165
|
+
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
166
|
+
|
|
167
|
+
You can import and use these tools like any other Langchain tool. For example:
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from content_core.tools import extract_content_tool, cleanup_content_tool, summarize_content_tool
|
|
171
|
+
from langchain.agents import initialize_agent, AgentType
|
|
172
|
+
|
|
173
|
+
tools = [extract_content_tool, cleanup_content_tool, summarize_content_tool]
|
|
174
|
+
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
|
|
175
|
+
agent.run("Extract the content from https://example.com and then summarize it.")
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Refer to the source code in `src/content_core/tools` for specific tool implementations and usage details.
|
|
179
|
+
|
|
180
|
+
## Basic Usage
|
|
181
|
+
|
|
182
|
+
The core functionality revolves around the extract_content function.
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
import asyncio
|
|
186
|
+
from content_core.extraction import extract_content
|
|
187
|
+
|
|
188
|
+
async def main():
|
|
189
|
+
# Extract from raw text
|
|
190
|
+
text_data = await extract_content({"content": "This is my sample text content."})
|
|
191
|
+
print(text_data)
|
|
192
|
+
|
|
193
|
+
# Extract from a URL
|
|
194
|
+
url_data = await extract_content({"url": "https://www.example.com"})
|
|
195
|
+
print(url_data)
|
|
196
|
+
|
|
197
|
+
# Extract from a local video file (gets transcript)
|
|
198
|
+
video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
|
|
199
|
+
print(video_data)
|
|
200
|
+
|
|
201
|
+
# Extract from a local markdown file
|
|
202
|
+
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
203
|
+
print(md_data)
|
|
204
|
+
|
|
205
|
+
if __name__ == "__main__":
|
|
206
|
+
asyncio.run(main())
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
(See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
|
|
210
|
+
|
|
211
|
+
## Configuration
|
|
212
|
+
|
|
213
|
+
Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
|
|
214
|
+
|
|
215
|
+
Example `.env`:
|
|
216
|
+
|
|
217
|
+
```plaintext
|
|
218
|
+
OPENAI_API_KEY=your-key-here
|
|
219
|
+
GOOGLE_API_KEY=your-key-here
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Development
|
|
223
|
+
|
|
224
|
+
To set up a development environment:
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
# Clone the repository
|
|
228
|
+
git clone <repository-url>
|
|
229
|
+
cd content-core
|
|
230
|
+
|
|
231
|
+
# Create virtual environment and install dependencies
|
|
232
|
+
uv venv
|
|
233
|
+
source .venv/bin/activate
|
|
234
|
+
uv sync --group dev
|
|
235
|
+
|
|
236
|
+
# Run tests
|
|
237
|
+
make test
|
|
238
|
+
|
|
239
|
+
# Lint code
|
|
240
|
+
make lint
|
|
241
|
+
|
|
242
|
+
# See all commands
|
|
243
|
+
make help
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
## Contributing
|
|
247
|
+
Contributions are welcome! Please follow standard practices (fork, feature branch, pull request).
|
|
248
|
+
|
|
249
|
+
## License
|
|
250
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
# Content Core
|
|
2
|
+
|
|
3
|
+
[](https://opensource.org/licenses/MIT)
|
|
4
|
+
|
|
5
|
+
**Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
|
|
10
|
+
|
|
11
|
+
## Key Features
|
|
12
|
+
|
|
13
|
+
* **Multi-Source Extraction:** Handles content from:
|
|
14
|
+
* Direct text strings.
|
|
15
|
+
* Web URLs (using robust extraction methods).
|
|
16
|
+
* Local files (including automatic transcription for video/audio files and parsing for text-based formats).
|
|
17
|
+
* **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
|
|
18
|
+
* **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
|
|
19
|
+
* **Asynchronous:** Built with `asyncio` for efficient I/O operations.
|
|
20
|
+
|
|
21
|
+
## Getting Started
|
|
22
|
+
|
|
23
|
+
### Installation
|
|
24
|
+
|
|
25
|
+
Install Content Core using `pip`:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
# Install the package
|
|
29
|
+
pip install content-core
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Alternatively, if you’re developing locally:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Clone the repository
|
|
36
|
+
git clone https://github.com/lfnovo/content-core
|
|
37
|
+
cd content-core
|
|
38
|
+
|
|
39
|
+
# Install with uv
|
|
40
|
+
uv sync
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Command-Line Interface
|
|
44
|
+
|
|
45
|
+
Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
|
|
46
|
+
ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
|
|
47
|
+
|
|
48
|
+
#### ccore - Extract Content
|
|
49
|
+
|
|
50
|
+
Extracts content from text, URLs, or files, with optional formatting.
|
|
51
|
+
Usage:
|
|
52
|
+
```bash
|
|
53
|
+
ccore [-f|--format xml|json|text] [-d|--debug] [content]
|
|
54
|
+
```
|
|
55
|
+
Options:
|
|
56
|
+
- `-f`, `--format`: Output format (xml, json, or text). Default: text.
|
|
57
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
58
|
+
- `content`: Input content (text, URL, or file path). If omitted, reads from stdin.
|
|
59
|
+
|
|
60
|
+
Examples:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Extract from a URL as text
|
|
64
|
+
ccore https://example.com
|
|
65
|
+
|
|
66
|
+
# Extract from a file as JSON
|
|
67
|
+
ccore -f json document.pdf
|
|
68
|
+
|
|
69
|
+
# Extract from piped text as XML
|
|
70
|
+
echo "Sample text" | ccore --format xml
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### cclean - Clean Content
|
|
74
|
+
Cleans content by removing unnecessary formatting, spaces, or artifacts. Accepts text, JSON, XML input, URLs, or file paths.
|
|
75
|
+
Usage:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
cclean [-d|--debug] [content]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Options:
|
|
82
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
83
|
+
- `content`: Input content to clean (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
|
|
84
|
+
|
|
85
|
+
Examples:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Clean a text string
|
|
89
|
+
cclean " messy text "
|
|
90
|
+
|
|
91
|
+
# Clean piped JSON
|
|
92
|
+
echo '{"content": " messy text "}' | cclean
|
|
93
|
+
|
|
94
|
+
# Clean content from a URL
|
|
95
|
+
cclean https://example.com
|
|
96
|
+
|
|
97
|
+
# Clean a file’s content
|
|
98
|
+
cclean document.txt
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### csum - Summarize Content
|
|
102
|
+
|
|
103
|
+
Summarizes content with an optional context to guide the summary style. Accepts text, JSON, XML input, URLs, or file paths.
|
|
104
|
+
|
|
105
|
+
Usage:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
csum [--context "context text"] [-d|--debug] [content]
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Options:
|
|
112
|
+
- `--context`: Context for summarization (e.g., "explain to a child"). Default: none.
|
|
113
|
+
- `-d`, `--debug`: Enable debug logging.
|
|
114
|
+
- `content`: Input content to summarize (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
|
|
115
|
+
|
|
116
|
+
Examples:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Summarize text
|
|
120
|
+
csum "AI is transforming industries."
|
|
121
|
+
|
|
122
|
+
# Summarize with context
|
|
123
|
+
csum --context "in bullet points" "AI is transforming industries."
|
|
124
|
+
|
|
125
|
+
# Summarize piped content
|
|
126
|
+
cat article.txt | csum --context "one sentence"
|
|
127
|
+
|
|
128
|
+
# Summarize content from URL
|
|
129
|
+
csum https://example.com
|
|
130
|
+
|
|
131
|
+
# Summarize a file's content
|
|
132
|
+
csum document.txt
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Using with Langchain
|
|
136
|
+
|
|
137
|
+
For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
|
|
138
|
+
|
|
139
|
+
You can import and use these tools like any other Langchain tool. For example:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from content_core.tools import extract_content_tool, cleanup_content_tool, summarize_content_tool
|
|
143
|
+
from langchain.agents import initialize_agent, AgentType
|
|
144
|
+
|
|
145
|
+
tools = [extract_content_tool, cleanup_content_tool, summarize_content_tool]
|
|
146
|
+
agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
|
|
147
|
+
agent.run("Extract the content from https://example.com and then summarize it.")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
Refer to the source code in `src/content_core/tools` for specific tool implementations and usage details.
|
|
151
|
+
|
|
152
|
+
## Basic Usage
|
|
153
|
+
|
|
154
|
+
The core functionality revolves around the extract_content function.
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
import asyncio
|
|
158
|
+
from content_core.extraction import extract_content
|
|
159
|
+
|
|
160
|
+
async def main():
|
|
161
|
+
# Extract from raw text
|
|
162
|
+
text_data = await extract_content({"content": "This is my sample text content."})
|
|
163
|
+
print(text_data)
|
|
164
|
+
|
|
165
|
+
# Extract from a URL
|
|
166
|
+
url_data = await extract_content({"url": "https://www.example.com"})
|
|
167
|
+
print(url_data)
|
|
168
|
+
|
|
169
|
+
# Extract from a local video file (gets transcript)
|
|
170
|
+
video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
|
|
171
|
+
print(video_data)
|
|
172
|
+
|
|
173
|
+
# Extract from a local markdown file
|
|
174
|
+
md_data = await extract_content({"file_path": "path/to/your/document.md"})
|
|
175
|
+
print(md_data)
|
|
176
|
+
|
|
177
|
+
if __name__ == "__main__":
|
|
178
|
+
asyncio.run(main())
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
(See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
|
|
182
|
+
|
|
183
|
+
## Configuration
|
|
184
|
+
|
|
185
|
+
Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
|
|
186
|
+
|
|
187
|
+
Example `.env`:
|
|
188
|
+
|
|
189
|
+
```plaintext
|
|
190
|
+
OPENAI_API_KEY=your-key-here
|
|
191
|
+
GOOGLE_API_KEY=your-key-here
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Development
|
|
195
|
+
|
|
196
|
+
To set up a development environment:
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
# Clone the repository
|
|
200
|
+
git clone <repository-url>
|
|
201
|
+
cd content-core
|
|
202
|
+
|
|
203
|
+
# Create virtual environment and install dependencies
|
|
204
|
+
uv venv
|
|
205
|
+
source .venv/bin/activate
|
|
206
|
+
uv sync --group dev
|
|
207
|
+
|
|
208
|
+
# Run tests
|
|
209
|
+
make test
|
|
210
|
+
|
|
211
|
+
# Lint code
|
|
212
|
+
make lint
|
|
213
|
+
|
|
214
|
+
# See all commands
|
|
215
|
+
make help
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Contributing
|
|
219
|
+
Contributions are welcome! Please follow standard practices (fork, feature branch, pull request).
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# GOAL
|
|
2
|
+
|
|
3
|
+
Adjust the content below to make it clean and readable:
|
|
4
|
+
Remove repeated strings that do not add value to the text.
|
|
5
|
+
|
|
6
|
+
Remove any content unrelated to the text itself (e.g., metadata, artifacts, or extraction errors).
|
|
7
|
+
|
|
8
|
+
Format the output as unstructured but clear text.
|
|
9
|
+
|
|
10
|
+
Do not add extra text, introductions, conclusions, or commentary—only rewrite the provided content as it is.
|
|
11
|
+
|
|
12
|
+
Do not interpret, analyze, or alter the meaning, intent, or narrative of the text—just reformat it for clarity and readability.
|
|
13
|
+
|
|
14
|
+
Do not change the text structure, do not write conclusions about it. Your only job is to make it readable.
|
|
15
|
+
|
|
16
|
+
Keep the text in its original language, regardless of what it is.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
You are an AI assistant for a personal study platform.
|
|
2
|
+
|
|
3
|
+
In this platform, your user collects various articles and content from the Internet for reference and study.
|
|
4
|
+
|
|
5
|
+
Your role is to summarize the selected content as densely as possible, helping the reader extract maximum value from it without reading the full text.
|
|
6
|
+
Focus solely on the content's value, avoiding unnecessary comments or messages.
|
|
7
|
+
|
|
8
|
+
The summary should be dense, rich in characters, and designed to create a powerful vector representation.
|
|
9
|
+
If the user provided additional context, follow its instructions. Otherwise, summary the whole content.
|
|
10
|
+
|
|
11
|
+
Do not return any acknowledgments or greetings—only the summary.
|
|
12
|
+
|
|
13
|
+
CONTENT:
|
|
14
|
+
|
|
15
|
+
{{ content }}
|
|
16
|
+
|
|
17
|
+
{% if context %}
|
|
18
|
+
CONTEXT:
|
|
19
|
+
|
|
20
|
+
User has provided the aditional context for your task:
|
|
21
|
+
{{context}}
|
|
22
|
+
{% endif%}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
SUMMARY:
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "content-core"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Extract what matters from any media source"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "LUIS NOVO", email = "lfnovo@gmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"aiohttp>=3.11.16",
|
|
12
|
+
"bs4>=0.0.2",
|
|
13
|
+
"esperanto>=1.2.0",
|
|
14
|
+
"google-genai>=1.10.0",
|
|
15
|
+
"jinja2>=3.1.6",
|
|
16
|
+
"langdetect>=1.0.9",
|
|
17
|
+
"loguru>=0.7.3",
|
|
18
|
+
"openai>=1.73.0",
|
|
19
|
+
"openpyxl>=3.1.5",
|
|
20
|
+
"pandas>=2.2.3",
|
|
21
|
+
"pydub>=0.25.1",
|
|
22
|
+
"pymupdf>=1.25.5",
|
|
23
|
+
"python-docx>=1.1.2",
|
|
24
|
+
"python-dotenv>=1.1.0",
|
|
25
|
+
"python-magic>=0.4.27",
|
|
26
|
+
"python-pptx>=1.0.2",
|
|
27
|
+
"youtube-transcript-api>=1.0.3",
|
|
28
|
+
"langgraph>=0.3.29",
|
|
29
|
+
"dicttoxml>=1.7.16",
|
|
30
|
+
"validators>=0.34.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
ccore = "content_core:ccore"
|
|
35
|
+
cclean = "content_core:cclean"
|
|
36
|
+
csum = "content_core:csum"
|
|
37
|
+
|
|
38
|
+
[tool.hatch.metadata]
|
|
39
|
+
allow-direct-references = true
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["hatchling", "pip"]
|
|
43
|
+
build-backend = "hatchling.build"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools]
|
|
46
|
+
package-dir = {"content_core" = "src/content_core"}
|
|
47
|
+
|
|
48
|
+
[dependency-groups]
|
|
49
|
+
dev = [
|
|
50
|
+
"ipykernel>=4.0.1",
|
|
51
|
+
"ipywidgets>=4.0.0",
|
|
52
|
+
"pyperclip>=1.9.0",
|
|
53
|
+
"pytest>=7.2.0",
|
|
54
|
+
"pytest-asyncio>=0.21.0",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
[tool.pytest.ini_options]
|
|
58
|
+
pythonpath = ["src"]
|
|
59
|
+
asyncio_mode = "auto"
|
|
60
|
+
asyncio_default_fixture_loop_scope = "function"
|