content-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -0,0 +1,9 @@
1
+ from .cleanup import cleanup_content_tool
2
+ from .extract import extract_content_tool
3
+ from .summarize import summarize_content_tool
4
+
5
+ __all__ = [
6
+ "cleanup_content_tool",
7
+ "extract_content_tool",
8
+ "summarize_content_tool",
9
+ ]
@@ -0,0 +1,15 @@
1
+ from langchain_core.tools import tool
2
+
3
+ from content_core.content_cleanup import cleanup_content
4
+ from content_core.common import process_input_content
5
+
6
+
7
+ @tool
8
+ async def cleanup_content_tool(content: str) -> str:
9
+ """
10
+ Clean content. Rewrite paragraphs. Fix grammar and spelling.
11
+ Accepts direct text, URLs, or file paths. If a URL or file path is provided,
12
+ the content will be extracted first before cleaning.
13
+ """
14
+ content = await process_input_content(content)
15
+ return await cleanup_content(content)
@@ -0,0 +1,21 @@
1
+ from typing import Dict
2
+
3
+ from langchain_core.tools import tool
4
+
5
+ from content_core.extraction import extract_content
6
+
7
+
8
+ @tool
9
+ async def extract_content_tool(file_path_or_url: str) -> Dict:
10
+ """
11
+ Extract title, content and metadata from URLs and Links.
12
+
13
+ Args:
14
+ file_path_or_url: URL or file path to extract content from.
15
+
16
+ Returns:
17
+ Dict: Extracted content and metadata.
18
+ """
19
+ if file_path_or_url.startswith("http"):
20
+ return await extract_content({"url": file_path_or_url})
21
+ return await extract_content({"file_path": file_path_or_url})
@@ -0,0 +1,17 @@
1
+ from typing import Optional
2
+
3
+ from langchain_core.tools import tool
4
+
5
+ from content_core.content_summary import summarize
6
+ from content_core.common import process_input_content
7
+
8
+
9
+ @tool
10
+ async def summarize_content_tool(content: str, context: Optional[str] = None) -> str:
11
+ """
12
+ Summarize content according to instructions provided via context.
13
+ Accepts direct text, URLs, or file paths. If a URL or file path is provided,
14
+ the content will be extracted first before summarizing.
15
+ """
16
+ content = await process_input_content(content)
17
+ return await summarize(content, context or "")
@@ -0,0 +1,250 @@
1
+ Metadata-Version: 2.4
2
+ Name: content-core
3
+ Version: 0.1.0
4
+ Summary: Extract what matters from any media source
5
+ Author-email: LUIS NOVO <lfnovo@gmail.com>
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: aiohttp>=3.11.16
8
+ Requires-Dist: bs4>=0.0.2
9
+ Requires-Dist: dicttoxml>=1.7.16
10
+ Requires-Dist: esperanto>=1.2.0
11
+ Requires-Dist: google-genai>=1.10.0
12
+ Requires-Dist: jinja2>=3.1.6
13
+ Requires-Dist: langdetect>=1.0.9
14
+ Requires-Dist: langgraph>=0.3.29
15
+ Requires-Dist: loguru>=0.7.3
16
+ Requires-Dist: openai>=1.73.0
17
+ Requires-Dist: openpyxl>=3.1.5
18
+ Requires-Dist: pandas>=2.2.3
19
+ Requires-Dist: pydub>=0.25.1
20
+ Requires-Dist: pymupdf>=1.25.5
21
+ Requires-Dist: python-docx>=1.1.2
22
+ Requires-Dist: python-dotenv>=1.1.0
23
+ Requires-Dist: python-magic>=0.4.27
24
+ Requires-Dist: python-pptx>=1.0.2
25
+ Requires-Dist: validators>=0.34.0
26
+ Requires-Dist: youtube-transcript-api>=1.0.3
27
+ Description-Content-Type: text/markdown
28
+
29
+ # Content Core
30
+
31
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
32
+
33
+ **Content Core** is a versatile Python library designed to extract and process content from various sources, providing a unified interface for handling text, web pages, and local files.
34
+
35
+ ## Overview
36
+
37
+ The primary goal of Content Core is to simplify the process of ingesting content from diverse origins. Whether you have raw text, a URL pointing to an article, or a local file like a video or markdown document, Content Core aims to extract the meaningful content for further use.
38
+
39
+ ## Key Features
40
+
41
+ * **Multi-Source Extraction:** Handles content from:
42
+ * Direct text strings.
43
+ * Web URLs (using robust extraction methods).
44
+ * Local files (including automatic transcription for video/audio files and parsing for text-based formats).
45
+ * **Intelligent Processing:** Applies appropriate extraction techniques based on the source type.
46
+ * **Content Cleaning (Optional):** Likely integrates with LLMs (via `prompter.py` and Jinja templates) to refine and clean the extracted content.
47
+ * **Asynchronous:** Built with `asyncio` for efficient I/O operations.
48
+
49
+ ## Getting Started
50
+
51
+ ### Installation
52
+
53
+ Install Content Core using `pip`:
54
+
55
+ ```bash
56
+ # Install the package
57
+ pip install content-core
58
+ ```
59
+
60
+ Alternatively, if you’re developing locally:
61
+
62
+ ```bash
63
+ # Clone the repository
64
+ git clone https://github.com/lfnovo/content-core
65
+ cd content-core
66
+
67
+ # Install with uv
68
+ uv sync
69
+ ```
70
+
71
+ ### Command-Line Interface
72
+
73
+ Content Core provides three CLI commands for extracting, cleaning, and summarizing content:
74
+ ccore, cclean, and csum. These commands support input from text, URLs, files, or piped data (e.g., via cat file | command).
75
+
76
+ #### ccore - Extract Content
77
+
78
+ Extracts content from text, URLs, or files, with optional formatting.
79
+ Usage:
80
+ ```bash
81
+ ccore [-f|--format xml|json|text] [-d|--debug] [content]
82
+ ```
83
+ Options:
84
+ - `-f`, `--format`: Output format (xml, json, or text). Default: text.
85
+ - `-d`, `--debug`: Enable debug logging.
86
+ - `content`: Input content (text, URL, or file path). If omitted, reads from stdin.
87
+
88
+ Examples:
89
+
90
+ ```bash
91
+ # Extract from a URL as text
92
+ ccore https://example.com
93
+
94
+ # Extract from a file as JSON
95
+ ccore -f json document.pdf
96
+
97
+ # Extract from piped text as XML
98
+ echo "Sample text" | ccore --format xml
99
+ ```
100
+
101
+ #### cclean - Clean Content
102
+ Cleans content by removing unnecessary formatting, spaces, or artifacts. Accepts text, JSON, XML input, URLs, or file paths.
103
+ Usage:
104
+
105
+ ```bash
106
+ cclean [-d|--debug] [content]
107
+ ```
108
+
109
+ Options:
110
+ - `-d`, `--debug`: Enable debug logging.
111
+ - `content`: Input content to clean (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
112
+
113
+ Examples:
114
+
115
+ ```bash
116
+ # Clean a text string
117
+ cclean " messy text "
118
+
119
+ # Clean piped JSON
120
+ echo '{"content": " messy text "}' | cclean
121
+
122
+ # Clean content from a URL
123
+ cclean https://example.com
124
+
125
+ # Clean a file’s content
126
+ cclean document.txt
127
+ ```
128
+
129
+ ### csum - Summarize Content
130
+
131
+ Summarizes content with an optional context to guide the summary style. Accepts text, JSON, XML input, URLs, or file paths.
132
+
133
+ Usage:
134
+
135
+ ```bash
136
+ csum [--context "context text"] [-d|--debug] [content]
137
+ ```
138
+
139
+ Options:
140
+ - `--context`: Context for summarization (e.g., "explain to a child"). Default: none.
141
+ - `-d`, `--debug`: Enable debug logging.
142
+ - `content`: Input content to summarize (text, URL, file path, JSON, or XML). If omitted, reads from stdin.
143
+
144
+ Examples:
145
+
146
+ ```bash
147
+ # Summarize text
148
+ csum "AI is transforming industries."
149
+
150
+ # Summarize with context
151
+ csum --context "in bullet points" "AI is transforming industries."
152
+
153
+ # Summarize piped content
154
+ cat article.txt | csum --context "one sentence"
155
+
156
+ # Summarize content from URL
157
+ csum https://example.com
158
+
159
+ # Summarize a file's content
160
+ csum document.txt
161
+ ```
162
+
163
+ ## Using with Langchain
164
+
165
+ For users integrating with the [Langchain](https://python.langchain.com/) framework, `content-core` exposes a set of compatible tools. These tools, located in the `src/content_core/tools` directory, allow you to leverage `content-core` extraction, cleaning, and summarization capabilities directly within your Langchain agents and chains.
166
+
167
+ You can import and use these tools like any other Langchain tool. For example:
168
+
169
+ ```python
170
+ from content_core.tools import extract_content_tool, cleanup_content_tool, summarize_content_tool
171
+ from langchain.agents import initialize_agent, AgentType
172
+
173
+ tools = [extract_content_tool, cleanup_content_tool, summarize_content_tool]
174
+ agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)
175
+ agent.run("Extract the content from https://example.com and then summarize it.")
176
+ ```
177
+
178
+ Refer to the source code in `src/content_core/tools` for specific tool implementations and usage details.
179
+
180
+ ## Basic Usage
181
+
182
+ The core functionality revolves around the extract_content function.
183
+
184
+ ```python
185
+ import asyncio
186
+ from content_core.extraction import extract_content
187
+
188
+ async def main():
189
+ # Extract from raw text
190
+ text_data = await extract_content({"content": "This is my sample text content."})
191
+ print(text_data)
192
+
193
+ # Extract from a URL
194
+ url_data = await extract_content({"url": "https://www.example.com"})
195
+ print(url_data)
196
+
197
+ # Extract from a local video file (gets transcript)
198
+ video_data = await extract_content({"file_path": "path/to/your/video.mp4"})
199
+ print(video_data)
200
+
201
+ # Extract from a local markdown file
202
+ md_data = await extract_content({"file_path": "path/to/your/document.md"})
203
+ print(md_data)
204
+
205
+ if __name__ == "__main__":
206
+ asyncio.run(main())
207
+ ```
208
+
209
+ (See `src/content_core/notebooks/run.ipynb` for more detailed examples.)
210
+
211
+ ## Configuration
212
+
213
+ Configuration settings (like API keys for external services, logging levels) can be managed through environment variables or `.env` files, loaded automatically via `python-dotenv`.
214
+
215
+ Example `.env`:
216
+
217
+ ```plaintext
218
+ OPENAI_API_KEY=your-key-here
219
+ GOOGLE_API_KEY=your-key-here
220
+ ```
221
+
222
+ ## Development
223
+
224
+ To set up a development environment:
225
+
226
+ ```bash
227
+ # Clone the repository
228
+ git clone <repository-url>
229
+ cd content-core
230
+
231
+ # Create virtual environment and install dependencies
232
+ uv venv
233
+ source .venv/bin/activate
234
+ uv sync --group dev
235
+
236
+ # Run tests
237
+ make test
238
+
239
+ # Lint code
240
+ make lint
241
+
242
+ # See all commands
243
+ make help
244
+ ```
245
+
246
+ ## Contributing
247
+ Contributions are welcome! Please follow standard practices (fork, feature branch, pull request).
248
+
249
+ ## License
250
+ This project is licensed under the MIT License - see the LICENSE file for details.
@@ -0,0 +1,32 @@
1
+ content_core/__init__.py,sha256=CAPVVm3mDl8dH3j7Pn7t7UJqdwwpNjy77SzF7acssFw,6352
2
+ content_core/config.py,sha256=5nFWb-g7DG__OuxSvwK4yCFEC1YCKdE6-rX5Z7c6JSo,794
3
+ content_core/prompter.py,sha256=aUm_Bz_pkQuXIMKB1Xe6OEE-y4AUNoNsfFy82fAU-Ss,4049
4
+ content_core/py.typed,sha256=pLuU3XTTeVpXo4UomOjcvAIQqOrzIotlWlJ3KFo2lxQ,154
5
+ content_core/templated_message.py,sha256=NSttaX1jL5LYIDlJnabx7baDuySIFIPjFjAX5NZt9pM,1704
6
+ content_core/common/__init__.py,sha256=SjDp-0QRjX9PMubyTjv77_GrUqm6eC4gBuXr593JVK4,525
7
+ content_core/common/exceptions.py,sha256=NpYedVbckIq4kP2wek7bicMVgGGn0fkhCvid5cIxfy4,1304
8
+ content_core/common/state.py,sha256=GrBtXKCz3c9SlMBfVxcTZ9szCrtQtz8PEo6E7Ihy6vY,867
9
+ content_core/common/utils.py,sha256=0o4jovPEw_6wu7EcPPbDNZskbhhfLUBJBvRmp0Yc4R4,1182
10
+ content_core/content/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ content_core/content/cleanup/__init__.py,sha256=wymD24WLDDdsZrv-5WhparSiHBK9SJCcqBHmokuZqk4,121
12
+ content_core/content/cleanup/core.py,sha256=FJupJGiIcZfAEX8Usn8ob2REfOVYFhcL3JmGovdnJOM,506
13
+ content_core/content/extraction/__init__.py,sha256=TaYw6CAcG62GZfsJxeZ6VJDLP85BU2a7_G271v6WWPk,446
14
+ content_core/content/extraction/graph.py,sha256=qrTEl9YDUJJJg7TbBqPjueSvV9oo4_WwAJ-VpWKOYec,4621
15
+ content_core/content/summary/__init__.py,sha256=ReKCZWKfDtqlInKeh87Y1DEfiNzVWabGybEz3hS2FrI,114
16
+ content_core/content/summary/core.py,sha256=BTzTYhB-5sQmvvDaxnnWwBYcVinMHcnG-fBApcf2cyg,517
17
+ content_core/notebooks/run.ipynb,sha256=MmZGVl62b8S7FpsFPAUKbB7ndEtyFXY-JeihaeT5CII,375888
18
+ content_core/processors/audio.py,sha256=ox-ScigfbBrN9B4MCvdgbYn2d3GBYqf6His0HPrzXDs,3459
19
+ content_core/processors/office.py,sha256=13qNAfqqLwXUT6HNmF8OnxjbfvkhnTRCPoVUctw4w1k,12139
20
+ content_core/processors/pdf.py,sha256=yndt8EGvV5_IxcFbFp4lb4g9T84w6cJ4LPdiTduO7aM,5296
21
+ content_core/processors/text.py,sha256=MiXNILDKcLO5sWTzp-LNJ8yC764_gmUEvDB7GUd7Wys,1145
22
+ content_core/processors/url.py,sha256=LcDB_FcmJMcwqM75DX2ERXcOry98e63WEjwd6l8u3ho,6268
23
+ content_core/processors/video.py,sha256=aEe3M_POENPwI1tK4mUNxSeGewsmjex7lvMmELdcQeo,5183
24
+ content_core/processors/youtube.py,sha256=RdkMWVV3iy4HMZfkT_eq_fUfSbwbimN8LrKTcdti0JA,5700
25
+ content_core/tools/__init__.py,sha256=DuJmd7fE-NpDvLP8IW1XY5MUkAQcdks52rn2jk4N8jQ,231
26
+ content_core/tools/cleanup.py,sha256=5IdKedsFyRQMdYzgFSKtsfyxJldbroXQXHesHICNENI,523
27
+ content_core/tools/extract.py,sha256=-r2_jsuMMXyXxGVqWhh1ilNPo_UMYAbw3Pkp1FzPy5g,577
28
+ content_core/tools/summarize.py,sha256=DPfeglLWB08q8SvHrsKpOKZ35XjduUDs2J02ISwjdj0,596
29
+ content_core-0.1.0.dist-info/METADATA,sha256=tkjNmmkdSfe3Mt64qOFicBwmHDAnHCEC0fbZW7wFMJo,7305
30
+ content_core-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
31
+ content_core-0.1.0.dist-info/entry_points.txt,sha256=9fGQUk6bxBVXj9PRwfWVPn54ClSEJV7J-KBLXtjOhQw,99
32
+ content_core-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,4 @@
1
+ [console_scripts]
2
+ cclean = content_core:cclean
3
+ ccore = content_core:ccore
4
+ csum = content_core:csum