content-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of content-core might be problematic. Click here for more details.

@@ -0,0 +1,214 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import os
5
+ import sys
6
+ from xml.etree import ElementTree as ET
7
+
8
+ from dicttoxml import dicttoxml # type: ignore
9
+ from dotenv import load_dotenv
10
+ from loguru import logger
11
+
12
+ from content_core.common import ProcessSourceInput
13
+ from content_core.content.cleanup import cleanup_content
14
+ from content_core.content.extraction import extract_content
15
+ from content_core.content.summary import summarize
16
+
17
+ load_dotenv()
18
+
19
+ # Configure loguru logger
20
+ logger.remove() # Remove default handler
21
+ logger.add(sys.stderr, level="INFO") # Default to INFO level
22
+
23
+
24
+ def parse_content_format(content: str) -> str:
25
+ """Parse content that might be JSON or XML, extracting the 'content' field if present."""
26
+ try:
27
+ # Try JSON first
28
+ try:
29
+ json_data = json.loads(content)
30
+ if isinstance(json_data, dict) and "content" in json_data:
31
+ extracted = json_data["content"]
32
+ return str(extracted) if extracted is not None else content
33
+ except json.JSONDecodeError:
34
+ # Try XML
35
+ try:
36
+ root = ET.fromstring(content)
37
+ content_elem = root.find(".//content")
38
+ if content_elem is not None and content_elem.text is not None:
39
+ return content_elem.text
40
+ except ET.ParseError:
41
+ pass
42
+ return content
43
+ except Exception as e:
44
+ logger.error(f"Error parsing content: {e}")
45
+ return content
46
+
47
+
48
+ def get_content(args, parser, allow_empty=False):
49
+ """Helper to get content from args or stdin."""
50
+ if args.content is None:
51
+ if sys.stdin.isatty():
52
+ parser.error("No content provided. Provide content or pipe input.")
53
+ else:
54
+ content = sys.stdin.read().strip()
55
+ else:
56
+ content = args.content
57
+
58
+ if not content and not allow_empty:
59
+ parser.error("Empty input provided.")
60
+ return content
61
+
62
+
63
+ async def process_input_content(content: str) -> str:
64
+ """Process input content, handling URLs and file paths."""
65
+ if "http" in content:
66
+ result = await extract_content(ProcessSourceInput(url=content))
67
+ content = result.content if result.content else str(result)
68
+ elif os.path.exists(content):
69
+ result = await extract_content(ProcessSourceInput(file_path=content))
70
+ content = result.content if result.content else str(result)
71
+ return content
72
+
73
+
74
+ async def ccore_main():
75
+ """CLI logic for ccore (extract)."""
76
+ parser = argparse.ArgumentParser(
77
+ description="Content Core CLI: Extract content with formatting options."
78
+ )
79
+ parser.add_argument(
80
+ "-f",
81
+ "--format",
82
+ choices=["xml", "json", "text"],
83
+ default="text",
84
+ help="Output format (xml, json, or text). Default: text",
85
+ )
86
+ parser.add_argument(
87
+ "-d", "--debug", action="store_true", help="Enable debug logging."
88
+ )
89
+ parser.add_argument(
90
+ "content",
91
+ nargs="?",
92
+ help="Content to process (URL, file path, or text). If not provided, reads from stdin.",
93
+ )
94
+
95
+ args = parser.parse_args()
96
+
97
+ # Adjust logging level based on debug flag
98
+ if args.debug:
99
+ logger.remove()
100
+ logger.add(sys.stderr, level="DEBUG")
101
+ logger.debug("Debug logging enabled")
102
+
103
+ content = get_content(args, parser)
104
+
105
+ content = await process_input_content(content)
106
+
107
+ try:
108
+ result = await extract_content(ProcessSourceInput(content=content))
109
+ if args.format == "xml":
110
+ result = dicttoxml(
111
+ result.model_dump(), custom_root="result", attr_type=False
112
+ )
113
+ elif args.format == "json":
114
+ result = result.model_dump_json()
115
+ else: # text
116
+ result = result.content
117
+ print(result)
118
+ except Exception as e:
119
+ logger.error(f"Error extracting content: {e}")
120
+ sys.exit(1)
121
+
122
+
123
+ async def cclean_main():
124
+ """CLI logic for cclean."""
125
+ parser = argparse.ArgumentParser(
126
+ description="Content Core CLI: Clean content string."
127
+ )
128
+ parser.add_argument(
129
+ "-d", "--debug", action="store_true", help="Enable debug logging."
130
+ )
131
+ parser.add_argument(
132
+ "content",
133
+ nargs="?",
134
+ help="Content to clean (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
135
+ )
136
+
137
+ args = parser.parse_args()
138
+
139
+ # Adjust logging level based on debug flag
140
+ if args.debug:
141
+ logger.remove()
142
+ logger.add(sys.stderr, level="DEBUG")
143
+ logger.debug("Debug logging enabled")
144
+
145
+ content = get_content(args, parser)
146
+
147
+ content = await process_input_content(content)
148
+ content = parse_content_format(content)
149
+
150
+ try:
151
+ result = await cleanup_content(content)
152
+ print(result)
153
+ except Exception as e:
154
+ logger.error(f"Error cleaning content: {e}")
155
+ sys.exit(1)
156
+
157
+
158
+ async def csum_main():
159
+ """CLI logic for csum."""
160
+ parser = argparse.ArgumentParser(
161
+ description="Content Core CLI: Summarize content with optional context."
162
+ )
163
+ parser.add_argument(
164
+ "--context",
165
+ default="",
166
+ help="Optional context for summarization (e.g., 'summarize as if explaining to a child').",
167
+ )
168
+ parser.add_argument(
169
+ "-d", "--debug", action="store_true", help="Enable debug logging."
170
+ )
171
+ parser.add_argument(
172
+ "content",
173
+ nargs="?",
174
+ help="Content to summarize (URL, file path, text, JSON, or XML). If not provided, reads from stdin.",
175
+ )
176
+
177
+ args = parser.parse_args()
178
+
179
+ # Adjust logging level based on debug flag
180
+ if args.debug:
181
+ logger.remove()
182
+ logger.add(sys.stderr, level="DEBUG")
183
+ logger.debug("Debug logging enabled")
184
+
185
+ content = get_content(args, parser)
186
+
187
+ content = await process_input_content(content)
188
+ content = parse_content_format(content)
189
+
190
+ try:
191
+ result = await summarize(content, args.context)
192
+ print(result)
193
+ except Exception as e:
194
+ logger.error(f"Error summarizing content: {e}")
195
+ sys.exit(1)
196
+
197
+
198
+ def ccore():
199
+ """Synchronous wrapper for ccore."""
200
+ asyncio.run(ccore_main())
201
+
202
+
203
+ def cclean():
204
+ """Synchronous wrapper for cclean."""
205
+ asyncio.run(cclean_main())
206
+
207
+
208
+ def csum():
209
+ """Synchronous wrapper for csum."""
210
+ asyncio.run(csum_main())
211
+
212
+
213
+ if __name__ == "__main__":
214
+ ccore()
@@ -0,0 +1,21 @@
1
+ """Common utilities and shared code for content-core."""
2
+
3
+ from .exceptions import (
4
+ ContentCoreError,
5
+ InvalidInputError,
6
+ NotFoundError,
7
+ UnsupportedTypeException,
8
+ )
9
+ from .state import ProcessSourceInput, ProcessSourceOutput, ProcessSourceState
10
+ from .utils import process_input_content
11
+
12
+ __all__ = [
13
+ "ContentCoreError",
14
+ "UnsupportedTypeException",
15
+ "InvalidInputError",
16
+ "NotFoundError",
17
+ "ProcessSourceInput",
18
+ "ProcessSourceState",
19
+ "ProcessSourceOutput",
20
+ "process_input_content",
21
+ ]
@@ -0,0 +1,70 @@
1
+ class ContentCoreError(Exception):
2
+ """Base exception class for Open Notebook errors."""
3
+
4
+ pass
5
+
6
+
7
+ class DatabaseOperationError(ContentCoreError):
8
+ """Raised when a database operation fails."""
9
+
10
+ pass
11
+
12
+
13
+ class UnsupportedTypeException(ContentCoreError):
14
+ """Raised when an unsupported type is provided."""
15
+
16
+ pass
17
+
18
+
19
+ class InvalidInputError(ContentCoreError):
20
+ """Raised when invalid input is provided."""
21
+
22
+ pass
23
+
24
+
25
+ class NotFoundError(ContentCoreError):
26
+ """Raised when a requested resource is not found."""
27
+
28
+ pass
29
+
30
+
31
+ class AuthenticationError(ContentCoreError):
32
+ """Raised when there's an authentication problem."""
33
+
34
+ pass
35
+
36
+
37
+ class ConfigurationError(ContentCoreError):
38
+ """Raised when there's a configuration problem."""
39
+
40
+ pass
41
+
42
+
43
+ class ExternalServiceError(ContentCoreError):
44
+ """Raised when an external service (e.g., AI model) fails."""
45
+
46
+ pass
47
+
48
+
49
+ class RateLimitError(ContentCoreError):
50
+ """Raised when a rate limit is exceeded."""
51
+
52
+ pass
53
+
54
+
55
+ class FileOperationError(ContentCoreError):
56
+ """Raised when a file operation fails."""
57
+
58
+ pass
59
+
60
+
61
+ class NetworkError(ContentCoreError):
62
+ """Raised when a network operation fails."""
63
+
64
+ pass
65
+
66
+
67
+ class NoTranscriptFound(ContentCoreError):
68
+ """Raised when no transcript is found for a video."""
69
+
70
+ pass
@@ -0,0 +1,30 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class ProcessSourceState(BaseModel):
7
+ file_path: Optional[str] = ""
8
+ url: Optional[str] = ""
9
+ delete_source: bool = False
10
+ title: Optional[str] = ""
11
+ source_type: Optional[str] = ""
12
+ identified_type: Optional[str] = ""
13
+ identified_provider: Optional[str] = ""
14
+ metadata: Optional[dict] = Field(default_factory=lambda: {})
15
+ content: Optional[str] = ""
16
+
17
+
18
+ class ProcessSourceInput(BaseModel):
19
+ content: Optional[str] = ""
20
+ file_path: Optional[str] = ""
21
+ url: Optional[str] = ""
22
+
23
+
24
+ class ProcessSourceOutput(BaseModel):
25
+ title: Optional[str] = ""
26
+ source_type: Optional[str] = ""
27
+ identified_type: Optional[str] = ""
28
+ identified_provider: Optional[str] = ""
29
+ metadata: Optional[dict] = Field(default_factory=lambda: {})
30
+ content: Optional[str] = ""
@@ -0,0 +1,31 @@
1
+ import os
2
+ import re
3
+ import validators
4
+
5
+ from .state import ProcessSourceInput
6
+
7
+
8
+ async def process_input_content(content: str) -> str:
9
+ """
10
+ Process input content to handle URLs and file paths.
11
+ If the input is a URL or file path, extract the content from it.
12
+ """
13
+ # Check if content is a URL
14
+ if validators.url(content):
15
+ from content_core.extraction import extract_content
16
+ content_input = ProcessSourceInput(url=content)
17
+ extracted = await extract_content(content_input)
18
+ return extracted.content if extracted.content else str(extracted)
19
+
20
+ # Check if content is a file path (simplified check for demonstration)
21
+ if re.match(r"^[a-zA-Z0-9_/\-\.]+\.[a-zA-Z0-9]+$", content):
22
+ if os.path.exists(content):
23
+ from content_core.extraction import extract_content
24
+ content_input = ProcessSourceInput(file_path=content)
25
+ extracted = await extract_content(content_input)
26
+ return extracted.content if extracted.content else str(extracted)
27
+ else:
28
+ raise ValueError(f"File not found: {content}")
29
+
30
+ # If neither URL nor file path, return content as is
31
+ return content
content_core/config.py ADDED
@@ -0,0 +1,37 @@
1
+ from esperanto import AIFactory
2
+ from esperanto.providers.stt import SpeechToTextModel
3
+
4
+ SPEECH_TO_TEXT_MODEL: SpeechToTextModel = AIFactory.create_speech_to_text(
5
+ "openai", "whisper-1"
6
+ )
7
+
8
+ DEFAULT_MODEL = AIFactory.create_language(
9
+ "openai",
10
+ "gpt-4o-mini",
11
+ config={
12
+ "temperature": 0.5,
13
+ "top_p": 1,
14
+ "max_tokens": 2000,
15
+ },
16
+ )
17
+
18
+ CLEANUP_MODEL = AIFactory.create_language(
19
+ "openai",
20
+ "gpt-4o-mini",
21
+ config={
22
+ "temperature": 0,
23
+ "max_tokens": 8000,
24
+ "output_format": "json",
25
+ # "stream": True, # TODO: handle streaming
26
+ },
27
+ ) # Fix deprecation
28
+
29
+ SUMMARY_MODEL = AIFactory.create_language(
30
+ "openai",
31
+ "gpt-4o-mini",
32
+ config={
33
+ "temperature": 0,
34
+ "top_p": 1,
35
+ "max_tokens": 2000,
36
+ },
37
+ )
File without changes
@@ -0,0 +1,5 @@
1
+ """Content cleaning functionality for content-core."""
2
+
3
+ from .core import cleanup_content
4
+
5
+ __all__ = ["cleanup_content"]
@@ -0,0 +1,15 @@
1
+ from functools import partial
2
+
3
+ from content_core.config import CLEANUP_MODEL
4
+ from content_core.templated_message import TemplatedMessageInput, templated_message
5
+
6
+
7
+ async def cleanup_content(content) -> str:
8
+ templated_summary_fn = partial(templated_message, model=CLEANUP_MODEL)
9
+ input = TemplatedMessageInput(
10
+ system_prompt_template="content/cleanup",
11
+ user_prompt_text=content,
12
+ data={"content": content},
13
+ )
14
+ result = await templated_summary_fn(input)
15
+ return result
@@ -0,0 +1,13 @@
1
+ from typing import Dict, Union
2
+
3
+ from content_core.common import ProcessSourceInput, ProcessSourceOutput
4
+ from content_core.content.extraction.graph import graph
5
+
6
+ # todo: input/output schema do langgraph
7
+
8
+
9
+ async def extract_content(data: Union[ProcessSourceInput, Dict]) -> ProcessSourceOutput:
10
+ if isinstance(data, dict):
11
+ data = ProcessSourceInput(**data)
12
+ result = await graph.ainvoke(data)
13
+ return ProcessSourceOutput(**result)
@@ -0,0 +1,148 @@
1
+ import os
2
+ from typing import Any, Dict, Optional
3
+
4
+ import magic
5
+ from langgraph.graph import END, START, StateGraph
6
+ from loguru import logger
7
+
8
+ from content_core.common import (
9
+ ProcessSourceInput,
10
+ ProcessSourceState,
11
+ UnsupportedTypeException,
12
+ )
13
+ from content_core.processors.audio import extract_audio # type: ignore
14
+ from content_core.processors.office import (
15
+ SUPPORTED_OFFICE_TYPES,
16
+ extract_office_content,
17
+ )
18
+ from content_core.processors.pdf import SUPPORTED_FITZ_TYPES, extract_pdf
19
+ from content_core.processors.text import extract_txt
20
+ from content_core.processors.url import extract_url, url_provider
21
+ from content_core.processors.video import extract_best_audio_from_video
22
+ from content_core.processors.youtube import extract_youtube_transcript
23
+
24
+
25
+ async def source_identification(state: ProcessSourceState) -> Dict[str, str]:
26
+ """
27
+ Identify the content source based on parameters
28
+ """
29
+ if state.content:
30
+ doc_type = "text"
31
+ elif state.file_path:
32
+ doc_type = "file"
33
+ elif state.url:
34
+ doc_type = "url"
35
+ else:
36
+ raise ValueError("No source provided.")
37
+
38
+ return {"source_type": doc_type}
39
+
40
+
41
+ async def file_type(state: ProcessSourceState) -> Dict[str, Any]:
42
+ """
43
+ Identify the file using python-magic
44
+ """
45
+ return_dict = {}
46
+ file_path = state.file_path
47
+ if file_path is not None:
48
+ return_dict["identified_type"] = magic.from_file(file_path, mime=True)
49
+ return_dict["title"] = os.path.basename(file_path)
50
+ return return_dict
51
+
52
+
53
+ async def file_type_edge(data: ProcessSourceState) -> str:
54
+ assert data.identified_type, "Type not identified"
55
+ identified_type = data.identified_type
56
+
57
+ if identified_type == "text/plain":
58
+ return "extract_txt"
59
+ elif identified_type in SUPPORTED_FITZ_TYPES:
60
+ return "extract_pdf"
61
+ elif identified_type in SUPPORTED_OFFICE_TYPES:
62
+ return "extract_office_content"
63
+ elif identified_type.startswith("video"):
64
+ return "extract_best_audio_from_video"
65
+ elif identified_type.startswith("audio"):
66
+ return "extract_audio"
67
+ else:
68
+ raise UnsupportedTypeException(f"Unsupported file type: {data.identified_type}")
69
+
70
+
71
+ async def delete_file(data: ProcessSourceState) -> Dict[str, Any]:
72
+ if data.delete_source:
73
+ logger.debug(f"Deleting file: {data.file_path}")
74
+ file_path = data.file_path
75
+ if file_path is not None:
76
+ try:
77
+ os.remove(file_path)
78
+ return {"file_path": None}
79
+ except FileNotFoundError:
80
+ logger.warning(f"File not found while trying to delete: {file_path}")
81
+ else:
82
+ logger.debug("Not deleting file")
83
+ return {}
84
+
85
+
86
+ async def url_type_router(x: ProcessSourceState) -> Optional[str]:
87
+ return x.identified_type
88
+
89
+
90
+ async def source_type_router(x: ProcessSourceState) -> Optional[str]:
91
+ return x.source_type
92
+
93
+
94
+ # Create workflow
95
+ workflow = StateGraph(
96
+ ProcessSourceState, input=ProcessSourceInput, output=ProcessSourceState
97
+ )
98
+
99
+ # Add nodes
100
+ workflow.add_node("source", source_identification)
101
+ workflow.add_node("url_provider", url_provider)
102
+ workflow.add_node("file_type", file_type)
103
+ workflow.add_node("extract_txt", extract_txt)
104
+ workflow.add_node("extract_pdf", extract_pdf)
105
+ workflow.add_node("extract_url", extract_url)
106
+ workflow.add_node("extract_office_content", extract_office_content)
107
+ workflow.add_node("extract_best_audio_from_video", extract_best_audio_from_video)
108
+ workflow.add_node("extract_audio", extract_audio)
109
+ workflow.add_node("extract_youtube_transcript", extract_youtube_transcript)
110
+ workflow.add_node("delete_file", delete_file)
111
+
112
+ # Add edges
113
+ workflow.add_edge(START, "source")
114
+ workflow.add_conditional_edges(
115
+ "source",
116
+ source_type_router,
117
+ {
118
+ "url": "url_provider",
119
+ "file": "file_type",
120
+ "text": END,
121
+ },
122
+ )
123
+ workflow.add_conditional_edges(
124
+ "file_type",
125
+ file_type_edge,
126
+ )
127
+ workflow.add_conditional_edges(
128
+ "url_provider",
129
+ url_type_router,
130
+ {"article": "extract_url", "youtube": "extract_youtube_transcript"},
131
+ )
132
+ workflow.add_edge("url_provider", END)
133
+ workflow.add_edge("file_type", END)
134
+ workflow.add_edge("extract_url", END)
135
+ workflow.add_edge("extract_txt", END)
136
+ workflow.add_edge("extract_youtube_transcript", END)
137
+
138
+ workflow.add_edge("extract_pdf", "delete_file")
139
+ workflow.add_edge("extract_office_content", "delete_file")
140
+ workflow.add_edge("extract_best_audio_from_video", "extract_audio")
141
+ workflow.add_edge("extract_audio", "delete_file")
142
+ workflow.add_edge("delete_file", END)
143
+
144
+ # Compile graph
145
+ graph = workflow.compile()
146
+
147
+ # Compile graph
148
+ graph = workflow.compile()
@@ -0,0 +1,5 @@
1
+ """Content summarization functionality for content-core."""
2
+
3
+ from .core import summarize
4
+
5
+ __all__ = ["summarize"]
@@ -0,0 +1,15 @@
1
+ from functools import partial
2
+
3
+ from content_core.config import SUMMARY_MODEL
4
+ from content_core.templated_message import TemplatedMessageInput, templated_message
5
+
6
+
7
+ async def summarize(content: str, context: str) -> str:
8
+ templated_message_fn = partial(templated_message, model=SUMMARY_MODEL)
9
+ response = await templated_message_fn(
10
+ TemplatedMessageInput(
11
+ user_prompt_template="content/summarize",
12
+ data={"content": content, "context": context},
13
+ )
14
+ )
15
+ return response