PyPI - churovoice - Versions diffs - 0.1.0__tar.gz - Mend

churovoice 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

churovoice-0.1.0/PKG-INFO +167 -0
churovoice-0.1.0/README.md +139 -0
churovoice-0.1.0/churovoice/__init__.py +5 -0
churovoice-0.1.0/churovoice/assistant.py +317 -0
churovoice-0.1.0/churovoice/cli.py +9 -0
churovoice-0.1.0/churovoice.egg-info/PKG-INFO +167 -0
churovoice-0.1.0/churovoice.egg-info/SOURCES.txt +11 -0
churovoice-0.1.0/churovoice.egg-info/dependency_links.txt +1 -0
churovoice-0.1.0/churovoice.egg-info/entry_points.txt +2 -0
churovoice-0.1.0/churovoice.egg-info/requires.txt +13 -0
churovoice-0.1.0/churovoice.egg-info/top_level.txt +1 -0
churovoice-0.1.0/pyproject.toml +49 -0
churovoice-0.1.0/setup.cfg +4 -0

churovoice-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,167 @@
+Metadata-Version: 2.4
+Name: churovoice
+Version: 0.1.0
+Summary: A multimodal voice assistant with web search, vision, and image generation.
+Author: Lakshya Prajapati
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: MacOS
+Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: torch
+Requires-Dist: ollama
+Requires-Dist: SpeechRecognition
+Requires-Dist: edge-tts
+Requires-Dist: ddgs
+Requires-Dist: opencv-python
+Requires-Dist: rich
+Requires-Dist: diffusers
+Requires-Dist: transformers
+Requires-Dist: accelerate
+Requires-Dist: safetensors
+Requires-Dist: Pillow
+Requires-Dist: term-image
+# V1.8 Speech Agent
+V1.8 is an experimental voice-first AI assistant. It listens to spoken prompts, responds out loud, can launch apps on macOS, can search the web for current context, can inspect images from a webcam, and can generate images when the user asks for a visual result.
+## What It Does
+This project combines several assistant behaviors into one loop:
+- Speech-to-text using Whisper through `speech_recognition`
+- Text-to-speech using `edge-tts`
+- App launching on macOS for commands such as `open Safari`
+- Web query simplification and search retrieval through DDGS
+- Webcam-based vision analysis for appearance or environment questions
+- Image generation with Stable Diffusion
+- Terminal-friendly output formatting with `rich`
+## Who This Is For
+This repository is intended for developers and hobbyists who want to explore a local voice assistant workflow. It is especially useful if you are interested in:
+- voice interfaces
+- multimodal AI interactions
+- local automation on macOS
+- image generation pipelines
+- combining web search, vision, and speech in a single assistant
+## Requirements
+- macOS
+- Python 3.11 or newer is recommended
+- A microphone with system permission enabled
+- A camera with system permission enabled if you want vision features
+- Ollama installed and available on the machine running the script
+- `chafa` installed if you want terminal previews for generated images
+- Hardware that can run the configured Stable Diffusion pipeline on `mps`, or code changes to target a different device
+## Python Dependencies
+The script uses the following Python packages:
+- `torch`
+- `ollama`
+- `speech_recognition`
+- `edge_tts`
+- `ddgs`
+- `opencv-python`
+- `rich`
+- `diffusers`
+- `term-image`
+## Installation
+1. Clone the repository and open the `V1.8` folder.
+2. Create a virtual environment:
+```bash
+python3 -m venv venv
+source venv/bin/activate
+```
+3. Install the dependencies:
+```bash
+pip install torch ollama SpeechRecognition edge-tts ddgs opencv-python rich diffusers term-image
+```
+4. Make sure Ollama can access the models referenced in `main.py`.
+## Usage
+Run the assistant with:
+```bash
+python main.py
+```
+On startup, the program asks you to choose a voice:
+- `Male` selects `en-US-SteffanNeural`
+- Any other input selects `en-US-AvaNeural`
+Then the assistant will:
+1. Prompt you to speak
+2. Transcribe your speech
+3. Decide whether the request is for app launching, image generation, vision analysis, or a normal answer
+4. Speak the response back to you
+5. Ask whether you want to continue the conversation
+## How It Works
+### App Launching
+If the transcription includes `open`, the assistant tries to find a matching application on macOS. If no local app is found, it falls back to opening a website based on the target name.
+### Web Answers
+For general questions, the assistant first simplifies the query and fetches recent search results. The response model can use those results when the request is about news, current events, or recent information.
+### Vision Mode
+If the prompt seems to require visual context, the assistant captures a frame from the webcam, saves it locally, and sends it to a vision-capable model for analysis.
+### Image Generation
+If the prompt is recognized as an image request, the assistant converts it into a short image prompt, generates an image with Stable Diffusion, saves the result as `generated_image.png`, and displays it in the terminal.
+## Limitations
+- The current implementation is macOS-focused.
+- The assistant depends on several external models and services.
+- The Stable Diffusion pipeline is loaded at startup, which may be slow on lower-powered machines.
+- The current code stores generated and captured images in the working directory.
+- The app-launching behavior is intentionally simple and may not match every app name perfectly.
+## Future Opportunities
+This version leaves room for several improvements:
+- Add cross-platform support beyond macOS
+- Make the model names and device selection configurable through environment variables or a config file
+- Add a proper command parser for app launching instead of relying on keyword matching
+- Add a conversation history file or database
+- Add streaming responses so users hear partial answers sooner
+- Add a richer UI for desktop or web use
+- Add safer image handling and cleanup for generated files
+- Add a setup script or dependency file for easier installation
+## Troubleshooting
+- If microphone input fails, check system permissions and verify `speech_recognition` is installed correctly.
+- If camera capture fails, check camera permissions and confirm OpenCV can access the device.
+- If image generation fails, verify that your hardware supports the configured device target or update the pipeline configuration.
+- If terminal image preview fails, install `chafa` and confirm it is available in your PATH.
+## License
+No license has been added yet. Add one before publishing or distributing the project widely.

churovoice-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,139 @@
+# V1.8 Speech Agent
+V1.8 is an experimental voice-first AI assistant. It listens to spoken prompts, responds out loud, can launch apps on macOS, can search the web for current context, can inspect images from a webcam, and can generate images when the user asks for a visual result.
+## What It Does
+This project combines several assistant behaviors into one loop:
+- Speech-to-text using Whisper through `speech_recognition`
+- Text-to-speech using `edge-tts`
+- App launching on macOS for commands such as `open Safari`
+- Web query simplification and search retrieval through DDGS
+- Webcam-based vision analysis for appearance or environment questions
+- Image generation with Stable Diffusion
+- Terminal-friendly output formatting with `rich`
+## Who This Is For
+This repository is intended for developers and hobbyists who want to explore a local voice assistant workflow. It is especially useful if you are interested in:
+- voice interfaces
+- multimodal AI interactions
+- local automation on macOS
+- image generation pipelines
+- combining web search, vision, and speech in a single assistant
+## Requirements
+- macOS
+- Python 3.11 or newer is recommended
+- A microphone with system permission enabled
+- A camera with system permission enabled if you want vision features
+- Ollama installed and available on the machine running the script
+- `chafa` installed if you want terminal previews for generated images
+- Hardware that can run the configured Stable Diffusion pipeline on `mps`, or code changes to target a different device
+## Python Dependencies
+The script uses the following Python packages:
+- `torch`
+- `ollama`
+- `speech_recognition`
+- `edge_tts`
+- `ddgs`
+- `opencv-python`
+- `rich`
+- `diffusers`
+- `term-image`
+## Installation
+1. Clone the repository and open the `V1.8` folder.
+2. Create a virtual environment:
+```bash
+python3 -m venv venv
+source venv/bin/activate
+```
+3. Install the dependencies:
+```bash
+pip install torch ollama SpeechRecognition edge-tts ddgs opencv-python rich diffusers term-image
+```
+4. Make sure Ollama can access the models referenced in `main.py`.
+## Usage
+Run the assistant with:
+```bash
+python main.py
+```
+On startup, the program asks you to choose a voice:
+- `Male` selects `en-US-SteffanNeural`
+- Any other input selects `en-US-AvaNeural`
+Then the assistant will:
+1. Prompt you to speak
+2. Transcribe your speech
+3. Decide whether the request is for app launching, image generation, vision analysis, or a normal answer
+4. Speak the response back to you
+5. Ask whether you want to continue the conversation
+## How It Works
+### App Launching
+If the transcription includes `open`, the assistant tries to find a matching application on macOS. If no local app is found, it falls back to opening a website based on the target name.
+### Web Answers
+For general questions, the assistant first simplifies the query and fetches recent search results. The response model can use those results when the request is about news, current events, or recent information.
+### Vision Mode
+If the prompt seems to require visual context, the assistant captures a frame from the webcam, saves it locally, and sends it to a vision-capable model for analysis.
+### Image Generation
+If the prompt is recognized as an image request, the assistant converts it into a short image prompt, generates an image with Stable Diffusion, saves the result as `generated_image.png`, and displays it in the terminal.
+## Limitations
+- The current implementation is macOS-focused.
+- The assistant depends on several external models and services.
+- The Stable Diffusion pipeline is loaded at startup, which may be slow on lower-powered machines.
+- The current code stores generated and captured images in the working directory.
+- The app-launching behavior is intentionally simple and may not match every app name perfectly.
+## Future Opportunities
+This version leaves room for several improvements:
+- Add cross-platform support beyond macOS
+- Make the model names and device selection configurable through environment variables or a config file
+- Add a proper command parser for app launching instead of relying on keyword matching
+- Add a conversation history file or database
+- Add streaming responses so users hear partial answers sooner
+- Add a richer UI for desktop or web use
+- Add safer image handling and cleanup for generated files
+- Add a setup script or dependency file for easier installation
+## Troubleshooting
+- If microphone input fails, check system permissions and verify `speech_recognition` is installed correctly.
+- If camera capture fails, check camera permissions and confirm OpenCV can access the device.
+- If image generation fails, verify that your hardware supports the configured device target or update the pipeline configuration.
+- If terminal image preview fails, install `chafa` and confirm it is available in your PATH.
+## License
+No license has been added yet. Add one before publishing or distributing the project widely.

churovoice-0.1.0/churovoice/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""ChuroVoice package."""
+__all__ = ["__version__"]
+__version__ = "0.1.0"

churovoice-0.1.0/churovoice/assistant.py ADDED Viewed

@@ -0,0 +1,317 @@
+"""Core ChuroVoice assistant implementation."""
+from __future__ import annotations
+import argparse
+import asyncio
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import time
+from functools import lru_cache
+import cv2
+import edge_tts
+import speech_recognition as sr
+import torch
+from ddgs import DDGS
+from diffusers import StableDiffusionPipeline
+from ollama import chat
+from rich.console import Console
+from rich.text import Text
+DEFAULT_CHAT_MODEL = os.getenv("CHUROVOICE_CHAT_MODEL", "gemma4:31b-cloud")
+DEFAULT_IMAGE_TRIGGER_MODEL = os.getenv("CHUROVOICE_IMAGE_TRIGGER_MODEL", "ministral-3:14b-cloud")
+DEFAULT_IMAGE_PROMPT_MODEL = os.getenv("CHUROVOICE_IMAGE_PROMPT_MODEL", "ministral-3:3b-cloud")
+DEFAULT_WEB_MODEL = os.getenv("CHUROVOICE_WEB_MODEL", "ministral-3:3b-cloud")
+DEFAULT_VISION_MODEL = os.getenv("CHUROVOICE_VISION_MODEL", "ministral-3:14b-cloud")
+DEFAULT_IMAGE_ANALYSIS_MODEL = os.getenv("CHUROVOICE_IMAGE_ANALYSIS_MODEL", "ministral-3:8b-cloud")
+DEFAULT_STABLE_DIFFUSION_MODEL = os.getenv("CHUROVOICE_SD_MODEL", "nota-ai/bk-sdm-small")
+ANSI_BOLD = "\033[1m"
+ANSI_ITALIC = "\033[3m"
+ANSI_RESET = "\033[0m"
+def format_for_terminal(text: str | None) -> str:
+    if text is None:
+        return ""
+    text = re.sub(r"\*\*(.*?)\*\*", f"{ANSI_BOLD}\\1{ANSI_RESET}", text)
+    text = re.sub(r"\*(.*?)\*", f"{ANSI_ITALIC}\\1{ANSI_RESET}", text)
+    return text
+def clean_for_speech(text: str | None) -> str:
+    if text is None:
+        return ""
+    return re.sub(r"\*\*|\*", "", text)
+def resolve_voice(choice: str) -> str:
+    return "en-US-SteffanNeural" if choice.lower().strip() == "male" else "en-US-AvaNeural"
+def resolve_device() -> str:
+    if torch.backends.mps.is_available():
+        return "mps"
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+@lru_cache(maxsize=1)
+def load_image_pipeline() -> StableDiffusionPipeline:
+    device = resolve_device()
+    dtype = torch.float16 if device in {"mps", "cuda"} else torch.float32
+    pipe = StableDiffusionPipeline.from_pretrained(DEFAULT_STABLE_DIFFUSION_MODEL, torch_dtype=dtype)
+    return pipe.to(device)
+async def speak_async(text: str, voice: str) -> None:
+    clean_text = clean_for_speech(text)
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
+        temp_path = fp.name
+    communicate = edge_tts.Communicate(clean_text, voice)
+    await communicate.save(temp_path)
+    os.system(f'afplay "{temp_path}"')
+    os.remove(temp_path)
+def speak(text: str, voice: str) -> None:
+    asyncio.run(speak_async(text, voice))
+def launch_target(target: str) -> bool:
+    finder = subprocess.run(
+        ["mdfind", 'kMDItemKind == "Application"'],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    matches = [line for line in finder.stdout.splitlines() if target.lower() in line.lower()]
+    if matches:
+        subprocess.run(["open", matches[0]], check=False)
+        return True
+    subprocess.run(["open", f"https://{target.replace(' ', '')}.com"], check=False)
+    return False
+def simplify_query(text: str) -> str:
+    response = chat(
+        model=DEFAULT_WEB_MODEL,
+        messages=[
+            {"role": "user", "content": text},
+            {
+                "role": "system",
+                "content": f'''You are a web-search query simplifier.Your job:Convert the user's message into ONE concise web search query.User message:"{text}"Rules:- Keep only important keywords- Remove filler words and stop words- No emojis- No explanations- Keep the meaning accurate- Make it optimized for a search engine- Output ONLY the final search queryFormat:latest details on <simplified topic> as of 2026''',
+            },
+        ],
+    )
+    return response.message.content.strip()
+def detect_trigger(text: str, prompt: str, model: str) -> str:
+    response = chat(
+        model=model,
+        messages=[
+            {"role": "user", "content": text},
+            {"role": "system", "content": prompt.format(text=text)},
+        ],
+    )
+    return response.message.content.strip().lower()
+def build_image_prompt(text: str) -> str:
+    response = chat(
+        model=DEFAULT_IMAGE_PROMPT_MODEL,
+        messages=[
+            {"role": "user", "content": text},
+            {
+                "role": "system",
+                "content": f'''You are an expert prompt to image prompt generator. Today your goal is to convert "{text}" into a proper prompt for an image model. This is an example you can follow:- User:"Can you generate an image of a sunset over mountains?" this is what you have to do: "Generate a realistic image of a sunset over mountains". You are maximum only allowed to use 10 words, anything higher will not be tollerated.''',
+            },
+        ],
+    )
+    return response.message.content.strip()
+def analyze_image(text: str, photo_path: str) -> str:
+    response = chat(
+        model=DEFAULT_IMAGE_ANALYSIS_MODEL,
+        messages=[
+            {"role": "system", "content": text},
+            {
+                "role": "user",
+                "content": f'''Analyze the image based on the user's request.User request:"{text}"Instructions:- Focus mainly on the requested subject- If no subject is specified, analyze the surroundings- Be concise but useful- Be truthful and accurate- Mention important visible details- Do not hallucinate- No emojis- No unnecessary formatting- Make the response natural and clear''',
+                "images": [photo_path],
+            },
+        ],
+    )
+    return response.message.content.strip()
+def answer_with_chat(text: str, memory: list[str], search_results: list[dict[str, str]]) -> str:
+    response = chat(
+        model=DEFAULT_CHAT_MODEL,
+        messages=[
+            {"role": "user", "content": text},
+            {
+                "role": "system",
+                "content": f'''You are Churo.Personality:- helpful- professional- intelligent- concise- accurate- natural soundingRules:- Use simple language- Keep responses concise- No emojis- Do not ramble- Answer directly- Be conversational but efficientMemory context:{memory}, use this when you feel that the query lacks context. Current user query:{text}Available web search results:{search_results}Use the web results ONLY if the user explicitly asks for:- latest news- recent updates- current information- newest details- web searchesOtherwise answer normally without relying on web results.If image analysis was already provided,do not repeat the analysis.Simply continue the conversation naturally.Never say:"Analysis provided"Instead continue naturally and intelligently.''',
+            },
+        ],
+    )
+    return response.message.content.strip()
+def print_block(console: Console, text: str, *, style: str = "cornsilk1 on gray15") -> None:
+    console.print(" ")
+    console.print(" ", style=style, justify="left")
+    console.print(Text.from_ansi(text), style=style, justify="left")
+    console.print(" ", style=style, justify="left")
+    console.print()
+def run_assistant(voice_choice: str | None = None) -> None:
+    console = Console()
+    terminal_width = shutil.get_terminal_size((100, 20)).columns
+    voice = resolve_voice(voice_choice or input("Choose a voice (Male/Female): "))
+    answer_history: list[str] = []
+    recognizer = sr.Recognizer()
+    yes_words = {"y", "yes", "yep", "yeah", "yup", "sure", "ok", "okay", "affirmative", "certainly", "definitely", "absolutely", "indeed", "true", "continue"}
+    while True:
+        is_app_open = False
+        is_recognised = False
+        with sr.Microphone() as source:
+            recognizer.adjust_for_ambient_noise(source, duration=0.2)
+            ask_anything = "*Ask Me Anything...*"
+            speak(ask_anything, voice)
+            console.print(Text.from_ansi(format_for_terminal(ask_anything)))
+            audio = recognizer.listen(source)
+        try:
+            text = recognizer.recognize_whisper(audio, model="small.en")
+            print_block(console, format_for_terminal(text), style="cornsilk1 on gray19")
+        except sr.UnknownValueError:
+            console.print(Text.from_ansi(format_for_terminal("Could not understand audio")))
+            text = ""
+        normalized_text = text.strip(".,!?").lower()
+        if "open" in normalized_text:
+            parts = text.split(maxsplit=1)
+            if len(parts) > 1:
+                target = parts[1].strip()
+                console.print(Text.from_ansi(format_for_terminal(f"**Opening {target}**")))
+                is_app_open = launch_target(target)
+        if text == "":
+            console.print(Text.from_ansi(format_for_terminal("No input detected. Please try again.")))
+            continue
+        web_query = simplify_query(text)
+        image_trigger = detect_trigger(
+            text,
+            '''You are an image generation trigger detector.Determine whether the user's query requires generating an image or not.User query:"{text}"Respond ONLY with:yesornoSay YES only if:- the user explicitly asks for an image- the user requests a visual representation of something- the answer requires generating an imageExamples of YES:- "Generate an image of a sunset over mountains"- "Create a picture of a futuristic city skyline"- "I want to see a visual representation of a dragon"- "Can you make an illustration of a robot?"Examples of NO:- news- coding- facts- explanations- web searches- math- history- general questionsBe accurate.Do not guess.Output ONLY yes or no.No punctuation.No emojis.''',
+            DEFAULT_IMAGE_TRIGGER_MODEL,
+        )
+        if "yes" in image_trigger:
+            image_prompt = build_image_prompt(text)
+            image = load_image_pipeline()(image_prompt, num_inference_steps=20).images[0]
+            image_path = os.path.join(os.getcwd(), "generated_image.png")
+            image.save(image_path)
+            chafa = shutil.which("chafa")
+            if chafa:
+                subprocess.run([
+                    chafa,
+                    image_path,
+                    "--symbols",
+                    "block",
+                    "--size=60",
+                ], check=False)
+            else:
+                console.print(f"Generated image saved to {image_path}")
+        else:
+            vision_trigger = detect_trigger(
+                text,
+                '''You are a vision-context detector.Determine whether answering the user's query requires:- a camera image- surroundings analysis- appearance analysis- object inspection- environmental contextUser query:"{text}"Respond ONLY with:yesornoSay YES only if:- the user refers to themselves- the user refers to their surroundings- the user asks about appearance- the user asks to inspect something visible- the answer requires visual contextExamples of YES:- "How do I look?"- "What's in front of me?"- "Analyze my room"- "What is this object?"- "Does my hair look good?"Examples of NO:- news- coding- facts- explanations- web searches- math- history- general questionsBe accurate.Do not guess.Output ONLY yes or no.No punctuation.No emojis.''',
+                DEFAULT_VISION_MODEL,
+            )
+            if "yes" in vision_trigger:
+                is_recognised = True
+                console.print(Text.from_ansi(format_for_terminal("**Capturing photo...**")))
+                cam = cv2.VideoCapture(0)
+                ret, frame = cam.read()
+                if ret:
+                    photo_path = os.path.join(os.getcwd(), "instant_photo.png")
+                    cv2.imwrite(photo_path, frame)
+                    console.print(Text.from_ansi(format_for_terminal("**Photo captured successfully!**")))
+                else:
+                    console.print(Text.from_ansi(format_for_terminal("**Error: Could not access camera.**")))
+                    photo_path = ""
+                cam.release()
+                if photo_path:
+                    image_answer = analyze_image(text, photo_path)
+                    console.print(Text.from_ansi(format_for_terminal(image_answer)))
+                    answer_history.append(image_answer)
+                    speak(image_answer, voice)
+            if is_recognised and is_app_open:
+                continue
+            search_results = list(DDGS().text(web_query, max_results=3))
+            output = answer_with_chat(text, answer_history, search_results)
+            formatted_output = format_for_terminal(output)
+            speech_output = clean_for_speech(output)
+            aligned = formatted_output.rjust(terminal_width)
+            print_block(console, aligned)
+            answer_history.append(output)
+            speak(speech_output, voice)
+            voice_recognizer = sr.Recognizer()
+            with sr.Microphone() as source1:
+                recognizer.adjust_for_ambient_noise(source1, duration=0.2)
+                prompt_text = "*Do you want to continue the conversation? Yes or No?*"
+                console.print(Text.from_ansi(format_for_terminal(prompt_text)))
+                speak("Do you want to continue the conversation? Yes or No?", voice)
+                time.sleep(0.01)
+                console.print(Text.from_ansi(format_for_terminal("*Listening for your response...*")))
+                audio1 = voice_recognizer.listen(source1)
+            try:
+                voice_continue = voice_recognizer.recognize_whisper(audio1, model="small.en").strip()
+                print_block(console, format_for_terminal(voice_continue), style="cornsilk1 on gray19")
+            except sr.UnknownValueError:
+                console.print(Text.from_ansi(format_for_terminal("Could not understand audio")))
+                voice_continue = ""
+            normalized_continue = re.sub(r"[^\w]", "", voice_continue).lower()
+            if normalized_continue == "":
+                console.print(Text.from_ansi(format_for_terminal("**No input detected.**")))
+            elif normalized_continue not in yes_words:
+                speak("Bye! Please Visit Again!", voice)
+                break
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Run the ChuroVoice assistant.")
+    parser.add_argument("--voice", choices=["male", "female"], help="Choose the spoken voice.")
+    return parser
+def main(argv: list[str] | None = None) -> None:
+    parser = build_parser()
+    args = parser.parse_args(argv)
+    run_assistant(args.voice)

churovoice-0.1.0/churovoice/cli.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""Console entry point for ChuroVoice."""
+from __future__ import annotations
+from .assistant import main
+if __name__ == "__main__":
+    main()

churovoice-0.1.0/churovoice.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,167 @@
+Metadata-Version: 2.4
+Name: churovoice
+Version: 0.1.0
+Summary: A multimodal voice assistant with web search, vision, and image generation.
+Author: Lakshya Prajapati
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Operating System :: MacOS
+Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+Requires-Dist: torch
+Requires-Dist: ollama
+Requires-Dist: SpeechRecognition
+Requires-Dist: edge-tts
+Requires-Dist: ddgs
+Requires-Dist: opencv-python
+Requires-Dist: rich
+Requires-Dist: diffusers
+Requires-Dist: transformers
+Requires-Dist: accelerate
+Requires-Dist: safetensors
+Requires-Dist: Pillow
+Requires-Dist: term-image
+# V1.8 Speech Agent
+V1.8 is an experimental voice-first AI assistant. It listens to spoken prompts, responds out loud, can launch apps on macOS, can search the web for current context, can inspect images from a webcam, and can generate images when the user asks for a visual result.
+## What It Does
+This project combines several assistant behaviors into one loop:
+- Speech-to-text using Whisper through `speech_recognition`
+- Text-to-speech using `edge-tts`
+- App launching on macOS for commands such as `open Safari`
+- Web query simplification and search retrieval through DDGS
+- Webcam-based vision analysis for appearance or environment questions
+- Image generation with Stable Diffusion
+- Terminal-friendly output formatting with `rich`
+## Who This Is For
+This repository is intended for developers and hobbyists who want to explore a local voice assistant workflow. It is especially useful if you are interested in:
+- voice interfaces
+- multimodal AI interactions
+- local automation on macOS
+- image generation pipelines
+- combining web search, vision, and speech in a single assistant
+## Requirements
+- macOS
+- Python 3.11 or newer is recommended
+- A microphone with system permission enabled
+- A camera with system permission enabled if you want vision features
+- Ollama installed and available on the machine running the script
+- `chafa` installed if you want terminal previews for generated images
+- Hardware that can run the configured Stable Diffusion pipeline on `mps`, or code changes to target a different device
+## Python Dependencies
+The script uses the following Python packages:
+- `torch`
+- `ollama`
+- `speech_recognition`
+- `edge_tts`
+- `ddgs`
+- `opencv-python`
+- `rich`
+- `diffusers`
+- `term-image`
+## Installation
+1. Clone the repository and open the `V1.8` folder.
+2. Create a virtual environment:
+```bash
+python3 -m venv venv
+source venv/bin/activate
+```
+3. Install the dependencies:
+```bash
+pip install torch ollama SpeechRecognition edge-tts ddgs opencv-python rich diffusers term-image
+```
+4. Make sure Ollama can access the models referenced in `main.py`.
+## Usage
+Run the assistant with:
+```bash
+python main.py
+```
+On startup, the program asks you to choose a voice:
+- `Male` selects `en-US-SteffanNeural`
+- Any other input selects `en-US-AvaNeural`
+Then the assistant will:
+1. Prompt you to speak
+2. Transcribe your speech
+3. Decide whether the request is for app launching, image generation, vision analysis, or a normal answer
+4. Speak the response back to you
+5. Ask whether you want to continue the conversation
+## How It Works
+### App Launching
+If the transcription includes `open`, the assistant tries to find a matching application on macOS. If no local app is found, it falls back to opening a website based on the target name.
+### Web Answers
+For general questions, the assistant first simplifies the query and fetches recent search results. The response model can use those results when the request is about news, current events, or recent information.
+### Vision Mode
+If the prompt seems to require visual context, the assistant captures a frame from the webcam, saves it locally, and sends it to a vision-capable model for analysis.
+### Image Generation
+If the prompt is recognized as an image request, the assistant converts it into a short image prompt, generates an image with Stable Diffusion, saves the result as `generated_image.png`, and displays it in the terminal.
+## Limitations
+- The current implementation is macOS-focused.
+- The assistant depends on several external models and services.
+- The Stable Diffusion pipeline is loaded at startup, which may be slow on lower-powered machines.
+- The current code stores generated and captured images in the working directory.
+- The app-launching behavior is intentionally simple and may not match every app name perfectly.
+## Future Opportunities
+This version leaves room for several improvements:
+- Add cross-platform support beyond macOS
+- Make the model names and device selection configurable through environment variables or a config file
+- Add a proper command parser for app launching instead of relying on keyword matching
+- Add a conversation history file or database
+- Add streaming responses so users hear partial answers sooner
+- Add a richer UI for desktop or web use
+- Add safer image handling and cleanup for generated files
+- Add a setup script or dependency file for easier installation
+## Troubleshooting
+- If microphone input fails, check system permissions and verify `speech_recognition` is installed correctly.
+- If camera capture fails, check camera permissions and confirm OpenCV can access the device.
+- If image generation fails, verify that your hardware supports the configured device target or update the pipeline configuration.
+- If terminal image preview fails, install `chafa` and confirm it is available in your PATH.
+## License
+No license has been added yet. Add one before publishing or distributing the project widely.

churovoice-0.1.0/churovoice.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,11 @@
+README.md
+pyproject.toml
+churovoice/__init__.py
+churovoice/assistant.py
+churovoice/cli.py
+churovoice.egg-info/PKG-INFO
+churovoice.egg-info/SOURCES.txt
+churovoice.egg-info/dependency_links.txt
+churovoice.egg-info/entry_points.txt
+churovoice.egg-info/requires.txt
+churovoice.egg-info/top_level.txt

churovoice-0.1.0/churovoice.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

churovoice-0.1.0/churovoice.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ churovoice = churovoice.cli:main

churovoice-0.1.0/churovoice.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,13 @@
+torch
+ollama
+SpeechRecognition
+edge-tts
+ddgs
+opencv-python
+rich
+diffusers
+transformers
+accelerate
+safetensors
+Pillow
+term-image

churovoice-0.1.0/churovoice.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ churovoice

churovoice-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,49 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "churovoice"
+version = "0.1.0"
+description = "A multimodal voice assistant with web search, vision, and image generation."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+  "torch",
+  "ollama",
+  "SpeechRecognition",
+  "edge-tts",
+  "ddgs",
+  "opencv-python",
+  "rich",
+  "diffusers",
+  "transformers",
+  "accelerate",
+  "safetensors",
+  "Pillow",
+  "term-image",
+]
+authors = [
+  { name = "Lakshya Prajapati" }
+]
+classifiers = [
+  "Development Status :: 3 - Alpha",
+  "Intended Audience :: Developers",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.11",
+  "Operating System :: MacOS",
+  "Topic :: Multimedia :: Sound/Audio :: Speech",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+[project.scripts]
+churovoice = "churovoice.cli:main"
+[tool.setuptools]
+include-package-data = true
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["churovoice*"]

churovoice-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0